/brz/remove-bazaar : contents of parser.py at revision 0.64.77

: (revision 0.64.77)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

# Copyright (C) 2008 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""Parser of import data into command objects.

In order to reuse existing front-ends, the stream format is a subset of
the one used by git-fast-import (as of the 1.5.4 release of git at least).
The grammar is:

  stream ::= cmd*;

  cmd ::= new_blob
        | new_commit
        | new_tag
        | reset_branch
        | checkpoint
        | progress
        ;

  new_blob ::= 'blob' lf
    mark?
    file_content;
  file_content ::= data;

  new_commit ::= 'commit' sp ref_str lf
    mark?
    ('author' sp name '<' email '>' when lf)?
    'committer' sp name '<' email '>' when lf
    commit_msg
    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
    ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
    file_change*
    lf?;
  commit_msg ::= data;

  file_change ::= file_clr
    | file_del
    | file_rnm
    | file_cpy
    | file_obm
    | file_inm;
  file_clr ::= 'deleteall' lf;
  file_del ::= 'D' sp path_str lf;
  file_rnm ::= 'R' sp path_str sp path_str lf;
  file_cpy ::= 'C' sp path_str sp path_str lf;
  file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
  file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
    data;

  new_tag ::= 'tag' sp tag_str lf
    'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
    'tagger' sp name '<' email '>' when lf
    tag_msg;
  tag_msg ::= data;

  reset_branch ::= 'reset' sp ref_str lf
    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
    lf?;

  checkpoint ::= 'checkpoint' lf
    lf?;

  progress ::= 'progress' sp not_lf* lf
    lf?;

     # note: the first idnum in a stream should be 1 and subsequent
     # idnums should not have gaps between values as this will cause
     # the stream parser to reserve space for the gapped values.  An
     # idnum can be updated in the future to a new object by issuing
     # a new mark directive with the old idnum.
     #
  mark ::= 'mark' sp idnum lf;
  data ::= (delimited_data | exact_data)
    lf?;

    # note: delim may be any string but must not contain lf.
    # data_line may contain any data but must not be exactly
    # delim.
  delimited_data ::= 'data' sp '<<' delim lf
    (data_line lf)*
    delim lf;

     # note: declen indicates the length of binary_data in bytes.
     # declen does not include the lf preceeding the binary data.
     #
  exact_data ::= 'data' sp declen lf
    binary_data;

     # note: quoted strings are C-style quoting supporting \c for
     # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
     # is the signed byte value in octal.  Note that the only
     # characters which must actually be escaped to protect the
     # stream formatting is: \, " and LF.  Otherwise these values
     # are UTF8.
     #
  ref_str     ::= ref;
  sha1exp_str ::= sha1exp;
  tag_str     ::= tag;
  path_str    ::= path    | '"' quoted(path)    '"' ;
  mode        ::= '100644' | '644'
                | '100755' | '755'
                | '120000'
                ;

  declen ::= # unsigned 32 bit value, ascii base10 notation;
  bigint ::= # unsigned integer value, ascii base10 notation;
  binary_data ::= # file content, not interpreted;

  when         ::= raw_when | rfc2822_when;
  raw_when     ::= ts sp tz;
  rfc2822_when ::= # Valid RFC 2822 date and time;

  sp ::= # ASCII space character;
  lf ::= # ASCII newline (LF) character;

     # note: a colon (':') must precede the numerical value assigned to
     # an idnum.  This is to distinguish it from a ref or tag name as
     # GIT does not permit ':' in ref or tag strings.
     #
  idnum   ::= ':' bigint;
  path    ::= # GIT style file path, e.g. "a/b/c";
  ref     ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
  tag     ::= # GIT tag name, e.g. "FIREFOX_1_5";
  sha1exp ::= # Any valid GIT SHA1 expression;
  hexsha1 ::= # SHA1 in hexadecimal format;

     # note: name and email are UTF8 strings, however name must not
     # contain '<' or lf and email must not contain any of the
     # following: '<', '>', lf.
     #
  name  ::= # valid GIT author/committer name;
  email ::= # valid GIT author/committer email;
  ts    ::= # time since the epoch in seconds, ascii base10 notation;
  tz    ::= # GIT style timezone;

     # note: comments may appear anywhere in the input, except
     # within a data command.  Any form of the data command
     # always escapes the related input from comment processing.
     #
     # In case it is not clear, the '#' that starts the comment
     # must be the first character on that the line (an lf have
     # preceeded it).
     #
  comment ::= '#' not_lf* lf;
  not_lf  ::= # Any byte that is not ASCII newline (LF);
"""


import re
import sys

import commands
import dates
import errors


## Stream parsing ##

class LineBasedParser(object):

    def __init__(self, input):
        """A Parser that keeps track of line numbers.

        :param input: the file-like object to read from
        """
        self.input = input
        self.lineno = 0
        # Lines pushed back onto the input stream
        self._buffer = []

    def abort(self, exception, *args):
        """Raise an exception providing line number information."""
        raise exception(self.lineno, *args)

    def readline(self):
        """Get the next line including the newline or '' on EOF."""
        self.lineno += 1
        if self._buffer:
            return self._buffer.pop()
        else:
            return self.input.readline()

    def next_line(self):
        """Get the next line without the newline or None on EOF."""
        line = self.readline()
        if line:
            return line[:-1]
        else:
            return None

    def push_line(self, line):
        """Push line back onto the line buffer.
        
        :param line: the line with no trailing newline
        """
        self.lineno -= 1
        self._buffer.append(line + "\n")

    def read_bytes(self, count):
        """Read a given number of bytes from the input stream.
        
        Throws MissingBytes if the bytes are not found.

        Note: This method does not read from the line buffer.

        :return: a string
        """
        lines = ''
        left = count
        found = 0
        while left > 0:
            line = self.input.readline(left)
            if line:
                line_len = len(line)
                left -= line_len
                found += line_len
                lines += line
                if line.endswith('\n'):
                    self.lineno += 1
            else:
                left = 0
        if found != count:
            self.abort(errors.MissingBytes, count, found)
        return lines

    def read_until(self, terminator):
        """Read the input stream until the terminator is found.
        
        Throws MissingTerminator if the terminator is not found.

        Note: This method does not read from the line buffer.

        :return: the bytes read up to but excluding the terminator.
        """
        raise NotImplementedError(self.read_until)


# Regular expression used for parsing. (Note: The spec states that the name
# part should be non-empty but git-fast-export doesn't always do that so
# the first bit is \w*, not \w+.) Also git-fast-import code says the
# space before the email is optional.
_WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.+)> (.+)')


class ImportParser(LineBasedParser):

    def __init__(self, input, verbose=False, output=sys.stdout):
        """A Parser of import commands.

        :param input: the file-like object to read from
        :param verbose: display extra information of not
        :param output: the file-like object to write messages to (YAGNI?)
        """
        LineBasedParser.__init__(self, input)
        self.verbose = verbose
        self.output = output
        # We auto-detect the date format when a date is first encountered
        self.date_parser = None

    def iter_commands(self):
        """Iterator returning ImportCommand objects."""
        while True:
            line = self.next_line()
            if line is None:
                break
            elif len(line) == 0 or line.startswith('#'):
                continue
            # Search for commands in order of likelihood
            elif line.startswith('commit '):
                yield self._parse_commit(line[len('commit '):])
            elif line.startswith('blob'):
                yield self._parse_blob()
            elif line.startswith('progress '):
                yield commands.ProgressCommand(line[len('progress '):])
            elif line.startswith('reset '):
                yield self._parse_reset(line[len('reset '):])
            elif line.startswith('tag '):
                yield self._parse_tag(line[len('tag '):])
            elif line.startswith('checkpoint'):
                yield commands.CheckpointCommand()
            else:
                self.abort(errors.InvalidCommand, line)

    def iter_file_commands(self):
        """Iterator returning FileCommand objects.
        
        If an invalid file command is found, the line is silently
        pushed back and iteration ends.
        """
        while True:
            line = self.next_line()
            if line is None:
                break
            elif len(line) == 0 or line.startswith('#'):
                continue
            # Search for file commands in order of likelihood
            elif line.startswith('M '):
                yield self._parse_file_modify(line[2:])
            elif line.startswith('D '):
                path = self._path(line[2:])
                yield commands.FileDeleteCommand(path)
            elif line.startswith('R '):
                old, new = self._path_pair(line[2:])
                yield commands.FileRenameCommand(old, new)
            elif line.startswith('C '):
                src, dest = self._path_pair(line[2:])
                yield commands.FileRenameCommand(src, dest)
            elif line.startswith('deleteall'):
                yield commands.FileDeleteAllCommand()
            else:
                self.push_line(line)
                break

    def _parse_blob(self):
        """Parse a blob command."""
        lineno = self.lineno
        mark = self._get_mark_if_any()
        data = self._get_data('blob')
        return commands.BlobCommand(mark, data, lineno)

    def _parse_commit(self, ref):
        """Parse a commit command."""
        lineno  = self.lineno
        mark = self._get_mark_if_any()
        author = self._get_user_info('commit', 'author', False)
        committer = self._get_user_info('commit', 'committer')
        message = self._get_data('commit', 'message').decode('utf_8')
        from_ = self._get_from()
        merges = []
        while True:
            merge = self._get_merge()
            if merge is not None:
                merges.append(merge)
            else:
                break
        return commands.CommitCommand(ref, mark, author, committer, message,
            from_, merges, self.iter_file_commands, lineno)

    def _parse_file_modify(self, info):
        """Parse a filemodify command within a commit.

        :param info: a string in the format "mode dataref path"
          (where dataref might be the hard-coded literal 'inline').
        """
        params = info.split(' ', 2)
        path = self._path(params[2])
        is_executable, is_symlink = self._mode(params[0])
        if is_symlink:
            kind = commands.SYMLINK_KIND
        else:
            kind = commands.FILE_KIND
        if params[1] == 'inline':
            dataref = None
            data = self._get_data('filemodify')
        else:
            dataref = params[1]
            data = None
        return commands.FileModifyCommand(path, kind, is_executable, dataref,
            data)

    def _parse_reset(self, ref):
        """Parse a reset command."""
        from_ = self._get_from()
        return commands.ResetCommand(ref, from_)

    def _parse_tag(self, name):
        """Parse a tag command."""
        from_ = self._get_from('tag')
        tagger = self._get_user_info('tag', 'tagger')
        message = self._get_data('tag', 'message').decode('utf_8')
        return commands.TagCommand(name, from_, tagger, message)

    def _get_mark_if_any(self):
        """Parse a mark section."""
        line = self.next_line()
        if line.startswith('mark :'):
            return line[len('mark :'):]
        else:
            self.push_line(line)
            return None

    def _get_from(self, required_for=None):
        """Parse a from section."""
        line = self.next_line()
        if line.startswith('from '):
            return line[len('from '):]
        elif required_for:
            self.abort(errors.MissingSection, required_for, 'from')
        else:
            self.push_line(line)
            return None

    def _get_merge(self):
        """Parse a merge section."""
        line = self.next_line()
        if line.startswith('merge '):
            return line[len('merge '):]
        else:
            self.push_line(line)
            return None

    def _get_user_info(self, cmd, section, required=True):
        """Parse a user section."""
        line = self.next_line()
        if line.startswith(section + ' '):
            return self._who_when(line[len(section + ' '):], cmd, section)
        elif required:
            self.abort(errors.MissingSection, cmd, section)
        else:
            self.push_line(line)
            return None

    def _get_data(self, required_for, section='data'):
        """Parse a data section."""
        line = self.next_line()
        if line.startswith('data '):
            rest = line[len('data '):]
            if rest.startswith('<<'):
                return self.read_until(rest[2:])
            else:
                size = int(rest)
                read_bytes = self.read_bytes(size)
                # optional LF after data.
                next = self.input.readline()
                self.lineno += 1
                if len(next) > 1 or next != "\n":
                    self.push_line(next[:-1])
                return read_bytes
        else:
            self.abort(errors.MissingSection, required_for, section)

    def _who_when(self, s, cmd, section):
        """Parse who and when information from a string.
        
        :return: a tuple of (name,email,timestamp,timezone). name may be
            the empty string if only an email address was given.
        """
        match = _WHO_AND_WHEN_RE.search(s)
        if match:
            datestr = match.group(3)
            if self.date_parser is None:
                # auto-detect the date format
                if len(datestr.split(' ')) == 2:
                    format = 'raw'
                elif datestr == 'now':
                    format = 'now'
                else:
                    format = 'rfc2822'
                self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
            when = self.date_parser(datestr)
            name = match.group(1)
            if len(name) > 0:
                if name[-1] == " ":
                    name = name[:-1].decode('utf_8')
            return (name,match.group(2),when[0],when[1])
        else:
            self.abort(errors.BadFormat, cmd, section, s)

    def _path(self, s):
        """Parse a path."""
        if s.startswith('"'):
            if s[-1] != '"':
                self.abort(errors.BadFormat, cmd, section, s)
            else:
                return _unquote_c_string(s[1:-1])
        return s.decode('utf_8')

    def _path_pair(self, s):
        """Parse two paths separated by a space."""
        # TODO: handle a space in the first path
        parts = s.split(' ', 1)
        return map(_unquote_c_string, parts)

    def _mode(self, s):
        """Parse a file mode into executable and symlink flags.
        
        :return (is_executable, is_symlink)
        """
        # Note: Output from git-fast-export slightly different to spec
        if s in ['644', '100644', '0100644']:
            return False, False
        elif s in ['755', '100755', '0100755']:
            return True, False
        elif s in ['120000', '0120000']:
            return False, True
        else:
            self.abort(errors.BadFormat, 'filemodify', 'mode', s)


def _unquote_c_string(s):
    """replace C-style escape sequences (\n, \", etc.) with real chars."""
    # HACK: Python strings are close enough
    return s.decode('string_escape', 'replace')

0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	1	# Copyright (C) 2008 Canonical Ltd
	2	#
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	16
	17	"""Parser of import data into command objects.
	18
	19	In order to reuse existing front-ends, the stream format is a subset of
	20	the one used by git-fast-import (as of the 1.5.4 release of git at least).
	21	The grammar is:
	22
	23	stream ::= cmd*;
	24
	25	cmd ::= new_blob
	26	\| new_commit
	27	\| new_tag
	28	\| reset_branch
	29	\| checkpoint
	30	\| progress
	31	;
	32
	33	new_blob ::= 'blob' lf
	34	mark?
	35	file_content;
	36	file_content ::= data;
	37
	38	new_commit ::= 'commit' sp ref_str lf
	39	mark?
	40	('author' sp name '<' email '>' when lf)?
	41	'committer' sp name '<' email '>' when lf
	42	commit_msg
	43	('from' sp (ref_str \| hexsha1 \| sha1exp_str \| idnum) lf)?
	44	('merge' sp (ref_str \| hexsha1 \| sha1exp_str \| idnum) lf)*
	45	file_change*
	46	lf?;
	47	commit_msg ::= data;
	48
	49	file_change ::= file_clr
	50	\| file_del
	51	\| file_rnm
	52	\| file_cpy
	53	\| file_obm
	54	\| file_inm;
	55	file_clr ::= 'deleteall' lf;
	56	file_del ::= 'D' sp path_str lf;
	57	file_rnm ::= 'R' sp path_str sp path_str lf;
	58	file_cpy ::= 'C' sp path_str sp path_str lf;
	59	file_obm ::= 'M' sp mode sp (hexsha1 \| idnum) sp path_str lf;
	60	file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
	61	data;
	62
	63	new_tag ::= 'tag' sp tag_str lf
	64	'from' sp (ref_str \| hexsha1 \| sha1exp_str \| idnum) lf
65	'tagger' sp name '<' email '>' when lf
66	tag_msg;
67	tag_msg ::= data;
68
69	reset_branch ::= 'reset' sp ref_str lf
70	('from' sp (ref_str \| hexsha1 \| sha1exp_str \| idnum) lf)?
71	lf?;
72
73	checkpoint ::= 'checkpoint' lf
74	lf?;
75
76	progress ::= 'progress' sp not_lf* lf
77	lf?;
78
79	# note: the first idnum in a stream should be 1 and subsequent
80	# idnums should not have gaps between values as this will cause
81	# the stream parser to reserve space for the gapped values. An
82	# idnum can be updated in the future to a new object by issuing
83	# a new mark directive with the old idnum.
84	#
85	mark ::= 'mark' sp idnum lf;
86	data ::= (delimited_data \| exact_data)
87	lf?;
88
89	# note: delim may be any string but must not contain lf.
90	# data_line may contain any data but must not be exactly
91	# delim.
92	delimited_data ::= 'data' sp '<<' delim lf
93	(data_line lf)*
94	delim lf;
95
96	# note: declen indicates the length of binary_data in bytes.
97	# declen does not include the lf preceeding the binary data.
98	#
99	exact_data ::= 'data' sp declen lf
100	binary_data;
101
102	# note: quoted strings are C-style quoting supporting \c for
103	# common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
104	# is the signed byte value in octal. Note that the only
105	# characters which must actually be escaped to protect the
106	# stream formatting is: \, " and LF. Otherwise these values
107	# are UTF8.
108	#
109	ref_str ::= ref;
110	sha1exp_str ::= sha1exp;
111	tag_str ::= tag;
112	path_str ::= path \| '"' quoted(path) '"' ;
113	mode ::= '100644' \| '644'
114	\| '100755' \| '755'
115	\| '120000'
116	;
117
118	declen ::= # unsigned 32 bit value, ascii base10 notation;
119	bigint ::= # unsigned integer value, ascii base10 notation;
120	binary_data ::= # file content, not interpreted;
121
122	when ::= raw_when \| rfc2822_when;
123	raw_when ::= ts sp tz;
124	rfc2822_when ::= # Valid RFC 2822 date and time;
125
126	sp ::= # ASCII space character;
127	lf ::= # ASCII newline (LF) character;
128
129	# note: a colon (':') must precede the numerical value assigned to
130	# an idnum. This is to distinguish it from a ref or tag name as
131	# GIT does not permit ':' in ref or tag strings.
132	#
133	idnum ::= ':' bigint;
134	path ::= # GIT style file path, e.g. "a/b/c";
135	ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
136	tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
137	sha1exp ::= # Any valid GIT SHA1 expression;
138	hexsha1 ::= # SHA1 in hexadecimal format;
139
140	# note: name and email are UTF8 strings, however name must not
141	# contain '<' or lf and email must not contain any of the
142	# following: '<', '>', lf.
143	#
144	name ::= # valid GIT author/committer name;
145	email ::= # valid GIT author/committer email;
146	ts ::= # time since the epoch in seconds, ascii base10 notation;
147	tz ::= # GIT style timezone;
148
149	# note: comments may appear anywhere in the input, except
150	# within a data command. Any form of the data command
151	# always escapes the related input from comment processing.
152	#
153	# In case it is not clear, the '#' that starts the comment
154	# must be the first character on that the line (an lf have
155	# preceeded it).
156	#
157	comment ::= '#' not_lf* lf;
158	not_lf ::= # Any byte that is not ASCII newline (LF);
159	"""
160
161
162	import re
163	import sys
164
165	import commands
166	import dates
167	import errors
168
169
170	## Stream parsing ##
171
172	class LineBasedParser(object):
173
174	def __init__(self, input):
175	"""A Parser that keeps track of line numbers.
176
177	:param input: the file-like object to read from
178	"""
179	self.input = input
180	self.lineno = 0
181	# Lines pushed back onto the input stream
182	self._buffer = []
183
184	def abort(self, exception, *args):
185	"""Raise an exception providing line number information."""
186	raise exception(self.lineno, *args)
187
188	def readline(self):
189	"""Get the next line including the newline or '' on EOF."""
190	self.lineno += 1
191	if self._buffer:
192	return self._buffer.pop()
193	else:
194	return self.input.readline()
195
196	def next_line(self):
197	"""Get the next line without the newline or None on EOF."""
198	line = self.readline()
199	if line:
200	return line[:-1]
201	else:
202	return None
203
204	def push_line(self, line):
205	"""Push line back onto the line buffer.
206
207	:param line: the line with no trailing newline
208	"""
209	self.lineno -= 1
210	self._buffer.append(line + "\n")
211
212	def read_bytes(self, count):
213	"""Read a given number of bytes from the input stream.
214
215	Throws MissingBytes if the bytes are not found.
216
217	Note: This method does not read from the line buffer.
0.64.6 by Ian Clatworthy generic processing method working for one revision in one branch	218
	219	:return: a string
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	220	"""
0.64.72 by Ian Clatworthy reduce memory usage while reading large blobs	221	lines = ''
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	222	left = count
0.64.6 by Ian Clatworthy generic processing method working for one revision in one branch	223	found = 0
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	224	while left > 0:
	225	line = self.input.readline(left)
	226	if line:
0.64.6 by Ian Clatworthy generic processing method working for one revision in one branch	227	line_len = len(line)
	228	left -= line_len
	229	found += line_len
0.64.72 by Ian Clatworthy reduce memory usage while reading large blobs	230	lines += line
0.64.53 by Ian Clatworthy fix line number tracking	231	if line.endswith('\n'):
0.64.53 by Ian Clatworthy fix line number tracking	232	self.lineno += 1
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	233	else:
	234	left = 0
	235	if found != count:
	236	self.abort(errors.MissingBytes, count, found)
0.64.72 by Ian Clatworthy reduce memory usage while reading large blobs	237	return lines
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	238
	239	def read_until(self, terminator):
	240	"""Read the input stream until the terminator is found.
	241
	242	Throws MissingTerminator if the terminator is not found.
	243
	244	Note: This method does not read from the line buffer.
	245
	246	:return: the bytes read up to but excluding the terminator.
	247	"""
	248	raise NotImplementedError(self.read_until)
	249
	250
0.64.3 by Ian Clatworthy tweak parser for better git-fast-export compatibility	251	# Regular expression used for parsing. (Note: The spec states that the name
	252	# part should be non-empty but git-fast-export doesn't always do that so
0.65.2 by James Westby The space between the author and email is optional in committer.	253	# the first bit is \w*, not \w+.) Also git-fast-import code says the
	254	# space before the email is optional.
0.65.5 by James Westby Make the parser handle multiple words in the committer name.	255	_WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.+)> (.+)')
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	256
	257
	258	class ImportParser(LineBasedParser):
	259
	260	def __init__(self, input, verbose=False, output=sys.stdout):
	261	"""A Parser of import commands.
	262
	263	:param input: the file-like object to read from
	264	:param verbose: display extra information of not
	265	:param output: the file-like object to write messages to (YAGNI?)
	266	"""
	267	LineBasedParser.__init__(self, input)
	268	self.verbose = verbose
	269	self.output = output
	270	# We auto-detect the date format when a date is first encountered
	271	self.date_parser = None
	272
	273	def iter_commands(self):
	274	"""Iterator returning ImportCommand objects."""
	275	while True:
	276	line = self.next_line()
	277	if line is None:
	278	break
	279	elif len(line) == 0 or line.startswith('#'):
	280	continue
	281	# Search for commands in order of likelihood
	282	elif line.startswith('commit '):
	283	yield self._parse_commit(line[len('commit '):])
	284	elif line.startswith('blob'):
	285	yield self._parse_blob()
	286	elif line.startswith('progress '):
	287	yield commands.ProgressCommand(line[len('progress '):])
	288	elif line.startswith('reset '):
	289	yield self._parse_reset(line[len('reset '):])
	290	elif line.startswith('tag '):
	291	yield self._parse_tag(line[len('tag '):])
	292	elif line.startswith('checkpoint'):
	293	yield commands.CheckpointCommand()
	294	else:
	295	self.abort(errors.InvalidCommand, line)
	296
	297	def iter_file_commands(self):
	298	"""Iterator returning FileCommand objects.
	299
	300	If an invalid file command is found, the line is silently
	301	pushed back and iteration ends.
	302	"""
	303	while True:
	304	line = self.next_line()
	305	if line is None:
	306	break
	307	elif len(line) == 0 or line.startswith('#'):
	308	continue
	309	# Search for file commands in order of likelihood
	310	elif line.startswith('M '):
	311	yield self._parse_file_modify(line[2:])
	312	elif line.startswith('D '):
	313	path = self._path(line[2:])
	314	yield commands.FileDeleteCommand(path)
	315	elif line.startswith('R '):
	316	old, new = self._path_pair(line[2:])
	317	yield commands.FileRenameCommand(old, new)
	318	elif line.startswith('C '):
	319	src, dest = self._path_pair(line[2:])
320	yield commands.FileRenameCommand(src, dest)
321	elif line.startswith('deleteall'):
322	yield commands.FileDeleteAllCommand()
323	else:
324	self.push_line(line)
325	break
326
327	def _parse_blob(self):
328	"""Parse a blob command."""
0.64.35 by Ian Clatworthy identify unmarked blobs and commits by line numbers	329	lineno = self.lineno
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	330	mark = self._get_mark_if_any()
	331	data = self._get_data('blob')
0.64.35 by Ian Clatworthy identify unmarked blobs and commits by line numbers	332	return commands.BlobCommand(mark, data, lineno)
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	333
	334	def _parse_commit(self, ref):
	335	"""Parse a commit command."""
0.64.35 by Ian Clatworthy identify unmarked blobs and commits by line numbers	336	lineno = self.lineno
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	337	mark = self._get_mark_if_any()
	338	author = self._get_user_info('commit', 'author', False)
	339	committer = self._get_user_info('commit', 'committer')
0.64.76 by Ian Clatworthy fix utf-8 decoding bugs	340	message = self._get_data('commit', 'message').decode('utf_8')
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	341	from_ = self._get_from()
0.64.60 by Ian Clatworthy support merges when from clause implicit	342	merges = []
	343	while True:
	344	merge = self._get_merge()
	345	if merge is not None:
	346	merges.append(merge)
	347	else:
	348	break
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	349	return commands.CommitCommand(ref, mark, author, committer, message,
0.64.60 by Ian Clatworthy support merges when from clause implicit	350	from_, merges, self.iter_file_commands, lineno)
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	351
	352	def _parse_file_modify(self, info):
	353	"""Parse a filemodify command within a commit.
	354
	355	:param info: a string in the format "mode dataref path"
	356	(where dataref might be the hard-coded literal 'inline').
	357	"""
	358	params = info.split(' ', 2)
	359	path = self._path(params[2])
	360	is_executable, is_symlink = self._mode(params[0])
	361	if is_symlink:
	362	kind = commands.SYMLINK_KIND
	363	else:
	364	kind = commands.FILE_KIND
	365	if params[1] == 'inline':
	366	dataref = None
	367	data = self._get_data('filemodify')
	368	else:
	369	dataref = params[1]
	370	data = None
	371	return commands.FileModifyCommand(path, kind, is_executable, dataref,
	372	data)
	373
	374	def _parse_reset(self, ref):
	375	"""Parse a reset command."""
	376	from_ = self._get_from()
	377	return commands.ResetCommand(ref, from_)
	378
	379	def _parse_tag(self, name):
	380	"""Parse a tag command."""
	381	from_ = self._get_from('tag')
	382	tagger = self._get_user_info('tag', 'tagger')
0.64.76 by Ian Clatworthy fix utf-8 decoding bugs	383	message = self._get_data('tag', 'message').decode('utf_8')
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	384	return commands.TagCommand(name, from_, tagger, message)
	385
	386	def _get_mark_if_any(self):
	387	"""Parse a mark section."""
	388	line = self.next_line()
	389	if line.startswith('mark :'):
	390	return line[len('mark :'):]
	391	else:
	392	self.push_line(line)
	393	return None
	394
	395	def _get_from(self, required_for=None):
	396	"""Parse a from section."""
	397	line = self.next_line()
	398	if line.startswith('from '):
	399	return line[len('from '):]
	400	elif required_for:
	401	self.abort(errors.MissingSection, required_for, 'from')
	402	else:
	403	self.push_line(line)
	404	return None
	405
	406	def _get_merge(self):
	407	"""Parse a merge section."""
	408	line = self.next_line()
	409	if line.startswith('merge '):
	410	return line[len('merge '):]
	411	else:
	412	self.push_line(line)
	413	return None
	414
	415	def _get_user_info(self, cmd, section, required=True):
	416	"""Parse a user section."""
	417	line = self.next_line()
	418	if line.startswith(section + ' '):
	419	return self._who_when(line[len(section + ' '):], cmd, section)
	420	elif required:
	421	self.abort(errors.MissingSection, cmd, section)
	422	else:
	423	self.push_line(line)
	424	return None
	425
	426	def _get_data(self, required_for, section='data'):
	427	"""Parse a data section."""
	428	line = self.next_line()
	429	if line.startswith('data '):
	430	rest = line[len('data '):]
	431	if rest.startswith('<<'):
	432	return self.read_until(rest[2:])
	433	else:
	434	size = int(rest)
0.65.1 by James Westby The data sections have an optional LF at the end in the byte count format.	435	read_bytes = self.read_bytes(size)
	436	# optional LF after data.
	437	next = self.input.readline()
	438	self.lineno += 1
	439	if len(next) > 1 or next != "\n":
0.65.5 by James Westby Make the parser handle multiple words in the committer name.	440	self.push_line(next[:-1])
0.65.1 by James Westby The data sections have an optional LF at the end in the byte count format.	441	return read_bytes
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	442	else:
	443	self.abort(errors.MissingSection, required_for, section)
	444
	445	def _who_when(self, s, cmd, section):
	446	"""Parse who and when information from a string.
	447
0.65.5 by James Westby Make the parser handle multiple words in the committer name.	448	:return: a tuple of (name,email,timestamp,timezone). name may be
	449	the empty string if only an email address was given.
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	450	"""
	451	match = _WHO_AND_WHEN_RE.search(s)
	452	if match:
	453	datestr = match.group(3)
	454	if self.date_parser is None:
	455	# auto-detect the date format
0.64.3 by Ian Clatworthy tweak parser for better git-fast-export compatibility	456	if len(datestr.split(' ')) == 2:
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	457	format = 'raw'
	458	elif datestr == 'now':
	459	format = 'now'
	460	else:
	461	format = 'rfc2822'
	462	self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
	463	when = self.date_parser(datestr)
0.65.5 by James Westby Make the parser handle multiple words in the committer name.	464	name = match.group(1)
	465	if len(name) > 0:
	466	if name[-1] == " ":
0.64.76 by Ian Clatworthy fix utf-8 decoding bugs	467	name = name[:-1].decode('utf_8')
0.65.5 by James Westby Make the parser handle multiple words in the committer name.	468	return (name,match.group(2),when[0],when[1])
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	469	else:
	470	self.abort(errors.BadFormat, cmd, section, s)
	471
	472	def _path(self, s):
	473	"""Parse a path."""
0.64.10 by Ian Clatworthy 1st cut are dequoting paths	474	if s.startswith('"'):
	475	if s[-1] != '"':
	476	self.abort(errors.BadFormat, cmd, section, s)
	477	else:
	478	return _unquote_c_string(s[1:-1])
0.64.76 by Ian Clatworthy fix utf-8 decoding bugs	479	return s.decode('utf_8')
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	480
	481	def _path_pair(self, s):
	482	"""Parse two paths separated by a space."""
0.64.10 by Ian Clatworthy 1st cut are dequoting paths	483	# TODO: handle a space in the first path
	484	parts = s.split(' ', 1)
	485	return map(_unquote_c_string, parts)
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	486
	487	def _mode(self, s):
	488	"""Parse a file mode into executable and symlink flags.
	489
	490	:return (is_executable, is_symlink)
	491	"""
	492	# Note: Output from git-fast-export slightly different to spec
	493	if s in ['644', '100644', '0100644']:
	494	return False, False
	495	elif s in ['755', '100755', '0100755']:
	496	return True, False
0.64.3 by Ian Clatworthy tweak parser for better git-fast-export compatibility	497	elif s in ['120000', '0120000']:
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	498	return False, True
	499	else:
	500	self.abort(errors.BadFormat, 'filemodify', 'mode', s)
	501
0.64.10 by Ian Clatworthy 1st cut are dequoting paths	502
	503	def _unquote_c_string(s):
	504	"""replace C-style escape sequences (\n, \", etc.) with real chars."""
	505	# HACK: Python strings are close enough
	506	return s.decode('string_escape', 'replace')