/brz/remove-bazaar : contents of parser.py at revision 0.64.87

: (revision 0.64.87)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

# Copyright (C) 2008 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""Parser of import data into command objects.

In order to reuse existing front-ends, the stream format is a subset of
the one used by git-fast-import (as of the 1.5.4 release of git at least).
The grammar is:

  stream ::= cmd*;

  cmd ::= new_blob
        | new_commit
        | new_tag
        | reset_branch
        | checkpoint
        | progress
        ;

  new_blob ::= 'blob' lf
    mark?
    file_content;
  file_content ::= data;

  new_commit ::= 'commit' sp ref_str lf
    mark?
    ('author' sp name '<' email '>' when lf)?
    'committer' sp name '<' email '>' when lf
    commit_msg
    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
    ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
    file_change*
    lf?;
  commit_msg ::= data;

  file_change ::= file_clr
    | file_del
    | file_rnm
    | file_cpy
    | file_obm
    | file_inm;
  file_clr ::= 'deleteall' lf;
  file_del ::= 'D' sp path_str lf;
  file_rnm ::= 'R' sp path_str sp path_str lf;
  file_cpy ::= 'C' sp path_str sp path_str lf;
  file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
  file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
    data;

  new_tag ::= 'tag' sp tag_str lf
    'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
    'tagger' sp name '<' email '>' when lf
    tag_msg;
  tag_msg ::= data;

  reset_branch ::= 'reset' sp ref_str lf
    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
    lf?;

  checkpoint ::= 'checkpoint' lf
    lf?;

  progress ::= 'progress' sp not_lf* lf
    lf?;

     # note: the first idnum in a stream should be 1 and subsequent
     # idnums should not have gaps between values as this will cause
     # the stream parser to reserve space for the gapped values.  An
     # idnum can be updated in the future to a new object by issuing
     # a new mark directive with the old idnum.
     #
  mark ::= 'mark' sp idnum lf;
  data ::= (delimited_data | exact_data)
    lf?;

    # note: delim may be any string but must not contain lf.
    # data_line may contain any data but must not be exactly
    # delim.
  delimited_data ::= 'data' sp '<<' delim lf
    (data_line lf)*
    delim lf;

     # note: declen indicates the length of binary_data in bytes.
     # declen does not include the lf preceeding the binary data.
     #
  exact_data ::= 'data' sp declen lf
    binary_data;

     # note: quoted strings are C-style quoting supporting \c for
     # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
     # is the signed byte value in octal.  Note that the only
     # characters which must actually be escaped to protect the
     # stream formatting is: \, " and LF.  Otherwise these values
     # are UTF8.
     #
  ref_str     ::= ref;
  sha1exp_str ::= sha1exp;
  tag_str     ::= tag;
  path_str    ::= path    | '"' quoted(path)    '"' ;
  mode        ::= '100644' | '644'
                | '100755' | '755'
                | '120000'
                ;

  declen ::= # unsigned 32 bit value, ascii base10 notation;
  bigint ::= # unsigned integer value, ascii base10 notation;
  binary_data ::= # file content, not interpreted;

  when         ::= raw_when | rfc2822_when;
  raw_when     ::= ts sp tz;
  rfc2822_when ::= # Valid RFC 2822 date and time;

  sp ::= # ASCII space character;
  lf ::= # ASCII newline (LF) character;

     # note: a colon (':') must precede the numerical value assigned to
     # an idnum.  This is to distinguish it from a ref or tag name as
     # GIT does not permit ':' in ref or tag strings.
     #
  idnum   ::= ':' bigint;
  path    ::= # GIT style file path, e.g. "a/b/c";
  ref     ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
  tag     ::= # GIT tag name, e.g. "FIREFOX_1_5";
  sha1exp ::= # Any valid GIT SHA1 expression;
  hexsha1 ::= # SHA1 in hexadecimal format;

     # note: name and email are UTF8 strings, however name must not
     # contain '<' or lf and email must not contain any of the
     # following: '<', '>', lf.
     #
  name  ::= # valid GIT author/committer name;
  email ::= # valid GIT author/committer email;
  ts    ::= # time since the epoch in seconds, ascii base10 notation;
  tz    ::= # GIT style timezone;

     # note: comments may appear anywhere in the input, except
     # within a data command.  Any form of the data command
     # always escapes the related input from comment processing.
     #
     # In case it is not clear, the '#' that starts the comment
     # must be the first character on that the line (an lf have
     # preceeded it).
     #
  comment ::= '#' not_lf* lf;
  not_lf  ::= # Any byte that is not ASCII newline (LF);
"""


import re
import sys

import commands
import dates
import errors


## Stream parsing ##

class LineBasedParser(object):

    def __init__(self, input):
        """A Parser that keeps track of line numbers.

        :param input: the file-like object to read from
        """
        self.input = input
        self.lineno = 0
        # Lines pushed back onto the input stream
        self._buffer = []

    def abort(self, exception, *args):
        """Raise an exception providing line number information."""
        raise exception(self.lineno, *args)

    def readline(self):
        """Get the next line including the newline or '' on EOF."""
        self.lineno += 1
        if self._buffer:
            return self._buffer.pop()
        else:
            return self.input.readline()

    def next_line(self):
        """Get the next line without the newline or None on EOF."""
        line = self.readline()
        if line:
            return line[:-1]
        else:
            return None

    def push_line(self, line):
        """Push line back onto the line buffer.
        
        :param line: the line with no trailing newline
        """
        self.lineno -= 1
        self._buffer.append(line + "\n")

    def read_bytes(self, count):
        """Read a given number of bytes from the input stream.
        
        Throws MissingBytes if the bytes are not found.

        Note: This method does not read from the line buffer.

        :return: a string
        """
        lines = ''
        left = count
        found = 0
        while left > 0:
            line = self.input.readline(left)
            if line:
                line_len = len(line)
                left -= line_len
                found += line_len
                lines += line
                if line.endswith('\n'):
                    self.lineno += 1
            else:
                left = 0
        if found != count:
            self.abort(errors.MissingBytes, count, found)
        return lines

    def read_until(self, terminator):
        """Read the input stream until the terminator is found.
        
        Throws MissingTerminator if the terminator is not found.

        Note: This method does not read from the line buffer.

        :return: the bytes read up to but excluding the terminator.
        """
        raise NotImplementedError(self.read_until)


# Regular expression used for parsing. (Note: The spec states that the name
# part should be non-empty but git-fast-export doesn't always do that so
# the first bit is \w*, not \w+.) Also git-fast-import code says the
# space before the email is optional.
_WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.+)> (.+)')
_WHO_RE = re.compile(r'([^<]*)<(.+)>')


class ImportParser(LineBasedParser):

    def __init__(self, input, verbose=False, output=sys.stdout):
        """A Parser of import commands.

        :param input: the file-like object to read from
        :param verbose: display extra information of not
        :param output: the file-like object to write messages to (YAGNI?)
        """
        LineBasedParser.__init__(self, input)
        self.verbose = verbose
        self.output = output
        # We auto-detect the date format when a date is first encountered
        self.date_parser = None

    def iter_commands(self):
        """Iterator returning ImportCommand objects."""
        while True:
            line = self.next_line()
            if line is None:
                break
            elif len(line) == 0 or line.startswith('#'):
                continue
            # Search for commands in order of likelihood
            elif line.startswith('commit '):
                yield self._parse_commit(line[len('commit '):])
            elif line.startswith('blob'):
                yield self._parse_blob()
            elif line.startswith('progress '):
                yield commands.ProgressCommand(line[len('progress '):])
            elif line.startswith('reset '):
                yield self._parse_reset(line[len('reset '):])
            elif line.startswith('tag '):
                yield self._parse_tag(line[len('tag '):])
            elif line.startswith('checkpoint'):
                yield commands.CheckpointCommand()
            else:
                self.abort(errors.InvalidCommand, line)

    def iter_file_commands(self):
        """Iterator returning FileCommand objects.
        
        If an invalid file command is found, the line is silently
        pushed back and iteration ends.
        """
        while True:
            line = self.next_line()
            if line is None:
                break
            elif len(line) == 0 or line.startswith('#'):
                continue
            # Search for file commands in order of likelihood
            elif line.startswith('M '):
                yield self._parse_file_modify(line[2:])
            elif line.startswith('D '):
                path = self._path(line[2:])
                yield commands.FileDeleteCommand(path)
            elif line.startswith('R '):
                old, new = self._path_pair(line[2:])
                yield commands.FileRenameCommand(old, new)
            elif line.startswith('C '):
                src, dest = self._path_pair(line[2:])
                yield commands.FileRenameCommand(src, dest)
            elif line.startswith('deleteall'):
                yield commands.FileDeleteAllCommand()
            else:
                self.push_line(line)
                break

    def _parse_blob(self):
        """Parse a blob command."""
        lineno = self.lineno
        mark = self._get_mark_if_any()
        data = self._get_data('blob')
        return commands.BlobCommand(mark, data, lineno)

    def _parse_commit(self, ref):
        """Parse a commit command."""
        lineno  = self.lineno
        mark = self._get_mark_if_any()
        author = self._get_user_info('commit', 'author', False)
        committer = self._get_user_info('commit', 'committer')
        message = self._get_data('commit', 'message')
        try:
            message = message.decode('utf_8')
        except UnicodeDecodeError:
            # TODO: output a warning here about a broken front-end
            pass
        from_ = self._get_from()
        merges = []
        while True:
            merge = self._get_merge()
            if merge is not None:
                merges.append(merge)
            else:
                break
        return commands.CommitCommand(ref, mark, author, committer, message,
            from_, merges, self.iter_file_commands, lineno)

    def _parse_file_modify(self, info):
        """Parse a filemodify command within a commit.

        :param info: a string in the format "mode dataref path"
          (where dataref might be the hard-coded literal 'inline').
        """
        params = info.split(' ', 2)
        path = self._path(params[2])
        is_executable, is_symlink = self._mode(params[0])
        if is_symlink:
            kind = commands.SYMLINK_KIND
        else:
            kind = commands.FILE_KIND
        if params[1] == 'inline':
            dataref = None
            data = self._get_data('filemodify')
        else:
            dataref = params[1]
            data = None
        return commands.FileModifyCommand(path, kind, is_executable, dataref,
            data)

    def _parse_reset(self, ref):
        """Parse a reset command."""
        from_ = self._get_from()
        return commands.ResetCommand(ref, from_)

    def _parse_tag(self, name):
        """Parse a tag command."""
        from_ = self._get_from('tag')
        tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)
        message = self._get_data('tag', 'message').decode('utf_8')
        return commands.TagCommand(name, from_, tagger, message)

    def _get_mark_if_any(self):
        """Parse a mark section."""
        line = self.next_line()
        if line.startswith('mark :'):
            return line[len('mark :'):]
        else:
            self.push_line(line)
            return None

    def _get_from(self, required_for=None):
        """Parse a from section."""
        line = self.next_line()
        if line.startswith('from '):
            return line[len('from '):]
        elif required_for:
            self.abort(errors.MissingSection, required_for, 'from')
        else:
            self.push_line(line)
            return None

    def _get_merge(self):
        """Parse a merge section."""
        line = self.next_line()
        if line.startswith('merge '):
            return line[len('merge '):]
        else:
            self.push_line(line)
            return None

    def _get_user_info(self, cmd, section, required=True,
        accept_just_who=False):
        """Parse a user section."""
        line = self.next_line()
        if line.startswith(section + ' '):
            return self._who_when(line[len(section + ' '):], cmd, section,
                accept_just_who=accept_just_who)
        elif required:
            self.abort(errors.MissingSection, cmd, section)
        else:
            self.push_line(line)
            return None

    def _get_data(self, required_for, section='data'):
        """Parse a data section."""
        line = self.next_line()
        if line.startswith('data '):
            rest = line[len('data '):]
            if rest.startswith('<<'):
                return self.read_until(rest[2:])
            else:
                size = int(rest)
                read_bytes = self.read_bytes(size)
                # optional LF after data.
                next = self.input.readline()
                self.lineno += 1
                if len(next) > 1 or next != "\n":
                    self.push_line(next[:-1])
                return read_bytes
        else:
            self.abort(errors.MissingSection, required_for, section)

    def _who_when(self, s, cmd, section, accept_just_who=False):
        """Parse who and when information from a string.
        
        :return: a tuple of (name,email,timestamp,timezone). name may be
            the empty string if only an email address was given.
        """
        match = _WHO_AND_WHEN_RE.search(s)
        if match:
            datestr = match.group(3)
            if self.date_parser is None:
                # auto-detect the date format
                if len(datestr.split(' ')) == 2:
                    format = 'raw'
                elif datestr == 'now':
                    format = 'now'
                else:
                    format = 'rfc2822'
                self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
            when = self.date_parser(datestr)
        else:
            match = _WHO_RE.search(s)
            if accept_just_who and match:
                # HACK around missing time
                # TODO: output a warning here
                when = dates.DATE_PARSERS_BY_NAME['now']('now')
            else:
                self.abort(errors.BadFormat, cmd, section, s)
        name = match.group(1)
        if len(name) > 0:
            if name[-1] == " ":
                name = name[:-1].decode('utf_8')
        return (name,match.group(2),when[0],when[1])

    def _path(self, s):
        """Parse a path."""
        if s.startswith('"'):
            if s[-1] != '"':
                self.abort(errors.BadFormat, cmd, section, s)
            else:
                return _unquote_c_string(s[1:-1])
        return s.decode('utf_8')

    def _path_pair(self, s):
        """Parse two paths separated by a space."""
        # TODO: handle a space in the first path
        parts = s.split(' ', 1)
        return map(_unquote_c_string, parts)

    def _mode(self, s):
        """Parse a file mode into executable and symlink flags.
        
        :return (is_executable, is_symlink)
        """
        # Note: Output from git-fast-export slightly different to spec
        if s in ['644', '100644', '0100644']:
            return False, False
        elif s in ['755', '100755', '0100755']:
            return True, False
        elif s in ['120000', '0120000']:
            return False, True
        else:
            self.abort(errors.BadFormat, 'filemodify', 'mode', s)


def _unquote_c_string(s):
    """replace C-style escape sequences (\n, \", etc.) with real chars."""
    # HACK: Python strings are close enough
    return s.decode('string_escape', 'replace')

0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	1	# Copyright (C) 2008 Canonical Ltd
	2	#
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	16
	17	"""Parser of import data into command objects.
	18
	19	In order to reuse existing front-ends, the stream format is a subset of
	20	the one used by git-fast-import (as of the 1.5.4 release of git at least).
	21	The grammar is:
	22
	23	stream ::= cmd*;
	24
	25	cmd ::= new_blob
	26	\| new_commit
	27	\| new_tag
	28	\| reset_branch
	29	\| checkpoint
	30	\| progress
	31	;
	32
	33	new_blob ::= 'blob' lf
	34	mark?
	35	file_content;
	36	file_content ::= data;
	37
	38	new_commit ::= 'commit' sp ref_str lf
	39	mark?
	40	('author' sp name '<' email '>' when lf)?
	41	'committer' sp name '<' email '>' when lf
	42	commit_msg
	43	('from' sp (ref_str \| hexsha1 \| sha1exp_str \| idnum) lf)?
	44	('merge' sp (ref_str \| hexsha1 \| sha1exp_str \| idnum) lf)*
	45	file_change*
	46	lf?;
	47	commit_msg ::= data;
	48
	49	file_change ::= file_clr
	50	\| file_del
	51	\| file_rnm
	52	\| file_cpy
	53	\| file_obm
	54	\| file_inm;
	55	file_clr ::= 'deleteall' lf;
	56	file_del ::= 'D' sp path_str lf;
	57	file_rnm ::= 'R' sp path_str sp path_str lf;
	58	file_cpy ::= 'C' sp path_str sp path_str lf;
	59	file_obm ::= 'M' sp mode sp (hexsha1 \| idnum) sp path_str lf;
	60	file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
	61	data;
	62
	63	new_tag ::= 'tag' sp tag_str lf
	64	'from' sp (ref_str \| hexsha1 \| sha1exp_str \| idnum) lf
65	'tagger' sp name '<' email '>' when lf
66	tag_msg;
67	tag_msg ::= data;
68
69	reset_branch ::= 'reset' sp ref_str lf
70	('from' sp (ref_str \| hexsha1 \| sha1exp_str \| idnum) lf)?
71	lf?;
72
73	checkpoint ::= 'checkpoint' lf
74	lf?;
75
76	progress ::= 'progress' sp not_lf* lf
77	lf?;
78
79	# note: the first idnum in a stream should be 1 and subsequent
80	# idnums should not have gaps between values as this will cause
81	# the stream parser to reserve space for the gapped values. An
82	# idnum can be updated in the future to a new object by issuing
83	# a new mark directive with the old idnum.
84	#
85	mark ::= 'mark' sp idnum lf;
86	data ::= (delimited_data \| exact_data)
87	lf?;
88
89	# note: delim may be any string but must not contain lf.
90	# data_line may contain any data but must not be exactly
91	# delim.
92	delimited_data ::= 'data' sp '<<' delim lf
93	(data_line lf)*
94	delim lf;
95
96	# note: declen indicates the length of binary_data in bytes.
97	# declen does not include the lf preceeding the binary data.
98	#
99	exact_data ::= 'data' sp declen lf
100	binary_data;
101
102	# note: quoted strings are C-style quoting supporting \c for
103	# common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
104	# is the signed byte value in octal. Note that the only
105	# characters which must actually be escaped to protect the
106	# stream formatting is: \, " and LF. Otherwise these values
107	# are UTF8.
108	#
109	ref_str ::= ref;
110	sha1exp_str ::= sha1exp;
111	tag_str ::= tag;
112	path_str ::= path \| '"' quoted(path) '"' ;
113	mode ::= '100644' \| '644'
114	\| '100755' \| '755'
115	\| '120000'
116	;
117
118	declen ::= # unsigned 32 bit value, ascii base10 notation;
119	bigint ::= # unsigned integer value, ascii base10 notation;
120	binary_data ::= # file content, not interpreted;
121
122	when ::= raw_when \| rfc2822_when;
123	raw_when ::= ts sp tz;
124	rfc2822_when ::= # Valid RFC 2822 date and time;
125
126	sp ::= # ASCII space character;
127	lf ::= # ASCII newline (LF) character;
128
129	# note: a colon (':') must precede the numerical value assigned to
130	# an idnum. This is to distinguish it from a ref or tag name as
131	# GIT does not permit ':' in ref or tag strings.
132	#
133	idnum ::= ':' bigint;
134	path ::= # GIT style file path, e.g. "a/b/c";
135	ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
136	tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
137	sha1exp ::= # Any valid GIT SHA1 expression;
138	hexsha1 ::= # SHA1 in hexadecimal format;
139
140	# note: name and email are UTF8 strings, however name must not
141	# contain '<' or lf and email must not contain any of the
142	# following: '<', '>', lf.
143	#
144	name ::= # valid GIT author/committer name;
145	email ::= # valid GIT author/committer email;
146	ts ::= # time since the epoch in seconds, ascii base10 notation;
147	tz ::= # GIT style timezone;
148
149	# note: comments may appear anywhere in the input, except
150	# within a data command. Any form of the data command
151	# always escapes the related input from comment processing.
152	#
153	# In case it is not clear, the '#' that starts the comment
154	# must be the first character on that the line (an lf have
155	# preceeded it).
156	#
157	comment ::= '#' not_lf* lf;
158	not_lf ::= # Any byte that is not ASCII newline (LF);
159	"""
160
161
162	import re
163	import sys
164
165	import commands
166	import dates
167	import errors
168
169
170	## Stream parsing ##
171
172	class LineBasedParser(object):
173
174	def __init__(self, input):
175	"""A Parser that keeps track of line numbers.
176
177	:param input: the file-like object to read from
178	"""
179	self.input = input
180	self.lineno = 0
181	# Lines pushed back onto the input stream
182	self._buffer = []
183
184	def abort(self, exception, *args):
185	"""Raise an exception providing line number information."""
186	raise exception(self.lineno, *args)
187
188	def readline(self):
189	"""Get the next line including the newline or '' on EOF."""
190	self.lineno += 1
191	if self._buffer:
192	return self._buffer.pop()
193	else:
194	return self.input.readline()
195
196	def next_line(self):
197	"""Get the next line without the newline or None on EOF."""
198	line = self.readline()
199	if line:
200	return line[:-1]
201	else:
202	return None
203
204	def push_line(self, line):
205	"""Push line back onto the line buffer.
206
207	:param line: the line with no trailing newline
208	"""
209	self.lineno -= 1
210	self._buffer.append(line + "\n")
211
212	def read_bytes(self, count):
213	"""Read a given number of bytes from the input stream.
214
215	Throws MissingBytes if the bytes are not found.
216
217	Note: This method does not read from the line buffer.
0.64.6 by Ian Clatworthy generic processing method working for one revision in one branch	218
	219	:return: a string
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	220	"""
0.64.72 by Ian Clatworthy reduce memory usage while reading large blobs	221	lines = ''
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	222	left = count
0.64.6 by Ian Clatworthy generic processing method working for one revision in one branch	223	found = 0
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	224	while left > 0:
	225	line = self.input.readline(left)
	226	if line:
0.64.6 by Ian Clatworthy generic processing method working for one revision in one branch	227	line_len = len(line)
	228	left -= line_len
	229	found += line_len
0.64.72 by Ian Clatworthy reduce memory usage while reading large blobs	230	lines += line
0.64.53 by Ian Clatworthy fix line number tracking	231	if line.endswith('\n'):
0.64.53 by Ian Clatworthy fix line number tracking	232	self.lineno += 1
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	233	else:
	234	left = 0
	235	if found != count:
	236	self.abort(errors.MissingBytes, count, found)
0.64.72 by Ian Clatworthy reduce memory usage while reading large blobs	237	return lines
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	238
	239	def read_until(self, terminator):
	240	"""Read the input stream until the terminator is found.
	241
	242	Throws MissingTerminator if the terminator is not found.
	243
	244	Note: This method does not read from the line buffer.
	245
	246	:return: the bytes read up to but excluding the terminator.
	247	"""
	248	raise NotImplementedError(self.read_until)
	249
	250
0.64.3 by Ian Clatworthy tweak parser for better git-fast-export compatibility	251	# Regular expression used for parsing. (Note: The spec states that the name
	252	# part should be non-empty but git-fast-export doesn't always do that so
0.65.2 by James Westby The space between the author and email is optional in committer.	253	# the first bit is \w*, not \w+.) Also git-fast-import code says the
	254	# space before the email is optional.
0.65.5 by James Westby Make the parser handle multiple words in the committer name.	255	_WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.+)> (.+)')
0.64.78 by Ian Clatworthy fix from Pieter de Bie - hack around broken front-ends	256	_WHO_RE = re.compile(r'([^<]*)<(.+)>')
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	257
	258
	259	class ImportParser(LineBasedParser):
	260
	261	def __init__(self, input, verbose=False, output=sys.stdout):
	262	"""A Parser of import commands.
	263
	264	:param input: the file-like object to read from
	265	:param verbose: display extra information of not
	266	:param output: the file-like object to write messages to (YAGNI?)
	267	"""
	268	LineBasedParser.__init__(self, input)
	269	self.verbose = verbose
	270	self.output = output
	271	# We auto-detect the date format when a date is first encountered
	272	self.date_parser = None
	273
	274	def iter_commands(self):
	275	"""Iterator returning ImportCommand objects."""
	276	while True:
	277	line = self.next_line()
	278	if line is None:
	279	break
	280	elif len(line) == 0 or line.startswith('#'):
	281	continue
	282	# Search for commands in order of likelihood
	283	elif line.startswith('commit '):
	284	yield self._parse_commit(line[len('commit '):])
	285	elif line.startswith('blob'):
	286	yield self._parse_blob()
	287	elif line.startswith('progress '):
	288	yield commands.ProgressCommand(line[len('progress '):])
	289	elif line.startswith('reset '):
	290	yield self._parse_reset(line[len('reset '):])
	291	elif line.startswith('tag '):
	292	yield self._parse_tag(line[len('tag '):])
	293	elif line.startswith('checkpoint'):
	294	yield commands.CheckpointCommand()
	295	else:
	296	self.abort(errors.InvalidCommand, line)
	297
	298	def iter_file_commands(self):
	299	"""Iterator returning FileCommand objects.
	300
	301	If an invalid file command is found, the line is silently
	302	pushed back and iteration ends.
	303	"""
	304	while True:
	305	line = self.next_line()
	306	if line is None:
	307	break
	308	elif len(line) == 0 or line.startswith('#'):
	309	continue
	310	# Search for file commands in order of likelihood
	311	elif line.startswith('M '):
	312	yield self._parse_file_modify(line[2:])
	313	elif line.startswith('D '):
	314	path = self._path(line[2:])
	315	yield commands.FileDeleteCommand(path)
	316	elif line.startswith('R '):
	317	old, new = self._path_pair(line[2:])
	318	yield commands.FileRenameCommand(old, new)
	319	elif line.startswith('C '):
	320	src, dest = self._path_pair(line[2:])
321	yield commands.FileRenameCommand(src, dest)
322	elif line.startswith('deleteall'):
323	yield commands.FileDeleteAllCommand()
324	else:
325	self.push_line(line)
326	break
327
328	def _parse_blob(self):
329	"""Parse a blob command."""
0.64.35 by Ian Clatworthy identify unmarked blobs and commits by line numbers	330	lineno = self.lineno
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	331	mark = self._get_mark_if_any()
	332	data = self._get_data('blob')
0.64.35 by Ian Clatworthy identify unmarked blobs and commits by line numbers	333	return commands.BlobCommand(mark, data, lineno)
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	334
	335	def _parse_commit(self, ref):
	336	"""Parse a commit command."""
0.64.35 by Ian Clatworthy identify unmarked blobs and commits by line numbers	337	lineno = self.lineno
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	338	mark = self._get_mark_if_any()
	339	author = self._get_user_info('commit', 'author', False)
	340	committer = self._get_user_info('commit', 'committer')
0.64.78 by Ian Clatworthy fix from Pieter de Bie - hack around broken front-ends	341	message = self._get_data('commit', 'message')
	342	try:
	343	message = message.decode('utf_8')
	344	except UnicodeDecodeError:
	345	# TODO: output a warning here about a broken front-end
	346	pass
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	347	from_ = self._get_from()
0.64.60 by Ian Clatworthy support merges when from clause implicit	348	merges = []
	349	while True:
	350	merge = self._get_merge()
	351	if merge is not None:
	352	merges.append(merge)
	353	else:
	354	break
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	355	return commands.CommitCommand(ref, mark, author, committer, message,
0.64.60 by Ian Clatworthy support merges when from clause implicit	356	from_, merges, self.iter_file_commands, lineno)
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	357
	358	def _parse_file_modify(self, info):
	359	"""Parse a filemodify command within a commit.
	360
	361	:param info: a string in the format "mode dataref path"
	362	(where dataref might be the hard-coded literal 'inline').
	363	"""
	364	params = info.split(' ', 2)
	365	path = self._path(params[2])
	366	is_executable, is_symlink = self._mode(params[0])
	367	if is_symlink:
	368	kind = commands.SYMLINK_KIND
	369	else:
	370	kind = commands.FILE_KIND
	371	if params[1] == 'inline':
	372	dataref = None
	373	data = self._get_data('filemodify')
	374	else:
	375	dataref = params[1]
	376	data = None
	377	return commands.FileModifyCommand(path, kind, is_executable, dataref,
	378	data)
	379
	380	def _parse_reset(self, ref):
	381	"""Parse a reset command."""
	382	from_ = self._get_from()
	383	return commands.ResetCommand(ref, from_)
	384
	385	def _parse_tag(self, name):
	386	"""Parse a tag command."""
	387	from_ = self._get_from('tag')
0.64.78 by Ian Clatworthy fix from Pieter de Bie - hack around broken front-ends	388	tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)
0.64.76 by Ian Clatworthy fix utf-8 decoding bugs	389	message = self._get_data('tag', 'message').decode('utf_8')
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	390	return commands.TagCommand(name, from_, tagger, message)
	391
	392	def _get_mark_if_any(self):
	393	"""Parse a mark section."""
	394	line = self.next_line()
	395	if line.startswith('mark :'):
	396	return line[len('mark :'):]
	397	else:
	398	self.push_line(line)
	399	return None
	400
	401	def _get_from(self, required_for=None):
	402	"""Parse a from section."""
	403	line = self.next_line()
	404	if line.startswith('from '):
	405	return line[len('from '):]
	406	elif required_for:
	407	self.abort(errors.MissingSection, required_for, 'from')
	408	else:
	409	self.push_line(line)
	410	return None
	411
	412	def _get_merge(self):
	413	"""Parse a merge section."""
	414	line = self.next_line()
	415	if line.startswith('merge '):
	416	return line[len('merge '):]
	417	else:
	418	self.push_line(line)
	419	return None
	420
0.64.78 by Ian Clatworthy fix from Pieter de Bie - hack around broken front-ends	421	def _get_user_info(self, cmd, section, required=True,
	422	accept_just_who=False):
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	423	"""Parse a user section."""
	424	line = self.next_line()
	425	if line.startswith(section + ' '):
0.64.78 by Ian Clatworthy fix from Pieter de Bie - hack around broken front-ends	426	return self._who_when(line[len(section + ' '):], cmd, section,
	427	accept_just_who=accept_just_who)
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	428	elif required:
	429	self.abort(errors.MissingSection, cmd, section)
	430	else:
	431	self.push_line(line)
	432	return None
	433
	434	def _get_data(self, required_for, section='data'):
	435	"""Parse a data section."""
	436	line = self.next_line()
	437	if line.startswith('data '):
	438	rest = line[len('data '):]
	439	if rest.startswith('<<'):
	440	return self.read_until(rest[2:])
	441	else:
	442	size = int(rest)
0.65.1 by James Westby The data sections have an optional LF at the end in the byte count format.	443	read_bytes = self.read_bytes(size)
	444	# optional LF after data.
	445	next = self.input.readline()
	446	self.lineno += 1
	447	if len(next) > 1 or next != "\n":
0.65.5 by James Westby Make the parser handle multiple words in the committer name.	448	self.push_line(next[:-1])
0.65.1 by James Westby The data sections have an optional LF at the end in the byte count format.	449	return read_bytes
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	450	else:
	451	self.abort(errors.MissingSection, required_for, section)
	452
0.64.78 by Ian Clatworthy fix from Pieter de Bie - hack around broken front-ends	453	def _who_when(self, s, cmd, section, accept_just_who=False):
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	454	"""Parse who and when information from a string.
	455
0.65.5 by James Westby Make the parser handle multiple words in the committer name.	456	:return: a tuple of (name,email,timestamp,timezone). name may be
	457	the empty string if only an email address was given.
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	458	"""
	459	match = _WHO_AND_WHEN_RE.search(s)
	460	if match:
	461	datestr = match.group(3)
	462	if self.date_parser is None:
	463	# auto-detect the date format
0.64.3 by Ian Clatworthy tweak parser for better git-fast-export compatibility	464	if len(datestr.split(' ')) == 2:
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	465	format = 'raw'
	466	elif datestr == 'now':
	467	format = 'now'
	468	else:
	469	format = 'rfc2822'
	470	self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
	471	when = self.date_parser(datestr)
	472	else:
0.64.78 by Ian Clatworthy fix from Pieter de Bie - hack around broken front-ends	473	match = _WHO_RE.search(s)
	474	if accept_just_who and match:
	475	# HACK around missing time
	476	# TODO: output a warning here
	477	when = dates.DATE_PARSERS_BY_NAME['now']('now')
	478	else:
	479	self.abort(errors.BadFormat, cmd, section, s)
	480	name = match.group(1)
	481	if len(name) > 0:
	482	if name[-1] == " ":
	483	name = name[:-1].decode('utf_8')
	484	return (name,match.group(2),when[0],when[1])
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	485
	486	def _path(self, s):
	487	"""Parse a path."""
0.64.10 by Ian Clatworthy 1st cut are dequoting paths	488	if s.startswith('"'):
	489	if s[-1] != '"':
	490	self.abort(errors.BadFormat, cmd, section, s)
	491	else:
	492	return _unquote_c_string(s[1:-1])
0.64.76 by Ian Clatworthy fix utf-8 decoding bugs	493	return s.decode('utf_8')
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	494
	495	def _path_pair(self, s):
	496	"""Parse two paths separated by a space."""
0.64.10 by Ian Clatworthy 1st cut are dequoting paths	497	# TODO: handle a space in the first path
	498	parts = s.split(' ', 1)
	499	return map(_unquote_c_string, parts)
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	500
	501	def _mode(self, s):
	502	"""Parse a file mode into executable and symlink flags.
	503
	504	:return (is_executable, is_symlink)
	505	"""
	506	# Note: Output from git-fast-export slightly different to spec
	507	if s in ['644', '100644', '0100644']:
	508	return False, False
	509	elif s in ['755', '100755', '0100755']:
	510	return True, False
0.64.3 by Ian Clatworthy tweak parser for better git-fast-export compatibility	511	elif s in ['120000', '0120000']:
0.64.1 by Ian Clatworthy 1st cut: gfi parser + --info processing method	512	return False, True
	513	else:
	514	self.abort(errors.BadFormat, 'filemodify', 'mode', s)
	515
0.64.10 by Ian Clatworthy 1st cut are dequoting paths	516
	517	def _unquote_c_string(s):
	518	"""replace C-style escape sequences (\n, \", etc.) with real chars."""
	519	# HACK: Python strings are close enough
	520	return s.decode('string_escape', 'replace')