1
# Copyright (C) 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Parser of import data into command objects.
19
In order to reuse existing front-ends, the stream format is a subset of
20
the one used by git-fast-import (as of the 1.5.4 release of git at least).
33
new_blob ::= 'blob' lf
36
file_content ::= data;
38
new_commit ::= 'commit' sp ref_str lf
40
('author' sp name '<' email '>' when lf)?
41
'committer' sp name '<' email '>' when lf
43
('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
44
('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
49
file_change ::= file_clr
55
file_clr ::= 'deleteall' lf;
56
file_del ::= 'D' sp path_str lf;
57
file_rnm ::= 'R' sp path_str sp path_str lf;
58
file_cpy ::= 'C' sp path_str sp path_str lf;
59
file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
60
file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
63
new_tag ::= 'tag' sp tag_str lf
64
'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
65
'tagger' sp name '<' email '>' when lf
69
reset_branch ::= 'reset' sp ref_str lf
70
('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
73
checkpoint ::= 'checkpoint' lf
76
progress ::= 'progress' sp not_lf* lf
79
# note: the first idnum in a stream should be 1 and subsequent
80
# idnums should not have gaps between values as this will cause
81
# the stream parser to reserve space for the gapped values. An
82
# idnum can be updated in the future to a new object by issuing
83
# a new mark directive with the old idnum.
85
mark ::= 'mark' sp idnum lf;
86
data ::= (delimited_data | exact_data)
89
# note: delim may be any string but must not contain lf.
90
# data_line may contain any data but must not be exactly
92
delimited_data ::= 'data' sp '<<' delim lf
96
# note: declen indicates the length of binary_data in bytes.
97
# declen does not include the lf preceeding the binary data.
99
exact_data ::= 'data' sp declen lf
102
# note: quoted strings are C-style quoting supporting \c for
103
# common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
104
# is the signed byte value in octal. Note that the only
105
# characters which must actually be escaped to protect the
106
# stream formatting is: \, " and LF. Otherwise these values
110
sha1exp_str ::= sha1exp;
112
path_str ::= path | '"' quoted(path) '"' ;
113
mode ::= '100644' | '644'
118
declen ::= # unsigned 32 bit value, ascii base10 notation;
119
bigint ::= # unsigned integer value, ascii base10 notation;
120
binary_data ::= # file content, not interpreted;
122
when ::= raw_when | rfc2822_when;
123
raw_when ::= ts sp tz;
124
rfc2822_when ::= # Valid RFC 2822 date and time;
126
sp ::= # ASCII space character;
127
lf ::= # ASCII newline (LF) character;
129
# note: a colon (':') must precede the numerical value assigned to
130
# an idnum. This is to distinguish it from a ref or tag name as
131
# GIT does not permit ':' in ref or tag strings.
133
idnum ::= ':' bigint;
134
path ::= # GIT style file path, e.g. "a/b/c";
135
ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
136
tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
137
sha1exp ::= # Any valid GIT SHA1 expression;
138
hexsha1 ::= # SHA1 in hexadecimal format;
140
# note: name and email are UTF8 strings, however name must not
141
# contain '<' or lf and email must not contain any of the
142
# following: '<', '>', lf.
144
name ::= # valid GIT author/committer name;
145
email ::= # valid GIT author/committer email;
146
ts ::= # time since the epoch in seconds, ascii base10 notation;
147
tz ::= # GIT style timezone;
149
# note: comments may appear anywhere in the input, except
150
# within a data command. Any form of the data command
151
# always escapes the related input from comment processing.
153
# In case it is not clear, the '#' that starts the comment
154
# must be the first character on that the line (an lf have
157
comment ::= '#' not_lf* lf;
158
not_lf ::= # Any byte that is not ASCII newline (LF);
172
class LineBasedParser(object):
174
def __init__(self, input):
175
"""A Parser that keeps track of line numbers.
177
:param input: the file-like object to read from
181
# Lines pushed back onto the input stream
184
def abort(self, exception, *args):
185
"""Raise an exception providing line number information."""
186
raise exception(self.lineno, *args)
189
"""Get the next line including the newline or '' on EOF."""
192
return self._buffer.pop()
194
return self.input.readline()
197
"""Get the next line without the newline or None on EOF."""
198
line = self.readline()
204
def push_line(self, line):
205
"""Push line back onto the line buffer.
207
:param line: the line with no trailing newline
210
self._buffer.append(line + "\n")
212
def read_bytes(self, count):
213
"""Read a given number of bytes from the input stream.
215
Throws MissingBytes if the bytes are not found.
217
Note: This method does not read from the line buffer.
225
line = self.input.readline(left)
231
if line.endswith('\n'):
236
self.abort(errors.MissingBytes, count, found)
239
def read_until(self, terminator):
240
"""Read the input stream until the terminator is found.
242
Throws MissingTerminator if the terminator is not found.
244
Note: This method does not read from the line buffer.
246
:return: the bytes read up to but excluding the terminator.
248
raise NotImplementedError(self.read_until)
251
# Regular expression used for parsing. (Note: The spec states that the name
252
# part should be non-empty but git-fast-export doesn't always do that so
253
# the first bit is \w*, not \w+.) Also git-fast-import code says the
254
# space before the email is optional.
255
_WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.*)> (.+)')
256
_WHO_RE = re.compile(r'([^<]*)<(.*)>')
259
class ImportParser(LineBasedParser):
261
def __init__(self, input, verbose=False, output=sys.stdout):
262
"""A Parser of import commands.
264
:param input: the file-like object to read from
265
:param verbose: display extra information of not
266
:param output: the file-like object to write messages to (YAGNI?)
268
LineBasedParser.__init__(self, input)
269
self.verbose = verbose
271
# We auto-detect the date format when a date is first encountered
272
self.date_parser = None
274
def iter_commands(self):
275
"""Iterator returning ImportCommand objects."""
277
line = self.next_line()
280
elif len(line) == 0 or line.startswith('#'):
282
# Search for commands in order of likelihood
283
elif line.startswith('commit '):
284
yield self._parse_commit(line[len('commit '):])
285
elif line.startswith('blob'):
286
yield self._parse_blob()
287
elif line.startswith('progress '):
288
yield commands.ProgressCommand(line[len('progress '):])
289
elif line.startswith('reset '):
290
yield self._parse_reset(line[len('reset '):])
291
elif line.startswith('tag '):
292
yield self._parse_tag(line[len('tag '):])
293
elif line.startswith('checkpoint'):
294
yield commands.CheckpointCommand()
296
self.abort(errors.InvalidCommand, line)
298
def iter_file_commands(self):
299
"""Iterator returning FileCommand objects.
301
If an invalid file command is found, the line is silently
302
pushed back and iteration ends.
305
line = self.next_line()
308
elif len(line) == 0 or line.startswith('#'):
310
# Search for file commands in order of likelihood
311
elif line.startswith('M '):
312
yield self._parse_file_modify(line[2:])
313
elif line.startswith('D '):
314
path = self._path(line[2:])
315
yield commands.FileDeleteCommand(path)
316
elif line.startswith('R '):
317
old, new = self._path_pair(line[2:])
318
yield commands.FileRenameCommand(old, new)
319
elif line.startswith('C '):
320
src, dest = self._path_pair(line[2:])
321
yield commands.FileCopyCommand(src, dest)
322
elif line.startswith('deleteall'):
323
yield commands.FileDeleteAllCommand()
328
def _parse_blob(self):
329
"""Parse a blob command."""
331
mark = self._get_mark_if_any()
332
data = self._get_data('blob')
333
return commands.BlobCommand(mark, data, lineno)
335
def _parse_commit(self, ref):
336
"""Parse a commit command."""
338
mark = self._get_mark_if_any()
339
author = self._get_user_info('commit', 'author', False)
340
committer = self._get_user_info('commit', 'committer')
341
message = self._get_data('commit', 'message')
343
message = message.decode('utf_8')
344
except UnicodeDecodeError:
345
# TODO: output a warning here about a broken front-end
347
from_ = self._get_from()
350
merge = self._get_merge()
351
if merge is not None:
352
# while the spec suggests it's illegal, git-fast-export
353
# outputs multiple merges on the one line, e.g.
355
these_merges = merge.split(" ")
356
merges.extend(these_merges)
359
return commands.CommitCommand(ref, mark, author, committer, message,
360
from_, merges, self.iter_file_commands, lineno)
362
def _parse_file_modify(self, info):
363
"""Parse a filemodify command within a commit.
365
:param info: a string in the format "mode dataref path"
366
(where dataref might be the hard-coded literal 'inline').
368
params = info.split(' ', 2)
369
path = self._path(params[2])
370
is_executable, is_symlink = self._mode(params[0])
372
kind = commands.SYMLINK_KIND
374
kind = commands.FILE_KIND
375
if params[1] == 'inline':
377
data = self._get_data('filemodify')
381
return commands.FileModifyCommand(path, kind, is_executable, dataref,
384
def _parse_reset(self, ref):
385
"""Parse a reset command."""
386
from_ = self._get_from()
387
return commands.ResetCommand(ref, from_)
389
def _parse_tag(self, name):
390
"""Parse a tag command."""
391
from_ = self._get_from('tag')
392
tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)
393
message = self._get_data('tag', 'message').decode('utf_8')
394
return commands.TagCommand(name, from_, tagger, message)
396
def _get_mark_if_any(self):
397
"""Parse a mark section."""
398
line = self.next_line()
399
if line.startswith('mark :'):
400
return line[len('mark :'):]
405
def _get_from(self, required_for=None):
406
"""Parse a from section."""
407
line = self.next_line()
408
if line.startswith('from '):
409
return line[len('from '):]
411
self.abort(errors.MissingSection, required_for, 'from')
416
def _get_merge(self):
417
"""Parse a merge section."""
418
line = self.next_line()
419
if line.startswith('merge '):
420
return line[len('merge '):]
425
def _get_user_info(self, cmd, section, required=True,
426
accept_just_who=False):
427
"""Parse a user section."""
428
line = self.next_line()
429
if line.startswith(section + ' '):
430
return self._who_when(line[len(section + ' '):], cmd, section,
431
accept_just_who=accept_just_who)
433
self.abort(errors.MissingSection, cmd, section)
438
def _get_data(self, required_for, section='data'):
439
"""Parse a data section."""
440
line = self.next_line()
441
if line.startswith('data '):
442
rest = line[len('data '):]
443
if rest.startswith('<<'):
444
return self.read_until(rest[2:])
447
read_bytes = self.read_bytes(size)
448
# optional LF after data.
449
next = self.input.readline()
451
if len(next) > 1 or next != "\n":
452
self.push_line(next[:-1])
455
self.abort(errors.MissingSection, required_for, section)
457
def _who_when(self, s, cmd, section, accept_just_who=False):
458
"""Parse who and when information from a string.
460
:return: a tuple of (name,email,timestamp,timezone). name may be
461
the empty string if only an email address was given.
463
match = _WHO_AND_WHEN_RE.search(s)
465
datestr = match.group(3)
466
if self.date_parser is None:
467
# auto-detect the date format
468
if len(datestr.split(' ')) == 2:
470
elif datestr == 'now':
474
self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
475
when = self.date_parser(datestr)
477
match = _WHO_RE.search(s)
478
if accept_just_who and match:
479
# HACK around missing time
480
# TODO: output a warning here
481
when = dates.DATE_PARSERS_BY_NAME['now']('now')
483
self.abort(errors.BadFormat, cmd, section, s)
484
name = match.group(1)
488
name = name[:-1].decode('utf_8')
489
except UnicodeDecodeError:
490
# The spec says names are *typically* utf8 encoded
491
# but that isn't enforced by git-fast-export (at least)
493
return (name,match.group(2),when[0],when[1])
497
if s.startswith('"'):
499
self.abort(errors.BadFormat, cmd, section, s)
501
return _unquote_c_string(s[1:-1])
503
return s.decode('utf_8')
504
except UnicodeDecodeError:
505
# The spec recommends utf8 encoding but that isn't enforced
508
def _path_pair(self, s):
509
"""Parse two paths separated by a space."""
510
# TODO: handle a space in the first path
511
parts = s.split(' ', 1)
512
return map(_unquote_c_string, parts)
515
"""Parse a file mode into executable and symlink flags.
517
:return (is_executable, is_symlink)
519
# Note: Output from git-fast-export slightly different to spec
520
if s in ['644', '100644', '0100644']:
522
elif s in ['755', '100755', '0100755']:
524
elif s in ['120000', '0120000']:
527
self.abort(errors.BadFormat, 'filemodify', 'mode', s)
530
def _unquote_c_string(s):
531
"""replace C-style escape sequences (\n, \", etc.) with real chars."""
532
# HACK: Python strings are close enough
533
return s.decode('string_escape', 'replace')