1
# Copyright (C) 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Parser of import data into command objects.
19
In order to reuse existing front-ends, the stream format is a subset of
20
the one used by git-fast-import (as of the 1.5.4 release of git at least).
33
new_blob ::= 'blob' lf
36
file_content ::= data;
38
new_commit ::= 'commit' sp ref_str lf
40
('author' sp name '<' email '>' when lf)?
41
'committer' sp name '<' email '>' when lf
43
('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
44
('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
49
file_change ::= file_clr
55
file_clr ::= 'deleteall' lf;
56
file_del ::= 'D' sp path_str lf;
57
file_rnm ::= 'R' sp path_str sp path_str lf;
58
file_cpy ::= 'C' sp path_str sp path_str lf;
59
file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
60
file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
63
new_tag ::= 'tag' sp tag_str lf
64
'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
65
'tagger' sp name '<' email '>' when lf
69
reset_branch ::= 'reset' sp ref_str lf
70
('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
73
checkpoint ::= 'checkpoint' lf
76
progress ::= 'progress' sp not_lf* lf
79
# note: the first idnum in a stream should be 1 and subsequent
80
# idnums should not have gaps between values as this will cause
81
# the stream parser to reserve space for the gapped values. An
82
# idnum can be updated in the future to a new object by issuing
83
# a new mark directive with the old idnum.
85
mark ::= 'mark' sp idnum lf;
86
data ::= (delimited_data | exact_data)
89
# note: delim may be any string but must not contain lf.
90
# data_line may contain any data but must not be exactly
91
# delim. The lf after the final data_line is included in
93
delimited_data ::= 'data' sp '<<' delim lf
97
# note: declen indicates the length of binary_data in bytes.
98
# declen does not include the lf preceeding the binary data.
100
exact_data ::= 'data' sp declen lf
103
# note: quoted strings are C-style quoting supporting \c for
104
# common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
105
# is the signed byte value in octal. Note that the only
106
# characters which must actually be escaped to protect the
107
# stream formatting is: \, " and LF. Otherwise these values
111
sha1exp_str ::= sha1exp;
113
path_str ::= path | '"' quoted(path) '"' ;
114
mode ::= '100644' | '644'
119
declen ::= # unsigned 32 bit value, ascii base10 notation;
120
bigint ::= # unsigned integer value, ascii base10 notation;
121
binary_data ::= # file content, not interpreted;
123
when ::= raw_when | rfc2822_when;
124
raw_when ::= ts sp tz;
125
rfc2822_when ::= # Valid RFC 2822 date and time;
127
sp ::= # ASCII space character;
128
lf ::= # ASCII newline (LF) character;
130
# note: a colon (':') must precede the numerical value assigned to
131
# an idnum. This is to distinguish it from a ref or tag name as
132
# GIT does not permit ':' in ref or tag strings.
134
idnum ::= ':' bigint;
135
path ::= # GIT style file path, e.g. "a/b/c";
136
ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
137
tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
138
sha1exp ::= # Any valid GIT SHA1 expression;
139
hexsha1 ::= # SHA1 in hexadecimal format;
141
# note: name and email are UTF8 strings, however name must not
142
# contain '<' or lf and email must not contain any of the
143
# following: '<', '>', lf.
145
name ::= # valid GIT author/committer name;
146
email ::= # valid GIT author/committer email;
147
ts ::= # time since the epoch in seconds, ascii base10 notation;
148
tz ::= # GIT style timezone;
150
# note: comments may appear anywhere in the input, except
151
# within a data command. Any form of the data command
152
# always escapes the related input from comment processing.
154
# In case it is not clear, the '#' that starts the comment
155
# must be the first character on that the line (an lf have
158
comment ::= '#' not_lf* lf;
159
not_lf ::= # Any byte that is not ASCII newline (LF);
173
class LineBasedParser(object):
175
def __init__(self, input):
176
"""A Parser that keeps track of line numbers.
178
:param input: the file-like object to read from
182
# Lines pushed back onto the input stream
185
def abort(self, exception, *args):
186
"""Raise an exception providing line number information."""
187
raise exception(self.lineno, *args)
190
"""Get the next line including the newline or '' on EOF."""
193
return self._buffer.pop()
195
return self.input.readline()
198
"""Get the next line without the newline or None on EOF."""
199
line = self.readline()
205
def push_line(self, line):
206
"""Push line back onto the line buffer.
208
:param line: the line with no trailing newline
211
self._buffer.append(line + "\n")
213
def read_bytes(self, count):
214
"""Read a given number of bytes from the input stream.
216
Throws MissingBytes if the bytes are not found.
218
Note: This method does not read from the line buffer.
222
result = self.input.read(count)
224
self.lineno += result.count("\n")
226
self.abort(errors.MissingBytes, count, found)
229
def read_until(self, terminator):
230
"""Read the input stream until the terminator is found.
232
Throws MissingTerminator if the terminator is not found.
234
Note: This method does not read from the line buffer.
236
:return: the bytes read up to but excluding the terminator.
240
term = terminator + '\n'
242
line = self.input.readline()
247
return ''.join(lines)
250
# Regular expression used for parsing. (Note: The spec states that the name
251
# part should be non-empty but git-fast-export doesn't always do that so
252
# the first bit is \w*, not \w+.) Also git-fast-import code says the
253
# space before the email is optional.
254
_WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.*)> (.+)')
255
_WHO_RE = re.compile(r'([^<]*)<(.*)>')
258
class ImportParser(LineBasedParser):
260
def __init__(self, input, verbose=False, output=sys.stdout):
261
"""A Parser of import commands.
263
:param input: the file-like object to read from
264
:param verbose: display extra information of not
265
:param output: the file-like object to write messages to (YAGNI?)
267
LineBasedParser.__init__(self, input)
268
self.verbose = verbose
270
# We auto-detect the date format when a date is first encountered
271
self.date_parser = None
273
def warning(self, msg):
274
sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg))
276
def iter_commands(self):
277
"""Iterator returning ImportCommand objects."""
279
line = self.next_line()
282
elif len(line) == 0 or line.startswith('#'):
284
# Search for commands in order of likelihood
285
elif line.startswith('commit '):
286
yield self._parse_commit(line[len('commit '):])
287
elif line.startswith('blob'):
288
yield self._parse_blob()
289
elif line.startswith('progress '):
290
yield commands.ProgressCommand(line[len('progress '):])
291
elif line.startswith('reset '):
292
yield self._parse_reset(line[len('reset '):])
293
elif line.startswith('tag '):
294
yield self._parse_tag(line[len('tag '):])
295
elif line.startswith('checkpoint'):
296
yield commands.CheckpointCommand()
297
elif line.startswith('feature'):
298
yield self._parse_feature(line[len('feature '):])
300
self.abort(errors.InvalidCommand, line)
302
def iter_file_commands(self):
303
"""Iterator returning FileCommand objects.
305
If an invalid file command is found, the line is silently
306
pushed back and iteration ends.
309
line = self.next_line()
312
elif len(line) == 0 or line.startswith('#'):
314
# Search for file commands in order of likelihood
315
elif line.startswith('M '):
316
yield self._parse_file_modify(line[2:])
317
elif line.startswith('D '):
318
path = self._path(line[2:])
319
yield commands.FileDeleteCommand(path)
320
elif line.startswith('R '):
321
old, new = self._path_pair(line[2:])
322
yield commands.FileRenameCommand(old, new)
323
elif line.startswith('C '):
324
src, dest = self._path_pair(line[2:])
325
yield commands.FileCopyCommand(src, dest)
326
elif line.startswith('deleteall'):
327
yield commands.FileDeleteAllCommand()
332
def _parse_blob(self):
333
"""Parse a blob command."""
335
mark = self._get_mark_if_any()
336
data = self._get_data('blob')
337
return commands.BlobCommand(mark, data, lineno)
339
def _parse_commit(self, ref):
340
"""Parse a commit command."""
342
mark = self._get_mark_if_any()
343
author = self._get_user_info('commit', 'author', False)
346
another_author = self._get_user_info('commit', 'author', False)
347
if another_author is not None:
348
more_authors.append(another_author)
351
committer = self._get_user_info('commit', 'committer')
352
message = self._get_data('commit', 'message')
354
message = message.decode('utf_8')
355
except UnicodeDecodeError:
357
"commit message not in utf8 - replacing unknown characters")
358
message = message.decode('utf_8', 'replace')
359
from_ = self._get_from()
362
merge = self._get_merge()
363
if merge is not None:
364
# while the spec suggests it's illegal, git-fast-export
365
# outputs multiple merges on the one line, e.g.
367
these_merges = merge.split(" ")
368
merges.extend(these_merges)
373
name_value = self._get_property()
374
if name_value is not None:
375
name, value = name_value
376
properties[name] = value
379
return commands.CommitCommand(ref, mark, author, committer, message,
380
from_, merges, self.iter_file_commands, lineno=lineno,
381
more_authors=more_authors, properties=properties)
383
def _parse_feature(self, info):
384
"""Parse a feature command."""
385
parts = info.split("=", 1)
388
value = self._path(parts[1])
391
return commands.FeatureCommand(name, value, lineno=self.lineno)
393
def _parse_file_modify(self, info):
394
"""Parse a filemodify command within a commit.
396
:param info: a string in the format "mode dataref path"
397
(where dataref might be the hard-coded literal 'inline').
399
params = info.split(' ', 2)
400
path = self._path(params[2])
401
is_executable, kind = self._mode(params[0])
402
if params[1] == 'inline':
404
data = self._get_data('filemodify')
408
return commands.FileModifyCommand(path, kind, is_executable, dataref,
411
def _parse_reset(self, ref):
412
"""Parse a reset command."""
413
from_ = self._get_from()
414
return commands.ResetCommand(ref, from_)
416
def _parse_tag(self, name):
417
"""Parse a tag command."""
418
from_ = self._get_from('tag')
419
tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)
420
message = self._get_data('tag', 'message').decode('utf_8')
421
return commands.TagCommand(name, from_, tagger, message)
423
def _get_mark_if_any(self):
424
"""Parse a mark section."""
425
line = self.next_line()
426
if line.startswith('mark :'):
427
return line[len('mark :'):]
432
def _get_from(self, required_for=None):
433
"""Parse a from section."""
434
line = self.next_line()
437
elif line.startswith('from '):
438
return line[len('from '):]
440
self.abort(errors.MissingSection, required_for, 'from')
445
def _get_merge(self):
446
"""Parse a merge section."""
447
line = self.next_line()
450
elif line.startswith('merge '):
451
return line[len('merge '):]
456
def _get_property(self):
457
"""Parse a property section."""
458
line = self.next_line()
461
elif line.startswith('property '):
462
return self._name_value(line[len('property '):])
467
def _get_user_info(self, cmd, section, required=True,
468
accept_just_who=False):
469
"""Parse a user section."""
470
line = self.next_line()
471
if line.startswith(section + ' '):
472
return self._who_when(line[len(section + ' '):], cmd, section,
473
accept_just_who=accept_just_who)
475
self.abort(errors.MissingSection, cmd, section)
480
def _get_data(self, required_for, section='data'):
481
"""Parse a data section."""
482
line = self.next_line()
483
if line.startswith('data '):
484
rest = line[len('data '):]
485
if rest.startswith('<<'):
486
return self.read_until(rest[2:])
489
read_bytes = self.read_bytes(size)
490
# optional LF after data.
491
next = self.input.readline()
493
if len(next) > 1 or next != "\n":
494
self.push_line(next[:-1])
497
self.abort(errors.MissingSection, required_for, section)
499
def _who_when(self, s, cmd, section, accept_just_who=False):
500
"""Parse who and when information from a string.
502
:return: a tuple of (name,email,timestamp,timezone). name may be
503
the empty string if only an email address was given.
505
match = _WHO_AND_WHEN_RE.search(s)
507
datestr = match.group(3).lstrip()
508
if self.date_parser is None:
509
# auto-detect the date format
510
if len(datestr.split(' ')) == 2:
512
elif datestr == 'now':
516
self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
518
when = self.date_parser(datestr, self.lineno)
520
print "failed to parse datestr '%s'" % (datestr,)
523
match = _WHO_RE.search(s)
524
if accept_just_who and match:
525
# HACK around missing time
526
# TODO: output a warning here
527
when = dates.DATE_PARSERS_BY_NAME['now']('now')
529
self.abort(errors.BadFormat, cmd, section, s)
530
name = match.group(1)
534
name = name[:-1].decode('utf_8')
535
except UnicodeDecodeError:
536
# The spec says names are *typically* utf8 encoded
537
# but that isn't enforced by git-fast-export (at least)
538
self.warning("%s name not in utf8 - replacing unknown "
539
"characters" % (section,))
540
name = name[:-1].decode('utf_8', 'replace')
541
email = match.group(2)
542
# While it shouldn't happen, some datasets have email addresses
543
# which contain unicode characters. See bug 338186. We sanitize
544
# the data at this level just in case.
546
email = email.decode('utf_8')
547
except UnicodeDecodeError:
548
self.warning("%s email not in utf8 - replacing unknown characters"
550
email = email.decode('utf_8', 'replace')
551
return (name, email, when[0], when[1])
553
def _name_value(self, s):
554
"""Parse a (name,value) tuple from 'name value-length value'."""
555
parts = s.split(' ', 2)
562
still_to_read = size - len(value)
563
if still_to_read == 1:
565
elif still_to_read > 0:
566
read_bytes = self.read_bytes(still_to_read - 1)
567
value += "\n" + read_bytes
568
value = value.decode('utf8')
573
if s.startswith('"'):
575
self.abort(errors.BadFormat, '?', '?', s)
577
return _unquote_c_string(s[1:-1])
579
return s.decode('utf_8')
580
except UnicodeDecodeError:
581
# The spec recommends utf8 encoding but that isn't enforced
584
def _path_pair(self, s):
585
"""Parse two paths separated by a space."""
586
# TODO: handle a space in the first path
587
if s.startswith('"'):
588
parts = s[1:].split('" ', 1)
590
parts = s.split(' ', 1)
592
self.abort(errors.BadFormat, '?', '?', s)
593
elif parts[1].startswith('"') and parts[1].endswith('"'):
594
parts[1] = parts[1][1:-1]
595
elif parts[1].startswith('"') or parts[1].endswith('"'):
596
self.abort(errors.BadFormat, '?', '?', s)
597
return map(_unquote_c_string, parts)
600
"""Parse a file mode into executable and kind.
602
:return (is_executable, kind)
604
# Note: Output from git-fast-export slightly different to spec
605
if s in ['644', '100644', '0100644']:
606
return False, commands.FILE_KIND
607
elif s in ['755', '100755', '0100755']:
608
return True, commands.FILE_KIND
609
elif s in ['040000', '0040000']:
610
return False, commands.DIRECTORY_KIND
611
elif s in ['120000', '0120000']:
612
return False, commands.SYMLINK_KIND
613
elif s in ['160000', '0160000']:
614
return False, commands.TREE_REFERENCE_KIND
616
self.abort(errors.BadFormat, 'filemodify', 'mode', s)
619
def _unquote_c_string(s):
620
"""replace C-style escape sequences (\n, \", etc.) with real chars."""
621
# HACK: Python strings are close enough
622
return s.decode('string_escape', 'replace')