/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
1
# Copyright (C) 2008 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""Parser of import data into command objects.
18
19
In order to reuse existing front-ends, the stream format is a subset of
20
the one used by git-fast-import (as of the 1.5.4 release of git at least).
21
The grammar is:
22
23
  stream ::= cmd*;
24
25
  cmd ::= new_blob
26
        | new_commit
27
        | new_tag
28
        | reset_branch
29
        | checkpoint
30
        | progress
31
        ;
32
33
  new_blob ::= 'blob' lf
34
    mark?
35
    file_content;
36
  file_content ::= data;
37
38
  new_commit ::= 'commit' sp ref_str lf
39
    mark?
40
    ('author' sp name '<' email '>' when lf)?
41
    'committer' sp name '<' email '>' when lf
42
    commit_msg
43
    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
44
    ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
45
    file_change*
46
    lf?;
47
  commit_msg ::= data;
48
49
  file_change ::= file_clr
50
    | file_del
51
    | file_rnm
52
    | file_cpy
53
    | file_obm
54
    | file_inm;
55
  file_clr ::= 'deleteall' lf;
56
  file_del ::= 'D' sp path_str lf;
57
  file_rnm ::= 'R' sp path_str sp path_str lf;
58
  file_cpy ::= 'C' sp path_str sp path_str lf;
59
  file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
60
  file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
61
    data;
62
63
  new_tag ::= 'tag' sp tag_str lf
64
    'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
65
    'tagger' sp name '<' email '>' when lf
66
    tag_msg;
67
  tag_msg ::= data;
68
69
  reset_branch ::= 'reset' sp ref_str lf
70
    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
71
    lf?;
72
73
  checkpoint ::= 'checkpoint' lf
74
    lf?;
75
76
  progress ::= 'progress' sp not_lf* lf
77
    lf?;
78
79
     # note: the first idnum in a stream should be 1 and subsequent
80
     # idnums should not have gaps between values as this will cause
81
     # the stream parser to reserve space for the gapped values.  An
82
     # idnum can be updated in the future to a new object by issuing
83
     # a new mark directive with the old idnum.
84
     #
85
  mark ::= 'mark' sp idnum lf;
86
  data ::= (delimited_data | exact_data)
87
    lf?;
88
89
    # note: delim may be any string but must not contain lf.
90
    # data_line may contain any data but must not be exactly
91
    # delim.
92
  delimited_data ::= 'data' sp '<<' delim lf
93
    (data_line lf)*
94
    delim lf;
95
96
     # note: declen indicates the length of binary_data in bytes.
97
     # declen does not include the lf preceeding the binary data.
98
     #
99
  exact_data ::= 'data' sp declen lf
100
    binary_data;
101
102
     # note: quoted strings are C-style quoting supporting \c for
103
     # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
104
     # is the signed byte value in octal.  Note that the only
105
     # characters which must actually be escaped to protect the
106
     # stream formatting is: \, " and LF.  Otherwise these values
107
     # are UTF8.
108
     #
109
  ref_str     ::= ref;
110
  sha1exp_str ::= sha1exp;
111
  tag_str     ::= tag;
112
  path_str    ::= path    | '"' quoted(path)    '"' ;
113
  mode        ::= '100644' | '644'
114
                | '100755' | '755'
115
                | '120000'
116
                ;
117
118
  declen ::= # unsigned 32 bit value, ascii base10 notation;
119
  bigint ::= # unsigned integer value, ascii base10 notation;
120
  binary_data ::= # file content, not interpreted;
121
122
  when         ::= raw_when | rfc2822_when;
123
  raw_when     ::= ts sp tz;
124
  rfc2822_when ::= # Valid RFC 2822 date and time;
125
126
  sp ::= # ASCII space character;
127
  lf ::= # ASCII newline (LF) character;
128
129
     # note: a colon (':') must precede the numerical value assigned to
130
     # an idnum.  This is to distinguish it from a ref or tag name as
131
     # GIT does not permit ':' in ref or tag strings.
132
     #
133
  idnum   ::= ':' bigint;
134
  path    ::= # GIT style file path, e.g. "a/b/c";
135
  ref     ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
136
  tag     ::= # GIT tag name, e.g. "FIREFOX_1_5";
137
  sha1exp ::= # Any valid GIT SHA1 expression;
138
  hexsha1 ::= # SHA1 in hexadecimal format;
139
140
     # note: name and email are UTF8 strings, however name must not
141
     # contain '<' or lf and email must not contain any of the
142
     # following: '<', '>', lf.
143
     #
144
  name  ::= # valid GIT author/committer name;
145
  email ::= # valid GIT author/committer email;
146
  ts    ::= # time since the epoch in seconds, ascii base10 notation;
147
  tz    ::= # GIT style timezone;
148
149
     # note: comments may appear anywhere in the input, except
150
     # within a data command.  Any form of the data command
151
     # always escapes the related input from comment processing.
152
     #
153
     # In case it is not clear, the '#' that starts the comment
154
     # must be the first character on that the line (an lf have
155
     # preceeded it).
156
     #
157
  comment ::= '#' not_lf* lf;
158
  not_lf  ::= # Any byte that is not ASCII newline (LF);
159
"""
160
161
162
import re
163
import sys
164
165
import commands
166
import dates
167
import errors
168
169
170
## Stream parsing ##
171
172
class LineBasedParser(object):
173
174
    def __init__(self, input):
175
        """A Parser that keeps track of line numbers.
176
177
        :param input: the file-like object to read from
178
        """
179
        self.input = input
180
        self.lineno = 0
181
        # Lines pushed back onto the input stream
182
        self._buffer = []
183
184
    def abort(self, exception, *args):
185
        """Raise an exception providing line number information."""
186
        raise exception(self.lineno, *args)
187
188
    def readline(self):
189
        """Get the next line including the newline or '' on EOF."""
190
        self.lineno += 1
191
        if self._buffer:
192
            return self._buffer.pop()
193
        else:
194
            return self.input.readline()
195
196
    def next_line(self):
197
        """Get the next line without the newline or None on EOF."""
198
        line = self.readline()
199
        if line:
200
            return line[:-1]
201
        else:
202
            return None
203
204
    def push_line(self, line):
205
        """Push line back onto the line buffer.
206
        
207
        :param line: the line with no trailing newline
208
        """
209
        self.lineno -= 1
210
        self._buffer.append(line + "\n")
211
212
    def read_bytes(self, count):
213
        """Read a given number of bytes from the input stream.
214
        
215
        Throws MissingBytes if the bytes are not found.
216
217
        Note: This method does not read from the line buffer.
0.64.6 by Ian Clatworthy
generic processing method working for one revision in one branch
218
219
        :return: a string
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
220
        """
0.64.72 by Ian Clatworthy
reduce memory usage while reading large blobs
221
        lines = ''
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
222
        left = count
0.64.6 by Ian Clatworthy
generic processing method working for one revision in one branch
223
        found = 0
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
224
        while left > 0:
225
            line = self.input.readline(left)
226
            if line:
0.64.6 by Ian Clatworthy
generic processing method working for one revision in one branch
227
                line_len = len(line)
228
                left -= line_len
229
                found += line_len
0.64.72 by Ian Clatworthy
reduce memory usage while reading large blobs
230
                lines += line
0.64.53 by Ian Clatworthy
fix line number tracking
231
                if line.endswith('\n'):
232
                    self.lineno += 1
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
233
            else:
234
                left = 0
235
        if found != count:
236
            self.abort(errors.MissingBytes, count, found)
0.64.72 by Ian Clatworthy
reduce memory usage while reading large blobs
237
        return lines
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
238
239
    def read_until(self, terminator):
240
        """Read the input stream until the terminator is found.
241
        
242
        Throws MissingTerminator if the terminator is not found.
243
244
        Note: This method does not read from the line buffer.
245
246
        :return: the bytes read up to but excluding the terminator.
247
        """
248
        raise NotImplementedError(self.read_until)
249
250
0.64.3 by Ian Clatworthy
tweak parser for better git-fast-export compatibility
251
# Regular expression used for parsing. (Note: The spec states that the name
252
# part should be non-empty but git-fast-export doesn't always do that so
0.65.2 by James Westby
The space between the author and email is optional in committer.
253
# the first bit is \w*, not \w+.) Also git-fast-import code says the
254
# space before the email is optional.
0.65.5 by James Westby
Make the parser handle multiple words in the committer name.
255
_WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.+)> (.+)')
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
256
_WHO_RE = re.compile(r'([^<]*)<(.+)>')
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
257
258
259
class ImportParser(LineBasedParser):
260
261
    def __init__(self, input, verbose=False, output=sys.stdout):
262
        """A Parser of import commands.
263
264
        :param input: the file-like object to read from
265
        :param verbose: display extra information of not
266
        :param output: the file-like object to write messages to (YAGNI?)
267
        """
268
        LineBasedParser.__init__(self, input)
269
        self.verbose = verbose
270
        self.output = output
271
        # We auto-detect the date format when a date is first encountered
272
        self.date_parser = None
273
274
    def iter_commands(self):
275
        """Iterator returning ImportCommand objects."""
276
        while True:
277
            line = self.next_line()
278
            if line is None:
279
                break
280
            elif len(line) == 0 or line.startswith('#'):
281
                continue
282
            # Search for commands in order of likelihood
283
            elif line.startswith('commit '):
284
                yield self._parse_commit(line[len('commit '):])
285
            elif line.startswith('blob'):
286
                yield self._parse_blob()
287
            elif line.startswith('progress '):
288
                yield commands.ProgressCommand(line[len('progress '):])
289
            elif line.startswith('reset '):
290
                yield self._parse_reset(line[len('reset '):])
291
            elif line.startswith('tag '):
292
                yield self._parse_tag(line[len('tag '):])
293
            elif line.startswith('checkpoint'):
294
                yield commands.CheckpointCommand()
295
            else:
296
                self.abort(errors.InvalidCommand, line)
297
298
    def iter_file_commands(self):
299
        """Iterator returning FileCommand objects.
300
        
301
        If an invalid file command is found, the line is silently
302
        pushed back and iteration ends.
303
        """
304
        while True:
305
            line = self.next_line()
306
            if line is None:
307
                break
308
            elif len(line) == 0 or line.startswith('#'):
309
                continue
310
            # Search for file commands in order of likelihood
311
            elif line.startswith('M '):
312
                yield self._parse_file_modify(line[2:])
313
            elif line.startswith('D '):
314
                path = self._path(line[2:])
315
                yield commands.FileDeleteCommand(path)
316
            elif line.startswith('R '):
317
                old, new = self._path_pair(line[2:])
318
                yield commands.FileRenameCommand(old, new)
319
            elif line.startswith('C '):
320
                src, dest = self._path_pair(line[2:])
321
                yield commands.FileRenameCommand(src, dest)
322
            elif line.startswith('deleteall'):
323
                yield commands.FileDeleteAllCommand()
324
            else:
325
                self.push_line(line)
326
                break
327
328
    def _parse_blob(self):
329
        """Parse a blob command."""
0.64.35 by Ian Clatworthy
identify unmarked blobs and commits by line numbers
330
        lineno = self.lineno
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
331
        mark = self._get_mark_if_any()
332
        data = self._get_data('blob')
0.64.35 by Ian Clatworthy
identify unmarked blobs and commits by line numbers
333
        return commands.BlobCommand(mark, data, lineno)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
334
335
    def _parse_commit(self, ref):
336
        """Parse a commit command."""
0.64.35 by Ian Clatworthy
identify unmarked blobs and commits by line numbers
337
        lineno  = self.lineno
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
338
        mark = self._get_mark_if_any()
339
        author = self._get_user_info('commit', 'author', False)
340
        committer = self._get_user_info('commit', 'committer')
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
341
        message = self._get_data('commit', 'message')
342
        try:
343
            message = message.decode('utf_8')
344
        except UnicodeDecodeError:
345
            # TODO: output a warning here about a broken front-end
346
            pass
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
347
        from_ = self._get_from()
0.64.60 by Ian Clatworthy
support merges when from clause implicit
348
        merges = []
349
        while True:
350
            merge = self._get_merge()
351
            if merge is not None:
352
                merges.append(merge)
353
            else:
354
                break
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
355
        return commands.CommitCommand(ref, mark, author, committer, message,
0.64.60 by Ian Clatworthy
support merges when from clause implicit
356
            from_, merges, self.iter_file_commands, lineno)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
357
358
    def _parse_file_modify(self, info):
359
        """Parse a filemodify command within a commit.
360
361
        :param info: a string in the format "mode dataref path"
362
          (where dataref might be the hard-coded literal 'inline').
363
        """
364
        params = info.split(' ', 2)
365
        path = self._path(params[2])
366
        is_executable, is_symlink = self._mode(params[0])
367
        if is_symlink:
368
            kind = commands.SYMLINK_KIND
369
        else:
370
            kind = commands.FILE_KIND
371
        if params[1] == 'inline':
372
            dataref = None
373
            data = self._get_data('filemodify')
374
        else:
375
            dataref = params[1]
376
            data = None
377
        return commands.FileModifyCommand(path, kind, is_executable, dataref,
378
            data)
379
380
    def _parse_reset(self, ref):
381
        """Parse a reset command."""
382
        from_ = self._get_from()
383
        return commands.ResetCommand(ref, from_)
384
385
    def _parse_tag(self, name):
386
        """Parse a tag command."""
387
        from_ = self._get_from('tag')
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
388
        tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)
0.64.76 by Ian Clatworthy
fix utf-8 decoding bugs
389
        message = self._get_data('tag', 'message').decode('utf_8')
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
390
        return commands.TagCommand(name, from_, tagger, message)
391
392
    def _get_mark_if_any(self):
393
        """Parse a mark section."""
394
        line = self.next_line()
395
        if line.startswith('mark :'):
396
            return line[len('mark :'):]
397
        else:
398
            self.push_line(line)
399
            return None
400
401
    def _get_from(self, required_for=None):
402
        """Parse a from section."""
403
        line = self.next_line()
404
        if line.startswith('from '):
405
            return line[len('from '):]
406
        elif required_for:
407
            self.abort(errors.MissingSection, required_for, 'from')
408
        else:
409
            self.push_line(line)
410
            return None
411
412
    def _get_merge(self):
413
        """Parse a merge section."""
414
        line = self.next_line()
415
        if line.startswith('merge '):
416
            return line[len('merge '):]
417
        else:
418
            self.push_line(line)
419
            return None
420
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
421
    def _get_user_info(self, cmd, section, required=True,
422
        accept_just_who=False):
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
423
        """Parse a user section."""
424
        line = self.next_line()
425
        if line.startswith(section + ' '):
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
426
            return self._who_when(line[len(section + ' '):], cmd, section,
427
                accept_just_who=accept_just_who)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
428
        elif required:
429
            self.abort(errors.MissingSection, cmd, section)
430
        else:
431
            self.push_line(line)
432
            return None
433
434
    def _get_data(self, required_for, section='data'):
435
        """Parse a data section."""
436
        line = self.next_line()
437
        if line.startswith('data '):
438
            rest = line[len('data '):]
439
            if rest.startswith('<<'):
440
                return self.read_until(rest[2:])
441
            else:
442
                size = int(rest)
0.65.1 by James Westby
The data sections have an optional LF at the end in the byte count format.
443
                read_bytes = self.read_bytes(size)
444
                # optional LF after data.
445
                next = self.input.readline()
446
                self.lineno += 1
447
                if len(next) > 1 or next != "\n":
0.65.5 by James Westby
Make the parser handle multiple words in the committer name.
448
                    self.push_line(next[:-1])
0.65.1 by James Westby
The data sections have an optional LF at the end in the byte count format.
449
                return read_bytes
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
450
        else:
451
            self.abort(errors.MissingSection, required_for, section)
452
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
453
    def _who_when(self, s, cmd, section, accept_just_who=False):
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
454
        """Parse who and when information from a string.
455
        
0.65.5 by James Westby
Make the parser handle multiple words in the committer name.
456
        :return: a tuple of (name,email,timestamp,timezone). name may be
457
            the empty string if only an email address was given.
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
458
        """
459
        match = _WHO_AND_WHEN_RE.search(s)
460
        if match:
461
            datestr = match.group(3)
462
            if self.date_parser is None:
463
                # auto-detect the date format
0.64.3 by Ian Clatworthy
tweak parser for better git-fast-export compatibility
464
                if len(datestr.split(' ')) == 2:
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
465
                    format = 'raw'
466
                elif datestr == 'now':
467
                    format = 'now'
468
                else:
469
                    format = 'rfc2822'
470
                self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
471
            when = self.date_parser(datestr)
472
        else:
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
473
            match = _WHO_RE.search(s)
474
            if accept_just_who and match:
475
                # HACK around missing time
476
                # TODO: output a warning here
477
                when = dates.DATE_PARSERS_BY_NAME['now']('now')
478
            else:
479
                self.abort(errors.BadFormat, cmd, section, s)
480
        name = match.group(1)
481
        if len(name) > 0:
482
            if name[-1] == " ":
483
                name = name[:-1].decode('utf_8')
484
        return (name,match.group(2),when[0],when[1])
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
485
486
    def _path(self, s):
487
        """Parse a path."""
0.64.10 by Ian Clatworthy
1st cut are dequoting paths
488
        if s.startswith('"'):
489
            if s[-1] != '"':
490
                self.abort(errors.BadFormat, cmd, section, s)
491
            else:
492
                return _unquote_c_string(s[1:-1])
0.64.76 by Ian Clatworthy
fix utf-8 decoding bugs
493
        return s.decode('utf_8')
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
494
495
    def _path_pair(self, s):
496
        """Parse two paths separated by a space."""
0.64.10 by Ian Clatworthy
1st cut are dequoting paths
497
        # TODO: handle a space in the first path
498
        parts = s.split(' ', 1)
499
        return map(_unquote_c_string, parts)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
500
501
    def _mode(self, s):
502
        """Parse a file mode into executable and symlink flags.
503
        
504
        :return (is_executable, is_symlink)
505
        """
506
        # Note: Output from git-fast-export slightly different to spec
507
        if s in ['644', '100644', '0100644']:
508
            return False, False
509
        elif s in ['755', '100755', '0100755']:
510
            return True, False
0.64.3 by Ian Clatworthy
tweak parser for better git-fast-export compatibility
511
        elif s in ['120000', '0120000']:
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
512
            return False, True
513
        else:
514
            self.abort(errors.BadFormat, 'filemodify', 'mode', s)
515
0.64.10 by Ian Clatworthy
1st cut are dequoting paths
516
517
def _unquote_c_string(s):
518
    """replace C-style escape sequences (\n, \", etc.) with real chars."""
519
    # HACK: Python strings are close enough
520
    return s.decode('string_escape', 'replace')