/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
1
# Copyright (C) 2008 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""Parser of import data into command objects.
18
19
In order to reuse existing front-ends, the stream format is a subset of
20
the one used by git-fast-import (as of the 1.5.4 release of git at least).
21
The grammar is:
22
23
  stream ::= cmd*;
24
25
  cmd ::= new_blob
26
        | new_commit
27
        | new_tag
28
        | reset_branch
29
        | checkpoint
30
        | progress
31
        ;
32
33
  new_blob ::= 'blob' lf
34
    mark?
35
    file_content;
36
  file_content ::= data;
37
38
  new_commit ::= 'commit' sp ref_str lf
39
    mark?
40
    ('author' sp name '<' email '>' when lf)?
41
    'committer' sp name '<' email '>' when lf
42
    commit_msg
43
    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
44
    ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
45
    file_change*
46
    lf?;
47
  commit_msg ::= data;
48
49
  file_change ::= file_clr
50
    | file_del
51
    | file_rnm
52
    | file_cpy
53
    | file_obm
54
    | file_inm;
55
  file_clr ::= 'deleteall' lf;
56
  file_del ::= 'D' sp path_str lf;
57
  file_rnm ::= 'R' sp path_str sp path_str lf;
58
  file_cpy ::= 'C' sp path_str sp path_str lf;
59
  file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
60
  file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
61
    data;
62
63
  new_tag ::= 'tag' sp tag_str lf
64
    'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
65
    'tagger' sp name '<' email '>' when lf
66
    tag_msg;
67
  tag_msg ::= data;
68
69
  reset_branch ::= 'reset' sp ref_str lf
70
    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
71
    lf?;
72
73
  checkpoint ::= 'checkpoint' lf
74
    lf?;
75
76
  progress ::= 'progress' sp not_lf* lf
77
    lf?;
78
79
     # note: the first idnum in a stream should be 1 and subsequent
80
     # idnums should not have gaps between values as this will cause
81
     # the stream parser to reserve space for the gapped values.  An
82
     # idnum can be updated in the future to a new object by issuing
83
     # a new mark directive with the old idnum.
84
     #
85
  mark ::= 'mark' sp idnum lf;
86
  data ::= (delimited_data | exact_data)
87
    lf?;
88
89
    # note: delim may be any string but must not contain lf.
90
    # data_line may contain any data but must not be exactly
0.88.2 by Samuel Bronson
Implement here-document style input data.
91
    # delim. The lf after the final data_line is included in
92
    # the data.
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
93
  delimited_data ::= 'data' sp '<<' delim lf
94
    (data_line lf)*
95
    delim lf;
96
97
     # note: declen indicates the length of binary_data in bytes.
98
     # declen does not include the lf preceeding the binary data.
99
     #
100
  exact_data ::= 'data' sp declen lf
101
    binary_data;
102
103
     # note: quoted strings are C-style quoting supporting \c for
104
     # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
105
     # is the signed byte value in octal.  Note that the only
106
     # characters which must actually be escaped to protect the
107
     # stream formatting is: \, " and LF.  Otherwise these values
108
     # are UTF8.
109
     #
110
  ref_str     ::= ref;
111
  sha1exp_str ::= sha1exp;
112
  tag_str     ::= tag;
113
  path_str    ::= path    | '"' quoted(path)    '"' ;
114
  mode        ::= '100644' | '644'
115
                | '100755' | '755'
116
                | '120000'
117
                ;
118
119
  declen ::= # unsigned 32 bit value, ascii base10 notation;
120
  bigint ::= # unsigned integer value, ascii base10 notation;
121
  binary_data ::= # file content, not interpreted;
122
123
  when         ::= raw_when | rfc2822_when;
124
  raw_when     ::= ts sp tz;
125
  rfc2822_when ::= # Valid RFC 2822 date and time;
126
127
  sp ::= # ASCII space character;
128
  lf ::= # ASCII newline (LF) character;
129
130
     # note: a colon (':') must precede the numerical value assigned to
131
     # an idnum.  This is to distinguish it from a ref or tag name as
132
     # GIT does not permit ':' in ref or tag strings.
133
     #
134
  idnum   ::= ':' bigint;
135
  path    ::= # GIT style file path, e.g. "a/b/c";
136
  ref     ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
137
  tag     ::= # GIT tag name, e.g. "FIREFOX_1_5";
138
  sha1exp ::= # Any valid GIT SHA1 expression;
139
  hexsha1 ::= # SHA1 in hexadecimal format;
140
141
     # note: name and email are UTF8 strings, however name must not
142
     # contain '<' or lf and email must not contain any of the
143
     # following: '<', '>', lf.
144
     #
145
  name  ::= # valid GIT author/committer name;
146
  email ::= # valid GIT author/committer email;
147
  ts    ::= # time since the epoch in seconds, ascii base10 notation;
148
  tz    ::= # GIT style timezone;
149
150
     # note: comments may appear anywhere in the input, except
151
     # within a data command.  Any form of the data command
152
     # always escapes the related input from comment processing.
153
     #
154
     # In case it is not clear, the '#' that starts the comment
155
     # must be the first character on that the line (an lf have
156
     # preceeded it).
157
     #
158
  comment ::= '#' not_lf* lf;
159
  not_lf  ::= # Any byte that is not ASCII newline (LF);
160
"""
161
162
163
import re
164
import sys
165
166
import commands
167
import dates
168
import errors
169
170
171
## Stream parsing ##
172
173
class LineBasedParser(object):
174
175
    def __init__(self, input):
176
        """A Parser that keeps track of line numbers.
177
178
        :param input: the file-like object to read from
179
        """
180
        self.input = input
181
        self.lineno = 0
182
        # Lines pushed back onto the input stream
183
        self._buffer = []
184
185
    def abort(self, exception, *args):
186
        """Raise an exception providing line number information."""
187
        raise exception(self.lineno, *args)
188
189
    def readline(self):
190
        """Get the next line including the newline or '' on EOF."""
191
        self.lineno += 1
192
        if self._buffer:
193
            return self._buffer.pop()
194
        else:
195
            return self.input.readline()
196
197
    def next_line(self):
198
        """Get the next line without the newline or None on EOF."""
199
        line = self.readline()
200
        if line:
201
            return line[:-1]
202
        else:
203
            return None
204
205
    def push_line(self, line):
206
        """Push line back onto the line buffer.
207
        
208
        :param line: the line with no trailing newline
209
        """
210
        self.lineno -= 1
211
        self._buffer.append(line + "\n")
212
213
    def read_bytes(self, count):
214
        """Read a given number of bytes from the input stream.
215
        
216
        Throws MissingBytes if the bytes are not found.
217
218
        Note: This method does not read from the line buffer.
0.64.6 by Ian Clatworthy
generic processing method working for one revision in one branch
219
220
        :return: a string
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
221
        """
0.64.143 by Ian Clatworthy
speed up blob parsing
222
        result = self.input.read(count)
223
        found = len(result)
224
        self.lineno += result.count("\n")
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
225
        if found != count:
226
            self.abort(errors.MissingBytes, count, found)
0.64.143 by Ian Clatworthy
speed up blob parsing
227
        return result
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
228
229
    def read_until(self, terminator):
230
        """Read the input stream until the terminator is found.
231
        
232
        Throws MissingTerminator if the terminator is not found.
233
234
        Note: This method does not read from the line buffer.
235
236
        :return: the bytes read up to but excluding the terminator.
237
        """
0.88.2 by Samuel Bronson
Implement here-document style input data.
238
        
239
        lines = []
240
        term = terminator + '\n'
241
        while True:
242
            line = self.input.readline()
243
            if line == term:
244
                break
245
            else:
246
                lines.append(line)
247
        return ''.join(lines)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
248
249
0.64.3 by Ian Clatworthy
tweak parser for better git-fast-export compatibility
250
# Regular expression used for parsing. (Note: The spec states that the name
251
# part should be non-empty but git-fast-export doesn't always do that so
0.65.2 by James Westby
The space between the author and email is optional in committer.
252
# the first bit is \w*, not \w+.) Also git-fast-import code says the
253
# space before the email is optional.
0.64.103 by Ian Clatworthy
handle empty emails & names/paths that aren't utf8 encoded
254
_WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.*)> (.+)')
255
_WHO_RE = re.compile(r'([^<]*)<(.*)>')
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
256
257
258
class ImportParser(LineBasedParser):
259
0.64.252 by Ian Clatworthy
Add --user-map option to both fast-import and fast-import-filter
260
    def __init__(self, input, verbose=False, output=sys.stdout,
261
        user_mapper=None):
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
262
        """A Parser of import commands.
263
264
        :param input: the file-like object to read from
265
        :param verbose: display extra information of not
266
        :param output: the file-like object to write messages to (YAGNI?)
0.64.252 by Ian Clatworthy
Add --user-map option to both fast-import and fast-import-filter
267
        :param user_mapper: if not None, the UserMapper used to adjust
268
          user-ids for authors, committers and taggers.
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
269
        """
270
        LineBasedParser.__init__(self, input)
271
        self.verbose = verbose
272
        self.output = output
0.64.252 by Ian Clatworthy
Add --user-map option to both fast-import and fast-import-filter
273
        self.user_mapper = user_mapper
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
274
        # We auto-detect the date format when a date is first encountered
275
        self.date_parser = None
276
0.64.241 by Ian Clatworthy
fix warning messages on non-utf8 names & emails
277
    def warning(self, msg):
0.64.203 by Ian Clatworthy
fix reporting of non-utf8 commits to go to stderr
278
        sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg))
0.64.191 by Ian Clatworthy
Force parser to generate commit messages in unicode - even when not in utf8 in the input stream
279
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
280
    def iter_commands(self):
281
        """Iterator returning ImportCommand objects."""
282
        while True:
283
            line = self.next_line()
284
            if line is None:
285
                break
286
            elif len(line) == 0 or line.startswith('#'):
287
                continue
288
            # Search for commands in order of likelihood
289
            elif line.startswith('commit '):
290
                yield self._parse_commit(line[len('commit '):])
291
            elif line.startswith('blob'):
292
                yield self._parse_blob()
293
            elif line.startswith('progress '):
294
                yield commands.ProgressCommand(line[len('progress '):])
295
            elif line.startswith('reset '):
296
                yield self._parse_reset(line[len('reset '):])
297
            elif line.startswith('tag '):
298
                yield self._parse_tag(line[len('tag '):])
299
            elif line.startswith('checkpoint'):
300
                yield commands.CheckpointCommand()
0.102.8 by Ian Clatworthy
feature parsing
301
            elif line.startswith('feature'):
302
                yield self._parse_feature(line[len('feature '):])
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
303
            else:
304
                self.abort(errors.InvalidCommand, line)
305
306
    def iter_file_commands(self):
307
        """Iterator returning FileCommand objects.
308
        
309
        If an invalid file command is found, the line is silently
310
        pushed back and iteration ends.
311
        """
312
        while True:
313
            line = self.next_line()
314
            if line is None:
315
                break
316
            elif len(line) == 0 or line.startswith('#'):
317
                continue
318
            # Search for file commands in order of likelihood
319
            elif line.startswith('M '):
320
                yield self._parse_file_modify(line[2:])
321
            elif line.startswith('D '):
322
                path = self._path(line[2:])
323
                yield commands.FileDeleteCommand(path)
324
            elif line.startswith('R '):
325
                old, new = self._path_pair(line[2:])
326
                yield commands.FileRenameCommand(old, new)
327
            elif line.startswith('C '):
328
                src, dest = self._path_pair(line[2:])
0.77.11 by Ian Clatworthy
fix parsing of filecopy command
329
                yield commands.FileCopyCommand(src, dest)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
330
            elif line.startswith('deleteall'):
331
                yield commands.FileDeleteAllCommand()
332
            else:
333
                self.push_line(line)
334
                break
335
336
    def _parse_blob(self):
337
        """Parse a blob command."""
0.64.35 by Ian Clatworthy
identify unmarked blobs and commits by line numbers
338
        lineno = self.lineno
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
339
        mark = self._get_mark_if_any()
340
        data = self._get_data('blob')
0.64.35 by Ian Clatworthy
identify unmarked blobs and commits by line numbers
341
        return commands.BlobCommand(mark, data, lineno)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
342
343
    def _parse_commit(self, ref):
344
        """Parse a commit command."""
0.64.35 by Ian Clatworthy
identify unmarked blobs and commits by line numbers
345
        lineno  = self.lineno
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
346
        mark = self._get_mark_if_any()
347
        author = self._get_user_info('commit', 'author', False)
0.102.9 by Ian Clatworthy
parsing of multiple authors and commit properties
348
        more_authors = []
349
        while True:
350
            another_author = self._get_user_info('commit', 'author', False)
351
            if another_author is not None:
352
                more_authors.append(another_author)
353
            else:
354
                break
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
355
        committer = self._get_user_info('commit', 'committer')
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
356
        message = self._get_data('commit', 'message')
357
        try:
358
            message = message.decode('utf_8')
359
        except UnicodeDecodeError:
0.64.241 by Ian Clatworthy
fix warning messages on non-utf8 names & emails
360
            self.warning(
0.64.191 by Ian Clatworthy
Force parser to generate commit messages in unicode - even when not in utf8 in the input stream
361
                "commit message not in utf8 - replacing unknown characters")
362
            message = message.decode('utf_8', 'replace')
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
363
        from_ = self._get_from()
0.64.60 by Ian Clatworthy
support merges when from clause implicit
364
        merges = []
365
        while True:
366
            merge = self._get_merge()
367
            if merge is not None:
0.64.104 by Ian Clatworthy
handle multiple merges in the one merge clause
368
                # while the spec suggests it's illegal, git-fast-export
369
                # outputs multiple merges on the one line, e.g.
370
                # merge :x :y :z
371
                these_merges = merge.split(" ")
372
                merges.extend(these_merges)
0.64.60 by Ian Clatworthy
support merges when from clause implicit
373
            else:
374
                break
0.102.9 by Ian Clatworthy
parsing of multiple authors and commit properties
375
        properties = {}
376
        while True:
377
            name_value = self._get_property()
378
            if name_value is not None:
379
                name, value = name_value
380
                properties[name] = value
381
            else:
382
                break
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
383
        return commands.CommitCommand(ref, mark, author, committer, message,
0.102.9 by Ian Clatworthy
parsing of multiple authors and commit properties
384
            from_, merges, self.iter_file_commands, lineno=lineno,
385
            more_authors=more_authors, properties=properties)
0.102.8 by Ian Clatworthy
feature parsing
386
387
    def _parse_feature(self, info):
388
        """Parse a feature command."""
389
        parts = info.split("=", 1)
390
        name = parts[0]
391
        if len(parts) > 1:
392
            value = self._path(parts[1])
393
        else:
394
            value = None
395
        return commands.FeatureCommand(name, value, lineno=self.lineno)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
396
397
    def _parse_file_modify(self, info):
398
        """Parse a filemodify command within a commit.
399
400
        :param info: a string in the format "mode dataref path"
401
          (where dataref might be the hard-coded literal 'inline').
402
        """
403
        params = info.split(' ', 2)
404
        path = self._path(params[2])
0.64.229 by Ian Clatworthy
Handle git submodules in the stream by warning about + ignoring them
405
        is_executable, kind = self._mode(params[0])
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
406
        if params[1] == 'inline':
407
            dataref = None
408
            data = self._get_data('filemodify')
409
        else:
410
            dataref = params[1]
411
            data = None
412
        return commands.FileModifyCommand(path, kind, is_executable, dataref,
413
            data)
414
415
    def _parse_reset(self, ref):
416
        """Parse a reset command."""
417
        from_ = self._get_from()
418
        return commands.ResetCommand(ref, from_)
419
420
    def _parse_tag(self, name):
421
        """Parse a tag command."""
422
        from_ = self._get_from('tag')
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
423
        tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)
0.64.76 by Ian Clatworthy
fix utf-8 decoding bugs
424
        message = self._get_data('tag', 'message').decode('utf_8')
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
425
        return commands.TagCommand(name, from_, tagger, message)
426
427
    def _get_mark_if_any(self):
428
        """Parse a mark section."""
429
        line = self.next_line()
430
        if line.startswith('mark :'):
431
            return line[len('mark :'):]
432
        else:
433
            self.push_line(line)
434
            return None
435
436
    def _get_from(self, required_for=None):
437
        """Parse a from section."""
438
        line = self.next_line()
0.64.172 by Ian Clatworthy
handle commit commands with no from clause and no sub-commands
439
        if line is None:
440
            return None
441
        elif line.startswith('from '):
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
442
            return line[len('from '):]
443
        elif required_for:
444
            self.abort(errors.MissingSection, required_for, 'from')
445
        else:
446
            self.push_line(line)
447
            return None
448
449
    def _get_merge(self):
450
        """Parse a merge section."""
451
        line = self.next_line()
0.64.172 by Ian Clatworthy
handle commit commands with no from clause and no sub-commands
452
        if line is None:
453
            return None
454
        elif line.startswith('merge '):
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
455
            return line[len('merge '):]
456
        else:
457
            self.push_line(line)
458
            return None
459
0.102.9 by Ian Clatworthy
parsing of multiple authors and commit properties
460
    def _get_property(self):
461
        """Parse a property section."""
462
        line = self.next_line()
463
        if line is None:
464
            return None
465
        elif line.startswith('property '):
466
            return self._name_value(line[len('property '):])
467
        else:
468
            self.push_line(line)
469
            return None
470
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
471
    def _get_user_info(self, cmd, section, required=True,
472
        accept_just_who=False):
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
473
        """Parse a user section."""
474
        line = self.next_line()
475
        if line.startswith(section + ' '):
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
476
            return self._who_when(line[len(section + ' '):], cmd, section,
477
                accept_just_who=accept_just_who)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
478
        elif required:
479
            self.abort(errors.MissingSection, cmd, section)
480
        else:
481
            self.push_line(line)
482
            return None
483
484
    def _get_data(self, required_for, section='data'):
485
        """Parse a data section."""
486
        line = self.next_line()
487
        if line.startswith('data '):
488
            rest = line[len('data '):]
489
            if rest.startswith('<<'):
490
                return self.read_until(rest[2:])
491
            else:
492
                size = int(rest)
0.65.1 by James Westby
The data sections have an optional LF at the end in the byte count format.
493
                read_bytes = self.read_bytes(size)
494
                # optional LF after data.
495
                next = self.input.readline()
496
                self.lineno += 1
497
                if len(next) > 1 or next != "\n":
0.65.5 by James Westby
Make the parser handle multiple words in the committer name.
498
                    self.push_line(next[:-1])
0.65.1 by James Westby
The data sections have an optional LF at the end in the byte count format.
499
                return read_bytes
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
500
        else:
501
            self.abort(errors.MissingSection, required_for, section)
502
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
503
    def _who_when(self, s, cmd, section, accept_just_who=False):
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
504
        """Parse who and when information from a string.
505
        
0.65.5 by James Westby
Make the parser handle multiple words in the committer name.
506
        :return: a tuple of (name,email,timestamp,timezone). name may be
507
            the empty string if only an email address was given.
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
508
        """
509
        match = _WHO_AND_WHEN_RE.search(s)
510
        if match:
0.64.246 by Ian Clatworthy
fix date parsing bug found while importing samba
511
            datestr = match.group(3).lstrip()
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
512
            if self.date_parser is None:
513
                # auto-detect the date format
0.64.3 by Ian Clatworthy
tweak parser for better git-fast-export compatibility
514
                if len(datestr.split(' ')) == 2:
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
515
                    format = 'raw'
516
                elif datestr == 'now':
517
                    format = 'now'
518
                else:
519
                    format = 'rfc2822'
520
                self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
0.64.246 by Ian Clatworthy
fix date parsing bug found while importing samba
521
            try:
522
                when = self.date_parser(datestr, self.lineno)
523
            except ValueError:
524
                print "failed to parse datestr '%s'" % (datestr,)
525
                raise
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
526
        else:
0.64.78 by Ian Clatworthy
fix from Pieter de Bie - hack around broken front-ends
527
            match = _WHO_RE.search(s)
528
            if accept_just_who and match:
529
                # HACK around missing time
530
                # TODO: output a warning here
531
                when = dates.DATE_PARSERS_BY_NAME['now']('now')
532
            else:
533
                self.abort(errors.BadFormat, cmd, section, s)
534
        name = match.group(1)
535
        if len(name) > 0:
536
            if name[-1] == " ":
0.64.103 by Ian Clatworthy
handle empty emails & names/paths that aren't utf8 encoded
537
                try:
538
                    name = name[:-1].decode('utf_8')
539
                except UnicodeDecodeError:
540
                    # The spec says names are *typically* utf8 encoded
541
                    # but that isn't enforced by git-fast-export (at least)
0.64.240 by Ian Clatworthy
Fix unicode email address parsing
542
                    self.warning("%s name not in utf8 - replacing unknown "
543
                        "characters" % (section,))
544
                    name = name[:-1].decode('utf_8', 'replace')
0.64.197 by Ian Clatworthy
Gracefully handle email addresses with unicode characters
545
        email = match.group(2)
546
        # While it shouldn't happen, some datasets have email addresses
547
        # which contain unicode characters. See bug 338186. We sanitize
548
        # the data at this level just in case.
549
        try:
0.64.240 by Ian Clatworthy
Fix unicode email address parsing
550
            email = email.decode('utf_8')
0.64.197 by Ian Clatworthy
Gracefully handle email addresses with unicode characters
551
        except UnicodeDecodeError:
0.64.240 by Ian Clatworthy
Fix unicode email address parsing
552
            self.warning("%s email not in utf8 - replacing unknown characters"
553
                % (section,))
554
            email = email.decode('utf_8', 'replace')
0.64.252 by Ian Clatworthy
Add --user-map option to both fast-import and fast-import-filter
555
        if self.user_mapper:
556
            name, email = self.user_mapper.map_name_and_email(name, email)
0.64.197 by Ian Clatworthy
Gracefully handle email addresses with unicode characters
557
        return (name, email, when[0], when[1])
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
558
0.102.9 by Ian Clatworthy
parsing of multiple authors and commit properties
559
    def _name_value(self, s):
560
        """Parse a (name,value) tuple from 'name value-length value'."""
561
        parts = s.split(' ', 2)
0.102.10 by Ian Clatworthy
Store multiple authors and revision properties when defined
562
        name = parts[0]
0.102.9 by Ian Clatworthy
parsing of multiple authors and commit properties
563
        if len(parts) == 1:
564
            value = None
565
        else:
566
            size = int(parts[1])
567
            value = parts[2]
568
            still_to_read = size - len(value)
0.64.255 by Ian Clatworthy
Fix parsing error when a property is found after a multi-line one
569
            if still_to_read > 0:
570
                read_bytes = self.read_bytes(still_to_read)
571
                value += "\n" + read_bytes[:still_to_read - 1]
0.102.9 by Ian Clatworthy
parsing of multiple authors and commit properties
572
            value = value.decode('utf8')
573
        return (name, value)
574
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
575
    def _path(self, s):
576
        """Parse a path."""
0.64.10 by Ian Clatworthy
1st cut are dequoting paths
577
        if s.startswith('"'):
578
            if s[-1] != '"':
0.64.175 by Ian Clatworthy
fix parsing when a rename old-path has spaces in it
579
                self.abort(errors.BadFormat, '?', '?', s)
0.64.10 by Ian Clatworthy
1st cut are dequoting paths
580
            else:
581
                return _unquote_c_string(s[1:-1])
0.64.103 by Ian Clatworthy
handle empty emails & names/paths that aren't utf8 encoded
582
        try:
583
            return s.decode('utf_8')
584
        except UnicodeDecodeError:
585
            # The spec recommends utf8 encoding but that isn't enforced
586
            return s
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
587
588
    def _path_pair(self, s):
589
        """Parse two paths separated by a space."""
0.64.10 by Ian Clatworthy
1st cut are dequoting paths
590
        # TODO: handle a space in the first path
0.64.175 by Ian Clatworthy
fix parsing when a rename old-path has spaces in it
591
        if s.startswith('"'):
592
            parts = s[1:].split('" ', 1)
593
        else:
594
            parts = s.split(' ', 1)
0.64.181 by Ian Clatworthy
Ignore quotes around second path in a pair (Dmitri Paduchikh)
595
        if len(parts) != 2:
596
            self.abort(errors.BadFormat, '?', '?', s)
597
        elif parts[1].startswith('"') and parts[1].endswith('"'):
598
            parts[1] = parts[1][1:-1]
599
        elif parts[1].startswith('"') or parts[1].endswith('"'):
600
            self.abort(errors.BadFormat, '?', '?', s)
601
        return map(_unquote_c_string, parts)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
602
603
    def _mode(self, s):
0.64.229 by Ian Clatworthy
Handle git submodules in the stream by warning about + ignoring them
604
        """Parse a file mode into executable and kind.
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
605
        
0.64.229 by Ian Clatworthy
Handle git submodules in the stream by warning about + ignoring them
606
        :return (is_executable, kind)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
607
        """
608
        # Note: Output from git-fast-export slightly different to spec
609
        if s in ['644', '100644', '0100644']:
0.64.229 by Ian Clatworthy
Handle git submodules in the stream by warning about + ignoring them
610
            return False, commands.FILE_KIND
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
611
        elif s in ['755', '100755', '0100755']:
0.64.229 by Ian Clatworthy
Handle git submodules in the stream by warning about + ignoring them
612
            return True, commands.FILE_KIND
0.102.14 by Ian Clatworthy
export and import empty directories
613
        elif s in ['040000', '0040000']:
614
            return False, commands.DIRECTORY_KIND
0.64.3 by Ian Clatworthy
tweak parser for better git-fast-export compatibility
615
        elif s in ['120000', '0120000']:
0.64.229 by Ian Clatworthy
Handle git submodules in the stream by warning about + ignoring them
616
            return False, commands.SYMLINK_KIND
617
        elif s in ['160000', '0160000']:
618
            return False, commands.TREE_REFERENCE_KIND
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
619
        else:
620
            self.abort(errors.BadFormat, 'filemodify', 'mode', s)
621
0.64.10 by Ian Clatworthy
1st cut are dequoting paths
622
623
def _unquote_c_string(s):
624
    """replace C-style escape sequences (\n, \", etc.) with real chars."""
625
    # HACK: Python strings are close enough
626
    return s.decode('string_escape', 'replace')