/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
1
# Copyright (C) 2008 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob
update FSF mailing address
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
16
#
17
18
"""Pyrex extensions for converting chunks to lines."""
19
6656.2.2 by Jelmer Vernooij
Use absolute_import.
20
from __future__ import absolute_import
21
6656.2.4 by Jelmer Vernooij
Merge cython-only branch.
22
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
23
cdef extern from "python-compat.h":
24
    pass
25
26
cdef extern from "stdlib.h":
27
    ctypedef unsigned size_t
28
29
cdef extern from "Python.h":
30
    ctypedef struct PyObject:
31
        pass
32
    int PyList_Append(object lst, object item) except -1
33
3890.2.11 by John Arbash Meinel
A bit more tweaking of the pyrex version. Shave off another 10% by
34
    int PyString_CheckExact(object p)
35
    char *PyString_AS_STRING(object p)
36
    Py_ssize_t PyString_GET_SIZE(object p)
3890.2.15 by John Arbash Meinel
Update to do a single iteration over the chunks.
37
    object PyString_FromStringAndSize(char *c_str, Py_ssize_t len)
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
38
39
cdef extern from "string.h":
40
    void *memchr(void *s, int c, size_t n)
41
42
43
def chunks_to_lines(chunks):
3890.2.10 by John Arbash Meinel
Change the python implementation to a friendlier implementation.
44
    """Re-split chunks into simple lines.
45
46
    Each entry in the result should contain a single newline at the end. Except
47
    for the last entry which may not have a final newline. If chunks is already
48
    a simple list of lines, we return it directly.
49
50
    :param chunks: An list/tuple of strings. If chunks is already a list of
51
        lines, then we will return it as-is.
52
    :return: A list of strings.
53
    """
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
54
    cdef char *c_str
55
    cdef char *newline
3890.2.16 by John Arbash Meinel
If we split into 2 loops, we get 440us for already lines, and the
56
    cdef char *c_last
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
57
    cdef Py_ssize_t the_len
3890.2.10 by John Arbash Meinel
Change the python implementation to a friendlier implementation.
58
    cdef int last_no_newline
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
59
60
    # Check to see if the chunks are already lines
3890.2.10 by John Arbash Meinel
Change the python implementation to a friendlier implementation.
61
    last_no_newline = 0
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
62
    for chunk in chunks:
3890.2.10 by John Arbash Meinel
Change the python implementation to a friendlier implementation.
63
        if last_no_newline:
64
            # We have a chunk which followed a chunk without a newline, so this
65
            # is not a simple list of lines.
3890.2.16 by John Arbash Meinel
If we split into 2 loops, we get 440us for already lines, and the
66
            break
3890.2.11 by John Arbash Meinel
A bit more tweaking of the pyrex version. Shave off another 10% by
67
        # Switching from PyString_AsStringAndSize to PyString_CheckExact and
68
        # then the macros GET_SIZE and AS_STRING saved us 40us / 470us.
69
        # It seems PyString_AsStringAndSize can actually trigger a conversion,
70
        # which we don't want anyway.
71
        if not PyString_CheckExact(chunk):
72
            raise TypeError('chunk is not a string')
73
        the_len = PyString_GET_SIZE(chunk)
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
74
        if the_len == 0:
3890.2.16 by John Arbash Meinel
If we split into 2 loops, we get 440us for already lines, and the
75
            # An empty string is never a valid line
76
            break
77
        c_str = PyString_AS_STRING(chunk)
78
        c_last = c_str + the_len - 1
79
        newline = <char *>memchr(c_str, c'\n', the_len)
80
        if newline != c_last:
81
            if newline == NULL:
82
                # Missing a newline. Only valid as the last line
83
                last_no_newline = 1
84
            else:
85
                # There is a newline in the middle, we must resplit
86
                break
87
    else:
88
        # Everything was already a list of lines
89
        return chunks
90
91
    # We know we need to create a new list of lines
92
    lines = []
93
    tail = None # Any remainder from the previous chunk
94
    for chunk in chunks:
95
        if tail is not None:
96
            chunk = tail + chunk
97
            tail = None
98
        if not PyString_CheckExact(chunk):
99
            raise TypeError('chunk is not a string')
100
        the_len = PyString_GET_SIZE(chunk)
101
        if the_len == 0:
3890.2.15 by John Arbash Meinel
Update to do a single iteration over the chunks.
102
            # An empty string is never a valid line, and we don't need to
103
            # append anything
104
            continue
3890.2.11 by John Arbash Meinel
A bit more tweaking of the pyrex version. Shave off another 10% by
105
        c_str = PyString_AS_STRING(chunk)
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
106
        c_last = c_str + the_len - 1
107
        newline = <char *>memchr(c_str, c'\n', the_len)
3890.2.15 by John Arbash Meinel
Update to do a single iteration over the chunks.
108
        if newline == c_last:
109
            # A simple line
110
            PyList_Append(lines, chunk)
111
        elif newline == NULL:
112
            # A chunk without a newline, if this is the last entry, then we
113
            # allow it
114
            tail = chunk
115
        else:
116
            # We have a newline in the middle, loop until we've consumed all
117
            # lines
118
            while newline != NULL:
119
                line = PyString_FromStringAndSize(c_str, newline - c_str + 1)
120
                PyList_Append(lines, line)
121
                c_str = newline + 1
122
                if c_str > c_last: # We are done
123
                    break
124
                the_len = c_last - c_str + 1
125
                newline = <char *>memchr(c_str, c'\n', the_len)
126
                if newline == NULL:
127
                    tail = PyString_FromStringAndSize(c_str, the_len)
128
                    break
129
    if tail is not None:
130
        PyList_Append(lines, tail)
131
    return lines