bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1 |
# Copyright (C) 2008, 2009 Canonical Ltd
|
2 |
#
|
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
3 |
# This program is free software; you can redistribute it and/or modify
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
4 |
# it under the terms of the GNU General Public License as published by
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
8 |
# This program is distributed in the hope that it will be useful,
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
12 |
#
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
13 |
# You should have received a copy of the GNU General Public License
|
14 |
# along with this program; if not, write to the Free Software
|
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
15 |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
16 |
|
17 |
"""Core compression logic for compressing streams of related files."""
|
|
18 |
||
|
0.17.13
by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting). |
19 |
from itertools import izip |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
20 |
from cStringIO import StringIO |
|
0.25.3
by John Arbash Meinel
Add a encode/decode base128 functions. |
21 |
import struct |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
22 |
import zlib |
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
23 |
try: |
24 |
import pylzma |
|
25 |
except ImportError: |
|
26 |
pylzma = None |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
27 |
|
|
0.17.4
by Robert Collins
Annotate. |
28 |
from bzrlib import ( |
29 |
annotate, |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
30 |
debug, |
|
0.17.4
by Robert Collins
Annotate. |
31 |
diff, |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
32 |
errors, |
|
0.17.4
by Robert Collins
Annotate. |
33 |
graph as _mod_graph, |
|
0.20.2
by John Arbash Meinel
Teach groupcompress about 'chunked' encoding |
34 |
osutils, |
|
0.17.4
by Robert Collins
Annotate. |
35 |
pack, |
36 |
patiencediff, |
|
37 |
)
|
|
38 |
from bzrlib.graph import Graph |
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
39 |
from bzrlib.knit import _DirectPackAccess |
|
0.17.2
by Robert Collins
Core proof of concept working. |
40 |
from bzrlib.osutils import ( |
41 |
contains_whitespace, |
|
42 |
sha_string, |
|
43 |
split_lines, |
|
44 |
)
|
|
|
0.17.21
by Robert Collins
Update groupcompress to bzrlib 1.10. |
45 |
from bzrlib.btree_index import BTreeBuilder |
|
0.17.24
by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group. |
46 |
from bzrlib.lru_cache import LRUSizeCache |
|
0.17.9
by Robert Collins
Initial stab at repository format support. |
47 |
from bzrlib.tsort import topo_sort |
|
0.17.2
by Robert Collins
Core proof of concept working. |
48 |
from bzrlib.versionedfile import ( |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
49 |
adapter_registry, |
50 |
AbsentContentFactory, |
|
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
51 |
ChunkedContentFactory, |
|
0.17.2
by Robert Collins
Core proof of concept working. |
52 |
FulltextContentFactory, |
53 |
VersionedFiles, |
|
54 |
)
|
|
55 |
||
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
56 |
_USE_LZMA = False and (pylzma is not None) |
|
3735.32.1
by John Arbash Meinel
Fix the VF WalkingContent checks. |
57 |
_NO_LABELS = False |
|
0.25.16
by John Arbash Meinel
Make sure we don't inter-pack to GCCHKBig repos. |
58 |
_FAST = False |
|
0.17.2
by Robert Collins
Core proof of concept working. |
59 |
|
|
0.25.3
by John Arbash Meinel
Add a encode/decode base128 functions. |
60 |
def encode_base128_int(val): |
61 |
"""Convert an integer into a 7-bit lsb encoding.""" |
|
62 |
bytes = [] |
|
63 |
count = 0 |
|
64 |
while val >= 0x80: |
|
65 |
bytes.append(chr((val | 0x80) & 0xFF)) |
|
66 |
val >>= 7 |
|
67 |
bytes.append(chr(val)) |
|
68 |
return ''.join(bytes) |
|
69 |
||
70 |
||
71 |
def decode_base128_int(bytes): |
|
72 |
"""Decode an integer from a 7-bit lsb encoding.""" |
|
73 |
offset = 0 |
|
74 |
val = 0 |
|
75 |
shift = 0 |
|
76 |
bval = ord(bytes[offset]) |
|
77 |
while bval >= 0x80: |
|
78 |
val |= (bval & 0x7F) << shift |
|
79 |
shift += 7 |
|
80 |
offset += 1 |
|
81 |
bval = ord(bytes[offset]) |
|
82 |
val |= bval << shift |
|
83 |
offset += 1 |
|
84 |
return val, offset |
|
85 |
||
86 |
||
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
87 |
def sort_gc_optimal(parent_map): |
|
3735.31.14
by John Arbash Meinel
Change the gc-optimal to 'groupcompress' |
88 |
"""Sort and group the keys in parent_map into groupcompress order. |
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
89 |
|
|
3735.31.14
by John Arbash Meinel
Change the gc-optimal to 'groupcompress' |
90 |
groupcompress is defined (currently) as reverse-topological order, grouped by
|
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
91 |
the key prefix.
|
92 |
||
93 |
:return: A sorted-list of keys
|
|
94 |
"""
|
|
|
3735.31.14
by John Arbash Meinel
Change the gc-optimal to 'groupcompress' |
95 |
# groupcompress ordering is approximately reverse topological,
|
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
96 |
# properly grouped by file-id.
|
|
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
97 |
per_prefix_map = {} |
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
98 |
for item in parent_map.iteritems(): |
99 |
key = item[0] |
|
100 |
if isinstance(key, str) or len(key) == 1: |
|
|
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
101 |
prefix = '' |
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
102 |
else: |
|
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
103 |
prefix = key[0] |
104 |
try: |
|
105 |
per_prefix_map[prefix].append(item) |
|
106 |
except KeyError: |
|
107 |
per_prefix_map[prefix] = [item] |
|
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
108 |
|
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
109 |
present_keys = [] |
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
110 |
for prefix in sorted(per_prefix_map): |
111 |
present_keys.extend(reversed(topo_sort(per_prefix_map[prefix]))) |
|
112 |
return present_keys |
|
113 |
||
114 |
||
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
115 |
class GroupCompressBlockEntry(object): |
116 |
"""Track the information about a single object inside a GC group. |
|
117 |
||
118 |
This is generally just the dumb data structure.
|
|
119 |
"""
|
|
120 |
||
121 |
def __init__(self, key, type, sha1, start, length): |
|
122 |
self.key = key |
|
123 |
self.type = type # delta, fulltext, external? |
|
124 |
self.sha1 = sha1 # Sha1 of content |
|
125 |
self.start = start # Byte offset to start of data |
|
126 |
self.length = length # Length of content |
|
127 |
||
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
128 |
def __repr__(self): |
129 |
return '%s(%s, %s, %s, %s, %s)' % ( |
|
130 |
self.__class__.__name__, |
|
131 |
self.key, self.type, self.sha1, self.start, self.length |
|
132 |
)
|
|
133 |
||
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
134 |
|
135 |
class GroupCompressBlock(object): |
|
136 |
"""An object which maintains the internal structure of the compressed data. |
|
137 |
||
138 |
This tracks the meta info (start of text, length, type, etc.)
|
|
139 |
"""
|
|
140 |
||
|
0.25.5
by John Arbash Meinel
Now using a zlib compressed format. |
141 |
# Group Compress Block v1 Zlib
|
142 |
GCB_HEADER = 'gcb1z\n' |
|
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
143 |
GCB_LZ_HEADER = 'gcb1l\n' |
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
144 |
|
145 |
def __init__(self): |
|
146 |
# map by key? or just order in file?
|
|
147 |
self._entries = {} |
|
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
148 |
self._content = None |
149 |
self._size = 0 |
|
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
150 |
|
151 |
def _parse_header(self): |
|
152 |
"""Parse the meta-info from the stream.""" |
|
153 |
||
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
154 |
def __len__(self): |
155 |
return self._size |
|
156 |
||
|
0.17.48
by John Arbash Meinel
if _NO_LABELS is set, don't bother parsing the mini header. |
157 |
def _parse_header_bytes(self, header_bytes): |
158 |
"""Parse the header part of the block.""" |
|
159 |
if _NO_LABELS: |
|
160 |
# Don't parse the label structure if we aren't going to use it
|
|
161 |
return
|
|
162 |
lines = header_bytes.split('\n') |
|
163 |
info_dict = {} |
|
164 |
for line in lines: |
|
165 |
if not line: #End of record |
|
166 |
if not info_dict: |
|
167 |
break
|
|
168 |
self.add_entry(**info_dict) |
|
169 |
info_dict = {} |
|
170 |
continue
|
|
171 |
key, value = line.split(':', 1) |
|
172 |
if key == 'key': |
|
173 |
value = tuple(map(intern, value.split('\x00'))) |
|
174 |
elif key in ('start', 'length'): |
|
175 |
value = int(value) |
|
176 |
elif key == 'type': |
|
177 |
value = intern(value) |
|
178 |
info_dict[key] = value |
|
179 |
||
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
180 |
@classmethod
|
181 |
def from_bytes(cls, bytes): |
|
182 |
out = cls() |
|
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
183 |
if bytes[:6] not in (cls.GCB_HEADER, cls.GCB_LZ_HEADER): |
|
3735.31.1
by John Arbash Meinel
Bring the groupcompress plugin into the brisbane-core branch. |
184 |
raise ValueError('bytes did not start with %r' % (cls.GCB_HEADER,)) |
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
185 |
if bytes[4] == 'z': |
186 |
decomp = zlib.decompress |
|
|
0.17.45
by John Arbash Meinel
Just make sure we have the right decompressor |
187 |
elif bytes[4] == 'l': |
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
188 |
decomp = pylzma.decompress |
|
0.17.45
by John Arbash Meinel
Just make sure we have the right decompressor |
189 |
else: |
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
190 |
raise ValueError('unknown compressor: %r' % (bytes,)) |
|
0.25.4
by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values. |
191 |
pos = bytes.index('\n', 6) |
|
0.25.5
by John Arbash Meinel
Now using a zlib compressed format. |
192 |
z_header_length = int(bytes[6:pos]) |
193 |
pos += 1 |
|
194 |
pos2 = bytes.index('\n', pos) |
|
195 |
header_length = int(bytes[pos:pos2]) |
|
196 |
if z_header_length == 0: |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
197 |
if header_length != 0: |
198 |
raise ValueError('z_header_length 0, but header length != 0') |
|
|
0.17.35
by John Arbash Meinel
Get the tests passing again |
199 |
zcontent = bytes[pos2+1:] |
200 |
if zcontent: |
|
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
201 |
out._content = decomp(zcontent) |
|
0.17.35
by John Arbash Meinel
Get the tests passing again |
202 |
out._size = len(out._content) |
|
0.25.4
by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values. |
203 |
return out |
|
0.25.5
by John Arbash Meinel
Now using a zlib compressed format. |
204 |
pos = pos2 + 1 |
205 |
pos2 = pos + z_header_length |
|
206 |
z_header_bytes = bytes[pos:pos2] |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
207 |
if len(z_header_bytes) != z_header_length: |
208 |
raise ValueError('Wrong length of compressed header. %s != %s' |
|
209 |
% (len(z_header_bytes), z_header_length)) |
|
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
210 |
header_bytes = decomp(z_header_bytes) |
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
211 |
if len(header_bytes) != header_length: |
212 |
raise ValueError('Wrong length of header. %s != %s' |
|
213 |
% (len(header_bytes), header_length)) |
|
|
0.25.5
by John Arbash Meinel
Now using a zlib compressed format. |
214 |
del z_header_bytes |
|
0.17.48
by John Arbash Meinel
if _NO_LABELS is set, don't bother parsing the mini header. |
215 |
out._parse_header_bytes(header_bytes) |
|
0.17.49
by John Arbash Meinel
Fix some missing variable names. |
216 |
del header_bytes |
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
217 |
zcontent = bytes[pos2:] |
218 |
if zcontent: |
|
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
219 |
out._content = decomp(zcontent) |
|
0.17.49
by John Arbash Meinel
Fix some missing variable names. |
220 |
out._size = header_length + len(out._content) |
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
221 |
return out |
222 |
||
|
0.25.12
by John Arbash Meinel
Add a single byte to indicate whether the following text is a fulltext |
223 |
def extract(self, key, index_memo, sha1=None): |
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
224 |
"""Extract the text for a specific key. |
225 |
||
226 |
:param key: The label used for this content
|
|
227 |
:param sha1: TODO (should we validate only when sha1 is supplied?)
|
|
228 |
:return: The bytes for the content
|
|
229 |
"""
|
|
|
0.25.16
by John Arbash Meinel
Make sure we don't inter-pack to GCCHKBig repos. |
230 |
if _NO_LABELS or not self._entries: |
|
0.25.13
by John Arbash Meinel
bring back the code that handles _NO_LABELS. |
231 |
start, end = index_memo[3:5] |
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
232 |
# The bytes are 'f' or 'd' for the type, then a variable-length
|
233 |
# base128 integer for the content size, then the actual content
|
|
234 |
# We know that the variable-length integer won't be longer than 10
|
|
235 |
# bytes (it only takes 5 bytes to encode 2^32)
|
|
|
0.25.13
by John Arbash Meinel
bring back the code that handles _NO_LABELS. |
236 |
c = self._content[start] |
237 |
if c == 'f': |
|
238 |
type = 'fulltext' |
|
239 |
else: |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
240 |
if c != 'd': |
241 |
raise ValueError('Unknown content control code: %s' |
|
242 |
% (c,)) |
|
|
0.25.13
by John Arbash Meinel
bring back the code that handles _NO_LABELS. |
243 |
type = 'delta' |
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
244 |
entry = GroupCompressBlockEntry(key, type, sha1=None, |
245 |
start=start, length=end-start) |
|
246 |
else: |
|
247 |
entry = self._entries[key] |
|
248 |
c = self._content[entry.start] |
|
249 |
if entry.type == 'fulltext': |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
250 |
if c != 'f': |
251 |
raise ValueError('Label claimed fulltext, byte claims: %s' |
|
252 |
% (c,)) |
|
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
253 |
elif entry.type == 'delta': |
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
254 |
if c != 'd': |
255 |
raise ValueError('Label claimed delta, byte claims: %s' |
|
256 |
% (c,)) |
|
|
0.17.37
by Ian Clatworthy
fix initialization of start variable |
257 |
start = entry.start |
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
258 |
content_len, len_len = decode_base128_int( |
|
0.26.1
by John Arbash Meinel
Prototype using LZMA as the secondary compressor, rather than zlib. |
259 |
self._content[entry.start + 1:entry.start + 11]) |
260 |
content_start = entry.start + 1 + len_len |
|
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
261 |
end = entry.start + entry.length |
262 |
content = self._content[content_start:end] |
|
263 |
if c == 'f': |
|
264 |
bytes = content |
|
265 |
elif c == 'd': |
|
266 |
bytes = _groupcompress_pyx.apply_delta(self._content, content) |
|
267 |
if entry.sha1 is None: |
|
268 |
entry.sha1 = sha_string(bytes) |
|
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
269 |
return entry, bytes |
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
270 |
|
271 |
def add_entry(self, key, type, sha1, start, length): |
|
272 |
"""Add new meta info about an entry. |
|
273 |
||
274 |
:param key: The key for the new content
|
|
275 |
:param type: Whether this is a delta or fulltext entry (external?)
|
|
276 |
:param sha1: sha1sum of the fulltext of this entry
|
|
277 |
:param start: where the encoded bytes start
|
|
278 |
:param length: total number of bytes in the encoded form
|
|
279 |
:return: The entry?
|
|
280 |
"""
|
|
281 |
entry = GroupCompressBlockEntry(key, type, sha1, start, length) |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
282 |
if key in self._entries: |
283 |
raise ValueError('Duplicate key found: %s' % (key,)) |
|
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
284 |
self._entries[key] = entry |
285 |
return entry |
|
286 |
||
|
0.25.7
by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content. |
287 |
def to_bytes(self, content=''): |
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
288 |
"""Encode the information into a byte stream.""" |
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
289 |
compress = zlib.compress |
290 |
if _USE_LZMA: |
|
291 |
compress = pylzma.compress |
|
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
292 |
chunks = [] |
293 |
for key in sorted(self._entries): |
|
294 |
entry = self._entries[key] |
|
295 |
chunk = ('key:%s\n' |
|
|
0.25.4
by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values. |
296 |
'sha1:%s\n' |
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
297 |
'type:%s\n' |
298 |
'start:%s\n' |
|
299 |
'length:%s\n' |
|
300 |
'\n' |
|
301 |
) % ('\x00'.join(entry.key), |
|
|
0.25.4
by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values. |
302 |
entry.sha1, |
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
303 |
entry.type, |
304 |
entry.start, |
|
305 |
entry.length, |
|
306 |
)
|
|
307 |
chunks.append(chunk) |
|
|
0.25.5
by John Arbash Meinel
Now using a zlib compressed format. |
308 |
bytes = ''.join(chunks) |
309 |
info_len = len(bytes) |
|
|
0.25.7
by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content. |
310 |
z_bytes = [] |
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
311 |
z_bytes.append(compress(bytes)) |
|
0.25.5
by John Arbash Meinel
Now using a zlib compressed format. |
312 |
del bytes |
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
313 |
# TODO: we may want to have the header compressed in the same chain
|
314 |
# as the data, or we may not, evaulate it
|
|
315 |
# having them compressed together is probably a win for
|
|
316 |
# revisions and the 'inv' portion of chk inventories. As the
|
|
317 |
# label in the header is duplicated in the text.
|
|
318 |
# For chk pages and real bytes, I would guess this is not
|
|
319 |
# true.
|
|
|
0.25.7
by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content. |
320 |
z_len = sum(map(len, z_bytes)) |
321 |
c_len = len(content) |
|
|
0.25.13
by John Arbash Meinel
bring back the code that handles _NO_LABELS. |
322 |
if _NO_LABELS: |
323 |
z_bytes = [] |
|
324 |
z_len = 0 |
|
325 |
info_len = 0 |
|
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
326 |
z_bytes.append(compress(content)) |
|
0.17.46
by John Arbash Meinel
Set the proper header when using/not using lzma |
327 |
if _USE_LZMA: |
328 |
header = self.GCB_LZ_HEADER |
|
329 |
else: |
|
330 |
header = self.GCB_HEADER |
|
331 |
chunks = [header, |
|
|
0.25.7
by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content. |
332 |
'%d\n' % (z_len,), |
333 |
'%d\n' % (info_len,), |
|
334 |
#'%d\n' % (c_len,),
|
|
335 |
]
|
|
336 |
chunks.extend(z_bytes) |
|
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
337 |
return ''.join(chunks) |
338 |
||
339 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
340 |
class GroupCompressor(object): |
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
341 |
"""Produce a serialised group of compressed texts. |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
342 |
|
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
343 |
It contains code very similar to SequenceMatcher because of having a similar
|
344 |
task. However some key differences apply:
|
|
345 |
- there is no junk, we want a minimal edit not a human readable diff.
|
|
346 |
- we don't filter very common lines (because we don't know where a good
|
|
347 |
range will start, and after the first text we want to be emitting minmal
|
|
348 |
edits only.
|
|
349 |
- we chain the left side, not the right side
|
|
350 |
- we incrementally update the adjacency matrix as new lines are provided.
|
|
351 |
- we look for matches in all of the left side, so the routine which does
|
|
352 |
the analagous task of find_longest_match does not need to filter on the
|
|
353 |
left side.
|
|
354 |
"""
|
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
355 |
|
356 |
def __init__(self, delta=True): |
|
357 |
"""Create a GroupCompressor. |
|
358 |
||
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
359 |
:param delta: If False, do not compress records.
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
360 |
"""
|
|
0.23.13
by John Arbash Meinel
Factor out the ability to have/not have labels. |
361 |
# Consider seeding the lines with some sort of GC Start flag, or
|
362 |
# putting it as part of the output stream, rather than in the
|
|
363 |
# compressed bytes.
|
|
|
0.23.9
by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor. |
364 |
self.lines = [] |
|
0.17.2
by Robert Collins
Core proof of concept working. |
365 |
self.endpoint = 0 |
366 |
self.input_bytes = 0 |
|
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
367 |
self.num_keys = 0 |
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
368 |
self.labels_deltas = {} |
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
369 |
self._last = None |
|
0.23.26
by John Arbash Meinel
We now start to make use of the ability to extend the delta index |
370 |
self._delta_index = _groupcompress_pyx.DeltaIndex() |
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
371 |
self._block = GroupCompressBlock() |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
372 |
|
|
3735.31.12
by John Arbash Meinel
Push nostore_sha down through the stack. |
373 |
def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False): |
|
0.17.2
by Robert Collins
Core proof of concept working. |
374 |
"""Compress lines with label key. |
375 |
||
376 |
:param key: A key tuple. It is stored in the output
|
|
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
377 |
for identification of the text during decompression. If the last
|
378 |
element is 'None' it is replaced with the sha1 of the text -
|
|
379 |
e.g. sha1:xxxxxxx.
|
|
|
0.23.58
by John Arbash Meinel
fix up the failing tests. |
380 |
:param bytes: The bytes to be compressed
|
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
381 |
:param expected_sha: If non-None, the sha the lines are believed to
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
382 |
have. During compression the sha is calculated; a mismatch will
|
383 |
cause an error.
|
|
|
3735.31.12
by John Arbash Meinel
Push nostore_sha down through the stack. |
384 |
:param nostore_sha: If the computed sha1 sum matches, we will raise
|
385 |
ExistingContent rather than adding the text.
|
|
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
386 |
:param soft: Do a 'soft' compression. This means that we require larger
|
387 |
ranges to match to be considered for a copy command.
|
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
388 |
:return: The sha1 of lines, and the number of bytes accumulated in
|
389 |
the group output so far.
|
|
|
3735.31.12
by John Arbash Meinel
Push nostore_sha down through the stack. |
390 |
:seealso VersionedFiles.add_lines:
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
391 |
"""
|
|
0.23.54
by John Arbash Meinel
'bzr pack' _FAST during compress() now is 32s versus 25s. |
392 |
if not _FAST or expected_sha is None: |
393 |
sha1 = sha_string(bytes) |
|
394 |
else: |
|
395 |
sha1 = expected_sha |
|
|
3735.31.12
by John Arbash Meinel
Push nostore_sha down through the stack. |
396 |
if sha1 == nostore_sha: |
397 |
raise errors.ExistingContent() |
|
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
398 |
if key[-1] is None: |
399 |
key = key[:-1] + ('sha1:' + sha1,) |
|
|
0.23.52
by John Arbash Meinel
Use the max_delta flag. |
400 |
input_len = len(bytes) |
|
0.23.12
by John Arbash Meinel
Add a 'len:' field to the data. |
401 |
# By having action/label/sha1/len, we can parse the group if the index
|
402 |
# was ever destroyed, we have the key in 'label', we know the final
|
|
403 |
# bytes are valid from sha1, and we know where to find the end of this
|
|
404 |
# record because of 'len'. (the delta record itself will store the
|
|
405 |
# total length for the expanded record)
|
|
|
0.23.13
by John Arbash Meinel
Factor out the ability to have/not have labels. |
406 |
# 'len: %d\n' costs approximately 1% increase in total data
|
407 |
# Having the labels at all costs us 9-10% increase, 38% increase for
|
|
408 |
# inventory pages, and 5.8% increase for text pages
|
|
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
409 |
# new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
|
|
0.23.33
by John Arbash Meinel
Fix a bug when handling multiple large-range copies. |
410 |
if self._delta_index._source_offset != self.endpoint: |
411 |
raise AssertionError('_source_offset != endpoint' |
|
412 |
' somehow the DeltaIndex got out of sync with'
|
|
413 |
' the output lines') |
|
|
0.23.52
by John Arbash Meinel
Use the max_delta flag. |
414 |
max_delta_size = len(bytes) / 2 |
415 |
delta = self._delta_index.make_delta(bytes, max_delta_size) |
|
416 |
if (delta is None): |
|
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
417 |
type = 'fulltext' |
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
418 |
enc_length = encode_base128_int(len(bytes)) |
419 |
len_mini_header = 1 + len(enc_length) |
|
420 |
length = len(bytes) + len_mini_header |
|
421 |
self._delta_index.add_source(bytes, len_mini_header) |
|
422 |
new_chunks = ['f', enc_length, bytes] |
|
|
0.23.9
by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor. |
423 |
else: |
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
424 |
type = 'delta' |
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
425 |
enc_length = encode_base128_int(len(delta)) |
426 |
len_mini_header = 1 + len(enc_length) |
|
427 |
length = len(delta) + len_mini_header |
|
428 |
new_chunks = ['d', enc_length, delta] |
|
|
0.23.43
by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data. |
429 |
if _FAST: |
|
0.25.14
by John Arbash Meinel
Fix a bug in 'FAST' handling. |
430 |
self._delta_index._source_offset += length |
|
0.23.43
by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data. |
431 |
else: |
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
432 |
self._delta_index.add_delta_source(delta, len_mini_header) |
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
433 |
self._block.add_entry(key, type=type, sha1=sha1, |
434 |
start=self.endpoint, length=length) |
|
|
0.18.9
by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right |
435 |
delta_start = (self.endpoint, len(self.lines)) |
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
436 |
self.num_keys += 1 |
|
0.23.9
by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor. |
437 |
self.output_chunks(new_chunks) |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
438 |
self.input_bytes += input_len |
|
0.18.9
by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right |
439 |
delta_end = (self.endpoint, len(self.lines)) |
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
440 |
self.labels_deltas[key] = (delta_start, delta_end) |
|
0.23.29
by John Arbash Meinel
Forgot to add the delta bytes to the index objects. |
441 |
if not self._delta_index._source_offset == self.endpoint: |
442 |
raise AssertionError('the delta index is out of sync' |
|
443 |
'with the output lines %s != %s' |
|
444 |
% (self._delta_index._source_offset, self.endpoint)) |
|
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
445 |
return sha1, self.endpoint, type, length |
|
0.17.2
by Robert Collins
Core proof of concept working. |
446 |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
447 |
def extract(self, key): |
|
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
448 |
"""Extract a key previously added to the compressor. |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
449 |
|
|
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
450 |
:param key: The key to extract.
|
451 |
:return: An iterable over bytes and the sha1.
|
|
452 |
"""
|
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
453 |
delta_details = self.labels_deltas[key] |
|
0.23.9
by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor. |
454 |
delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]] |
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
455 |
stored_bytes = ''.join(delta_chunks) |
456 |
# TODO: Fix this, we shouldn't really be peeking here
|
|
457 |
entry = self._block._entries[key] |
|
458 |
if entry.type == 'fulltext': |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
459 |
if stored_bytes[0] != 'f': |
460 |
raise ValueError('Index claimed fulltext, but stored bytes' |
|
461 |
' indicate %s' % (stored_bytes[0],)) |
|
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
462 |
fulltext_len, offset = decode_base128_int(stored_bytes[1:10]) |
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
463 |
if fulltext_len + 1 + offset != len(stored_bytes): |
464 |
raise ValueError('Index claimed fulltext len, but stored bytes' |
|
465 |
' claim %s != %s' |
|
466 |
% (len(stored_bytes), |
|
467 |
fulltext_len + 1 + offset)) |
|
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
468 |
bytes = stored_bytes[offset + 1:] |
|
0.23.9
by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor. |
469 |
else: |
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
470 |
if entry.type != 'delta': |
471 |
raise ValueError('Unknown entry type: %s' % (entry.type,)) |
|
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
472 |
# XXX: This is inefficient at best
|
473 |
source = ''.join(self.lines) |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
474 |
if stored_bytes[0] != 'd': |
475 |
raise ValueError('Entry type claims delta, bytes claim %s' |
|
476 |
% (stored_bytes[0],)) |
|
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
477 |
delta_len, offset = decode_base128_int(stored_bytes[1:10]) |
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
478 |
if delta_len + 1 + offset != len(stored_bytes): |
479 |
raise ValueError('Index claimed delta len, but stored bytes' |
|
480 |
' claim %s != %s' |
|
481 |
% (len(stored_bytes), |
|
482 |
delta_len + 1 + offset)) |
|
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
483 |
bytes = _groupcompress_pyx.apply_delta(source, |
484 |
stored_bytes[offset + 1:]) |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
485 |
bytes_sha1 = sha_string(bytes) |
486 |
if entry.sha1 != bytes_sha1: |
|
487 |
raise ValueError('Recorded sha1 != measured %s != %s' |
|
488 |
% (entry.sha1, bytes_sha1)) |
|
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
489 |
return bytes, entry.sha1 |
|
0.23.9
by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor. |
490 |
|
491 |
def output_chunks(self, new_chunks): |
|
492 |
"""Output some chunks. |
|
493 |
||
494 |
:param new_chunks: The chunks to output.
|
|
495 |
"""
|
|
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
496 |
self._last = (len(self.lines), self.endpoint) |
|
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
497 |
endpoint = self.endpoint |
|
0.23.9
by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor. |
498 |
self.lines.extend(new_chunks) |
499 |
endpoint += sum(map(len, new_chunks)) |
|
|
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
500 |
self.endpoint = endpoint |
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
501 |
|
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
502 |
def pop_last(self): |
503 |
"""Call this if you want to 'revoke' the last compression. |
|
504 |
||
505 |
After this, the data structures will be rolled back, but you cannot do
|
|
506 |
more compression.
|
|
507 |
"""
|
|
508 |
self._delta_index = None |
|
509 |
del self.lines[self._last[0]:] |
|
510 |
self.endpoint = self._last[1] |
|
511 |
self._last = None |
|
512 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
513 |
def ratio(self): |
514 |
"""Return the overall compression ratio.""" |
|
515 |
return float(self.input_bytes) / float(self.endpoint) |
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
516 |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
517 |
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
518 |
def make_pack_factory(graph, delta, keylength): |
519 |
"""Create a factory for creating a pack based groupcompress. |
|
520 |
||
521 |
This is only functional enough to run interface tests, it doesn't try to
|
|
522 |
provide a full pack environment.
|
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
523 |
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
524 |
:param graph: Store a graph.
|
525 |
:param delta: Delta compress contents.
|
|
526 |
:param keylength: How long should keys be.
|
|
527 |
"""
|
|
528 |
def factory(transport): |
|
|
3735.32.2
by John Arbash Meinel
The 'delta' flag has no effect on the content (all GC is delta'd), |
529 |
parents = graph |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
530 |
ref_length = 0 |
531 |
if graph: |
|
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
532 |
ref_length = 1 |
|
0.17.7
by Robert Collins
Update for current index2 changes. |
533 |
graph_index = BTreeBuilder(reference_lists=ref_length, |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
534 |
key_elements=keylength) |
535 |
stream = transport.open_write_stream('newpack') |
|
536 |
writer = pack.ContainerWriter(stream.write) |
|
537 |
writer.begin() |
|
538 |
index = _GCGraphIndex(graph_index, lambda:True, parents=parents, |
|
|
0.17.9
by Robert Collins
Initial stab at repository format support. |
539 |
add_callback=graph_index.add_nodes) |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
540 |
access = _DirectPackAccess({}) |
541 |
access.set_writer(writer, graph_index, (transport, 'newpack')) |
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
542 |
result = GroupCompressVersionedFiles(index, access, delta) |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
543 |
result.stream = stream |
544 |
result.writer = writer |
|
545 |
return result |
|
546 |
return factory |
|
547 |
||
548 |
||
549 |
def cleanup_pack_group(versioned_files): |
|
|
0.17.23
by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe. |
550 |
versioned_files.writer.end() |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
551 |
versioned_files.stream.close() |
552 |
||
553 |
||
554 |
class GroupCompressVersionedFiles(VersionedFiles): |
|
555 |
"""A group-compress based VersionedFiles implementation.""" |
|
556 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
557 |
def __init__(self, index, access, delta=True): |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
558 |
"""Create a GroupCompressVersionedFiles object. |
559 |
||
560 |
:param index: The index object storing access and graph data.
|
|
561 |
:param access: The access object storing raw data.
|
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
562 |
:param delta: Whether to delta compress or just entropy compress.
|
563 |
"""
|
|
564 |
self._index = index |
|
565 |
self._access = access |
|
566 |
self._delta = delta |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
567 |
self._unadded_refs = {} |
|
0.17.24
by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group. |
568 |
self._group_cache = LRUSizeCache(max_size=50*1024*1024) |
|
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
569 |
self._fallback_vfs = [] |
|
0.17.2
by Robert Collins
Core proof of concept working. |
570 |
|
571 |
def add_lines(self, key, parents, lines, parent_texts=None, |
|
572 |
left_matching_blocks=None, nostore_sha=None, random_id=False, |
|
573 |
check_content=True): |
|
574 |
"""Add a text to the store. |
|
575 |
||
576 |
:param key: The key tuple of the text to add.
|
|
577 |
:param parents: The parents key tuples of the text to add.
|
|
578 |
:param lines: A list of lines. Each line must be a bytestring. And all
|
|
579 |
of them except the last must be terminated with \n and contain no
|
|
580 |
other \n's. The last line may either contain no \n's or a single
|
|
581 |
terminating \n. If the lines list does meet this constraint the add
|
|
582 |
routine may error or may succeed - but you will be unable to read
|
|
583 |
the data back accurately. (Checking the lines have been split
|
|
584 |
correctly is expensive and extremely unlikely to catch bugs so it
|
|
585 |
is not done at runtime unless check_content is True.)
|
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
586 |
:param parent_texts: An optional dictionary containing the opaque
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
587 |
representations of some or all of the parents of version_id to
|
588 |
allow delta optimisations. VERY IMPORTANT: the texts must be those
|
|
589 |
returned by add_lines or data corruption can be caused.
|
|
590 |
:param left_matching_blocks: a hint about which areas are common
|
|
591 |
between the text and its left-hand-parent. The format is
|
|
592 |
the SequenceMatcher.get_matching_blocks format.
|
|
593 |
:param nostore_sha: Raise ExistingContent and do not add the lines to
|
|
594 |
the versioned file if the digest of the lines matches this.
|
|
595 |
:param random_id: If True a random id has been selected rather than
|
|
596 |
an id determined by some deterministic process such as a converter
|
|
597 |
from a foreign VCS. When True the backend may choose not to check
|
|
598 |
for uniqueness of the resulting key within the versioned file, so
|
|
599 |
this should only be done when the result is expected to be unique
|
|
600 |
anyway.
|
|
601 |
:param check_content: If True, the lines supplied are verified to be
|
|
602 |
bytestrings that are correctly formed lines.
|
|
603 |
:return: The text sha1, the number of bytes in the text, and an opaque
|
|
604 |
representation of the inserted version which can be provided
|
|
605 |
back to future add_lines calls in the parent_texts dictionary.
|
|
606 |
"""
|
|
607 |
self._index._check_write_ok() |
|
608 |
self._check_add(key, lines, random_id, check_content) |
|
609 |
if parents is None: |
|
610 |
# The caller might pass None if there is no graph data, but kndx
|
|
611 |
# indexes can't directly store that, so we give them
|
|
612 |
# an empty tuple instead.
|
|
613 |
parents = () |
|
614 |
# double handling for now. Make it work until then.
|
|
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
615 |
length = sum(map(len, lines)) |
616 |
record = ChunkedContentFactory(key, parents, None, lines) |
|
|
3735.31.12
by John Arbash Meinel
Push nostore_sha down through the stack. |
617 |
sha1 = list(self._insert_record_stream([record], random_id=random_id, |
618 |
nostore_sha=nostore_sha))[0] |
|
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
619 |
return sha1, length, None |
|
0.17.2
by Robert Collins
Core proof of concept working. |
620 |
|
|
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
621 |
def add_fallback_versioned_files(self, a_versioned_files): |
622 |
"""Add a source of texts for texts not present in this knit. |
|
623 |
||
624 |
:param a_versioned_files: A VersionedFiles object.
|
|
625 |
"""
|
|
626 |
self._fallback_vfs.append(a_versioned_files) |
|
627 |
||
|
0.17.4
by Robert Collins
Annotate. |
628 |
def annotate(self, key): |
629 |
"""See VersionedFiles.annotate.""" |
|
630 |
graph = Graph(self) |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
631 |
parent_map = self.get_parent_map([key]) |
632 |
if not parent_map: |
|
633 |
raise errors.RevisionNotPresent(key, self) |
|
634 |
if parent_map[key] is not None: |
|
635 |
search = graph._make_breadth_first_searcher([key]) |
|
636 |
keys = set() |
|
637 |
while True: |
|
638 |
try: |
|
639 |
present, ghosts = search.next_with_ghosts() |
|
640 |
except StopIteration: |
|
641 |
break
|
|
642 |
keys.update(present) |
|
643 |
parent_map = self.get_parent_map(keys) |
|
644 |
else: |
|
645 |
keys = [key] |
|
646 |
parent_map = {key:()} |
|
|
0.17.4
by Robert Collins
Annotate. |
647 |
head_cache = _mod_graph.FrozenHeadsCache(graph) |
648 |
parent_cache = {} |
|
649 |
reannotate = annotate.reannotate |
|
650 |
for record in self.get_record_stream(keys, 'topological', True): |
|
651 |
key = record.key |
|
|
0.20.2
by John Arbash Meinel
Teach groupcompress about 'chunked' encoding |
652 |
chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked')) |
|
0.17.4
by Robert Collins
Annotate. |
653 |
parent_lines = [parent_cache[parent] for parent in parent_map[key]] |
654 |
parent_cache[key] = list( |
|
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
655 |
reannotate(parent_lines, chunks, key, None, head_cache)) |
|
0.17.4
by Robert Collins
Annotate. |
656 |
return parent_cache[key] |
657 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
658 |
def check(self, progress_bar=None): |
659 |
"""See VersionedFiles.check().""" |
|
660 |
keys = self.keys() |
|
661 |
for record in self.get_record_stream(keys, 'unordered', True): |
|
662 |
record.get_bytes_as('fulltext') |
|
663 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
664 |
def _check_add(self, key, lines, random_id, check_content): |
665 |
"""check that version_id and lines are safe to add.""" |
|
666 |
version_id = key[-1] |
|
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
667 |
if version_id is not None: |
668 |
if contains_whitespace(version_id): |
|
|
3735.31.1
by John Arbash Meinel
Bring the groupcompress plugin into the brisbane-core branch. |
669 |
raise errors.InvalidRevisionId(version_id, self) |
|
0.17.2
by Robert Collins
Core proof of concept working. |
670 |
self.check_not_reserved_id(version_id) |
671 |
# TODO: If random_id==False and the key is already present, we should
|
|
672 |
# probably check that the existing content is identical to what is
|
|
673 |
# being inserted, and otherwise raise an exception. This would make
|
|
674 |
# the bundle code simpler.
|
|
675 |
if check_content: |
|
676 |
self._check_lines_not_unicode(lines) |
|
677 |
self._check_lines_are_lines(lines) |
|
678 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
679 |
def get_parent_map(self, keys): |
|
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
680 |
"""Get a map of the graph parents of keys. |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
681 |
|
682 |
:param keys: The keys to look up parents for.
|
|
683 |
:return: A mapping from keys to parents. Absent keys are absent from
|
|
684 |
the mapping.
|
|
685 |
"""
|
|
|
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
686 |
return self._get_parent_map_with_sources(keys)[0] |
687 |
||
688 |
def _get_parent_map_with_sources(self, keys): |
|
689 |
"""Get a map of the parents of keys. |
|
690 |
||
691 |
:param keys: The keys to look up parents for.
|
|
692 |
:return: A tuple. The first element is a mapping from keys to parents.
|
|
693 |
Absent keys are absent from the mapping. The second element is a
|
|
694 |
list with the locations each key was found in. The first element
|
|
695 |
is the in-this-knit parents, the second the first fallback source,
|
|
696 |
and so on.
|
|
697 |
"""
|
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
698 |
result = {} |
|
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
699 |
sources = [self._index] + self._fallback_vfs |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
700 |
source_results = [] |
701 |
missing = set(keys) |
|
702 |
for source in sources: |
|
703 |
if not missing: |
|
704 |
break
|
|
705 |
new_result = source.get_parent_map(missing) |
|
706 |
source_results.append(new_result) |
|
707 |
result.update(new_result) |
|
708 |
missing.difference_update(set(new_result)) |
|
|
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
709 |
return result, source_results |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
710 |
|
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
711 |
def _get_block(self, index_memo): |
|
0.20.14
by John Arbash Meinel
Factor out _get_group_and_delta_lines. |
712 |
read_memo = index_memo[0:3] |
713 |
# get the group:
|
|
714 |
try: |
|
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
715 |
block = self._group_cache[read_memo] |
|
0.20.14
by John Arbash Meinel
Factor out _get_group_and_delta_lines. |
716 |
except KeyError: |
717 |
# read the group
|
|
718 |
zdata = self._access.get_raw_records([read_memo]).next() |
|
719 |
# decompress - whole thing - this is not a bug, as it
|
|
720 |
# permits caching. We might want to store the partially
|
|
721 |
# decompresed group and decompress object, so that recent
|
|
722 |
# texts are not penalised by big groups.
|
|
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
723 |
block = GroupCompressBlock.from_bytes(zdata) |
724 |
self._group_cache[read_memo] = block |
|
|
0.20.14
by John Arbash Meinel
Factor out _get_group_and_delta_lines. |
725 |
# cheapo debugging:
|
726 |
# print len(zdata), len(plain)
|
|
727 |
# parse - requires split_lines, better to have byte offsets
|
|
728 |
# here (but not by much - we only split the region for the
|
|
729 |
# recipe, and we often want to end up with lines anyway.
|
|
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
730 |
return block |
|
0.20.14
by John Arbash Meinel
Factor out _get_group_and_delta_lines. |
731 |
|
|
0.20.18
by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys() |
732 |
def get_missing_compression_parent_keys(self): |
733 |
"""Return the keys of missing compression parents. |
|
734 |
||
735 |
Missing compression parents occur when a record stream was missing
|
|
736 |
basis texts, or a index was scanned that had missing basis texts.
|
|
737 |
"""
|
|
738 |
# GroupCompress cannot currently reference texts that are not in the
|
|
739 |
# group, so this is valid for now
|
|
740 |
return frozenset() |
|
741 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
742 |
def get_record_stream(self, keys, ordering, include_delta_closure): |
743 |
"""Get a stream of records for keys. |
|
744 |
||
745 |
:param keys: The keys to include.
|
|
746 |
:param ordering: Either 'unordered' or 'topological'. A topologically
|
|
747 |
sorted stream has compression parents strictly before their
|
|
748 |
children.
|
|
749 |
:param include_delta_closure: If True then the closure across any
|
|
750 |
compression parents will be included (in the opaque data).
|
|
751 |
:return: An iterator of ContentFactory objects, each of which is only
|
|
752 |
valid until the iterator is advanced.
|
|
753 |
"""
|
|
754 |
# keys might be a generator
|
|
|
0.22.6
by John Arbash Meinel
Clustering chk pages properly makes a big difference. |
755 |
orig_keys = list(keys) |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
756 |
keys = set(keys) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
757 |
if not keys: |
758 |
return
|
|
|
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
759 |
if (not self._index.has_graph |
|
3735.31.14
by John Arbash Meinel
Change the gc-optimal to 'groupcompress' |
760 |
and ordering in ('topological', 'groupcompress')): |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
761 |
# Cannot topological order when no graph has been stored.
|
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
762 |
# but we allow 'as-requested' or 'unordered'
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
763 |
ordering = 'unordered' |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
764 |
|
765 |
remaining_keys = keys |
|
766 |
while True: |
|
767 |
try: |
|
768 |
keys = set(remaining_keys) |
|
769 |
for content_factory in self._get_remaining_record_stream(keys, |
|
770 |
orig_keys, ordering, include_delta_closure): |
|
771 |
remaining_keys.discard(content_factory.key) |
|
772 |
yield content_factory |
|
773 |
return
|
|
774 |
except errors.RetryWithNewPacks, e: |
|
775 |
self._access.reload_or_raise(e) |
|
776 |
||
777 |
def _find_from_fallback(self, missing): |
|
778 |
"""Find whatever keys you can from the fallbacks. |
|
779 |
||
780 |
:param missing: A set of missing keys. This set will be mutated as keys
|
|
781 |
are found from a fallback_vfs
|
|
782 |
:return: (parent_map, key_to_source_map, source_results)
|
|
783 |
parent_map the overall key => parent_keys
|
|
784 |
key_to_source_map a dict from {key: source}
|
|
785 |
source_results a list of (source: keys)
|
|
786 |
"""
|
|
787 |
parent_map = {} |
|
788 |
key_to_source_map = {} |
|
789 |
source_results = [] |
|
790 |
for source in self._fallback_vfs: |
|
791 |
if not missing: |
|
792 |
break
|
|
793 |
source_parents = source.get_parent_map(missing) |
|
794 |
parent_map.update(source_parents) |
|
795 |
source_parents = list(source_parents) |
|
796 |
source_results.append((source, source_parents)) |
|
797 |
key_to_source_map.update((key, source) for key in source_parents) |
|
798 |
missing.difference_update(source_parents) |
|
799 |
return parent_map, key_to_source_map, source_results |
|
800 |
||
801 |
def _get_ordered_source_keys(self, ordering, parent_map, key_to_source_map): |
|
802 |
"""Get the (source, [keys]) list. |
|
803 |
||
804 |
The returned objects should be in the order defined by 'ordering',
|
|
805 |
which can weave between different sources.
|
|
806 |
:param ordering: Must be one of 'topological' or 'groupcompress'
|
|
807 |
:return: List of [(source, [keys])] tuples, such that all keys are in
|
|
808 |
the defined order, regardless of source.
|
|
809 |
"""
|
|
810 |
if ordering == 'topological': |
|
811 |
present_keys = topo_sort(parent_map) |
|
812 |
else: |
|
813 |
# ordering == 'groupcompress'
|
|
814 |
# XXX: This only optimizes for the target ordering. We may need
|
|
815 |
# to balance that with the time it takes to extract
|
|
816 |
# ordering, by somehow grouping based on
|
|
817 |
# locations[key][0:3]
|
|
818 |
present_keys = sort_gc_optimal(parent_map) |
|
819 |
# Now group by source:
|
|
820 |
source_keys = [] |
|
821 |
current_source = None |
|
822 |
for key in present_keys: |
|
823 |
source = key_to_source_map.get(key, self) |
|
824 |
if source is not current_source: |
|
825 |
source_keys.append((source, [])) |
|
|
3735.2.151
by John Arbash Meinel
A the source grouping code needs to update current_source |
826 |
current_source = source |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
827 |
source_keys[-1][1].append(key) |
828 |
return source_keys |
|
829 |
||
830 |
def _get_as_requested_source_keys(self, orig_keys, locations, unadded_keys, |
|
831 |
key_to_source_map): |
|
832 |
source_keys = [] |
|
833 |
current_source = None |
|
834 |
for key in orig_keys: |
|
835 |
if key in locations or key in unadded_keys: |
|
836 |
source = self |
|
837 |
elif key in key_to_source_map: |
|
838 |
source = key_to_source_map[key] |
|
839 |
else: # absent |
|
840 |
continue
|
|
841 |
if source is not current_source: |
|
842 |
source_keys.append((source, [])) |
|
|
3735.2.151
by John Arbash Meinel
A the source grouping code needs to update current_source |
843 |
current_source = source |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
844 |
source_keys[-1][1].append(key) |
845 |
return source_keys |
|
846 |
||
847 |
def _get_io_ordered_source_keys(self, locations, unadded_keys, |
|
848 |
source_result): |
|
849 |
def get_group(key): |
|
850 |
# This is the group the bytes are stored in, followed by the
|
|
851 |
# location in the group
|
|
852 |
return locations[key][0] |
|
853 |
present_keys = sorted(locations.iterkeys(), key=get_group) |
|
854 |
# We don't have an ordering for keys in the in-memory object, but
|
|
855 |
# lets process the in-memory ones first.
|
|
856 |
present_keys = list(unadded_keys) + present_keys |
|
857 |
# Now grab all of the ones from other sources
|
|
858 |
source_keys = [(self, present_keys)] |
|
859 |
source_keys.extend(source_result) |
|
860 |
return source_keys |
|
861 |
||
862 |
def _get_remaining_record_stream(self, keys, orig_keys, ordering, |
|
863 |
include_delta_closure): |
|
864 |
"""Get a stream of records for keys. |
|
865 |
||
866 |
:param keys: The keys to include.
|
|
867 |
:param ordering: one of 'unordered', 'topological', 'groupcompress' or
|
|
868 |
'as-requested'
|
|
869 |
:param include_delta_closure: If True then the closure across any
|
|
870 |
compression parents will be included (in the opaque data).
|
|
871 |
:return: An iterator of ContentFactory objects, each of which is only
|
|
872 |
valid until the iterator is advanced.
|
|
873 |
"""
|
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
874 |
# Cheap: iterate
|
875 |
locations = self._index.get_build_details(keys) |
|
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
876 |
unadded_keys = set(self._unadded_refs).intersection(keys) |
877 |
missing = keys.difference(locations) |
|
878 |
missing.difference_update(unadded_keys) |
|
879 |
(fallback_parent_map, key_to_source_map, |
|
880 |
source_result) = self._find_from_fallback(missing) |
|
881 |
if ordering in ('topological', 'groupcompress'): |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
882 |
# would be better to not globally sort initially but instead
|
883 |
# start with one key, recurse to its oldest parent, then grab
|
|
884 |
# everything in the same group, etc.
|
|
885 |
parent_map = dict((key, details[2]) for key, details in |
|
886 |
locations.iteritems()) |
|
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
887 |
for key in unadded_keys: |
888 |
parent_map[key] = self._unadded_refs[key] |
|
889 |
parent_map.update(fallback_parent_map) |
|
890 |
source_keys = self._get_ordered_source_keys(ordering, parent_map, |
|
891 |
key_to_source_map) |
|
|
0.22.6
by John Arbash Meinel
Clustering chk pages properly makes a big difference. |
892 |
elif ordering == 'as-requested': |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
893 |
source_keys = self._get_as_requested_source_keys(orig_keys, |
894 |
locations, unadded_keys, key_to_source_map) |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
895 |
else: |
|
0.20.10
by John Arbash Meinel
Change the extraction ordering for 'unordered'. |
896 |
# We want to yield the keys in a semi-optimal (read-wise) ordering.
|
897 |
# Otherwise we thrash the _group_cache and destroy performance
|
|
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
898 |
source_keys = self._get_io_ordered_source_keys(locations, |
899 |
unadded_keys, source_result) |
|
900 |
for key in missing: |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
901 |
yield AbsentContentFactory(key) |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
902 |
for source, keys in source_keys: |
903 |
if source is self: |
|
904 |
for key in keys: |
|
905 |
if key in self._unadded_refs: |
|
906 |
bytes, sha1 = self._compressor.extract(key) |
|
907 |
parents = self._unadded_refs[key] |
|
908 |
else: |
|
909 |
index_memo, _, parents, (method, _) = locations[key] |
|
910 |
block = self._get_block(index_memo) |
|
911 |
entry, bytes = block.extract(key, index_memo) |
|
912 |
sha1 = entry.sha1 |
|
913 |
# TODO: If we don't have labels, then the sha1 here is computed
|
|
914 |
# from the data, so we don't want to re-sha the string.
|
|
915 |
if not _FAST and sha_string(bytes) != sha1: |
|
916 |
raise AssertionError('sha1 sum did not match') |
|
917 |
yield FulltextContentFactory(key, parents, sha1, bytes) |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
918 |
else: |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
919 |
for record in source.get_record_stream(keys, ordering, |
920 |
include_delta_closure): |
|
921 |
yield record |
|
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
922 |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
923 |
def get_sha1s(self, keys): |
924 |
"""See VersionedFiles.get_sha1s().""" |
|
925 |
result = {} |
|
926 |
for record in self.get_record_stream(keys, 'unordered', True): |
|
927 |
if record.sha1 != None: |
|
928 |
result[record.key] = record.sha1 |
|
929 |
else: |
|
930 |
if record.storage_kind != 'absent': |
|
931 |
result[record.key] == sha_string(record.get_bytes_as( |
|
932 |
'fulltext')) |
|
933 |
return result |
|
934 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
935 |
def insert_record_stream(self, stream): |
936 |
"""Insert a record stream into this container. |
|
937 |
||
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
938 |
:param stream: A stream of records to insert.
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
939 |
:return: None
|
940 |
:seealso VersionedFiles.get_record_stream:
|
|
941 |
"""
|
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
942 |
for _ in self._insert_record_stream(stream): |
943 |
pass
|
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
944 |
|
|
3735.31.12
by John Arbash Meinel
Push nostore_sha down through the stack. |
945 |
def _insert_record_stream(self, stream, random_id=False, nostore_sha=None): |
|
0.17.2
by Robert Collins
Core proof of concept working. |
946 |
"""Internal core to insert a record stream into this container. |
947 |
||
948 |
This helper function has a different interface than insert_record_stream
|
|
949 |
to allow add_lines to be minimal, but still return the needed data.
|
|
950 |
||
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
951 |
:param stream: A stream of records to insert.
|
|
3735.31.12
by John Arbash Meinel
Push nostore_sha down through the stack. |
952 |
:param nostore_sha: If the sha1 of a given text matches nostore_sha,
|
953 |
raise ExistingContent, rather than committing the new text.
|
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
954 |
:return: An iterator over the sha1 of the inserted records.
|
955 |
:seealso insert_record_stream:
|
|
956 |
:seealso add_lines:
|
|
957 |
"""
|
|
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
958 |
adapters = {} |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
959 |
def get_adapter(adapter_key): |
960 |
try: |
|
961 |
return adapters[adapter_key] |
|
962 |
except KeyError: |
|
963 |
adapter_factory = adapter_registry.get(adapter_key) |
|
964 |
adapter = adapter_factory(self) |
|
965 |
adapters[adapter_key] = adapter |
|
966 |
return adapter |
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
967 |
# This will go up to fulltexts for gc to gc fetching, which isn't
|
968 |
# ideal.
|
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
969 |
self._compressor = GroupCompressor(self._delta) |
970 |
self._unadded_refs = {} |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
971 |
keys_to_add = [] |
972 |
basis_end = 0 |
|
|
0.17.6
by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big). |
973 |
def flush(): |
|
0.25.7
by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content. |
974 |
bytes = self._compressor._block.to_bytes( |
975 |
''.join(self._compressor.lines)) |
|
|
0.17.6
by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big). |
976 |
index, start, length = self._access.add_raw_records( |
|
0.25.7
by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content. |
977 |
[(None, len(bytes))], bytes)[0] |
|
0.17.6
by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big). |
978 |
nodes = [] |
979 |
for key, reads, refs in keys_to_add: |
|
980 |
nodes.append((key, "%d %d %s" % (start, length, reads), refs)) |
|
981 |
self._index.add_records(nodes, random_id=random_id) |
|
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
982 |
self._unadded_refs = {} |
983 |
del keys_to_add[:] |
|
984 |
self._compressor = GroupCompressor(self._delta) |
|
985 |
||
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
986 |
last_prefix = None |
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
987 |
last_fulltext_len = None |
988 |
max_fulltext_len = 0 |
|
|
0.25.11
by John Arbash Meinel
Slightly different handling of large texts. |
989 |
max_fulltext_prefix = None |
|
0.17.2
by Robert Collins
Core proof of concept working. |
990 |
for record in stream: |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
991 |
# Raise an error when a record is missing.
|
992 |
if record.storage_kind == 'absent': |
|
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
993 |
raise errors.RevisionNotPresent(record.key, self) |
|
0.20.18
by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys() |
994 |
try: |
|
0.23.52
by John Arbash Meinel
Use the max_delta flag. |
995 |
bytes = record.get_bytes_as('fulltext') |
|
0.20.18
by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys() |
996 |
except errors.UnavailableRepresentation: |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
997 |
adapter_key = record.storage_kind, 'fulltext' |
998 |
adapter = get_adapter(adapter_key) |
|
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
999 |
bytes = adapter.get_bytes(record) |
|
0.20.13
by John Arbash Meinel
Play around a bit. |
1000 |
if len(record.key) > 1: |
1001 |
prefix = record.key[0] |
|
|
0.25.11
by John Arbash Meinel
Slightly different handling of large texts. |
1002 |
soft = (prefix == last_prefix) |
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
1003 |
else: |
1004 |
prefix = None |
|
|
0.25.11
by John Arbash Meinel
Slightly different handling of large texts. |
1005 |
soft = False |
1006 |
if max_fulltext_len < len(bytes): |
|
1007 |
max_fulltext_len = len(bytes) |
|
1008 |
max_fulltext_prefix = prefix |
|
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
1009 |
(found_sha1, end_point, type, |
1010 |
length) = self._compressor.compress(record.key, |
|
|
3735.31.12
by John Arbash Meinel
Push nostore_sha down through the stack. |
1011 |
bytes, record.sha1, soft=soft, |
1012 |
nostore_sha=nostore_sha) |
|
|
0.25.11
by John Arbash Meinel
Slightly different handling of large texts. |
1013 |
# delta_ratio = float(len(bytes)) / length
|
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
1014 |
# Check if we want to continue to include that text
|
|
0.25.11
by John Arbash Meinel
Slightly different handling of large texts. |
1015 |
if (prefix == max_fulltext_prefix |
1016 |
and end_point < 2 * max_fulltext_len): |
|
1017 |
# As long as we are on the same file_id, we will fill at least
|
|
1018 |
# 2 * max_fulltext_len
|
|
1019 |
start_new_block = False |
|
1020 |
elif end_point > 4*1024*1024: |
|
1021 |
start_new_block = True |
|
1022 |
elif (prefix is not None and prefix != last_prefix |
|
1023 |
and end_point > 2*1024*1024): |
|
1024 |
start_new_block = True |
|
1025 |
else: |
|
1026 |
start_new_block = False |
|
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
1027 |
# if type == 'fulltext':
|
1028 |
# # If this is the first text, we don't do anything
|
|
1029 |
# if self._compressor.num_keys > 1:
|
|
1030 |
# if prefix is not None and prefix != last_prefix:
|
|
1031 |
# # We just inserted a fulltext for a different prefix
|
|
1032 |
# # (aka file-id).
|
|
1033 |
# if end_point > 512 * 1024:
|
|
1034 |
# start_new_block = True
|
|
1035 |
# # TODO: Consider packing several small texts together
|
|
1036 |
# # maybe only flush if end_point > some threshold
|
|
1037 |
# # if end_point > 512 * 1024 or len(bytes) <
|
|
1038 |
# # start_new_block = true
|
|
1039 |
# else:
|
|
1040 |
# # We just added a fulltext, part of the same file-id
|
|
1041 |
# if (end_point > 2*1024*1024
|
|
1042 |
# and end_point > 5*max_fulltext_len):
|
|
1043 |
# start_new_block = True
|
|
1044 |
# last_fulltext_len = len(bytes)
|
|
1045 |
# else:
|
|
1046 |
# delta_ratio = float(len(bytes)) / length
|
|
1047 |
# if delta_ratio < 3: # Not much compression
|
|
1048 |
# if end_point > 1*1024*1024:
|
|
1049 |
# start_new_block = True
|
|
1050 |
# elif delta_ratio < 10: # 10:1 compression
|
|
1051 |
# if end_point > 4*1024*1024:
|
|
1052 |
# start_new_block = True
|
|
1053 |
last_prefix = prefix |
|
1054 |
if start_new_block: |
|
1055 |
self._compressor.pop_last() |
|
1056 |
flush() |
|
1057 |
basis_end = 0 |
|
1058 |
max_fulltext_len = len(bytes) |
|
1059 |
(found_sha1, end_point, type, |
|
1060 |
length) = self._compressor.compress(record.key, |
|
1061 |
bytes, record.sha1) |
|
1062 |
last_fulltext_len = length |
|
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
1063 |
if record.key[-1] is None: |
1064 |
key = record.key[:-1] + ('sha1:' + found_sha1,) |
|
1065 |
else: |
|
1066 |
key = record.key |
|
1067 |
self._unadded_refs[key] = record.parents |
|
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
1068 |
yield found_sha1 |
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
1069 |
keys_to_add.append((key, '%d %d' % (basis_end, end_point), |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1070 |
(record.parents,))) |
1071 |
basis_end = end_point |
|
|
0.17.8
by Robert Collins
Flush pending updates at the end of _insert_record_stream |
1072 |
if len(keys_to_add): |
1073 |
flush() |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
1074 |
self._compressor = None |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1075 |
|
1076 |
def iter_lines_added_or_present_in_keys(self, keys, pb=None): |
|
1077 |
"""Iterate over the lines in the versioned files from keys. |
|
1078 |
||
1079 |
This may return lines from other keys. Each item the returned
|
|
1080 |
iterator yields is a tuple of a line and a text version that that line
|
|
1081 |
is present in (not introduced in).
|
|
1082 |
||
1083 |
Ordering of results is in whatever order is most suitable for the
|
|
1084 |
underlying storage format.
|
|
1085 |
||
1086 |
If a progress bar is supplied, it may be used to indicate progress.
|
|
1087 |
The caller is responsible for cleaning up progress bars (because this
|
|
1088 |
is an iterator).
|
|
1089 |
||
1090 |
NOTES:
|
|
1091 |
* Lines are normalised by the underlying store: they will all have \n
|
|
1092 |
terminators.
|
|
1093 |
* Lines are returned in arbitrary order.
|
|
1094 |
||
1095 |
:return: An iterator over (line, key).
|
|
1096 |
"""
|
|
1097 |
if pb is None: |
|
1098 |
pb = progress.DummyProgress() |
|
1099 |
keys = set(keys) |
|
1100 |
total = len(keys) |
|
1101 |
# we don't care about inclusions, the caller cares.
|
|
1102 |
# but we need to setup a list of records to visit.
|
|
1103 |
# we need key, position, length
|
|
1104 |
for key_idx, record in enumerate(self.get_record_stream(keys, |
|
1105 |
'unordered', True)): |
|
1106 |
# XXX: todo - optimise to use less than full texts.
|
|
1107 |
key = record.key |
|
|
3735.32.1
by John Arbash Meinel
Fix the VF WalkingContent checks. |
1108 |
pb.update('Walking content', key_idx, total) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1109 |
if record.storage_kind == 'absent': |
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1110 |
raise errors.RevisionNotPresent(key, self) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1111 |
lines = split_lines(record.get_bytes_as('fulltext')) |
1112 |
for line in lines: |
|
1113 |
yield line, key |
|
|
3735.32.1
by John Arbash Meinel
Fix the VF WalkingContent checks. |
1114 |
pb.update('Walking content', total, total) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1115 |
|
1116 |
def keys(self): |
|
1117 |
"""See VersionedFiles.keys.""" |
|
1118 |
if 'evil' in debug.debug_flags: |
|
1119 |
trace.mutter_callsite(2, "keys scales with size of history") |
|
|
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
1120 |
sources = [self._index] + self._fallback_vfs |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1121 |
result = set() |
1122 |
for source in sources: |
|
1123 |
result.update(source.keys()) |
|
1124 |
return result |
|
1125 |
||
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1126 |
|
1127 |
class _GCGraphIndex(object): |
|
1128 |
"""Mapper from GroupCompressVersionedFiles needs into GraphIndex storage.""" |
|
1129 |
||
|
0.17.9
by Robert Collins
Initial stab at repository format support. |
1130 |
def __init__(self, graph_index, is_locked, parents=True, |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1131 |
add_callback=None): |
1132 |
"""Construct a _GCGraphIndex on a graph_index. |
|
1133 |
||
1134 |
:param graph_index: An implementation of bzrlib.index.GraphIndex.
|
|
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1135 |
:param is_locked: A callback, returns True if the index is locked and
|
1136 |
thus usable.
|
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1137 |
:param parents: If True, record knits parents, if not do not record
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1138 |
parents.
|
1139 |
:param add_callback: If not None, allow additions to the index and call
|
|
1140 |
this callback with a list of added GraphIndex nodes:
|
|
1141 |
[(node, value, node_refs), ...]
|
|
1142 |
"""
|
|
1143 |
self._add_callback = add_callback |
|
1144 |
self._graph_index = graph_index |
|
1145 |
self._parents = parents |
|
1146 |
self.has_graph = parents |
|
1147 |
self._is_locked = is_locked |
|
1148 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1149 |
def add_records(self, records, random_id=False): |
1150 |
"""Add multiple records to the index. |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1151 |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1152 |
This function does not insert data into the Immutable GraphIndex
|
1153 |
backing the KnitGraphIndex, instead it prepares data for insertion by
|
|
1154 |
the caller and checks that it is safe to insert then calls
|
|
1155 |
self._add_callback with the prepared GraphIndex nodes.
|
|
1156 |
||
1157 |
:param records: a list of tuples:
|
|
1158 |
(key, options, access_memo, parents).
|
|
1159 |
:param random_id: If True the ids being added were randomly generated
|
|
1160 |
and no check for existence will be performed.
|
|
1161 |
"""
|
|
1162 |
if not self._add_callback: |
|
1163 |
raise errors.ReadOnlyError(self) |
|
1164 |
# we hope there are no repositories with inconsistent parentage
|
|
1165 |
# anymore.
|
|
1166 |
||
1167 |
changed = False |
|
1168 |
keys = {} |
|
1169 |
for (key, value, refs) in records: |
|
1170 |
if not self._parents: |
|
1171 |
if refs: |
|
1172 |
for ref in refs: |
|
1173 |
if ref: |
|
1174 |
raise KnitCorrupt(self, |
|
1175 |
"attempt to add node with parents "
|
|
1176 |
"in parentless index.") |
|
1177 |
refs = () |
|
1178 |
changed = True |
|
1179 |
keys[key] = (value, refs) |
|
1180 |
# check for dups
|
|
1181 |
if not random_id: |
|
1182 |
present_nodes = self._get_entries(keys) |
|
1183 |
for (index, key, value, node_refs) in present_nodes: |
|
1184 |
if node_refs != keys[key][1]: |
|
1185 |
raise errors.KnitCorrupt(self, "inconsistent details in add_records" |
|
1186 |
": %s %s" % ((value, node_refs), keys[key])) |
|
1187 |
del keys[key] |
|
1188 |
changed = True |
|
1189 |
if changed: |
|
1190 |
result = [] |
|
1191 |
if self._parents: |
|
1192 |
for key, (value, node_refs) in keys.iteritems(): |
|
1193 |
result.append((key, value, node_refs)) |
|
1194 |
else: |
|
1195 |
for key, (value, node_refs) in keys.iteritems(): |
|
1196 |
result.append((key, value)) |
|
1197 |
records = result |
|
1198 |
self._add_callback(records) |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1199 |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1200 |
def _check_read(self): |
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1201 |
"""Raise an exception if reads are not permitted.""" |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1202 |
if not self._is_locked(): |
1203 |
raise errors.ObjectNotLocked(self) |
|
1204 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
1205 |
def _check_write_ok(self): |
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1206 |
"""Raise an exception if writes are not permitted.""" |
|
0.17.2
by Robert Collins
Core proof of concept working. |
1207 |
if not self._is_locked(): |
1208 |
raise errors.ObjectNotLocked(self) |
|
1209 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1210 |
def _get_entries(self, keys, check_present=False): |
1211 |
"""Get the entries for keys. |
|
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1212 |
|
1213 |
Note: Callers are responsible for checking that the index is locked
|
|
1214 |
before calling this method.
|
|
1215 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1216 |
:param keys: An iterable of index key tuples.
|
1217 |
"""
|
|
1218 |
keys = set(keys) |
|
1219 |
found_keys = set() |
|
1220 |
if self._parents: |
|
1221 |
for node in self._graph_index.iter_entries(keys): |
|
1222 |
yield node |
|
1223 |
found_keys.add(node[1]) |
|
1224 |
else: |
|
1225 |
# adapt parentless index to the rest of the code.
|
|
1226 |
for node in self._graph_index.iter_entries(keys): |
|
1227 |
yield node[0], node[1], node[2], () |
|
1228 |
found_keys.add(node[1]) |
|
1229 |
if check_present: |
|
1230 |
missing_keys = keys.difference(found_keys) |
|
1231 |
if missing_keys: |
|
1232 |
raise RevisionNotPresent(missing_keys.pop(), self) |
|
1233 |
||
1234 |
def get_parent_map(self, keys): |
|
1235 |
"""Get a map of the parents of keys. |
|
1236 |
||
1237 |
:param keys: The keys to look up parents for.
|
|
1238 |
:return: A mapping from keys to parents. Absent keys are absent from
|
|
1239 |
the mapping.
|
|
1240 |
"""
|
|
1241 |
self._check_read() |
|
1242 |
nodes = self._get_entries(keys) |
|
1243 |
result = {} |
|
1244 |
if self._parents: |
|
1245 |
for node in nodes: |
|
1246 |
result[node[1]] = node[3][0] |
|
1247 |
else: |
|
1248 |
for node in nodes: |
|
1249 |
result[node[1]] = None |
|
1250 |
return result |
|
1251 |
||
1252 |
def get_build_details(self, keys): |
|
1253 |
"""Get the various build details for keys. |
|
1254 |
||
1255 |
Ghosts are omitted from the result.
|
|
1256 |
||
1257 |
:param keys: An iterable of keys.
|
|
1258 |
:return: A dict of key:
|
|
1259 |
(index_memo, compression_parent, parents, record_details).
|
|
1260 |
index_memo
|
|
1261 |
opaque structure to pass to read_records to extract the raw
|
|
1262 |
data
|
|
1263 |
compression_parent
|
|
1264 |
Content that this record is built upon, may be None
|
|
1265 |
parents
|
|
1266 |
Logical parents of this node
|
|
1267 |
record_details
|
|
1268 |
extra information about the content which needs to be passed to
|
|
1269 |
Factory.parse_record
|
|
1270 |
"""
|
|
1271 |
self._check_read() |
|
1272 |
result = {} |
|
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1273 |
entries = self._get_entries(keys) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1274 |
for entry in entries: |
1275 |
key = entry[1] |
|
1276 |
if not self._parents: |
|
1277 |
parents = None |
|
1278 |
else: |
|
1279 |
parents = entry[3][0] |
|
1280 |
method = 'group' |
|
1281 |
result[key] = (self._node_to_position(entry), |
|
1282 |
None, parents, (method, None)) |
|
1283 |
return result |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1284 |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1285 |
def keys(self): |
1286 |
"""Get all the keys in the collection. |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1287 |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1288 |
The keys are not ordered.
|
1289 |
"""
|
|
1290 |
self._check_read() |
|
1291 |
return [node[1] for node in self._graph_index.iter_all_entries()] |
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1292 |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1293 |
def _node_to_position(self, node): |
1294 |
"""Convert an index value to position details.""" |
|
1295 |
bits = node[2].split(' ') |
|
1296 |
# It would be nice not to read the entire gzip.
|
|
1297 |
start = int(bits[0]) |
|
1298 |
stop = int(bits[1]) |
|
1299 |
basis_end = int(bits[2]) |
|
1300 |
delta_end = int(bits[3]) |
|
1301 |
return node[0], start, stop, basis_end, delta_end |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
1302 |
|
1303 |
||
1304 |
try: |
|
|
3735.31.1
by John Arbash Meinel
Bring the groupcompress plugin into the brisbane-core branch. |
1305 |
from bzrlib import _groupcompress_pyx |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
1306 |
except ImportError: |
1307 |
pass
|