bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1 |
# groupcompress, a bzr plugin providing new compression logic.
|
2 |
# Copyright (C) 2008 Canonical Limited.
|
|
3 |
#
|
|
4 |
# This program is free software; you can redistribute it and/or modify
|
|
5 |
# it under the terms of the GNU General Public License version 2 as published
|
|
6 |
# by the Free Software Foundation.
|
|
7 |
#
|
|
8 |
# This program is distributed in the hope that it will be useful,
|
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
12 |
#
|
|
13 |
# You should have received a copy of the GNU General Public License
|
|
14 |
# along with this program; if not, write to the Free Software
|
|
15 |
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
16 |
#
|
|
17 |
||
18 |
"""Core compression logic for compressing streams of related files."""
|
|
19 |
||
|
0.17.13
by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting). |
20 |
from itertools import izip |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
21 |
from cStringIO import StringIO |
22 |
import zlib |
|
23 |
||
|
0.17.4
by Robert Collins
Annotate. |
24 |
from bzrlib import ( |
25 |
annotate, |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
26 |
debug, |
|
0.17.4
by Robert Collins
Annotate. |
27 |
diff, |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
28 |
errors, |
|
0.17.4
by Robert Collins
Annotate. |
29 |
graph as _mod_graph, |
|
0.20.2
by John Arbash Meinel
Teach groupcompress about 'chunked' encoding |
30 |
osutils, |
|
0.17.4
by Robert Collins
Annotate. |
31 |
pack, |
32 |
patiencediff, |
|
33 |
)
|
|
34 |
from bzrlib.graph import Graph |
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
35 |
from bzrlib.knit import _DirectPackAccess |
|
0.17.2
by Robert Collins
Core proof of concept working. |
36 |
from bzrlib.osutils import ( |
37 |
contains_whitespace, |
|
38 |
contains_linebreaks, |
|
39 |
sha_string, |
|
40 |
sha_strings, |
|
41 |
split_lines, |
|
42 |
)
|
|
|
0.17.21
by Robert Collins
Update groupcompress to bzrlib 1.10. |
43 |
from bzrlib.btree_index import BTreeBuilder |
|
0.17.24
by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group. |
44 |
from bzrlib.lru_cache import LRUSizeCache |
|
0.18.6
by John Arbash Meinel
Use the new EquivalenceTable to track the lines. |
45 |
from bzrlib.plugins.groupcompress import equivalence_table |
|
0.17.9
by Robert Collins
Initial stab at repository format support. |
46 |
from bzrlib.tsort import topo_sort |
|
0.17.2
by Robert Collins
Core proof of concept working. |
47 |
from bzrlib.versionedfile import ( |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
48 |
adapter_registry, |
49 |
AbsentContentFactory, |
|
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
50 |
ChunkedContentFactory, |
|
0.17.2
by Robert Collins
Core proof of concept working. |
51 |
FulltextContentFactory, |
52 |
VersionedFiles, |
|
53 |
)
|
|
54 |
||
55 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
56 |
def parse(line_list): |
|
0.17.2
by Robert Collins
Core proof of concept working. |
57 |
result = [] |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
58 |
lines = iter(line_list) |
|
0.17.2
by Robert Collins
Core proof of concept working. |
59 |
next = lines.next |
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
60 |
label_line = lines.next() |
61 |
sha1_line = lines.next() |
|
62 |
if (not label_line.startswith('label: ') or |
|
63 |
not sha1_line.startswith('sha1: ')): |
|
64 |
raise AssertionError("bad text record %r" % lines) |
|
65 |
label = tuple(label_line[7:-1].split('\x00')) |
|
66 |
sha1 = sha1_line[6:-1] |
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
67 |
for header in lines: |
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
68 |
op = header[0] |
69 |
numbers = header[2:] |
|
70 |
numbers = [int(n) for n in header[2:].split(',')] |
|
71 |
if op == 'c': |
|
72 |
result.append((op, numbers[0], numbers[1], None)) |
|
73 |
else: |
|
74 |
contents = [next() for i in xrange(numbers[0])] |
|
75 |
result.append((op, None, numbers[0], contents)) |
|
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
76 |
## return result
|
77 |
return label, sha1, result |
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
78 |
|
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
79 |
|
|
0.20.9
by John Arbash Meinel
Revert previous change. |
80 |
def apply_delta(basis, delta): |
|
0.17.2
by Robert Collins
Core proof of concept working. |
81 |
"""Apply delta to this object to become new_version_id.""" |
82 |
lines = [] |
|
83 |
last_offset = 0 |
|
84 |
# eq ranges occur where gaps occur
|
|
85 |
# start, end refer to offsets in basis
|
|
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
86 |
for op, start, count, delta_lines in delta: |
87 |
if op == 'c': |
|
|
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
88 |
lines.append(basis[start:start+count]) |
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
89 |
else: |
90 |
lines.extend(delta_lines) |
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
91 |
trim_encoding_newline(lines) |
92 |
return lines |
|
93 |
||
94 |
||
95 |
def trim_encoding_newline(lines): |
|
96 |
if lines[-1] == '\n': |
|
97 |
del lines[-1] |
|
98 |
else: |
|
99 |
lines[-1] = lines[-1][:-1] |
|
100 |
||
101 |
||
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
102 |
|
103 |
def sort_gc_optimal(parent_map): |
|
104 |
"""Sort and group the keys in parent_map into gc-optimal order. |
|
105 |
||
106 |
gc-optimal is defined (currently) as reverse-topological order, grouped by
|
|
107 |
the key prefix.
|
|
108 |
||
109 |
:return: A sorted-list of keys
|
|
110 |
"""
|
|
111 |
# gc-optimal ordering is approximately reverse topological,
|
|
112 |
# properly grouped by file-id.
|
|
|
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
113 |
per_prefix_map = {} |
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
114 |
present_keys = [] |
115 |
for item in parent_map.iteritems(): |
|
116 |
key = item[0] |
|
117 |
if isinstance(key, str) or len(key) == 1: |
|
|
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
118 |
prefix = '' |
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
119 |
else: |
|
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
120 |
prefix = key[0] |
121 |
try: |
|
122 |
per_prefix_map[prefix].append(item) |
|
123 |
except KeyError: |
|
124 |
per_prefix_map[prefix] = [item] |
|
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
125 |
|
126 |
for prefix in sorted(per_prefix_map): |
|
127 |
present_keys.extend(reversed(topo_sort(per_prefix_map[prefix]))) |
|
128 |
return present_keys |
|
129 |
||
130 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
131 |
class GroupCompressor(object): |
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
132 |
"""Produce a serialised group of compressed texts. |
133 |
|
|
134 |
It contains code very similar to SequenceMatcher because of having a similar
|
|
135 |
task. However some key differences apply:
|
|
136 |
- there is no junk, we want a minimal edit not a human readable diff.
|
|
137 |
- we don't filter very common lines (because we don't know where a good
|
|
138 |
range will start, and after the first text we want to be emitting minmal
|
|
139 |
edits only.
|
|
140 |
- we chain the left side, not the right side
|
|
141 |
- we incrementally update the adjacency matrix as new lines are provided.
|
|
142 |
- we look for matches in all of the left side, so the routine which does
|
|
143 |
the analagous task of find_longest_match does not need to filter on the
|
|
144 |
left side.
|
|
145 |
"""
|
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
146 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
147 |
_equivalence_table_class = equivalence_table.EquivalenceTable |
148 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
149 |
def __init__(self, delta=True): |
150 |
"""Create a GroupCompressor. |
|
151 |
||
152 |
:paeam delta: If False, do not compress records.
|
|
153 |
"""
|
|
154 |
self._delta = delta |
|
|
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
155 |
self.line_offsets = [] |
|
0.17.2
by Robert Collins
Core proof of concept working. |
156 |
self.endpoint = 0 |
157 |
self.input_bytes = 0 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
158 |
self.line_locations = self._equivalence_table_class([]) |
|
0.18.9
by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right |
159 |
self.lines = self.line_locations.lines |
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
160 |
self.labels_deltas = {} |
|
0.20.12
by John Arbash Meinel
Change the code a little bit. |
161 |
self._present_prefixes = set() |
|
0.17.2
by Robert Collins
Core proof of concept working. |
162 |
|
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
163 |
def get_matching_blocks(self, lines, soft=False): |
|
0.17.15
by Robert Collins
Factor out a get_matching_blocks style function. |
164 |
"""Return an the ranges in lines which match self.lines. |
165 |
||
166 |
:param lines: lines to compress
|
|
167 |
:return: A list of (old_start, new_start, length) tuples which reflect
|
|
168 |
a region in self.lines that is present in lines. The last element
|
|
169 |
of the list is always (old_len, new_len, 0) to provide a end point
|
|
170 |
for generating instructions from the matching blocks list.
|
|
171 |
"""
|
|
172 |
result = [] |
|
173 |
pos = 0 |
|
174 |
line_locations = self.line_locations |
|
|
0.18.11
by John Arbash Meinel
Convert back into grabbing a right-lines ahead of time. |
175 |
line_locations.set_right_lines(lines) |
|
0.17.15
by Robert Collins
Factor out a get_matching_blocks style function. |
176 |
# We either copy a range (while there are reusable lines) or we
|
177 |
# insert new lines. To find reusable lines we traverse
|
|
|
0.18.24
by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func. |
178 |
locations = None |
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
179 |
max_pos = len(lines) |
|
0.18.36
by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise. |
180 |
result_append = result.append |
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
181 |
min_match_bytes = 10 |
182 |
if soft: |
|
183 |
min_match_bytes = 200 |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
184 |
while pos < max_pos: |
|
0.18.35
by John Arbash Meinel
remove the timing calls |
185 |
block, pos, locations = _get_longest_match(line_locations, pos, |
186 |
max_pos, locations) |
|
|
0.18.25
by John Arbash Meinel
Factor the get_longest_match into a helper func |
187 |
if block is not None: |
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
188 |
# Check to see if we are matching fewer than 5 characters,
|
189 |
# which is turned into a simple 'insert', rather than a copy
|
|
190 |
# If we have more than 5 lines, we definitely have more than 5
|
|
191 |
# chars
|
|
192 |
if block[-1] < min_match_bytes: |
|
193 |
# This block may be a 'short' block, check
|
|
194 |
old_start, new_start, range_len = block |
|
195 |
matched_bytes = sum(map(len, |
|
196 |
lines[new_start:new_start + range_len])) |
|
197 |
if matched_bytes < min_match_bytes: |
|
198 |
block = None |
|
199 |
if block is not None: |
|
|
0.18.36
by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise. |
200 |
result_append(block) |
201 |
result_append((len(self.lines), len(lines), 0)) |
|
|
0.17.15
by Robert Collins
Factor out a get_matching_blocks style function. |
202 |
return result |
203 |
||
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
204 |
def compress(self, key, lines, expected_sha, soft=False): |
|
0.17.2
by Robert Collins
Core proof of concept working. |
205 |
"""Compress lines with label key. |
206 |
||
207 |
:param key: A key tuple. It is stored in the output
|
|
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
208 |
for identification of the text during decompression. If the last
|
209 |
element is 'None' it is replaced with the sha1 of the text -
|
|
210 |
e.g. sha1:xxxxxxx.
|
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
211 |
:param lines: The lines to be compressed. Must be split
|
212 |
on \n, with the \n preserved.'
|
|
213 |
:param expected_sha: If non-None, the sha the lines are blieved to
|
|
214 |
have. During compression the sha is calculated; a mismatch will
|
|
215 |
cause an error.
|
|
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
216 |
:param soft: Do a 'soft' compression. This means that we require larger
|
217 |
ranges to match to be considered for a copy command.
|
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
218 |
:return: The sha1 of lines, and the number of bytes accumulated in
|
219 |
the group output so far.
|
|
220 |
"""
|
|
221 |
sha1 = sha_strings(lines) |
|
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
222 |
if key[-1] is None: |
223 |
key = key[:-1] + ('sha1:' + sha1,) |
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
224 |
label = '\x00'.join(key) |
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
225 |
## new_lines = []
|
226 |
new_lines = ['label: %s\n' % label, |
|
227 |
'sha1: %s\n' % sha1, |
|
228 |
]
|
|
229 |
## index_lines = []
|
|
230 |
index_lines = [False, False] |
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
231 |
# setup good encoding for trailing \n support.
|
232 |
if not lines or lines[-1].endswith('\n'): |
|
233 |
lines.append('\n') |
|
234 |
else: |
|
235 |
lines[-1] = lines[-1] + '\n' |
|
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
236 |
pos = 0 |
|
0.17.14
by Robert Collins
Cleaner code. |
237 |
range_len = 0 |
238 |
range_start = 0 |
|
239 |
flush_range = self.flush_range |
|
240 |
copy_ends = None |
|
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
241 |
blocks = self.get_matching_blocks(lines, soft=soft) |
|
0.17.15
by Robert Collins
Factor out a get_matching_blocks style function. |
242 |
current_pos = 0 |
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
243 |
copies_without_insertion = [] |
244 |
# We either copy a range (while there are reusable lines) or we
|
|
245 |
# insert new lines. To find reusable lines we traverse
|
|
|
0.17.15
by Robert Collins
Factor out a get_matching_blocks style function. |
246 |
for old_start, new_start, range_len in blocks: |
247 |
if new_start != current_pos: |
|
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
248 |
# if copies_without_insertion:
|
249 |
# self.flush_multi(copies_without_insertion,
|
|
250 |
# lines, new_lines, index_lines)
|
|
251 |
# copies_without_insertion = []
|
|
|
0.17.15
by Robert Collins
Factor out a get_matching_blocks style function. |
252 |
# non-matching region
|
|
0.19.1
by Robert Collins
Start to simplify flush_range. |
253 |
flush_range(current_pos, None, new_start - current_pos, |
|
0.17.15
by Robert Collins
Factor out a get_matching_blocks style function. |
254 |
lines, new_lines, index_lines) |
255 |
current_pos = new_start + range_len |
|
256 |
if not range_len: |
|
257 |
continue
|
|
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
258 |
# copies_without_insertion.append((new_start, old_start, range_len))
|
259 |
flush_range(new_start, old_start, range_len, |
|
260 |
lines, new_lines, index_lines) |
|
261 |
# if copies_without_insertion:
|
|
262 |
# self.flush_multi(copies_without_insertion,
|
|
263 |
# lines, new_lines, index_lines)
|
|
264 |
# copies_without_insertion = []
|
|
|
0.18.9
by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right |
265 |
delta_start = (self.endpoint, len(self.lines)) |
|
0.17.13
by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting). |
266 |
self.output_lines(new_lines, index_lines) |
|
0.17.2
by Robert Collins
Core proof of concept working. |
267 |
trim_encoding_newline(lines) |
268 |
self.input_bytes += sum(map(len, lines)) |
|
|
0.18.9
by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right |
269 |
delta_end = (self.endpoint, len(self.lines)) |
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
270 |
self.labels_deltas[key] = (delta_start, delta_end) |
|
0.17.2
by Robert Collins
Core proof of concept working. |
271 |
return sha1, self.endpoint |
272 |
||
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
273 |
def extract(self, key): |
|
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
274 |
"""Extract a key previously added to the compressor. |
275 |
|
|
276 |
:param key: The key to extract.
|
|
277 |
:return: An iterable over bytes and the sha1.
|
|
278 |
"""
|
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
279 |
delta_details = self.labels_deltas[key] |
280 |
delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]] |
|
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
281 |
label, sha1, delta = parse(delta_lines) |
282 |
## delta = parse(delta_lines)
|
|
283 |
if label != key: |
|
284 |
raise AssertionError("wrong key: %r, wanted %r" % (label, key)) |
|
|
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
285 |
# Perhaps we want to keep the line offsets too in memory at least?
|
|
0.20.22
by John Arbash Meinel
Make it clear that the bits you get from 'apply_delta' are chunks, not lines. |
286 |
chunks = apply_delta(''.join(self.lines), delta) |
287 |
sha1 = sha_strings(chunks) |
|
288 |
return chunks, sha1 |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
289 |
|
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
290 |
def flush_multi(self, instructions, lines, new_lines, index_lines): |
291 |
"""Flush a bunch of different ranges out. |
|
292 |
||
293 |
This should only be called with data that are "pure" copies.
|
|
294 |
"""
|
|
295 |
flush_range = self.flush_range |
|
296 |
if len(instructions) > 2: |
|
297 |
# This is the number of lines to be copied
|
|
298 |
total_copy_range = sum(i[2] for i in instructions) |
|
299 |
if len(instructions) > 0.5 * total_copy_range: |
|
300 |
# We are copying N lines, but taking more than N/2
|
|
301 |
# copy instructions to do so. We will go ahead and expand this
|
|
302 |
# text so that other code is able to match against it
|
|
303 |
flush_range(instructions[0][0], None, total_copy_range, |
|
304 |
lines, new_lines, index_lines) |
|
305 |
return
|
|
306 |
for ns, os, rl in instructions: |
|
307 |
flush_range(ns, os, rl, lines, new_lines, index_lines) |
|
308 |
||
|
0.19.1
by Robert Collins
Start to simplify flush_range. |
309 |
def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines): |
|
0.17.14
by Robert Collins
Cleaner code. |
310 |
insert_instruction = "i,%d\n" % range_len |
|
0.19.1
by Robert Collins
Start to simplify flush_range. |
311 |
if copy_start is not None: |
|
0.17.14
by Robert Collins
Cleaner code. |
312 |
# range stops, flush and start a new copy range
|
313 |
stop_byte = self.line_offsets[copy_start + range_len - 1] |
|
314 |
if copy_start == 0: |
|
315 |
start_byte = 0 |
|
316 |
else: |
|
317 |
start_byte = self.line_offsets[copy_start - 1] |
|
318 |
bytes = stop_byte - start_byte |
|
319 |
copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes) |
|
320 |
if (bytes + len(insert_instruction) > |
|
321 |
len(copy_control_instruction)): |
|
322 |
new_lines.append(copy_control_instruction) |
|
323 |
index_lines.append(False) |
|
324 |
return
|
|
325 |
# not copying, or inserting is shorter than copying, so insert.
|
|
326 |
new_lines.append(insert_instruction) |
|
327 |
new_lines.extend(lines[range_start:range_start+range_len]) |
|
328 |
index_lines.append(False) |
|
|
0.19.1
by Robert Collins
Start to simplify flush_range. |
329 |
index_lines.extend([copy_start is None]*range_len) |
|
0.17.14
by Robert Collins
Cleaner code. |
330 |
|
|
0.17.13
by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting). |
331 |
def output_lines(self, new_lines, index_lines): |
332 |
"""Output some lines. |
|
333 |
||
334 |
:param new_lines: The lines to output.
|
|
335 |
:param index_lines: A boolean flag for each line - when True, index
|
|
336 |
that line.
|
|
337 |
"""
|
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
338 |
# indexed_newlines = [idx for idx, val in enumerate(index_lines)
|
339 |
# if val and new_lines[idx] == '\n']
|
|
340 |
# if indexed_newlines:
|
|
341 |
# import pdb; pdb.set_trace()
|
|
|
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
342 |
endpoint = self.endpoint |
|
0.18.9
by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right |
343 |
self.line_locations.extend_lines(new_lines, index_lines) |
|
0.18.6
by John Arbash Meinel
Use the new EquivalenceTable to track the lines. |
344 |
for line in new_lines: |
|
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
345 |
endpoint += len(line) |
346 |
self.line_offsets.append(endpoint) |
|
347 |
self.endpoint = endpoint |
|
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
348 |
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
349 |
def ratio(self): |
350 |
"""Return the overall compression ratio.""" |
|
351 |
return float(self.input_bytes) / float(self.endpoint) |
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
352 |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
353 |
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
354 |
def make_pack_factory(graph, delta, keylength): |
355 |
"""Create a factory for creating a pack based groupcompress. |
|
356 |
||
357 |
This is only functional enough to run interface tests, it doesn't try to
|
|
358 |
provide a full pack environment.
|
|
359 |
|
|
360 |
:param graph: Store a graph.
|
|
361 |
:param delta: Delta compress contents.
|
|
362 |
:param keylength: How long should keys be.
|
|
363 |
"""
|
|
364 |
def factory(transport): |
|
365 |
parents = graph or delta |
|
366 |
ref_length = 0 |
|
367 |
if graph: |
|
368 |
ref_length += 1 |
|
|
0.17.7
by Robert Collins
Update for current index2 changes. |
369 |
graph_index = BTreeBuilder(reference_lists=ref_length, |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
370 |
key_elements=keylength) |
371 |
stream = transport.open_write_stream('newpack') |
|
372 |
writer = pack.ContainerWriter(stream.write) |
|
373 |
writer.begin() |
|
374 |
index = _GCGraphIndex(graph_index, lambda:True, parents=parents, |
|
|
0.17.9
by Robert Collins
Initial stab at repository format support. |
375 |
add_callback=graph_index.add_nodes) |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
376 |
access = _DirectPackAccess({}) |
377 |
access.set_writer(writer, graph_index, (transport, 'newpack')) |
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
378 |
result = GroupCompressVersionedFiles(index, access, delta) |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
379 |
result.stream = stream |
380 |
result.writer = writer |
|
381 |
return result |
|
382 |
return factory |
|
383 |
||
384 |
||
385 |
def cleanup_pack_group(versioned_files): |
|
|
0.17.23
by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe. |
386 |
versioned_files.writer.end() |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
387 |
versioned_files.stream.close() |
388 |
||
389 |
||
390 |
class GroupCompressVersionedFiles(VersionedFiles): |
|
391 |
"""A group-compress based VersionedFiles implementation.""" |
|
392 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
393 |
def __init__(self, index, access, delta=True): |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
394 |
"""Create a GroupCompressVersionedFiles object. |
395 |
||
396 |
:param index: The index object storing access and graph data.
|
|
397 |
:param access: The access object storing raw data.
|
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
398 |
:param delta: Whether to delta compress or just entropy compress.
|
399 |
"""
|
|
400 |
self._index = index |
|
401 |
self._access = access |
|
402 |
self._delta = delta |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
403 |
self._unadded_refs = {} |
|
0.17.24
by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group. |
404 |
self._group_cache = LRUSizeCache(max_size=50*1024*1024) |
|
0.17.2
by Robert Collins
Core proof of concept working. |
405 |
|
406 |
def add_lines(self, key, parents, lines, parent_texts=None, |
|
407 |
left_matching_blocks=None, nostore_sha=None, random_id=False, |
|
408 |
check_content=True): |
|
409 |
"""Add a text to the store. |
|
410 |
||
411 |
:param key: The key tuple of the text to add.
|
|
412 |
:param parents: The parents key tuples of the text to add.
|
|
413 |
:param lines: A list of lines. Each line must be a bytestring. And all
|
|
414 |
of them except the last must be terminated with \n and contain no
|
|
415 |
other \n's. The last line may either contain no \n's or a single
|
|
416 |
terminating \n. If the lines list does meet this constraint the add
|
|
417 |
routine may error or may succeed - but you will be unable to read
|
|
418 |
the data back accurately. (Checking the lines have been split
|
|
419 |
correctly is expensive and extremely unlikely to catch bugs so it
|
|
420 |
is not done at runtime unless check_content is True.)
|
|
421 |
:param parent_texts: An optional dictionary containing the opaque
|
|
422 |
representations of some or all of the parents of version_id to
|
|
423 |
allow delta optimisations. VERY IMPORTANT: the texts must be those
|
|
424 |
returned by add_lines or data corruption can be caused.
|
|
425 |
:param left_matching_blocks: a hint about which areas are common
|
|
426 |
between the text and its left-hand-parent. The format is
|
|
427 |
the SequenceMatcher.get_matching_blocks format.
|
|
428 |
:param nostore_sha: Raise ExistingContent and do not add the lines to
|
|
429 |
the versioned file if the digest of the lines matches this.
|
|
430 |
:param random_id: If True a random id has been selected rather than
|
|
431 |
an id determined by some deterministic process such as a converter
|
|
432 |
from a foreign VCS. When True the backend may choose not to check
|
|
433 |
for uniqueness of the resulting key within the versioned file, so
|
|
434 |
this should only be done when the result is expected to be unique
|
|
435 |
anyway.
|
|
436 |
:param check_content: If True, the lines supplied are verified to be
|
|
437 |
bytestrings that are correctly formed lines.
|
|
438 |
:return: The text sha1, the number of bytes in the text, and an opaque
|
|
439 |
representation of the inserted version which can be provided
|
|
440 |
back to future add_lines calls in the parent_texts dictionary.
|
|
441 |
"""
|
|
442 |
self._index._check_write_ok() |
|
443 |
self._check_add(key, lines, random_id, check_content) |
|
444 |
if parents is None: |
|
445 |
# The caller might pass None if there is no graph data, but kndx
|
|
446 |
# indexes can't directly store that, so we give them
|
|
447 |
# an empty tuple instead.
|
|
448 |
parents = () |
|
449 |
# double handling for now. Make it work until then.
|
|
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
450 |
length = sum(map(len, lines)) |
451 |
record = ChunkedContentFactory(key, parents, None, lines) |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
452 |
sha1 = list(self._insert_record_stream([record], random_id=random_id))[0] |
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
453 |
return sha1, length, None |
|
0.17.2
by Robert Collins
Core proof of concept working. |
454 |
|
|
0.17.4
by Robert Collins
Annotate. |
455 |
def annotate(self, key): |
456 |
"""See VersionedFiles.annotate.""" |
|
457 |
graph = Graph(self) |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
458 |
parent_map = self.get_parent_map([key]) |
459 |
if not parent_map: |
|
460 |
raise errors.RevisionNotPresent(key, self) |
|
461 |
if parent_map[key] is not None: |
|
462 |
search = graph._make_breadth_first_searcher([key]) |
|
463 |
keys = set() |
|
464 |
while True: |
|
465 |
try: |
|
466 |
present, ghosts = search.next_with_ghosts() |
|
467 |
except StopIteration: |
|
468 |
break
|
|
469 |
keys.update(present) |
|
470 |
parent_map = self.get_parent_map(keys) |
|
471 |
else: |
|
472 |
keys = [key] |
|
473 |
parent_map = {key:()} |
|
|
0.17.4
by Robert Collins
Annotate. |
474 |
head_cache = _mod_graph.FrozenHeadsCache(graph) |
475 |
parent_cache = {} |
|
476 |
reannotate = annotate.reannotate |
|
477 |
for record in self.get_record_stream(keys, 'topological', True): |
|
478 |
key = record.key |
|
|
0.20.2
by John Arbash Meinel
Teach groupcompress about 'chunked' encoding |
479 |
chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked')) |
|
0.17.4
by Robert Collins
Annotate. |
480 |
parent_lines = [parent_cache[parent] for parent in parent_map[key]] |
481 |
parent_cache[key] = list( |
|
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
482 |
reannotate(parent_lines, chunks, key, None, head_cache)) |
|
0.17.4
by Robert Collins
Annotate. |
483 |
return parent_cache[key] |
484 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
485 |
def check(self, progress_bar=None): |
486 |
"""See VersionedFiles.check().""" |
|
487 |
keys = self.keys() |
|
488 |
for record in self.get_record_stream(keys, 'unordered', True): |
|
489 |
record.get_bytes_as('fulltext') |
|
490 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
491 |
def _check_add(self, key, lines, random_id, check_content): |
492 |
"""check that version_id and lines are safe to add.""" |
|
493 |
version_id = key[-1] |
|
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
494 |
if version_id is not None: |
495 |
if contains_whitespace(version_id): |
|
496 |
raise InvalidRevisionId(version_id, self) |
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
497 |
self.check_not_reserved_id(version_id) |
498 |
# TODO: If random_id==False and the key is already present, we should
|
|
499 |
# probably check that the existing content is identical to what is
|
|
500 |
# being inserted, and otherwise raise an exception. This would make
|
|
501 |
# the bundle code simpler.
|
|
502 |
if check_content: |
|
503 |
self._check_lines_not_unicode(lines) |
|
504 |
self._check_lines_are_lines(lines) |
|
505 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
506 |
def get_parent_map(self, keys): |
507 |
"""Get a map of the parents of keys. |
|
508 |
||
509 |
:param keys: The keys to look up parents for.
|
|
510 |
:return: A mapping from keys to parents. Absent keys are absent from
|
|
511 |
the mapping.
|
|
512 |
"""
|
|
513 |
result = {} |
|
514 |
sources = [self._index] |
|
515 |
source_results = [] |
|
516 |
missing = set(keys) |
|
517 |
for source in sources: |
|
518 |
if not missing: |
|
519 |
break
|
|
520 |
new_result = source.get_parent_map(missing) |
|
521 |
source_results.append(new_result) |
|
522 |
result.update(new_result) |
|
523 |
missing.difference_update(set(new_result)) |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
524 |
if self._unadded_refs: |
525 |
for key in missing: |
|
526 |
if key in self._unadded_refs: |
|
527 |
result[key] = self._unadded_refs[key] |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
528 |
return result |
529 |
||
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
530 |
def _get_delta_lines(self, key): |
531 |
"""Helper function for manual debugging. |
|
532 |
||
533 |
This is a convenience function that shouldn't be used in production
|
|
534 |
code.
|
|
535 |
"""
|
|
536 |
build_details = self._index.get_build_details([key])[key] |
|
537 |
index_memo = build_details[0] |
|
538 |
group, delta_lines = self._get_group_and_delta_lines(index_memo) |
|
539 |
return delta_lines |
|
540 |
||
|
0.20.14
by John Arbash Meinel
Factor out _get_group_and_delta_lines. |
541 |
def _get_group_and_delta_lines(self, index_memo): |
542 |
read_memo = index_memo[0:3] |
|
543 |
# get the group:
|
|
544 |
try: |
|
545 |
plain = self._group_cache[read_memo] |
|
546 |
except KeyError: |
|
547 |
# read the group
|
|
548 |
zdata = self._access.get_raw_records([read_memo]).next() |
|
549 |
# decompress - whole thing - this is not a bug, as it
|
|
550 |
# permits caching. We might want to store the partially
|
|
551 |
# decompresed group and decompress object, so that recent
|
|
552 |
# texts are not penalised by big groups.
|
|
553 |
plain = zlib.decompress(zdata) #, index_memo[4]) |
|
554 |
self._group_cache[read_memo] = plain |
|
555 |
# cheapo debugging:
|
|
556 |
# print len(zdata), len(plain)
|
|
557 |
# parse - requires split_lines, better to have byte offsets
|
|
558 |
# here (but not by much - we only split the region for the
|
|
559 |
# recipe, and we often want to end up with lines anyway.
|
|
560 |
return plain, split_lines(plain[index_memo[3]:index_memo[4]]) |
|
561 |
||
|
0.20.18
by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys() |
562 |
def get_missing_compression_parent_keys(self): |
563 |
"""Return the keys of missing compression parents. |
|
564 |
||
565 |
Missing compression parents occur when a record stream was missing
|
|
566 |
basis texts, or a index was scanned that had missing basis texts.
|
|
567 |
"""
|
|
568 |
# GroupCompress cannot currently reference texts that are not in the
|
|
569 |
# group, so this is valid for now
|
|
570 |
return frozenset() |
|
571 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
572 |
def get_record_stream(self, keys, ordering, include_delta_closure): |
573 |
"""Get a stream of records for keys. |
|
574 |
||
575 |
:param keys: The keys to include.
|
|
576 |
:param ordering: Either 'unordered' or 'topological'. A topologically
|
|
577 |
sorted stream has compression parents strictly before their
|
|
578 |
children.
|
|
579 |
:param include_delta_closure: If True then the closure across any
|
|
580 |
compression parents will be included (in the opaque data).
|
|
581 |
:return: An iterator of ContentFactory objects, each of which is only
|
|
582 |
valid until the iterator is advanced.
|
|
583 |
"""
|
|
584 |
# keys might be a generator
|
|
|
0.22.6
by John Arbash Meinel
Clustering chk pages properly makes a big difference. |
585 |
orig_keys = list(keys) |
586 |
keys = set(orig_keys) |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
587 |
if not keys: |
588 |
return
|
|
|
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
589 |
if (not self._index.has_graph |
590 |
and ordering in ('topological', 'gc-optimal')): |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
591 |
# Cannot topological order when no graph has been stored.
|
592 |
ordering = 'unordered' |
|
593 |
# Cheap: iterate
|
|
594 |
locations = self._index.get_build_details(keys) |
|
|
0.20.10
by John Arbash Meinel
Change the extraction ordering for 'unordered'. |
595 |
local_keys = frozenset(keys).intersection(set(self._unadded_refs)) |
596 |
locations.update((key, None) for key in local_keys) |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
597 |
if ordering == 'topological': |
598 |
# would be better to not globally sort initially but instead
|
|
599 |
# start with one key, recurse to its oldest parent, then grab
|
|
600 |
# everything in the same group, etc.
|
|
601 |
parent_map = dict((key, details[2]) for key, details in |
|
602 |
locations.iteritems()) |
|
|
0.20.10
by John Arbash Meinel
Change the extraction ordering for 'unordered'. |
603 |
for key in local_keys: |
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
604 |
parent_map[key] = self._unadded_refs[key] |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
605 |
present_keys = topo_sort(parent_map) |
606 |
# Now group by source:
|
|
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
607 |
elif ordering == 'gc-optimal': |
608 |
parent_map = dict((key, details[2]) for key, details in |
|
|
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
609 |
locations.iteritems()) |
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
610 |
for key in local_keys: |
611 |
parent_map[key] = self._unadded_refs[key] |
|
612 |
# XXX: This only optimizes for the target ordering. We may need to
|
|
613 |
# balance that with the time it takes to extract ordering, by
|
|
614 |
# somehow grouping based on locations[key][0:3]
|
|
615 |
present_keys = sort_gc_optimal(parent_map) |
|
|
0.22.6
by John Arbash Meinel
Clustering chk pages properly makes a big difference. |
616 |
elif ordering == 'as-requested': |
617 |
present_keys = [key for key in orig_keys if key in locations] |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
618 |
else: |
|
0.20.10
by John Arbash Meinel
Change the extraction ordering for 'unordered'. |
619 |
# We want to yield the keys in a semi-optimal (read-wise) ordering.
|
620 |
# Otherwise we thrash the _group_cache and destroy performance
|
|
621 |
def get_group(key): |
|
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
622 |
# This is the group the bytes are stored in, followed by the
|
623 |
# location in the group
|
|
624 |
return locations[key][0] |
|
|
0.20.10
by John Arbash Meinel
Change the extraction ordering for 'unordered'. |
625 |
present_keys = sorted(locations.iterkeys(), key=get_group) |
626 |
# We don't have an ordering for keys in the in-memory object, but
|
|
627 |
# lets process the in-memory ones first.
|
|
628 |
local = list(local_keys) |
|
629 |
present_keys = list(local_keys) + present_keys |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
630 |
absent_keys = keys.difference(set(locations)) |
631 |
for key in absent_keys: |
|
632 |
yield AbsentContentFactory(key) |
|
633 |
for key in present_keys: |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
634 |
if key in self._unadded_refs: |
635 |
lines, sha1 = self._compressor.extract(key) |
|
636 |
parents = self._unadded_refs[key] |
|
637 |
else: |
|
638 |
index_memo, _, parents, (method, _) = locations[key] |
|
|
0.20.14
by John Arbash Meinel
Factor out _get_group_and_delta_lines. |
639 |
plain, delta_lines = self._get_group_and_delta_lines(index_memo) |
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
640 |
## delta = parse(delta_lines)
|
641 |
label, sha1, delta = parse(delta_lines) |
|
642 |
if label != key: |
|
643 |
raise AssertionError("wrong key: %r, wanted %r" % (label, key)) |
|
|
0.20.22
by John Arbash Meinel
Make it clear that the bits you get from 'apply_delta' are chunks, not lines. |
644 |
chunks = apply_delta(plain, delta) |
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
645 |
## sha1 = sha_strings(lines)
|
|
0.20.22
by John Arbash Meinel
Make it clear that the bits you get from 'apply_delta' are chunks, not lines. |
646 |
if sha_strings(chunks) != sha1: |
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
647 |
raise AssertionError('sha1 sum did not match') |
|
0.20.22
by John Arbash Meinel
Make it clear that the bits you get from 'apply_delta' are chunks, not lines. |
648 |
yield ChunkedContentFactory(key, parents, sha1, chunks) |
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
649 |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
650 |
def get_sha1s(self, keys): |
651 |
"""See VersionedFiles.get_sha1s().""" |
|
652 |
result = {} |
|
653 |
for record in self.get_record_stream(keys, 'unordered', True): |
|
654 |
if record.sha1 != None: |
|
655 |
result[record.key] = record.sha1 |
|
656 |
else: |
|
657 |
if record.storage_kind != 'absent': |
|
658 |
result[record.key] == sha_string(record.get_bytes_as( |
|
659 |
'fulltext')) |
|
660 |
return result |
|
661 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
662 |
def insert_record_stream(self, stream): |
663 |
"""Insert a record stream into this container. |
|
664 |
||
665 |
:param stream: A stream of records to insert.
|
|
666 |
:return: None
|
|
667 |
:seealso VersionedFiles.get_record_stream:
|
|
668 |
"""
|
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
669 |
for _ in self._insert_record_stream(stream): |
670 |
pass
|
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
671 |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
672 |
def _insert_record_stream(self, stream, random_id=False): |
|
0.17.2
by Robert Collins
Core proof of concept working. |
673 |
"""Internal core to insert a record stream into this container. |
674 |
||
675 |
This helper function has a different interface than insert_record_stream
|
|
676 |
to allow add_lines to be minimal, but still return the needed data.
|
|
677 |
||
678 |
:param stream: A stream of records to insert.
|
|
679 |
:return: An iterator over the sha1 of the inserted records.
|
|
680 |
:seealso insert_record_stream:
|
|
681 |
:seealso add_lines:
|
|
682 |
"""
|
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
683 |
def get_adapter(adapter_key): |
684 |
try: |
|
685 |
return adapters[adapter_key] |
|
686 |
except KeyError: |
|
687 |
adapter_factory = adapter_registry.get(adapter_key) |
|
688 |
adapter = adapter_factory(self) |
|
689 |
adapters[adapter_key] = adapter |
|
690 |
return adapter |
|
691 |
adapters = {} |
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
692 |
# This will go up to fulltexts for gc to gc fetching, which isn't
|
693 |
# ideal.
|
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
694 |
self._compressor = GroupCompressor(self._delta) |
695 |
self._unadded_refs = {} |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
696 |
keys_to_add = [] |
697 |
basis_end = 0 |
|
|
0.17.6
by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big). |
698 |
groups = 1 |
699 |
def flush(): |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
700 |
compressed = zlib.compress(''.join(self._compressor.lines)) |
|
0.17.6
by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big). |
701 |
index, start, length = self._access.add_raw_records( |
702 |
[(None, len(compressed))], compressed)[0] |
|
703 |
nodes = [] |
|
704 |
for key, reads, refs in keys_to_add: |
|
705 |
nodes.append((key, "%d %d %s" % (start, length, reads), refs)) |
|
706 |
self._index.add_records(nodes, random_id=random_id) |
|
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
707 |
last_prefix = None |
|
0.17.2
by Robert Collins
Core proof of concept working. |
708 |
for record in stream: |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
709 |
# Raise an error when a record is missing.
|
710 |
if record.storage_kind == 'absent': |
|
711 |
raise errors.RevisionNotPresent([record.key], self) |
|
|
0.20.18
by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys() |
712 |
try: |
|
0.20.2
by John Arbash Meinel
Teach groupcompress about 'chunked' encoding |
713 |
lines = osutils.chunks_to_lines(record.get_bytes_as('chunked')) |
|
0.20.18
by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys() |
714 |
except errors.UnavailableRepresentation: |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
715 |
adapter_key = record.storage_kind, 'fulltext' |
716 |
adapter = get_adapter(adapter_key) |
|
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
717 |
bytes = adapter.get_bytes(record) |
|
0.20.2
by John Arbash Meinel
Teach groupcompress about 'chunked' encoding |
718 |
lines = osutils.split_lines(bytes) |
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
719 |
soft = False |
|
0.20.13
by John Arbash Meinel
Play around a bit. |
720 |
if len(record.key) > 1: |
721 |
prefix = record.key[0] |
|
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
722 |
if (last_prefix is not None and prefix != last_prefix): |
723 |
soft = True |
|
724 |
if basis_end > 1024 * 1024 * 4: |
|
725 |
flush() |
|
726 |
self._compressor = GroupCompressor(self._delta) |
|
727 |
self._unadded_refs = {} |
|
728 |
keys_to_add = [] |
|
729 |
basis_end = 0 |
|
730 |
groups += 1 |
|
731 |
last_prefix = prefix |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
732 |
found_sha1, end_point = self._compressor.compress(record.key, |
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
733 |
lines, record.sha1, soft=soft) |
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
734 |
if record.key[-1] is None: |
735 |
key = record.key[:-1] + ('sha1:' + found_sha1,) |
|
736 |
else: |
|
737 |
key = record.key |
|
738 |
self._unadded_refs[key] = record.parents |
|
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
739 |
yield found_sha1 |
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
740 |
keys_to_add.append((key, '%d %d' % (basis_end, end_point), |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
741 |
(record.parents,))) |
742 |
basis_end = end_point |
|
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
743 |
if basis_end > 1024 * 1024 * 8: |
|
0.17.6
by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big). |
744 |
flush() |
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
745 |
self._compressor = GroupCompressor(self._delta) |
746 |
self._unadded_refs = {} |
|
|
0.17.6
by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big). |
747 |
keys_to_add = [] |
748 |
basis_end = 0 |
|
749 |
groups += 1 |
|
|
0.17.8
by Robert Collins
Flush pending updates at the end of _insert_record_stream |
750 |
if len(keys_to_add): |
751 |
flush() |
|
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
752 |
self._compressor = None |
753 |
self._unadded_refs = {} |
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
754 |
|
755 |
def iter_lines_added_or_present_in_keys(self, keys, pb=None): |
|
756 |
"""Iterate over the lines in the versioned files from keys. |
|
757 |
||
758 |
This may return lines from other keys. Each item the returned
|
|
759 |
iterator yields is a tuple of a line and a text version that that line
|
|
760 |
is present in (not introduced in).
|
|
761 |
||
762 |
Ordering of results is in whatever order is most suitable for the
|
|
763 |
underlying storage format.
|
|
764 |
||
765 |
If a progress bar is supplied, it may be used to indicate progress.
|
|
766 |
The caller is responsible for cleaning up progress bars (because this
|
|
767 |
is an iterator).
|
|
768 |
||
769 |
NOTES:
|
|
770 |
* Lines are normalised by the underlying store: they will all have \n
|
|
771 |
terminators.
|
|
772 |
* Lines are returned in arbitrary order.
|
|
773 |
||
774 |
:return: An iterator over (line, key).
|
|
775 |
"""
|
|
776 |
if pb is None: |
|
777 |
pb = progress.DummyProgress() |
|
778 |
keys = set(keys) |
|
779 |
total = len(keys) |
|
780 |
# we don't care about inclusions, the caller cares.
|
|
781 |
# but we need to setup a list of records to visit.
|
|
782 |
# we need key, position, length
|
|
783 |
for key_idx, record in enumerate(self.get_record_stream(keys, |
|
784 |
'unordered', True)): |
|
785 |
# XXX: todo - optimise to use less than full texts.
|
|
786 |
key = record.key |
|
787 |
pb.update('Walking content.', key_idx, total) |
|
788 |
if record.storage_kind == 'absent': |
|
789 |
raise errors.RevisionNotPresent(record.key, self) |
|
790 |
lines = split_lines(record.get_bytes_as('fulltext')) |
|
791 |
for line in lines: |
|
792 |
yield line, key |
|
793 |
pb.update('Walking content.', total, total) |
|
794 |
||
795 |
def keys(self): |
|
796 |
"""See VersionedFiles.keys.""" |
|
797 |
if 'evil' in debug.debug_flags: |
|
798 |
trace.mutter_callsite(2, "keys scales with size of history") |
|
799 |
sources = [self._index] |
|
800 |
result = set() |
|
801 |
for source in sources: |
|
802 |
result.update(source.keys()) |
|
803 |
return result |
|
804 |
||
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
805 |
|
806 |
class _GCGraphIndex(object): |
|
807 |
"""Mapper from GroupCompressVersionedFiles needs into GraphIndex storage.""" |
|
808 |
||
|
0.17.9
by Robert Collins
Initial stab at repository format support. |
809 |
def __init__(self, graph_index, is_locked, parents=True, |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
810 |
add_callback=None): |
811 |
"""Construct a _GCGraphIndex on a graph_index. |
|
812 |
||
813 |
:param graph_index: An implementation of bzrlib.index.GraphIndex.
|
|
814 |
:param is_locked: A callback to check whether the object should answer
|
|
815 |
queries.
|
|
816 |
:param parents: If True, record knits parents, if not do not record
|
|
817 |
parents.
|
|
818 |
:param add_callback: If not None, allow additions to the index and call
|
|
819 |
this callback with a list of added GraphIndex nodes:
|
|
820 |
[(node, value, node_refs), ...]
|
|
821 |
:param is_locked: A callback, returns True if the index is locked and
|
|
822 |
thus usable.
|
|
823 |
"""
|
|
824 |
self._add_callback = add_callback |
|
825 |
self._graph_index = graph_index |
|
826 |
self._parents = parents |
|
827 |
self.has_graph = parents |
|
828 |
self._is_locked = is_locked |
|
829 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
830 |
def add_records(self, records, random_id=False): |
831 |
"""Add multiple records to the index. |
|
832 |
|
|
833 |
This function does not insert data into the Immutable GraphIndex
|
|
834 |
backing the KnitGraphIndex, instead it prepares data for insertion by
|
|
835 |
the caller and checks that it is safe to insert then calls
|
|
836 |
self._add_callback with the prepared GraphIndex nodes.
|
|
837 |
||
838 |
:param records: a list of tuples:
|
|
839 |
(key, options, access_memo, parents).
|
|
840 |
:param random_id: If True the ids being added were randomly generated
|
|
841 |
and no check for existence will be performed.
|
|
842 |
"""
|
|
843 |
if not self._add_callback: |
|
844 |
raise errors.ReadOnlyError(self) |
|
845 |
# we hope there are no repositories with inconsistent parentage
|
|
846 |
# anymore.
|
|
847 |
||
848 |
changed = False |
|
849 |
keys = {} |
|
850 |
for (key, value, refs) in records: |
|
851 |
if not self._parents: |
|
852 |
if refs: |
|
853 |
for ref in refs: |
|
854 |
if ref: |
|
855 |
raise KnitCorrupt(self, |
|
856 |
"attempt to add node with parents "
|
|
857 |
"in parentless index.") |
|
858 |
refs = () |
|
859 |
changed = True |
|
860 |
keys[key] = (value, refs) |
|
861 |
# check for dups
|
|
862 |
if not random_id: |
|
863 |
present_nodes = self._get_entries(keys) |
|
864 |
for (index, key, value, node_refs) in present_nodes: |
|
865 |
if node_refs != keys[key][1]: |
|
866 |
raise errors.KnitCorrupt(self, "inconsistent details in add_records" |
|
867 |
": %s %s" % ((value, node_refs), keys[key])) |
|
868 |
del keys[key] |
|
869 |
changed = True |
|
870 |
if changed: |
|
871 |
result = [] |
|
872 |
if self._parents: |
|
873 |
for key, (value, node_refs) in keys.iteritems(): |
|
874 |
result.append((key, value, node_refs)) |
|
875 |
else: |
|
876 |
for key, (value, node_refs) in keys.iteritems(): |
|
877 |
result.append((key, value)) |
|
878 |
records = result |
|
879 |
self._add_callback(records) |
|
880 |
||
881 |
def _check_read(self): |
|
882 |
"""raise if reads are not permitted.""" |
|
883 |
if not self._is_locked(): |
|
884 |
raise errors.ObjectNotLocked(self) |
|
885 |
||
|
0.17.2
by Robert Collins
Core proof of concept working. |
886 |
def _check_write_ok(self): |
887 |
"""Assert if writes are not permitted.""" |
|
888 |
if not self._is_locked(): |
|
889 |
raise errors.ObjectNotLocked(self) |
|
890 |
||
|
0.17.5
by Robert Collins
nograph tests completely passing. |
891 |
def _get_entries(self, keys, check_present=False): |
892 |
"""Get the entries for keys. |
|
893 |
|
|
894 |
:param keys: An iterable of index key tuples.
|
|
895 |
"""
|
|
896 |
keys = set(keys) |
|
897 |
found_keys = set() |
|
898 |
if self._parents: |
|
899 |
for node in self._graph_index.iter_entries(keys): |
|
900 |
yield node |
|
901 |
found_keys.add(node[1]) |
|
902 |
else: |
|
903 |
# adapt parentless index to the rest of the code.
|
|
904 |
for node in self._graph_index.iter_entries(keys): |
|
905 |
yield node[0], node[1], node[2], () |
|
906 |
found_keys.add(node[1]) |
|
907 |
if check_present: |
|
908 |
missing_keys = keys.difference(found_keys) |
|
909 |
if missing_keys: |
|
910 |
raise RevisionNotPresent(missing_keys.pop(), self) |
|
911 |
||
912 |
def get_parent_map(self, keys): |
|
913 |
"""Get a map of the parents of keys. |
|
914 |
||
915 |
:param keys: The keys to look up parents for.
|
|
916 |
:return: A mapping from keys to parents. Absent keys are absent from
|
|
917 |
the mapping.
|
|
918 |
"""
|
|
919 |
self._check_read() |
|
920 |
nodes = self._get_entries(keys) |
|
921 |
result = {} |
|
922 |
if self._parents: |
|
923 |
for node in nodes: |
|
924 |
result[node[1]] = node[3][0] |
|
925 |
else: |
|
926 |
for node in nodes: |
|
927 |
result[node[1]] = None |
|
928 |
return result |
|
929 |
||
930 |
def get_build_details(self, keys): |
|
931 |
"""Get the various build details for keys. |
|
932 |
||
933 |
Ghosts are omitted from the result.
|
|
934 |
||
935 |
:param keys: An iterable of keys.
|
|
936 |
:return: A dict of key:
|
|
937 |
(index_memo, compression_parent, parents, record_details).
|
|
938 |
index_memo
|
|
939 |
opaque structure to pass to read_records to extract the raw
|
|
940 |
data
|
|
941 |
compression_parent
|
|
942 |
Content that this record is built upon, may be None
|
|
943 |
parents
|
|
944 |
Logical parents of this node
|
|
945 |
record_details
|
|
946 |
extra information about the content which needs to be passed to
|
|
947 |
Factory.parse_record
|
|
948 |
"""
|
|
949 |
self._check_read() |
|
950 |
result = {} |
|
951 |
entries = self._get_entries(keys, False) |
|
952 |
for entry in entries: |
|
953 |
key = entry[1] |
|
954 |
if not self._parents: |
|
955 |
parents = None |
|
956 |
else: |
|
957 |
parents = entry[3][0] |
|
958 |
value = entry[2] |
|
959 |
method = 'group' |
|
960 |
result[key] = (self._node_to_position(entry), |
|
961 |
None, parents, (method, None)) |
|
962 |
return result |
|
963 |
||
964 |
def keys(self): |
|
965 |
"""Get all the keys in the collection. |
|
966 |
|
|
967 |
The keys are not ordered.
|
|
968 |
"""
|
|
969 |
self._check_read() |
|
970 |
return [node[1] for node in self._graph_index.iter_all_entries()] |
|
971 |
||
972 |
def _node_to_position(self, node): |
|
973 |
"""Convert an index value to position details.""" |
|
974 |
bits = node[2].split(' ') |
|
975 |
# It would be nice not to read the entire gzip.
|
|
976 |
start = int(bits[0]) |
|
977 |
stop = int(bits[1]) |
|
978 |
basis_end = int(bits[2]) |
|
979 |
delta_end = int(bits[3]) |
|
980 |
return node[0], start, stop, basis_end, delta_end |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
981 |
|
982 |
||
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
983 |
def _get_longest_match(equivalence_table, pos, max_pos, locations): |
|
0.18.25
by John Arbash Meinel
Factor the get_longest_match into a helper func |
984 |
"""Get the longest possible match for the current position.""" |
985 |
range_start = pos |
|
986 |
range_len = 0 |
|
987 |
copy_ends = None |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
988 |
while pos < max_pos: |
|
0.18.25
by John Arbash Meinel
Factor the get_longest_match into a helper func |
989 |
if locations is None: |
990 |
locations = equivalence_table.get_idx_matches(pos) |
|
991 |
if locations is None: |
|
992 |
# No more matches, just return whatever we have, but we know that
|
|
993 |
# this last position is not going to match anything
|
|
994 |
pos += 1 |
|
995 |
break
|
|
996 |
else: |
|
997 |
if copy_ends is None: |
|
998 |
# We are starting a new range
|
|
999 |
copy_ends = [loc + 1 for loc in locations] |
|
1000 |
range_len = 1 |
|
1001 |
locations = None # Consumed |
|
1002 |
else: |
|
1003 |
# We are currently in the middle of a match
|
|
1004 |
next_locations = set(copy_ends).intersection(locations) |
|
1005 |
if len(next_locations): |
|
1006 |
# range continues
|
|
1007 |
copy_ends = [loc + 1 for loc in next_locations] |
|
1008 |
range_len += 1 |
|
1009 |
locations = None # Consumed |
|
1010 |
else: |
|
1011 |
# But we are done with this match, we should be
|
|
1012 |
# starting a new one, though. We will pass back 'locations'
|
|
1013 |
# so that we don't have to do another lookup.
|
|
1014 |
break
|
|
1015 |
pos += 1 |
|
1016 |
if copy_ends is None: |
|
1017 |
return None, pos, locations |
|
1018 |
return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations |
|
1019 |
||
1020 |
||
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
1021 |
try: |
1022 |
from bzrlib.plugins.groupcompress import _groupcompress_c |
|
1023 |
except ImportError: |
|
1024 |
pass
|
|
1025 |
else: |
|
1026 |
GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable |
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
1027 |
_get_longest_match = _groupcompress_c._get_longest_match |