/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
1
/*
2
 * diff-delta.c: generate a delta between two buffers
3
 *
4
 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5
 * http://www.xmailserver.org/xdiff-lib.html
6
 *
7
 * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
8
 * Adapted for Bazaar by John Arbash Meinel <john@arbash-meinel.com> (C) 2009
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
9
 *
10
 * This code is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License version 2 as
12
 * published by the Free Software Foundation.
13
 */
14
3735.33.4 by John Arbash Meinel
The new layout is working.
15
#include <stdio.h>
16
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
17
#include "delta.h"
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
18
#include <stdlib.h>
19
#include <string.h>
0.23.5 by John Arbash Meinel
Minor changes to get diff-delta.c and patch-delta.c to compile.
20
#include <assert.h>
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
21
22
/* maximum hash entry list for the same hash bucket */
23
#define HASH_LIMIT 64
24
25
#define RABIN_SHIFT 23
26
#define RABIN_WINDOW 16
27
3735.33.13 by John Arbash Meinel
Tweak the number of blank spaces up just a tad.
28
/* The hash map is sized to put 4 entries per bucket, this gives us ~even room
29
 * for more data. Tweaking this number above 4 doesn't seem to help much,
30
 * anyway.
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
31
 */
3735.33.13 by John Arbash Meinel
Tweak the number of blank spaces up just a tad.
32
#define EXTRA_NULLS 4
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
33
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
34
static const unsigned int T[256] = {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
35
    0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
36
    0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
37
    0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
38
    0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
39
    0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
40
    0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
41
    0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
42
    0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
43
    0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
44
    0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
45
    0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
46
    0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
47
    0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
48
    0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
49
    0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
50
    0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
51
    0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
52
    0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
53
    0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
54
    0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
55
    0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
56
    0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
57
    0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
58
    0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
59
    0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
60
    0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
61
    0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
62
    0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
63
    0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
64
    0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
65
    0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
66
    0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
67
    0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
68
    0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
69
    0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
70
    0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
71
    0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
72
    0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
73
    0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
74
    0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
75
    0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
76
    0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
77
    0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
78
};
79
80
static const unsigned int U[256] = {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
81
    0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
82
    0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
83
    0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
84
    0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
85
    0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
86
    0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
87
    0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
88
    0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
89
    0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
90
    0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
91
    0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
92
    0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
93
    0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
94
    0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
95
    0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
96
    0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
97
    0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
98
    0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
99
    0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
100
    0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
101
    0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
102
    0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
103
    0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
104
    0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
105
    0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
106
    0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
107
    0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
108
    0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
109
    0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
110
    0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
111
    0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
112
    0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
113
    0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
114
    0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
115
    0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
116
    0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
117
    0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
118
    0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
119
    0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
120
    0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
121
    0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
122
    0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
123
    0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
124
};
125
126
struct index_entry {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
127
    const unsigned char *ptr;
128
    const struct source_info *src;
129
    unsigned int val;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
130
};
131
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
132
struct index_entry_linked_list {
133
    struct index_entry *p_entry;
134
    struct index_entry_linked_list *next;
135
};
136
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
137
struct unpacked_index_entry {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
138
    struct index_entry entry;
139
    struct unpacked_index_entry *next;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
140
};
141
142
struct delta_index {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
143
    unsigned long memsize; /* Total bytes pointed to by this index */
144
    const struct source_info *last_src; /* Information about the referenced source */
145
    unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
146
                               entry */
147
    unsigned int num_entries; /* The total number of entries in this index */
148
    struct index_entry *last_entry; /* Pointer to the last valid entry */
149
    struct index_entry *hash[];
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
150
};
151
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
152
static unsigned int
153
limit_hash_buckets(struct unpacked_index_entry **hash,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
154
                   unsigned int *hash_count, unsigned int hsize,
155
                   unsigned int entries)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
156
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
157
    struct unpacked_index_entry *entry;
158
    unsigned int i;
159
    /*
160
     * Determine a limit on the number of entries in the same hash
161
     * bucket.  This guards us against pathological data sets causing
162
     * really bad hash distribution with most entries in the same hash
163
     * bucket that would bring us to O(m*n) computing costs (m and n
164
     * corresponding to reference and target buffer sizes).
165
     *
166
     * Make sure none of the hash buckets has more entries than
167
     * we're willing to test.  Otherwise we cull the entry list
168
     * uniformly to still preserve a good repartition across
169
     * the reference buffer.
170
     */
171
    for (i = 0; i < hsize; i++) {
172
        int acc;
173
174
        if (hash_count[i] <= HASH_LIMIT)
175
            continue;
176
177
        /* We leave exactly HASH_LIMIT entries in the bucket */
178
        entries -= hash_count[i] - HASH_LIMIT;
179
180
        entry = hash[i];
181
        acc = 0;
182
183
        /*
184
         * Assume that this loop is gone through exactly
185
         * HASH_LIMIT times and is entered and left with
186
         * acc==0.  So the first statement in the loop
187
         * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
188
         * to the accumulator, and the inner loop consequently
189
         * is run (hash_count[i]-HASH_LIMIT) times, removing
190
         * one element from the list each time.  Since acc
191
         * balances out to 0 at the final run, the inner loop
192
         * body can't be left with entry==NULL.  So we indeed
193
         * encounter entry==NULL in the outer loop only.
194
         */
195
        do {
196
            acc += hash_count[i] - HASH_LIMIT;
197
            if (acc > 0) {
198
                struct unpacked_index_entry *keep = entry;
199
                do {
200
                    entry = entry->next;
201
                    acc -= HASH_LIMIT;
202
                } while (acc > 0);
203
                keep->next = entry->next;
204
            }
205
            entry = entry->next;
206
        } while (entry);
207
    }
208
    return entries;
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
209
}
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
210
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
211
static struct delta_index *
212
pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
213
                 unsigned int num_entries, struct delta_index *old_index)
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
214
{
3735.33.14 by John Arbash Meinel
Simplify the code a bit. We don't repack as often, so go with a
215
    unsigned int i, j, hmask, memsize, fit_in_old, copied_count;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
216
    struct unpacked_index_entry *entry;
217
    struct delta_index *index;
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
218
    struct index_entry *packed_entry, **packed_hash, *old_entry, *copy_from;
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
219
    struct index_entry null_entry = {0};
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
220
    void *mem;
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
221
222
    hmask = hsize - 1;
223
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
224
    // if (old_index) {
225
    //     fprintf(stderr, "Packing %d entries into %d for total of %d entries"
226
    //                     " %x => %x\n",
227
    //                     num_entries - old_index->num_entries,
228
    //                     old_index->num_entries, num_entries,
229
    //                     old_index->hash_mask, hmask);
230
    // } else {
231
    //     fprintf(stderr, "Packing %d entries into a new index\n",
232
    //                     num_entries);
233
    // }
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
234
    /* First, see if we can squeeze the new items into the existing structure.
235
     */
236
    fit_in_old = 0;
237
    copied_count = 0;
238
    if (old_index && old_index->hash_mask == hmask) {
239
        fit_in_old = 1;
240
        for (i = 0; i < hsize; ++i) {
241
            packed_entry = NULL;
242
            for (entry = hash[i]; entry; entry = entry->next) {
243
                if (packed_entry == NULL) {
244
                    /* Find the last open spot */
245
                    packed_entry = old_index->hash[i + 1];
246
                    --packed_entry;
247
                    while (packed_entry >= old_index->hash[i]
248
                           && packed_entry->ptr == NULL) {
249
                        --packed_entry;
250
                    }
251
                    ++packed_entry;
252
                }
253
                if (packed_entry >= old_index->hash[i+1]
254
                    || packed_entry->ptr != NULL) {
255
                    /* There are no free spots here :( */
256
                    fit_in_old = 0;
257
                    break;
258
                }
259
                /* We found an empty spot to put this entry
260
                 * Copy it over, and remove it from the linked list, just in
261
                 * case we end up running out of room later.
262
                 */
263
                *packed_entry++ = entry->entry;
264
                assert(entry == hash[i]);
265
                hash[i] = entry->next;
266
                copied_count += 1;
267
                old_index->num_entries++;
268
            }
269
            if (!fit_in_old) {
270
                break;
271
            }
272
        }
273
    }
274
    if (old_index) {
275
        if (fit_in_old) {
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
276
            // fprintf(stderr, "Fit all %d entries into old index\n",
277
            //                 copied_count);
3735.33.14 by John Arbash Meinel
Simplify the code a bit. We don't repack as often, so go with a
278
            /* No need to allocate a new buffer */
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
279
            return NULL;
280
        } else {
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
281
            // fprintf(stderr, "Fit only %d entries into old index,"
282
            //                 " reallocating\n", copied_count);
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
283
        }
284
    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
285
    /*
286
     * Now create the packed index in array form
287
     * rather than linked lists.
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
288
     * Leave a 2-entry gap for inserting more entries between the groups
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
289
     */
290
    memsize = sizeof(*index)
291
        + sizeof(*packed_hash) * (hsize+1)
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
292
        + sizeof(*packed_entry) * (num_entries + hsize * EXTRA_NULLS);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
293
    mem = malloc(memsize);
294
    if (!mem) {
295
        return NULL;
296
    }
297
298
    index = mem;
299
    index->memsize = memsize;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
300
    index->hash_mask = hmask;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
301
    index->num_entries = num_entries;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
302
    if (old_index) {
3735.33.16 by John Arbash Meinel
Handle when our current packing is sub-optimal.
303
        if (hmask < old_index->hash_mask) {
304
            fprintf(stderr, "hash mask was shrunk %x => %x\n",
305
                            old_index->hash_mask, hmask);
306
        }
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
307
        assert(hmask >= old_index->hash_mask);
308
    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
309
310
    mem = index->hash;
311
    packed_hash = mem;
312
    mem = packed_hash + (hsize+1);
313
    packed_entry = mem;
314
315
    for (i = 0; i < hsize; i++) {
316
        /*
317
         * Coalesce all entries belonging to one linked list
318
         * into consecutive array entries.
319
         */
320
        packed_hash[i] = packed_entry;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
321
        /* Old comes earlier as a source, so it always comes first in a given
322
         * hash bucket.
323
         */
324
        if (old_index) {
325
            /* Could we optimize this to use memcpy when hmask ==
326
             * old_index->hash_mask? Would it make any real difference?
327
             */
328
            j = i & old_index->hash_mask;
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
329
            copy_from = old_index->hash[j];
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
330
            for (old_entry = old_index->hash[j];
331
                 old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;
332
                 old_entry++) {
333
                if ((old_entry->val & hmask) == i) {
3735.33.14 by John Arbash Meinel
Simplify the code a bit. We don't repack as often, so go with a
334
                    *packed_entry++ = *old_entry;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
335
                }
336
            }
337
        }
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
338
        for (entry = hash[i]; entry; entry = entry->next) {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
339
            *packed_entry++ = entry->entry;
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
340
        }
3735.33.14 by John Arbash Meinel
Simplify the code a bit. We don't repack as often, so go with a
341
        /* TODO: At this point packed_entry - packed_hash[i] is the number of
342
         *       records that we have inserted into this hash bucket.
343
         *       We should *really* consider doing some limiting along the
344
         *       lines of limit_hash_buckets() to avoid pathological behavior.
345
         */
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
346
        /* Now add extra 'NULL' entries that we can use for future expansion. */
347
        for (j = 0; j < EXTRA_NULLS; ++j ) {
348
            *packed_entry++ = null_entry;
349
        }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
350
    }
351
352
    /* Sentinel value to indicate the length of the last hash bucket */
353
    packed_hash[hsize] = packed_entry;
354
3735.33.4 by John Arbash Meinel
The new layout is working.
355
    if (packed_entry - (struct index_entry *)mem
356
        != num_entries + hsize*EXTRA_NULLS) {
3735.33.5 by John Arbash Meinel
Restoring the debugging for now.
357
        fprintf(stderr, "We expected %d entries, but created %d\n",
358
                num_entries + hsize*EXTRA_NULLS,
359
                (int)(packed_entry - (struct index_entry*)mem));
3735.33.4 by John Arbash Meinel
The new layout is working.
360
    }
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
361
    assert(packed_entry - (struct index_entry *)mem
362
            == num_entries + hsize*EXTRA_NULLS);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
363
    index->last_entry = (packed_entry - 1);
364
    return index;
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
365
}
366
3735.33.4 by John Arbash Meinel
The new layout is working.
367
0.23.49 by John Arbash Meinel
When adding new entries to the delta index, use memcpy
368
struct delta_index *
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
369
create_delta_index(const struct source_info *src,
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
370
                   struct delta_index *old)
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
371
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
372
    unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
373
    unsigned int total_num_entries;
374
    const unsigned char *data, *buffer;
375
    struct delta_index *index;
376
    struct unpacked_index_entry *entry, **hash;
377
    void *mem;
378
    unsigned long memsize;
379
380
    if (!src->buf || !src->size)
381
        return NULL;
382
    buffer = src->buf;
383
384
    /* Determine index hash size.  Note that indexing skips the
385
       first byte to allow for optimizing the Rabin's polynomial
386
       initialization in create_delta(). */
387
    num_entries = (src->size - 1)  / RABIN_WINDOW;
388
    if (old != NULL)
389
        total_num_entries = num_entries + old->num_entries;
390
    else
391
        total_num_entries = num_entries;
3735.33.8 by John Arbash Meinel
Reverted back to the same hash width, and bumped EXTRA_NULLS to 3.
392
    hsize = total_num_entries / 4;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
393
    for (i = 4; (1u << i) < hsize && i < 31; i++);
394
    hsize = 1 << i;
395
    hmask = hsize - 1;
3735.33.18 by John Arbash Meinel
*grow* the local hmask if it is smaller than expected, don't *shrink* it.
396
    if (old && old->hash_mask > hmask) {
3735.33.16 by John Arbash Meinel
Handle when our current packing is sub-optimal.
397
        hmask = old->hash_mask;
398
        hsize = hmask + 1;
399
    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
400
401
    /* allocate lookup index */
402
    memsize = sizeof(*hash) * hsize +
403
          sizeof(*entry) * total_num_entries;
404
    mem = malloc(memsize);
405
    if (!mem)
406
        return NULL;
407
    hash = mem;
408
    mem = hash + hsize;
409
    entry = mem;
410
411
    memset(hash, 0, hsize * sizeof(*hash));
412
413
    /* allocate an array to count hash num_entries */
414
    hash_count = calloc(hsize, sizeof(*hash_count));
415
    if (!hash_count) {
416
        free(hash);
417
        return NULL;
418
    }
419
420
    /* then populate the index for the new data */
421
    prev_val = ~0;
422
    for (data = buffer + num_entries * RABIN_WINDOW - RABIN_WINDOW;
423
         data >= buffer;
424
         data -= RABIN_WINDOW) {
425
        unsigned int val = 0;
426
        for (i = 1; i <= RABIN_WINDOW; i++)
427
            val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
428
        if (val == prev_val) {
429
            /* keep the lowest of consecutive identical blocks */
430
            entry[-1].entry.ptr = data + RABIN_WINDOW;
431
            --num_entries;
432
            --total_num_entries;
433
        } else {
434
            prev_val = val;
435
            i = val & hmask;
436
            entry->entry.ptr = data + RABIN_WINDOW;
437
            entry->entry.val = val;
438
            entry->entry.src = src;
439
            entry->next = hash[i];
440
            hash[i] = entry++;
441
            hash_count[i]++;
442
        }
443
    }
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
444
    /* TODO: It would be nice to limit_hash_buckets at a better time. */
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
445
    total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
446
                                           total_num_entries);
447
    free(hash_count);
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
448
    if (old) {
449
        old->last_src = src;
450
    }
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
451
    index = pack_delta_index(hash, hsize, total_num_entries, old);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
452
    free(hash);
453
    if (!index) {
454
        return NULL;
455
    }
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
456
    index->last_src = src;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
457
    return index;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
458
}
459
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
460
/* Take some entries, and put them into a custom hash.
461
 * @param entries   A list of entries, sorted by position in file
462
 * @param num_entries   Length of entries
463
 * @param out_hsize     The maximum size of the hash, the final size will be
464
 *                      returned here
465
 */
466
struct index_entry_linked_list **
467
_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
468
                       unsigned int hsize)
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
469
{
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
470
    unsigned int hash_offset, hmask, memsize;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
471
    struct index_entry *entry, *last_entry;
472
    struct index_entry_linked_list *out_entry, **hash;
473
    void *mem;
474
475
    hmask = hsize - 1;
476
477
    memsize = sizeof(*hash) * hsize +
478
          sizeof(*out_entry) * num_entries;
479
    mem = malloc(memsize);
480
    if (!mem)
481
        return NULL;
482
    hash = mem;
483
    mem = hash + hsize;
484
    out_entry = mem;
485
486
    memset(hash, 0, sizeof(*hash)*(hsize+1));
487
488
    /* We know that entries are in the order we want in the output, but they
489
     * aren't "grouped" by hash bucket yet.
490
     */
491
    last_entry = entries + num_entries;
492
    for (entry = entries + num_entries - 1; entry >= entries; --entry) {
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
493
        hash_offset = entry->val & hmask;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
494
        out_entry->p_entry = entry;
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
495
        out_entry->next = hash[hash_offset];
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
496
        /* TODO: Remove entries that have identical vals, or at least filter
497
         *       the map a little bit.
498
         * if (hash[i] != NULL) {
499
         * }
500
         */
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
501
        hash[hash_offset] = out_entry;
502
        ++out_entry;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
503
    }
504
    return hash;
505
}
506
3735.33.4 by John Arbash Meinel
The new layout is working.
507
508
struct delta_index *
509
create_index_from_old_and_new_entries(const struct delta_index *old_index,
510
                                      struct index_entry *entries,
511
                                      unsigned int num_entries)
512
{
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
513
    unsigned int i, j, hsize, hmask, total_num_entries;
3735.33.4 by John Arbash Meinel
The new layout is working.
514
    struct delta_index *index;
515
    struct index_entry *entry, *packed_entry, **packed_hash;
516
    struct index_entry *last_entry, null_entry = {0};
517
    void *mem;
518
    unsigned long memsize;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
519
    struct index_entry_linked_list *unpacked_entry, **mini_hash;
3735.33.4 by John Arbash Meinel
The new layout is working.
520
521
    /* Determine index hash size.  Note that indexing skips the
522
       first byte to allow for optimizing the Rabin's polynomial
523
       initialization in create_delta(). */
524
    total_num_entries = num_entries + old_index->num_entries;
3735.33.8 by John Arbash Meinel
Reverted back to the same hash width, and bumped EXTRA_NULLS to 3.
525
    hsize = total_num_entries / 4;
3735.33.4 by John Arbash Meinel
The new layout is working.
526
    for (i = 4; (1u << i) < hsize && i < 31; i++);
527
    hsize = 1 << i;
528
    if (hsize < old_index->hash_mask) {
529
        /* For some reason, there was a code path that would actually *shrink*
530
         * the hash size. This screws with some later code, and in general, I
531
         * think it better to make the hash bigger, rather than smaller. So
532
         * we'll just force the size here.
533
         * Possibly done by create_delta_index running into a
534
         * limit_hash_buckets call, that ended up transitioning across a
535
         * power-of-2. The cause isn't 100% clear, though.
536
         */
537
        hsize = old_index->hash_mask + 1;
538
    }
539
    hmask = hsize - 1;
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
540
    // fprintf(stderr, "resizing index to insert %d entries into array"
541
    //                 " with %d entries: %x => %x\n",
542
    //         num_entries, old_index->num_entries, old_index->hash_mask, hmask);
3735.33.4 by John Arbash Meinel
The new layout is working.
543
544
    memsize = sizeof(*index)
545
        + sizeof(*packed_hash) * (hsize+1)
546
        + sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);
547
    mem = malloc(memsize);
548
    if (!mem) {
549
        return NULL;
550
    }
551
    index = mem;
552
    index->memsize = memsize;
553
    index->hash_mask = hmask;
554
    index->num_entries = total_num_entries;
555
    index->last_src = old_index->last_src;
556
557
    mem = index->hash;
558
    packed_hash = mem;
559
    mem = packed_hash + (hsize+1);
560
    packed_entry = mem;
561
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
562
    mini_hash = _put_entries_into_hash(entries, num_entries, hsize);
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
563
    if (mini_hash == NULL) {
564
        free(index);
565
        return NULL;
566
    }
3735.33.4 by John Arbash Meinel
The new layout is working.
567
    last_entry = entries + num_entries;
568
    for (i = 0; i < hsize; i++) {
569
        /*
570
         * Coalesce all entries belonging in one hash bucket
571
         * into consecutive array entries.
572
         * The entries in old_index all come before 'entries'.
573
         */
574
        packed_hash[i] = packed_entry;
575
        /* Copy any of the old entries across */
576
        /* Would we rather use memcpy? */
577
        if (hmask == old_index->hash_mask) {
578
            for (entry = old_index->hash[i];
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
579
                 entry < old_index->hash[i+1] && entry->ptr != NULL;
3735.33.4 by John Arbash Meinel
The new layout is working.
580
                 ++entry) {
581
                assert((entry->val & hmask) == i);
582
                *packed_entry++ = *entry;
583
            }
584
        } else {
585
            /* If we resized the index from this action, all of the old values
586
             * will be found in the previous location, but they will end up
587
             * spread across the new locations.
588
             */
589
            j = i & old_index->hash_mask;
590
            for (entry = old_index->hash[j];
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
591
                 entry < old_index->hash[j+1] && entry->ptr != NULL;
3735.33.4 by John Arbash Meinel
The new layout is working.
592
                 ++entry) {
593
                assert((entry->val & old_index->hash_mask) == j);
594
                if ((entry->val & hmask) == i) {
595
                    /* Any entries not picked up here will be picked up on the
596
                     * next pass.
597
                     */
598
                    *packed_entry++ = *entry;
599
                }
600
            }
601
        }
602
        /* Now see if we need to insert any of the new entries.
603
         * Note that loop ends up O(hsize*num_entries), so we expect that
604
         * num_entries is always small.
605
         * We also help a little bit by collapsing the entry range when the
606
         * endpoints are inserted. However, an alternative would be to build a
607
         * quick hash lookup for just the new entries.
608
         * Testing shows that this list can easily get up to about 100
609
         * entries, the tradeoff is a malloc, 1 pass over the entries, copying
610
         * them into a sorted buffer, and a free() when done,
611
         */
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
612
        for (unpacked_entry = mini_hash[i];
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
613
             unpacked_entry;
614
             unpacked_entry = unpacked_entry->next) {
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
615
            assert((unpacked_entry->p_entry->val & hmask) == i);
616
            *packed_entry++ = *(unpacked_entry->p_entry);
3735.33.4 by John Arbash Meinel
The new layout is working.
617
        }
618
        /* Now insert some extra nulls */
619
        for (j = 0; j < EXTRA_NULLS; ++j) {
620
            *packed_entry++ = null_entry;
621
        }
622
    }
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
623
    free(mini_hash);
3735.33.4 by John Arbash Meinel
The new layout is working.
624
625
    /* Sentinel value to indicate the length of the last hash bucket */
626
    packed_hash[hsize] = packed_entry;
627
628
    if ((packed_entry - (struct index_entry *)mem)
629
        != (total_num_entries + hsize*EXTRA_NULLS)) {
630
        fprintf(stderr, "We expected %d entries, but created %d\n",
631
                total_num_entries + hsize*EXTRA_NULLS,
632
                (int)(packed_entry - (struct index_entry*)mem));
633
        fflush(stderr);
634
    }
635
    assert((packed_entry - (struct index_entry *)mem)
636
           == (total_num_entries + hsize * EXTRA_NULLS));
637
    index->last_entry = (packed_entry - 1);
638
    return index;
639
}
640
641
642
void
643
get_text(char buff[128], const unsigned char *ptr)
644
{
645
    unsigned int i;
646
    const unsigned char *start;
647
    unsigned char cmd;
648
    start = (ptr-RABIN_WINDOW-1);
649
    cmd = *(start);
650
    if (cmd < 0x80) {// This is likely to be an insert instruction
651
        if (cmd < RABIN_WINDOW) {
652
            cmd = RABIN_WINDOW;
653
        }
654
    } else {
655
        /* This was either a copy [should never be] or it
656
         * was a longer insert so the insert start happened at 16 more
657
         * bytes back.
658
         */
659
        cmd = RABIN_WINDOW + 1;
660
    }
661
    if (cmd > 60) {
662
        cmd = 60; /* Be friendly to 80char terms */
663
    }
664
    /* Copy the 1 byte command, and 4 bytes after the insert */
665
    cmd += 5;
666
    memcpy(buff, start, cmd);
667
    buff[cmd] = 0;
668
    for (i = 0; i < cmd; ++i) {
669
        if (buff[i] == '\n') {
670
            buff[i] = 'N';
671
        } else if (buff[i] == '\t') {
672
            buff[i] = 'T';
673
        }
674
    }
675
}
676
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
677
struct delta_index *
678
create_delta_index_from_delta(const struct source_info *src,
3735.33.4 by John Arbash Meinel
The new layout is working.
679
                              struct delta_index *old_index)
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
680
{
3735.33.4 by John Arbash Meinel
The new layout is working.
681
    unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;
682
    unsigned int hash_offset;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
683
    const unsigned char *data, *buffer, *top;
684
    unsigned char cmd;
3735.33.4 by John Arbash Meinel
The new layout is working.
685
    struct delta_index *new_index;
686
    struct index_entry *entry, *entries, *old_entry;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
687
688
    if (!src->buf || !src->size)
689
        return NULL;
690
    buffer = src->buf;
691
    top = buffer + src->size;
692
693
    /* Determine index hash size.  Note that indexing skips the
694
       first byte to allow for optimizing the Rabin's polynomial
695
       initialization in create_delta().
696
       This computes the maximum number of entries that could be held. The
697
       actual number will be recomputed during processing.
698
       */
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
699
3735.33.4 by John Arbash Meinel
The new layout is working.
700
    max_num_entries = (src->size - 1)  / RABIN_WINDOW;
701
702
    /* allocate an array to hold whatever entries we find */
703
    entries = malloc(sizeof(*entry) * max_num_entries);
704
    if (!entries) /* malloc failure */
705
        return NULL;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
706
707
    /* then populate the index for the new data */
708
    prev_val = ~0;
709
    data = buffer;
710
    /* target size */
711
    get_delta_hdr_size(&data, top);
3735.33.4 by John Arbash Meinel
The new layout is working.
712
    entry = entries; /* start at the first slot */
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
713
    num_entries = 0; /* calculate the real number of entries */
714
    while (data < top) {
715
        cmd = *data++;
716
        if (cmd & 0x80) {
717
            /* Copy instruction, skip it */
718
            if (cmd & 0x01) data++;
719
            if (cmd & 0x02) data++;
720
            if (cmd & 0x04) data++;
721
            if (cmd & 0x08) data++;
722
            if (cmd & 0x10) data++;
723
            if (cmd & 0x20) data++;
724
            if (cmd & 0x40) data++;
725
        } else if (cmd) {
726
            /* Insert instruction, we want to index these bytes */
727
            if (data + cmd > top) {
728
                /* Invalid insert, not enough bytes in the delta */
729
                break;
730
            }
3735.33.5 by John Arbash Meinel
Restoring the debugging for now.
731
            /* The create_delta code requires a match at least 4 characters
732
             * (including only the last char of the RABIN_WINDOW) before it
733
             * will consider it something worth copying rather than inserting.
734
             * So we don't want to index anything that we know won't ever be a
735
             * match.
736
             */
737
            for (; cmd > RABIN_WINDOW + 3; cmd -= RABIN_WINDOW,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
738
                                       data += RABIN_WINDOW) {
739
                unsigned int val = 0;
740
                for (i = 1; i <= RABIN_WINDOW; i++)
741
                    val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
742
                if (val != prev_val) {
743
                    /* Only keep the first of consecutive data */
744
                    prev_val = val;
3735.33.4 by John Arbash Meinel
The new layout is working.
745
                    num_entries++;
746
                    entry->ptr = data + RABIN_WINDOW;
747
                    entry->val = val;
748
                    entry->src = src;
3735.33.2 by John Arbash Meinel
Revert some of the previous work.
749
                    entry++;
3735.33.4 by John Arbash Meinel
The new layout is working.
750
                    if (num_entries > max_num_entries) {
751
                        /* We ran out of entry room, something is really wrong
752
                         */
753
                        break;
754
                    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
755
                }
756
            }
757
            /* Move the data pointer by whatever remainder is left */
758
            data += cmd;
759
        } else {
760
            /*
761
             * cmd == 0 is reserved for future encoding
762
             * extensions. In the mean time we must fail when
763
             * encountering them (might be data corruption).
764
             */
765
            break;
766
        }
767
    }
768
    if (data != top) {
769
        /* Something was wrong with this delta */
3735.33.4 by John Arbash Meinel
The new layout is working.
770
        free(entries);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
771
        return NULL;
772
    }
773
    if (num_entries == 0) {
774
        /** Nothing to index **/
3735.33.4 by John Arbash Meinel
The new layout is working.
775
        free(entries);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
776
        return NULL;
777
    }
3735.33.4 by John Arbash Meinel
The new layout is working.
778
    assert(old_index != NULL);
779
    old_index->last_src = src;
780
    /* See if we can fill in these values into the holes in the array */
781
    entry = entries;
782
    num_inserted = 0;
783
    for (; num_entries > 0; --num_entries, ++entry) {
784
        hash_offset = (entry->val & old_index->hash_mask);
785
        /* The basic structure is a hash => packed_entries that fit in that
786
         * hash bucket. Things are structured such that the hash-pointers are
787
         * strictly ordered. So we start by pointing to the next pointer, and
788
         * walk back until we stop getting NULL targets, and then go back
789
         * forward. If there are no NULL targets, then we know because
790
         * entry->ptr will not be NULL.
791
         */
792
        old_entry = old_index->hash[hash_offset + 1];
793
        old_entry--;
794
        while (old_entry->ptr == NULL
795
               && old_entry >= old_index->hash[hash_offset]) {
796
            old_entry--;
797
        }
798
        old_entry++;
799
        if (old_entry->ptr != NULL
800
            || old_entry >= old_index->hash[hash_offset + 1]) {
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
801
            /* There is no room for this entry, we have to resize */
802
            // char buff[128];
803
            // get_text(buff, entry->ptr);
804
            // fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",
805
            //         hash_offset, entry->val, buff);
806
            // for (old_entry = old_index->hash[hash_offset];
807
            //      old_entry < old_index->hash[hash_offset+1];
808
            //      ++old_entry) {
809
            //     get_text(buff, old_entry->ptr);
810
            //     fprintf(stderr, "  [%2d] %8x %8x: '%s'\n",
811
            //             (int)(old_entry - old_index->hash[hash_offset]),
812
            //             old_entry->val, old_entry->ptr, buff);
813
            // }
3735.33.4 by John Arbash Meinel
The new layout is working.
814
            break;
815
        }
816
        num_inserted++;
817
        *old_entry = *entry;
818
        /* For entries which we *do* manage to insert into old_index, we don't
819
         * want them double copied into the final output.
820
         */
821
        old_index->num_entries++;
822
    }
823
    if (num_entries > 0) {
824
        /* We couldn't fit the new entries into the old index, so allocate a
825
         * new one, and fill it with stuff.
826
         */
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
827
        // fprintf(stderr, "inserted %d before resize\n", num_inserted);
3735.33.4 by John Arbash Meinel
The new layout is working.
828
        new_index = create_index_from_old_and_new_entries(old_index,
829
            entry, num_entries);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
830
    } else {
3735.33.4 by John Arbash Meinel
The new layout is working.
831
        new_index = NULL;
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
832
        // fprintf(stderr, "inserted %d without resizing\n", num_inserted);
3735.33.4 by John Arbash Meinel
The new layout is working.
833
    }
834
    free(entries);
835
    return new_index;
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
836
}
837
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
838
void free_delta_index(struct delta_index *index)
839
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
840
    free(index);
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
841
}
842
843
unsigned long sizeof_delta_index(struct delta_index *index)
844
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
845
    if (index)
846
        return index->memsize;
847
    else
848
        return 0;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
849
}
850
851
/*
852
 * The maximum size for any opcode sequence, including the initial header
853
 * plus Rabin window plus biggest copy.
854
 */
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
855
#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
856
857
void *
0.23.44 by John Arbash Meinel
Remove the multi-index handling now that we have index combining instead.
858
create_delta(const struct delta_index *index,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
859
             const void *trg_buf, unsigned long trg_size,
860
             unsigned long *delta_size, unsigned long max_size)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
861
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
862
    unsigned int i, outpos, outsize, moff, msize, val;
863
    const struct source_info *msource;
864
    int inscnt;
865
    const unsigned char *ref_data, *ref_top, *data, *top;
866
    unsigned char *out;
867
    unsigned long source_size;
868
869
    if (!trg_buf || !trg_size)
870
        return NULL;
871
    if (index == NULL)
872
        return NULL;
873
874
    outpos = 0;
875
    outsize = 8192;
876
    if (max_size && outsize >= max_size)
877
        outsize = max_size + MAX_OP_SIZE + 1;
878
    out = malloc(outsize);
879
    if (!out)
880
        return NULL;
881
882
    source_size = index->last_src->size + index->last_src->agg_offset;
883
884
    /* store target buffer size */
885
    i = trg_size;
886
    while (i >= 0x80) {
887
        out[outpos++] = i | 0x80;
888
        i >>= 7;
889
    }
890
    out[outpos++] = i;
891
892
    data = trg_buf;
893
    top = (const unsigned char *) trg_buf + trg_size;
894
895
    /* Start the matching by filling out with a simple 'insert' instruction, of
896
     * the first RABIN_WINDOW bytes of the input.
897
     */
898
    outpos++; /* leave a byte for the insert command */
899
    val = 0;
900
    for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
901
        out[outpos++] = *data;
902
        val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
903
    }
904
    /* we are now setup with an insert of 'i' bytes and val contains the RABIN
905
     * hash for those bytes, and data points to the RABIN_WINDOW+1 byte of
906
     * input.
907
     */
908
    inscnt = i;
909
910
    moff = 0;
911
    msize = 0;
912
    msource = NULL;
913
    while (data < top) {
914
        if (msize < 4096) {
915
            /* we don't have a 'worthy enough' match yet, so let's look for
916
             * one.
917
             */
918
            struct index_entry *entry;
919
            /* Shift the window by one byte. */
920
            val ^= U[data[-RABIN_WINDOW]];
921
            val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
922
            i = val & index->hash_mask;
923
            /* TODO: When using multiple indexes like this, the hash tables
924
             *       mapping val => index_entry become less efficient.
925
             *       You end up getting a lot more collisions in the hash,
926
             *       which doesn't actually lead to a entry->val match.
927
             */
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
928
            for (entry = index->hash[i];
929
                 entry < index->hash[i+1] && entry->src != NULL;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
930
                 entry++) {
931
                const unsigned char *ref;
932
                const unsigned char *src;
933
                unsigned int ref_size;
934
                if (entry->val != val)
935
                    continue;
936
                ref = entry->ptr;
937
                src = data;
938
                ref_data = entry->src->buf;
939
                ref_top = ref_data + entry->src->size;
940
                ref_size = ref_top - ref;
941
                /* ref_size is the longest possible match that we could make
942
                 * here. If ref_size <= msize, then we know that we cannot
943
                 * match more bytes with this location that we have already
944
                 * matched.
945
                 */
946
                if (ref_size > top - src)
947
                    ref_size = top - src;
948
                if (ref_size <= msize)
949
                    break;
950
                /* See how many bytes actually match at this location. */
951
                while (ref_size-- && *src++ == *ref)
952
                    ref++;
953
                if (msize < ref - entry->ptr) {
954
                    /* this is our best match so far */
955
                    msize = ref - entry->ptr;
956
                    msource = entry->src;
957
                    moff = entry->ptr - ref_data;
958
                    if (msize >= 4096) /* good enough */
959
                        break;
960
                }
961
            }
962
        }
963
964
        if (msize < 4) {
965
            /* The best match right now is less than 4 bytes long. So just add
966
             * the current byte to the insert instruction. Increment the insert
967
             * counter, and copy the byte of data into the output buffer.
968
             */
969
            if (!inscnt)
970
                outpos++;
971
            out[outpos++] = *data++;
972
            inscnt++;
973
            if (inscnt == 0x7f) {
974
                /* We have a max length insert instruction, finalize it in the
975
                 * output.
976
                 */
977
                out[outpos - inscnt - 1] = inscnt;
978
                inscnt = 0;
979
            }
980
            msize = 0;
981
        } else {
982
            unsigned int left;
983
            unsigned char *op;
984
985
            if (inscnt) {
986
                ref_data = msource->buf;
987
                while (moff && ref_data[moff-1] == data[-1]) {
988
                    /* we can match one byte back */
989
                    msize++;
990
                    moff--;
991
                    data--;
992
                    outpos--;
993
                    if (--inscnt)
994
                        continue;
995
                    outpos--;  /* remove count slot */
996
                    inscnt--;  /* make it -1 */
997
                    break;
998
                }
999
                out[outpos - inscnt - 1] = inscnt;
1000
                inscnt = 0;
1001
            }
1002
1003
            /* A copy op is currently limited to 64KB (pack v2) */
1004
            left = (msize < 0x10000) ? 0 : (msize - 0x10000);
1005
            msize -= left;
1006
1007
            op = out + outpos++;
1008
            i = 0x80;
1009
1010
            /* moff is the offset in the local structure, for encoding, we need
1011
             * to push it into the global offset
1012
             */
1013
            assert(moff < msource->size);
1014
            moff += msource->agg_offset;
1015
            assert(moff + msize <= source_size);
1016
            if (moff & 0x000000ff)
1017
                out[outpos++] = moff >> 0,  i |= 0x01;
1018
            if (moff & 0x0000ff00)
1019
                out[outpos++] = moff >> 8,  i |= 0x02;
1020
            if (moff & 0x00ff0000)
1021
                out[outpos++] = moff >> 16, i |= 0x04;
1022
            if (moff & 0xff000000)
1023
                out[outpos++] = moff >> 24, i |= 0x08;
1024
            /* Put it back into local coordinates, in case we have multiple
1025
             * copies in a row.
1026
             */
1027
            moff -= msource->agg_offset;
1028
1029
            if (msize & 0x00ff)
1030
                out[outpos++] = msize >> 0, i |= 0x10;
1031
            if (msize & 0xff00)
1032
                out[outpos++] = msize >> 8, i |= 0x20;
1033
1034
            *op = i;
1035
1036
            data += msize;
1037
            moff += msize;
1038
            msize = left;
1039
1040
            if (msize < 4096) {
1041
                int j;
1042
                val = 0;
1043
                for (j = -RABIN_WINDOW; j < 0; j++)
1044
                    val = ((val << 8) | data[j])
1045
                          ^ T[val >> RABIN_SHIFT];
1046
            }
1047
        }
1048
1049
        if (outpos >= outsize - MAX_OP_SIZE) {
1050
            void *tmp = out;
1051
            outsize = outsize * 3 / 2;
1052
            if (max_size && outsize >= max_size)
1053
                outsize = max_size + MAX_OP_SIZE + 1;
1054
            if (max_size && outpos > max_size)
1055
                break;
1056
            out = realloc(out, outsize);
1057
            if (!out) {
1058
                free(tmp);
1059
                return NULL;
1060
            }
1061
        }
1062
    }
1063
1064
    if (inscnt)
1065
        out[outpos - inscnt - 1] = inscnt;
1066
1067
    if (max_size && outpos > max_size) {
1068
        free(out);
1069
        return NULL;
1070
    }
1071
1072
    *delta_size = outpos;
1073
    return out;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
1074
}
0.23.24 by John Arbash Meinel
Change the code so that we can pass in multiple sources to match against.
1075
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
1076
/* vim: et ts=4 sw=4 sts=4
0.23.36 by John Arbash Meinel
Track down a memory leak in the refactored diff-delta.c code.
1077
 */