/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
1
/*
2
 * diff-delta.c: generate a delta between two buffers
3
 *
4
 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5
 * http://www.xmailserver.org/xdiff-lib.html
6
 *
5698.2.1 by Martin
Update Nicolas Pitre's email address
7
 * Rewritten for GIT by Nicolas Pitre <nico@fluxnic.net>, (C) 2005-2007
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
8
 * Adapted for Bazaar by John Arbash Meinel <john@arbash-meinel.com> (C) 2009
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
9
 *
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * NB: The version in GIT is 'version 2 of the Licence only', however Nicolas
16
 * has granted permission for use under 'version 2 or later' in private email
17
 * to Robert Collins and Karl Fogel on the 6th April 2009.
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
18
 */
19
3735.33.4 by John Arbash Meinel
The new layout is working.
20
#include <stdio.h>
21
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
22
#include "delta.h"
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
23
#include <stdlib.h>
24
#include <string.h>
0.23.5 by John Arbash Meinel
Minor changes to get diff-delta.c and patch-delta.c to compile.
25
#include <assert.h>
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
26
27
/* maximum hash entry list for the same hash bucket */
28
#define HASH_LIMIT 64
29
30
#define RABIN_SHIFT 23
31
#define RABIN_WINDOW 16
5755.2.1 by John Arbash Meinel
Set a maximum number of delta entries for a fulltext.
32
#define MAX_NUM_ENTRIES 10000
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
33
3735.33.13 by John Arbash Meinel
Tweak the number of blank spaces up just a tad.
34
/* The hash map is sized to put 4 entries per bucket, this gives us ~even room
35
 * for more data. Tweaking this number above 4 doesn't seem to help much,
36
 * anyway.
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
37
 */
3735.33.13 by John Arbash Meinel
Tweak the number of blank spaces up just a tad.
38
#define EXTRA_NULLS 4
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
39
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
40
static const unsigned int T[256] = {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
41
    0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
42
    0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
43
    0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
44
    0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
45
    0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
46
    0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
47
    0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
48
    0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
49
    0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
50
    0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
51
    0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
52
    0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
53
    0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
54
    0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
55
    0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
56
    0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
57
    0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
58
    0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
59
    0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
60
    0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
61
    0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
62
    0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
63
    0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
64
    0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
65
    0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
66
    0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
67
    0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
68
    0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
69
    0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
70
    0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
71
    0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
72
    0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
73
    0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
74
    0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
75
    0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
76
    0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
77
    0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
78
    0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
79
    0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
80
    0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
81
    0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
82
    0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
83
    0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
84
};
85
86
static const unsigned int U[256] = {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
87
    0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
88
    0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
89
    0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
90
    0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
91
    0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
92
    0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
93
    0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
94
    0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
95
    0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
96
    0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
97
    0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
98
    0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
99
    0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
100
    0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
101
    0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
102
    0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
103
    0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
104
    0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
105
    0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
106
    0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
107
    0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
108
    0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
109
    0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
110
    0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
111
    0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
112
    0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
113
    0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
114
    0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
115
    0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
116
    0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
117
    0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
118
    0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
119
    0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
120
    0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
121
    0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
122
    0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
123
    0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
124
    0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
125
    0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
126
    0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
127
    0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
128
    0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
129
    0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
130
};
131
132
struct index_entry {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
133
    const unsigned char *ptr;
134
    const struct source_info *src;
135
    unsigned int val;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
136
};
137
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
138
struct index_entry_linked_list {
139
    struct index_entry *p_entry;
140
    struct index_entry_linked_list *next;
141
};
142
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
143
struct unpacked_index_entry {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
144
    struct index_entry entry;
145
    struct unpacked_index_entry *next;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
146
};
147
148
struct delta_index {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
149
    unsigned long memsize; /* Total bytes pointed to by this index */
150
    const struct source_info *last_src; /* Information about the referenced source */
151
    unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
152
                               entry */
153
    unsigned int num_entries; /* The total number of entries in this index */
154
    struct index_entry *last_entry; /* Pointer to the last valid entry */
155
    struct index_entry *hash[];
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
156
};
157
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
158
static unsigned int
159
limit_hash_buckets(struct unpacked_index_entry **hash,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
160
                   unsigned int *hash_count, unsigned int hsize,
161
                   unsigned int entries)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
162
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
163
    struct unpacked_index_entry *entry;
164
    unsigned int i;
165
    /*
166
     * Determine a limit on the number of entries in the same hash
167
     * bucket.  This guards us against pathological data sets causing
168
     * really bad hash distribution with most entries in the same hash
169
     * bucket that would bring us to O(m*n) computing costs (m and n
170
     * corresponding to reference and target buffer sizes).
171
     *
172
     * Make sure none of the hash buckets has more entries than
173
     * we're willing to test.  Otherwise we cull the entry list
174
     * uniformly to still preserve a good repartition across
175
     * the reference buffer.
176
     */
177
    for (i = 0; i < hsize; i++) {
178
        int acc;
179
180
        if (hash_count[i] <= HASH_LIMIT)
181
            continue;
182
183
        /* We leave exactly HASH_LIMIT entries in the bucket */
184
        entries -= hash_count[i] - HASH_LIMIT;
185
186
        entry = hash[i];
187
        acc = 0;
188
189
        /*
190
         * Assume that this loop is gone through exactly
191
         * HASH_LIMIT times and is entered and left with
192
         * acc==0.  So the first statement in the loop
193
         * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
194
         * to the accumulator, and the inner loop consequently
195
         * is run (hash_count[i]-HASH_LIMIT) times, removing
196
         * one element from the list each time.  Since acc
197
         * balances out to 0 at the final run, the inner loop
198
         * body can't be left with entry==NULL.  So we indeed
199
         * encounter entry==NULL in the outer loop only.
200
         */
201
        do {
202
            acc += hash_count[i] - HASH_LIMIT;
203
            if (acc > 0) {
204
                struct unpacked_index_entry *keep = entry;
205
                do {
206
                    entry = entry->next;
207
                    acc -= HASH_LIMIT;
208
                } while (acc > 0);
209
                keep->next = entry->next;
210
            }
211
            entry = entry->next;
212
        } while (entry);
213
    }
214
    return entries;
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
215
}
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
216
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
217
static struct delta_index *
218
pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
219
                 unsigned int num_entries, struct delta_index *old_index)
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
220
{
3735.33.14 by John Arbash Meinel
Simplify the code a bit. We don't repack as often, so go with a
221
    unsigned int i, j, hmask, memsize, fit_in_old, copied_count;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
222
    struct unpacked_index_entry *entry;
223
    struct delta_index *index;
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
224
    struct index_entry *packed_entry, **packed_hash, *old_entry, *copy_from;
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
225
    struct index_entry null_entry = {0};
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
226
    void *mem;
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
227
228
    hmask = hsize - 1;
229
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
230
    // if (old_index) {
231
    //     fprintf(stderr, "Packing %d entries into %d for total of %d entries"
232
    //                     " %x => %x\n",
233
    //                     num_entries - old_index->num_entries,
234
    //                     old_index->num_entries, num_entries,
235
    //                     old_index->hash_mask, hmask);
236
    // } else {
237
    //     fprintf(stderr, "Packing %d entries into a new index\n",
238
    //                     num_entries);
239
    // }
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
240
    /* First, see if we can squeeze the new items into the existing structure.
241
     */
242
    fit_in_old = 0;
243
    copied_count = 0;
244
    if (old_index && old_index->hash_mask == hmask) {
245
        fit_in_old = 1;
246
        for (i = 0; i < hsize; ++i) {
247
            packed_entry = NULL;
248
            for (entry = hash[i]; entry; entry = entry->next) {
249
                if (packed_entry == NULL) {
250
                    /* Find the last open spot */
251
                    packed_entry = old_index->hash[i + 1];
252
                    --packed_entry;
253
                    while (packed_entry >= old_index->hash[i]
254
                           && packed_entry->ptr == NULL) {
255
                        --packed_entry;
256
                    }
257
                    ++packed_entry;
258
                }
259
                if (packed_entry >= old_index->hash[i+1]
260
                    || packed_entry->ptr != NULL) {
261
                    /* There are no free spots here :( */
262
                    fit_in_old = 0;
263
                    break;
264
                }
265
                /* We found an empty spot to put this entry
266
                 * Copy it over, and remove it from the linked list, just in
267
                 * case we end up running out of room later.
268
                 */
269
                *packed_entry++ = entry->entry;
270
                assert(entry == hash[i]);
271
                hash[i] = entry->next;
272
                copied_count += 1;
273
                old_index->num_entries++;
274
            }
275
            if (!fit_in_old) {
276
                break;
277
            }
278
        }
279
    }
280
    if (old_index) {
281
        if (fit_in_old) {
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
282
            // fprintf(stderr, "Fit all %d entries into old index\n",
283
            //                 copied_count);
5698.2.2 by Martin
Change create_delta_index signature so callers can treat NULL returns as MemoryError
284
            /*
285
             * No need to allocate a new buffer, but return old_index ptr so
286
             * callers can distinguish this from an OOM failure.
287
             */
288
            return old_index;
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
289
        } else {
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
290
            // fprintf(stderr, "Fit only %d entries into old index,"
291
            //                 " reallocating\n", copied_count);
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
292
        }
293
    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
294
    /*
295
     * Now create the packed index in array form
296
     * rather than linked lists.
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
297
     * Leave a 2-entry gap for inserting more entries between the groups
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
298
     */
299
    memsize = sizeof(*index)
300
        + sizeof(*packed_hash) * (hsize+1)
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
301
        + sizeof(*packed_entry) * (num_entries + hsize * EXTRA_NULLS);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
302
    mem = malloc(memsize);
303
    if (!mem) {
304
        return NULL;
305
    }
306
307
    index = mem;
308
    index->memsize = memsize;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
309
    index->hash_mask = hmask;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
310
    index->num_entries = num_entries;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
311
    if (old_index) {
3735.33.16 by John Arbash Meinel
Handle when our current packing is sub-optimal.
312
        if (hmask < old_index->hash_mask) {
313
            fprintf(stderr, "hash mask was shrunk %x => %x\n",
314
                            old_index->hash_mask, hmask);
315
        }
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
316
        assert(hmask >= old_index->hash_mask);
317
    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
318
319
    mem = index->hash;
320
    packed_hash = mem;
321
    mem = packed_hash + (hsize+1);
322
    packed_entry = mem;
323
324
    for (i = 0; i < hsize; i++) {
325
        /*
326
         * Coalesce all entries belonging to one linked list
327
         * into consecutive array entries.
328
         */
329
        packed_hash[i] = packed_entry;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
330
        /* Old comes earlier as a source, so it always comes first in a given
331
         * hash bucket.
332
         */
333
        if (old_index) {
334
            /* Could we optimize this to use memcpy when hmask ==
335
             * old_index->hash_mask? Would it make any real difference?
336
             */
337
            j = i & old_index->hash_mask;
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
338
            copy_from = old_index->hash[j];
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
339
            for (old_entry = old_index->hash[j];
340
                 old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;
341
                 old_entry++) {
342
                if ((old_entry->val & hmask) == i) {
3735.33.14 by John Arbash Meinel
Simplify the code a bit. We don't repack as often, so go with a
343
                    *packed_entry++ = *old_entry;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
344
                }
345
            }
346
        }
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
347
        for (entry = hash[i]; entry; entry = entry->next) {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
348
            *packed_entry++ = entry->entry;
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
349
        }
3735.33.14 by John Arbash Meinel
Simplify the code a bit. We don't repack as often, so go with a
350
        /* TODO: At this point packed_entry - packed_hash[i] is the number of
351
         *       records that we have inserted into this hash bucket.
352
         *       We should *really* consider doing some limiting along the
353
         *       lines of limit_hash_buckets() to avoid pathological behavior.
354
         */
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
355
        /* Now add extra 'NULL' entries that we can use for future expansion. */
356
        for (j = 0; j < EXTRA_NULLS; ++j ) {
357
            *packed_entry++ = null_entry;
358
        }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
359
    }
360
361
    /* Sentinel value to indicate the length of the last hash bucket */
362
    packed_hash[hsize] = packed_entry;
363
3735.33.4 by John Arbash Meinel
The new layout is working.
364
    if (packed_entry - (struct index_entry *)mem
365
        != num_entries + hsize*EXTRA_NULLS) {
3735.33.5 by John Arbash Meinel
Restoring the debugging for now.
366
        fprintf(stderr, "We expected %d entries, but created %d\n",
367
                num_entries + hsize*EXTRA_NULLS,
368
                (int)(packed_entry - (struct index_entry*)mem));
3735.33.4 by John Arbash Meinel
The new layout is working.
369
    }
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
370
    assert(packed_entry - (struct index_entry *)mem
371
            == num_entries + hsize*EXTRA_NULLS);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
372
    index->last_entry = (packed_entry - 1);
373
    return index;
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
374
}
375
3735.33.4 by John Arbash Meinel
The new layout is working.
376
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
377
delta_result
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
378
create_delta_index(const struct source_info *src,
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
379
                   struct delta_index *old,
380
                   struct delta_index **fresh)
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
381
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
382
    unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
5755.2.1 by John Arbash Meinel
Set a maximum number of delta entries for a fulltext.
383
    unsigned int total_num_entries, stride;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
384
    const unsigned char *data, *buffer;
385
    struct delta_index *index;
386
    struct unpacked_index_entry *entry, **hash;
387
    void *mem;
388
    unsigned long memsize;
389
390
    if (!src->buf || !src->size)
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
391
        return DELTA_SOURCE_EMPTY;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
392
    buffer = src->buf;
393
394
    /* Determine index hash size.  Note that indexing skips the
395
       first byte to allow for optimizing the Rabin's polynomial
396
       initialization in create_delta(). */
5755.2.1 by John Arbash Meinel
Set a maximum number of delta entries for a fulltext.
397
    stride = RABIN_WINDOW;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
398
    num_entries = (src->size - 1)  / RABIN_WINDOW;
5755.2.1 by John Arbash Meinel
Set a maximum number of delta entries for a fulltext.
399
    if (num_entries > MAX_NUM_ENTRIES) {
400
        /* Limit the max number of matching entries. This reduces the 'best'
401
         * possible match, but means we don't consume all of ram.
402
         */
403
        // fprintf(stderr, "limiting num_entries to %d\n", MAX_NUM_ENTRIES);
404
        num_entries = MAX_NUM_ENTRIES;
405
        stride = (src->size) / num_entries;
406
    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
407
    if (old != NULL)
408
        total_num_entries = num_entries + old->num_entries;
409
    else
410
        total_num_entries = num_entries;
3735.33.8 by John Arbash Meinel
Reverted back to the same hash width, and bumped EXTRA_NULLS to 3.
411
    hsize = total_num_entries / 4;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
412
    for (i = 4; (1u << i) < hsize && i < 31; i++);
413
    hsize = 1 << i;
414
    hmask = hsize - 1;
3735.33.18 by John Arbash Meinel
*grow* the local hmask if it is smaller than expected, don't *shrink* it.
415
    if (old && old->hash_mask > hmask) {
3735.33.16 by John Arbash Meinel
Handle when our current packing is sub-optimal.
416
        hmask = old->hash_mask;
417
        hsize = hmask + 1;
418
    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
419
420
    /* allocate lookup index */
421
    memsize = sizeof(*hash) * hsize +
422
          sizeof(*entry) * total_num_entries;
423
    mem = malloc(memsize);
424
    if (!mem)
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
425
        return DELTA_OUT_OF_MEMORY;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
426
    hash = mem;
427
    mem = hash + hsize;
428
    entry = mem;
429
430
    memset(hash, 0, hsize * sizeof(*hash));
431
432
    /* allocate an array to count hash num_entries */
433
    hash_count = calloc(hsize, sizeof(*hash_count));
434
    if (!hash_count) {
435
        free(hash);
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
436
        return DELTA_OUT_OF_MEMORY;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
437
    }
438
439
    /* then populate the index for the new data */
440
    prev_val = ~0;
5755.2.1 by John Arbash Meinel
Set a maximum number of delta entries for a fulltext.
441
    for (data = buffer + num_entries * stride - RABIN_WINDOW;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
442
         data >= buffer;
5755.2.1 by John Arbash Meinel
Set a maximum number of delta entries for a fulltext.
443
         data -= stride) {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
444
        unsigned int val = 0;
445
        for (i = 1; i <= RABIN_WINDOW; i++)
446
            val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
447
        if (val == prev_val) {
448
            /* keep the lowest of consecutive identical blocks */
449
            entry[-1].entry.ptr = data + RABIN_WINDOW;
450
            --num_entries;
451
            --total_num_entries;
452
        } else {
453
            prev_val = val;
454
            i = val & hmask;
455
            entry->entry.ptr = data + RABIN_WINDOW;
456
            entry->entry.val = val;
457
            entry->entry.src = src;
458
            entry->next = hash[i];
459
            hash[i] = entry++;
460
            hash_count[i]++;
461
        }
462
    }
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
463
    /* TODO: It would be nice to limit_hash_buckets at a better time. */
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
464
    total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
465
                                           total_num_entries);
466
    free(hash_count);
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
467
    index = pack_delta_index(hash, hsize, total_num_entries, old);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
468
    free(hash);
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
469
    /* pack_delta_index only returns NULL on malloc failure */
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
470
    if (!index) {
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
471
        return DELTA_OUT_OF_MEMORY;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
472
    }
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
473
    index->last_src = src;
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
474
    *fresh = index;
475
    return DELTA_OK;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
476
}
477
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
478
/* Take some entries, and put them into a custom hash.
479
 * @param entries   A list of entries, sorted by position in file
480
 * @param num_entries   Length of entries
481
 * @param out_hsize     The maximum size of the hash, the final size will be
482
 *                      returned here
483
 */
484
struct index_entry_linked_list **
485
_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
486
                       unsigned int hsize)
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
487
{
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
488
    unsigned int hash_offset, hmask, memsize;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
489
    struct index_entry *entry, *last_entry;
490
    struct index_entry_linked_list *out_entry, **hash;
491
    void *mem;
492
493
    hmask = hsize - 1;
494
495
    memsize = sizeof(*hash) * hsize +
496
          sizeof(*out_entry) * num_entries;
497
    mem = malloc(memsize);
498
    if (!mem)
499
        return NULL;
500
    hash = mem;
501
    mem = hash + hsize;
502
    out_entry = mem;
503
504
    memset(hash, 0, sizeof(*hash)*(hsize+1));
505
506
    /* We know that entries are in the order we want in the output, but they
507
     * aren't "grouped" by hash bucket yet.
508
     */
509
    last_entry = entries + num_entries;
510
    for (entry = entries + num_entries - 1; entry >= entries; --entry) {
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
511
        hash_offset = entry->val & hmask;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
512
        out_entry->p_entry = entry;
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
513
        out_entry->next = hash[hash_offset];
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
514
        /* TODO: Remove entries that have identical vals, or at least filter
515
         *       the map a little bit.
516
         * if (hash[i] != NULL) {
517
         * }
518
         */
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
519
        hash[hash_offset] = out_entry;
520
        ++out_entry;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
521
    }
522
    return hash;
523
}
524
3735.33.4 by John Arbash Meinel
The new layout is working.
525
526
struct delta_index *
527
create_index_from_old_and_new_entries(const struct delta_index *old_index,
528
                                      struct index_entry *entries,
529
                                      unsigned int num_entries)
530
{
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
531
    unsigned int i, j, hsize, hmask, total_num_entries;
3735.33.4 by John Arbash Meinel
The new layout is working.
532
    struct delta_index *index;
533
    struct index_entry *entry, *packed_entry, **packed_hash;
534
    struct index_entry *last_entry, null_entry = {0};
535
    void *mem;
536
    unsigned long memsize;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
537
    struct index_entry_linked_list *unpacked_entry, **mini_hash;
3735.33.4 by John Arbash Meinel
The new layout is working.
538
539
    /* Determine index hash size.  Note that indexing skips the
540
       first byte to allow for optimizing the Rabin's polynomial
541
       initialization in create_delta(). */
542
    total_num_entries = num_entries + old_index->num_entries;
3735.33.8 by John Arbash Meinel
Reverted back to the same hash width, and bumped EXTRA_NULLS to 3.
543
    hsize = total_num_entries / 4;
3735.33.4 by John Arbash Meinel
The new layout is working.
544
    for (i = 4; (1u << i) < hsize && i < 31; i++);
545
    hsize = 1 << i;
546
    if (hsize < old_index->hash_mask) {
547
        /* For some reason, there was a code path that would actually *shrink*
548
         * the hash size. This screws with some later code, and in general, I
549
         * think it better to make the hash bigger, rather than smaller. So
550
         * we'll just force the size here.
551
         * Possibly done by create_delta_index running into a
552
         * limit_hash_buckets call, that ended up transitioning across a
553
         * power-of-2. The cause isn't 100% clear, though.
554
         */
555
        hsize = old_index->hash_mask + 1;
556
    }
557
    hmask = hsize - 1;
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
558
    // fprintf(stderr, "resizing index to insert %d entries into array"
559
    //                 " with %d entries: %x => %x\n",
560
    //         num_entries, old_index->num_entries, old_index->hash_mask, hmask);
3735.33.4 by John Arbash Meinel
The new layout is working.
561
562
    memsize = sizeof(*index)
563
        + sizeof(*packed_hash) * (hsize+1)
564
        + sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);
565
    mem = malloc(memsize);
566
    if (!mem) {
567
        return NULL;
568
    }
569
    index = mem;
570
    index->memsize = memsize;
571
    index->hash_mask = hmask;
572
    index->num_entries = total_num_entries;
573
    index->last_src = old_index->last_src;
574
575
    mem = index->hash;
576
    packed_hash = mem;
577
    mem = packed_hash + (hsize+1);
578
    packed_entry = mem;
579
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
580
    mini_hash = _put_entries_into_hash(entries, num_entries, hsize);
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
581
    if (mini_hash == NULL) {
582
        free(index);
583
        return NULL;
584
    }
3735.33.4 by John Arbash Meinel
The new layout is working.
585
    last_entry = entries + num_entries;
586
    for (i = 0; i < hsize; i++) {
587
        /*
588
         * Coalesce all entries belonging in one hash bucket
589
         * into consecutive array entries.
590
         * The entries in old_index all come before 'entries'.
591
         */
592
        packed_hash[i] = packed_entry;
593
        /* Copy any of the old entries across */
594
        /* Would we rather use memcpy? */
595
        if (hmask == old_index->hash_mask) {
596
            for (entry = old_index->hash[i];
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
597
                 entry < old_index->hash[i+1] && entry->ptr != NULL;
3735.33.4 by John Arbash Meinel
The new layout is working.
598
                 ++entry) {
599
                assert((entry->val & hmask) == i);
600
                *packed_entry++ = *entry;
601
            }
602
        } else {
603
            /* If we resized the index from this action, all of the old values
604
             * will be found in the previous location, but they will end up
605
             * spread across the new locations.
606
             */
607
            j = i & old_index->hash_mask;
608
            for (entry = old_index->hash[j];
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
609
                 entry < old_index->hash[j+1] && entry->ptr != NULL;
3735.33.4 by John Arbash Meinel
The new layout is working.
610
                 ++entry) {
611
                assert((entry->val & old_index->hash_mask) == j);
612
                if ((entry->val & hmask) == i) {
613
                    /* Any entries not picked up here will be picked up on the
614
                     * next pass.
615
                     */
616
                    *packed_entry++ = *entry;
617
                }
618
            }
619
        }
620
        /* Now see if we need to insert any of the new entries.
621
         * Note that loop ends up O(hsize*num_entries), so we expect that
622
         * num_entries is always small.
623
         * We also help a little bit by collapsing the entry range when the
624
         * endpoints are inserted. However, an alternative would be to build a
625
         * quick hash lookup for just the new entries.
626
         * Testing shows that this list can easily get up to about 100
627
         * entries, the tradeoff is a malloc, 1 pass over the entries, copying
628
         * them into a sorted buffer, and a free() when done,
629
         */
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
630
        for (unpacked_entry = mini_hash[i];
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
631
             unpacked_entry;
632
             unpacked_entry = unpacked_entry->next) {
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
633
            assert((unpacked_entry->p_entry->val & hmask) == i);
634
            *packed_entry++ = *(unpacked_entry->p_entry);
3735.33.4 by John Arbash Meinel
The new layout is working.
635
        }
636
        /* Now insert some extra nulls */
637
        for (j = 0; j < EXTRA_NULLS; ++j) {
638
            *packed_entry++ = null_entry;
639
        }
640
    }
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
641
    free(mini_hash);
3735.33.4 by John Arbash Meinel
The new layout is working.
642
643
    /* Sentinel value to indicate the length of the last hash bucket */
644
    packed_hash[hsize] = packed_entry;
645
646
    if ((packed_entry - (struct index_entry *)mem)
647
        != (total_num_entries + hsize*EXTRA_NULLS)) {
648
        fprintf(stderr, "We expected %d entries, but created %d\n",
649
                total_num_entries + hsize*EXTRA_NULLS,
650
                (int)(packed_entry - (struct index_entry*)mem));
651
        fflush(stderr);
652
    }
653
    assert((packed_entry - (struct index_entry *)mem)
654
           == (total_num_entries + hsize * EXTRA_NULLS));
655
    index->last_entry = (packed_entry - 1);
656
    return index;
657
}
658
659
660
void
661
get_text(char buff[128], const unsigned char *ptr)
662
{
663
    unsigned int i;
664
    const unsigned char *start;
665
    unsigned char cmd;
666
    start = (ptr-RABIN_WINDOW-1);
667
    cmd = *(start);
668
    if (cmd < 0x80) {// This is likely to be an insert instruction
669
        if (cmd < RABIN_WINDOW) {
670
            cmd = RABIN_WINDOW;
671
        }
672
    } else {
673
        /* This was either a copy [should never be] or it
674
         * was a longer insert so the insert start happened at 16 more
675
         * bytes back.
676
         */
677
        cmd = RABIN_WINDOW + 1;
678
    }
679
    if (cmd > 60) {
680
        cmd = 60; /* Be friendly to 80char terms */
681
    }
682
    /* Copy the 1 byte command, and 4 bytes after the insert */
683
    cmd += 5;
684
    memcpy(buff, start, cmd);
685
    buff[cmd] = 0;
686
    for (i = 0; i < cmd; ++i) {
687
        if (buff[i] == '\n') {
688
            buff[i] = 'N';
689
        } else if (buff[i] == '\t') {
690
            buff[i] = 'T';
691
        }
692
    }
693
}
694
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
695
delta_result
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
696
create_delta_index_from_delta(const struct source_info *src,
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
697
                              struct delta_index *old_index,
698
                              struct delta_index **fresh)
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
699
{
3735.33.4 by John Arbash Meinel
The new layout is working.
700
    unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;
701
    unsigned int hash_offset;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
702
    const unsigned char *data, *buffer, *top;
703
    unsigned char cmd;
3735.33.4 by John Arbash Meinel
The new layout is working.
704
    struct delta_index *new_index;
4634.106.2 by John Arbash Meinel
Restructure the internals to be a bit more understandable.
705
    struct index_entry *entry, *entries;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
706
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
707
    if (!old_index)
708
        return DELTA_INDEX_NEEDED;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
709
    if (!src->buf || !src->size)
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
710
        return DELTA_SOURCE_EMPTY;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
711
    buffer = src->buf;
712
    top = buffer + src->size;
713
714
    /* Determine index hash size.  Note that indexing skips the
715
       first byte to allow for optimizing the Rabin's polynomial
716
       initialization in create_delta().
717
       This computes the maximum number of entries that could be held. The
718
       actual number will be recomputed during processing.
719
       */
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
720
3735.33.4 by John Arbash Meinel
The new layout is working.
721
    max_num_entries = (src->size - 1)  / RABIN_WINDOW;
722
723
    /* allocate an array to hold whatever entries we find */
724
    entries = malloc(sizeof(*entry) * max_num_entries);
725
    if (!entries) /* malloc failure */
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
726
        return DELTA_OUT_OF_MEMORY;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
727
728
    /* then populate the index for the new data */
729
    prev_val = ~0;
730
    data = buffer;
731
    /* target size */
4582.2.1 by John Arbash Meinel
Use a cast to improve compiler conformance.
732
    /* get_delta_hdr_size doesn't mutate the content, just moves the
733
     * start-of-data pointer, so it is safe to do the cast.
734
     */
735
    get_delta_hdr_size((unsigned char**)&data, top);
3735.33.4 by John Arbash Meinel
The new layout is working.
736
    entry = entries; /* start at the first slot */
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
737
    num_entries = 0; /* calculate the real number of entries */
738
    while (data < top) {
739
        cmd = *data++;
740
        if (cmd & 0x80) {
741
            /* Copy instruction, skip it */
742
            if (cmd & 0x01) data++;
743
            if (cmd & 0x02) data++;
744
            if (cmd & 0x04) data++;
745
            if (cmd & 0x08) data++;
746
            if (cmd & 0x10) data++;
747
            if (cmd & 0x20) data++;
748
            if (cmd & 0x40) data++;
749
        } else if (cmd) {
750
            /* Insert instruction, we want to index these bytes */
751
            if (data + cmd > top) {
752
                /* Invalid insert, not enough bytes in the delta */
753
                break;
754
            }
3735.33.5 by John Arbash Meinel
Restoring the debugging for now.
755
            /* The create_delta code requires a match at least 4 characters
756
             * (including only the last char of the RABIN_WINDOW) before it
757
             * will consider it something worth copying rather than inserting.
758
             * So we don't want to index anything that we know won't ever be a
759
             * match.
760
             */
761
            for (; cmd > RABIN_WINDOW + 3; cmd -= RABIN_WINDOW,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
762
                                       data += RABIN_WINDOW) {
763
                unsigned int val = 0;
764
                for (i = 1; i <= RABIN_WINDOW; i++)
765
                    val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
766
                if (val != prev_val) {
767
                    /* Only keep the first of consecutive data */
768
                    prev_val = val;
3735.33.4 by John Arbash Meinel
The new layout is working.
769
                    num_entries++;
770
                    entry->ptr = data + RABIN_WINDOW;
771
                    entry->val = val;
772
                    entry->src = src;
3735.33.2 by John Arbash Meinel
Revert some of the previous work.
773
                    entry++;
3735.33.4 by John Arbash Meinel
The new layout is working.
774
                    if (num_entries > max_num_entries) {
775
                        /* We ran out of entry room, something is really wrong
776
                         */
777
                        break;
778
                    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
779
                }
780
            }
781
            /* Move the data pointer by whatever remainder is left */
782
            data += cmd;
783
        } else {
784
            /*
785
             * cmd == 0 is reserved for future encoding
786
             * extensions. In the mean time we must fail when
787
             * encountering them (might be data corruption).
788
             */
789
            break;
790
        }
791
    }
792
    if (data != top) {
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
793
        /* The source_info data passed was corrupted or otherwise invalid */
3735.33.4 by John Arbash Meinel
The new layout is working.
794
        free(entries);
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
795
        return DELTA_SOURCE_BAD;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
796
    }
797
    if (num_entries == 0) {
798
        /** Nothing to index **/
3735.33.4 by John Arbash Meinel
The new layout is working.
799
        free(entries);
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
800
        *fresh = old_index;
801
        return DELTA_OK;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
802
    }
3735.33.4 by John Arbash Meinel
The new layout is working.
803
    old_index->last_src = src;
804
    /* See if we can fill in these values into the holes in the array */
805
    entry = entries;
806
    num_inserted = 0;
807
    for (; num_entries > 0; --num_entries, ++entry) {
4634.106.2 by John Arbash Meinel
Restructure the internals to be a bit more understandable.
808
        struct index_entry *next_bucket_entry, *cur_entry, *bucket_first_entry;
3735.33.4 by John Arbash Meinel
The new layout is working.
809
        hash_offset = (entry->val & old_index->hash_mask);
810
        /* The basic structure is a hash => packed_entries that fit in that
811
         * hash bucket. Things are structured such that the hash-pointers are
812
         * strictly ordered. So we start by pointing to the next pointer, and
813
         * walk back until we stop getting NULL targets, and then go back
814
         * forward. If there are no NULL targets, then we know because
815
         * entry->ptr will not be NULL.
816
         */
4634.106.2 by John Arbash Meinel
Restructure the internals to be a bit more understandable.
817
        // The start of the next bucket, this may point past the end of the
818
        // entry table if hash_offset is the last bucket.
819
        next_bucket_entry = old_index->hash[hash_offset + 1];
820
        // First entry in this bucket
821
        bucket_first_entry = old_index->hash[hash_offset];
822
        cur_entry = next_bucket_entry - 1;
823
        while (cur_entry->ptr == NULL && cur_entry >= bucket_first_entry) {
824
            cur_entry--;
3735.33.4 by John Arbash Meinel
The new layout is working.
825
        }
4634.106.2 by John Arbash Meinel
Restructure the internals to be a bit more understandable.
826
        // cur_entry now either points at the first NULL, or it points to
827
        // next_bucket_entry if there were no blank spots.
828
        cur_entry++;
829
        if (cur_entry >= next_bucket_entry || cur_entry->ptr != NULL) {
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
830
            /* There is no room for this entry, we have to resize */
831
            // char buff[128];
832
            // get_text(buff, entry->ptr);
833
            // fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",
834
            //         hash_offset, entry->val, buff);
835
            // for (old_entry = old_index->hash[hash_offset];
836
            //      old_entry < old_index->hash[hash_offset+1];
837
            //      ++old_entry) {
838
            //     get_text(buff, old_entry->ptr);
839
            //     fprintf(stderr, "  [%2d] %8x %8x: '%s'\n",
840
            //             (int)(old_entry - old_index->hash[hash_offset]),
841
            //             old_entry->val, old_entry->ptr, buff);
842
            // }
3735.33.4 by John Arbash Meinel
The new layout is working.
843
            break;
844
        }
845
        num_inserted++;
4634.106.2 by John Arbash Meinel
Restructure the internals to be a bit more understandable.
846
        *cur_entry = *entry;
3735.33.4 by John Arbash Meinel
The new layout is working.
847
        /* For entries which we *do* manage to insert into old_index, we don't
848
         * want them double copied into the final output.
849
         */
850
        old_index->num_entries++;
851
    }
852
    if (num_entries > 0) {
853
        /* We couldn't fit the new entries into the old index, so allocate a
854
         * new one, and fill it with stuff.
855
         */
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
856
        // fprintf(stderr, "inserted %d before resize\n", num_inserted);
3735.33.4 by John Arbash Meinel
The new layout is working.
857
        new_index = create_index_from_old_and_new_entries(old_index,
858
            entry, num_entries);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
859
    } else {
5698.2.3 by Martin
Change create_delta_index_from_delta too so NULL can be treated as MemoryError
860
        new_index = old_index;
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
861
        // fprintf(stderr, "inserted %d without resizing\n", num_inserted);
3735.33.4 by John Arbash Meinel
The new layout is working.
862
    }
863
    free(entries);
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
864
    /* create_index_from_old_and_new_entries returns NULL on malloc failure */
865
    if (!new_index)
866
        return DELTA_OUT_OF_MEMORY;
867
    *fresh = new_index;
868
    return DELTA_OK;
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
869
}
870
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
871
void free_delta_index(struct delta_index *index)
872
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
873
    free(index);
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
874
}
875
5361.2.2 by John Arbash Meinel
Fix some signed integer warnings in diff-delta.c
876
unsigned long
877
sizeof_delta_index(struct delta_index *index)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
878
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
879
    if (index)
880
        return index->memsize;
881
    else
882
        return 0;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
883
}
884
885
/*
886
 * The maximum size for any opcode sequence, including the initial header
887
 * plus Rabin window plus biggest copy.
888
 */
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
889
#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
890
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
891
delta_result
0.23.44 by John Arbash Meinel
Remove the multi-index handling now that we have index combining instead.
892
create_delta(const struct delta_index *index,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
893
             const void *trg_buf, unsigned long trg_size,
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
894
             unsigned long *delta_size, unsigned long max_size,
895
             void **delta_data)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
896
{
5361.2.4 by John Arbash Meinel
Use signed ints to avoid the compiler warning.
897
    unsigned int i, outpos, outsize, moff, val;
898
    int msize;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
899
    const struct source_info *msource;
900
    int inscnt;
901
    const unsigned char *ref_data, *ref_top, *data, *top;
902
    unsigned char *out;
903
    unsigned long source_size;
904
905
    if (!trg_buf || !trg_size)
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
906
        return DELTA_BUFFER_EMPTY;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
907
    if (index == NULL)
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
908
        return DELTA_INDEX_NEEDED;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
909
910
    outpos = 0;
911
    outsize = 8192;
912
    if (max_size && outsize >= max_size)
913
        outsize = max_size + MAX_OP_SIZE + 1;
914
    out = malloc(outsize);
915
    if (!out)
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
916
        return DELTA_OUT_OF_MEMORY;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
917
918
    source_size = index->last_src->size + index->last_src->agg_offset;
919
920
    /* store target buffer size */
921
    i = trg_size;
922
    while (i >= 0x80) {
923
        out[outpos++] = i | 0x80;
924
        i >>= 7;
925
    }
926
    out[outpos++] = i;
927
928
    data = trg_buf;
929
    top = (const unsigned char *) trg_buf + trg_size;
930
931
    /* Start the matching by filling out with a simple 'insert' instruction, of
932
     * the first RABIN_WINDOW bytes of the input.
933
     */
934
    outpos++; /* leave a byte for the insert command */
935
    val = 0;
936
    for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
937
        out[outpos++] = *data;
938
        val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
939
    }
940
    /* we are now setup with an insert of 'i' bytes and val contains the RABIN
941
     * hash for those bytes, and data points to the RABIN_WINDOW+1 byte of
942
     * input.
943
     */
944
    inscnt = i;
945
946
    moff = 0;
947
    msize = 0;
948
    msource = NULL;
949
    while (data < top) {
950
        if (msize < 4096) {
951
            /* we don't have a 'worthy enough' match yet, so let's look for
952
             * one.
953
             */
954
            struct index_entry *entry;
955
            /* Shift the window by one byte. */
956
            val ^= U[data[-RABIN_WINDOW]];
957
            val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
958
            i = val & index->hash_mask;
959
            /* TODO: When using multiple indexes like this, the hash tables
960
             *       mapping val => index_entry become less efficient.
961
             *       You end up getting a lot more collisions in the hash,
962
             *       which doesn't actually lead to a entry->val match.
963
             */
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
964
            for (entry = index->hash[i];
965
                 entry < index->hash[i+1] && entry->src != NULL;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
966
                 entry++) {
967
                const unsigned char *ref;
968
                const unsigned char *src;
5361.2.4 by John Arbash Meinel
Use signed ints to avoid the compiler warning.
969
                int ref_size;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
970
                if (entry->val != val)
971
                    continue;
972
                ref = entry->ptr;
973
                src = data;
974
                ref_data = entry->src->buf;
975
                ref_top = ref_data + entry->src->size;
976
                ref_size = ref_top - ref;
977
                /* ref_size is the longest possible match that we could make
978
                 * here. If ref_size <= msize, then we know that we cannot
979
                 * match more bytes with this location that we have already
980
                 * matched.
981
                 */
5361.2.4 by John Arbash Meinel
Use signed ints to avoid the compiler warning.
982
                if (ref_size > (top - src))
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
983
                    ref_size = top - src;
984
                if (ref_size <= msize)
985
                    break;
986
                /* See how many bytes actually match at this location. */
987
                while (ref_size-- && *src++ == *ref)
988
                    ref++;
5361.2.4 by John Arbash Meinel
Use signed ints to avoid the compiler warning.
989
                if (msize < (ref - entry->ptr)) {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
990
                    /* this is our best match so far */
991
                    msize = ref - entry->ptr;
992
                    msource = entry->src;
993
                    moff = entry->ptr - ref_data;
994
                    if (msize >= 4096) /* good enough */
995
                        break;
996
                }
997
            }
998
        }
999
1000
        if (msize < 4) {
1001
            /* The best match right now is less than 4 bytes long. So just add
1002
             * the current byte to the insert instruction. Increment the insert
1003
             * counter, and copy the byte of data into the output buffer.
1004
             */
1005
            if (!inscnt)
1006
                outpos++;
1007
            out[outpos++] = *data++;
1008
            inscnt++;
1009
            if (inscnt == 0x7f) {
1010
                /* We have a max length insert instruction, finalize it in the
1011
                 * output.
1012
                 */
1013
                out[outpos - inscnt - 1] = inscnt;
1014
                inscnt = 0;
1015
            }
1016
            msize = 0;
1017
        } else {
1018
            unsigned int left;
1019
            unsigned char *op;
1020
1021
            if (inscnt) {
1022
                ref_data = msource->buf;
1023
                while (moff && ref_data[moff-1] == data[-1]) {
1024
                    /* we can match one byte back */
1025
                    msize++;
1026
                    moff--;
1027
                    data--;
1028
                    outpos--;
1029
                    if (--inscnt)
1030
                        continue;
1031
                    outpos--;  /* remove count slot */
1032
                    inscnt--;  /* make it -1 */
1033
                    break;
1034
                }
1035
                out[outpos - inscnt - 1] = inscnt;
1036
                inscnt = 0;
1037
            }
1038
1039
            /* A copy op is currently limited to 64KB (pack v2) */
1040
            left = (msize < 0x10000) ? 0 : (msize - 0x10000);
1041
            msize -= left;
1042
1043
            op = out + outpos++;
1044
            i = 0x80;
1045
1046
            /* moff is the offset in the local structure, for encoding, we need
1047
             * to push it into the global offset
1048
             */
1049
            assert(moff < msource->size);
1050
            moff += msource->agg_offset;
1051
            assert(moff + msize <= source_size);
1052
            if (moff & 0x000000ff)
1053
                out[outpos++] = moff >> 0,  i |= 0x01;
1054
            if (moff & 0x0000ff00)
1055
                out[outpos++] = moff >> 8,  i |= 0x02;
1056
            if (moff & 0x00ff0000)
1057
                out[outpos++] = moff >> 16, i |= 0x04;
1058
            if (moff & 0xff000000)
1059
                out[outpos++] = moff >> 24, i |= 0x08;
1060
            /* Put it back into local coordinates, in case we have multiple
1061
             * copies in a row.
1062
             */
1063
            moff -= msource->agg_offset;
1064
1065
            if (msize & 0x00ff)
1066
                out[outpos++] = msize >> 0, i |= 0x10;
1067
            if (msize & 0xff00)
1068
                out[outpos++] = msize >> 8, i |= 0x20;
1069
1070
            *op = i;
1071
1072
            data += msize;
1073
            moff += msize;
1074
            msize = left;
1075
1076
            if (msize < 4096) {
1077
                int j;
1078
                val = 0;
1079
                for (j = -RABIN_WINDOW; j < 0; j++)
1080
                    val = ((val << 8) | data[j])
1081
                          ^ T[val >> RABIN_SHIFT];
1082
            }
1083
        }
1084
1085
        if (outpos >= outsize - MAX_OP_SIZE) {
1086
            void *tmp = out;
1087
            outsize = outsize * 3 / 2;
1088
            if (max_size && outsize >= max_size)
1089
                outsize = max_size + MAX_OP_SIZE + 1;
1090
            if (max_size && outpos > max_size)
1091
                break;
1092
            out = realloc(out, outsize);
1093
            if (!out) {
1094
                free(tmp);
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
1095
                return DELTA_OUT_OF_MEMORY;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
1096
            }
1097
        }
1098
    }
1099
1100
    if (inscnt)
1101
        out[outpos - inscnt - 1] = inscnt;
1102
1103
    if (max_size && outpos > max_size) {
1104
        free(out);
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
1105
        return DELTA_SIZE_TOO_BIG;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
1106
    }
1107
1108
    *delta_size = outpos;
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
1109
    *delta_data = out;
1110
    return DELTA_OK;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
1111
}
0.23.24 by John Arbash Meinel
Change the code so that we can pass in multiple sources to match against.
1112
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
1113
/* vim: et ts=4 sw=4 sts=4
0.23.36 by John Arbash Meinel
Track down a memory leak in the refactored diff-delta.c code.
1114
 */