git
/
diffcore-delta.c
236 строк · 5.6 Кб
1#include "git-compat-util.h"2#include "diffcore.h"3
4/*
5* Idea here is very simple.
6*
7* Almost all data we are interested in are text, but sometimes we have
8* to deal with binary data. So we cut them into chunks delimited by
9* LF byte, or 64-byte sequence, whichever comes first, and hash them.
10*
11* For those chunks, if the source buffer has more instances of it
12* than the destination buffer, that means the difference are the
13* number of bytes not copied from source to destination. If the
14* counts are the same, everything was copied from source to
15* destination. If the destination has more, everything was copied,
16* and destination added more.
17*
18* We are doing an approximation so we do not really have to waste
19* memory by actually storing the sequence. We just hash them into
20* somewhere around 2^16 hashbuckets and count the occurrences.
21*/
22
23/* Wild guess at the initial hash size */
24#define INITIAL_HASH_SIZE 925
26/* We leave more room in smaller hash but do not let it
27* grow to have unused hole too much.
28*/
29#define INITIAL_FREE(sz_log2) ((1<<(sz_log2))*(sz_log2-3)/(sz_log2))30
31/* A prime rather carefully chosen between 2^16..2^17, so that
32* HASHBASE < INITIAL_FREE(17). We want to keep the maximum hashtable
33* size under the current 2<<17 maximum, which can hold this many
34* different values before overflowing to hashtable of size 2<<18.
35*/
36#define HASHBASE 10792737
38struct spanhash {39unsigned int hashval;40unsigned int cnt;41};42struct spanhash_top {43int alloc_log2;44int free;45struct spanhash data[FLEX_ARRAY];46};47
48static struct spanhash_top *spanhash_rehash(struct spanhash_top *orig)49{
50struct spanhash_top *new_spanhash;51int i;52int osz = 1 << orig->alloc_log2;53int sz = osz << 1;54
55new_spanhash = xmalloc(st_add(sizeof(*orig),56st_mult(sizeof(struct spanhash), sz)));57new_spanhash->alloc_log2 = orig->alloc_log2 + 1;58new_spanhash->free = INITIAL_FREE(new_spanhash->alloc_log2);59memset(new_spanhash->data, 0, sizeof(struct spanhash) * sz);60for (i = 0; i < osz; i++) {61struct spanhash *o = &(orig->data[i]);62int bucket;63if (!o->cnt)64continue;65bucket = o->hashval & (sz - 1);66while (1) {67struct spanhash *h = &(new_spanhash->data[bucket++]);68if (!h->cnt) {69h->hashval = o->hashval;70h->cnt = o->cnt;71new_spanhash->free--;72break;73}74if (sz <= bucket)75bucket = 0;76}77}78free(orig);79return new_spanhash;80}
81
82static struct spanhash_top *add_spanhash(struct spanhash_top *top,83unsigned int hashval, int cnt)84{
85int bucket, lim;86struct spanhash *h;87
88lim = (1 << top->alloc_log2);89bucket = hashval & (lim - 1);90while (1) {91h = &(top->data[bucket++]);92if (!h->cnt) {93h->hashval = hashval;94h->cnt = cnt;95top->free--;96if (top->free < 0)97return spanhash_rehash(top);98return top;99}100if (h->hashval == hashval) {101h->cnt += cnt;102return top;103}104if (lim <= bucket)105bucket = 0;106}107}
108
109static int spanhash_cmp(const void *a_, const void *b_)110{
111const struct spanhash *a = a_;112const struct spanhash *b = b_;113
114/* A count of zero compares at the end.. */115if (!a->cnt)116return !b->cnt ? 0 : 1;117if (!b->cnt)118return -1;119return a->hashval < b->hashval ? -1 :120a->hashval > b->hashval ? 1 : 0;121}
122
123static struct spanhash_top *hash_chars(struct repository *r,124struct diff_filespec *one)125{
126int i, n;127unsigned int accum1, accum2, hashval;128struct spanhash_top *hash;129unsigned char *buf = one->data;130unsigned int sz = one->size;131int is_text = !diff_filespec_is_binary(r, one);132
133i = INITIAL_HASH_SIZE;134hash = xmalloc(st_add(sizeof(*hash),135st_mult(sizeof(struct spanhash), (size_t)1 << i)));136hash->alloc_log2 = i;137hash->free = INITIAL_FREE(i);138memset(hash->data, 0, sizeof(struct spanhash) * ((size_t)1 << i));139
140n = 0;141accum1 = accum2 = 0;142while (sz) {143unsigned int c = *buf++;144unsigned int old_1 = accum1;145sz--;146
147/* Ignore CR in CRLF sequence if text */148if (is_text && c == '\r' && sz && *buf == '\n')149continue;150
151accum1 = (accum1 << 7) ^ (accum2 >> 25);152accum2 = (accum2 << 7) ^ (old_1 >> 25);153accum1 += c;154if (++n < 64 && c != '\n')155continue;156hashval = (accum1 + accum2 * 0x61) % HASHBASE;157hash = add_spanhash(hash, hashval, n);158n = 0;159accum1 = accum2 = 0;160}161if (n > 0) {162hashval = (accum1 + accum2 * 0x61) % HASHBASE;163hash = add_spanhash(hash, hashval, n);164}165QSORT(hash->data, (size_t)1ul << hash->alloc_log2, spanhash_cmp);166return hash;167}
168
169int diffcore_count_changes(struct repository *r,170struct diff_filespec *src,171struct diff_filespec *dst,172void **src_count_p,173void **dst_count_p,174unsigned long *src_copied,175unsigned long *literal_added)176{
177struct spanhash *s, *d;178struct spanhash_top *src_count, *dst_count;179unsigned long sc, la;180
181src_count = dst_count = NULL;182if (src_count_p)183src_count = *src_count_p;184if (!src_count) {185src_count = hash_chars(r, src);186if (src_count_p)187*src_count_p = src_count;188}189if (dst_count_p)190dst_count = *dst_count_p;191if (!dst_count) {192dst_count = hash_chars(r, dst);193if (dst_count_p)194*dst_count_p = dst_count;195}196sc = la = 0;197
198s = src_count->data;199d = dst_count->data;200for (;;) {201unsigned dst_cnt, src_cnt;202if (!s->cnt)203break; /* we checked all in src */204while (d->cnt) {205if (d->hashval >= s->hashval)206break;207la += d->cnt;208d++;209}210src_cnt = s->cnt;211dst_cnt = 0;212if (d->cnt && d->hashval == s->hashval) {213dst_cnt = d->cnt;214d++;215}216if (src_cnt < dst_cnt) {217la += dst_cnt - src_cnt;218sc += src_cnt;219}220else221sc += dst_cnt;222s++;223}224while (d->cnt) {225la += d->cnt;226d++;227}228
229if (!src_count_p)230free(src_count);231if (!dst_count_p)232free(dst_count);233*src_copied = sc;234*literal_added = la;235return 0;236}
237