git

diffcore-delta.c
236 строк · 5.6 Кб
Перенос по словам
1
#include "git-compat-util.h"
2
#include "diffcore.h"
3

4
/*
5
 * Idea here is very simple.
6
 *
7
 * Almost all data we are interested in are text, but sometimes we have
8
 * to deal with binary data.  So we cut them into chunks delimited by
9
 * LF byte, or 64-byte sequence, whichever comes first, and hash them.
10
 *
11
 * For those chunks, if the source buffer has more instances of it
12
 * than the destination buffer, that means the difference are the
13
 * number of bytes not copied from source to destination.  If the
14
 * counts are the same, everything was copied from source to
15
 * destination.  If the destination has more, everything was copied,
16
 * and destination added more.
17
 *
18
 * We are doing an approximation so we do not really have to waste
19
 * memory by actually storing the sequence.  We just hash them into
20
 * somewhere around 2^16 hashbuckets and count the occurrences.
21
 */
22

23
/* Wild guess at the initial hash size */
24
#define INITIAL_HASH_SIZE 9
25

26
/* We leave more room in smaller hash but do not let it
27
 * grow to have unused hole too much.
28
 */
29
#define INITIAL_FREE(sz_log2) ((1<<(sz_log2))*(sz_log2-3)/(sz_log2))
30

31
/* A prime rather carefully chosen between 2^16..2^17, so that
32
 * HASHBASE < INITIAL_FREE(17).  We want to keep the maximum hashtable
33
 * size under the current 2<<17 maximum, which can hold this many
34
 * different values before overflowing to hashtable of size 2<<18.
35
 */
36
#define HASHBASE 107927
37

38
struct spanhash {
39
	unsigned int hashval;
40
	unsigned int cnt;
41
};
42
struct spanhash_top {
43
	int alloc_log2;
44
	int free;
45
	struct spanhash data[FLEX_ARRAY];
46
};
47

48
static struct spanhash_top *spanhash_rehash(struct spanhash_top *orig)
49
{
50
	struct spanhash_top *new_spanhash;
51
	int i;
52
	int osz = 1 << orig->alloc_log2;
53
	int sz = osz << 1;
54

55
	new_spanhash = xmalloc(st_add(sizeof(*orig),
56
			     st_mult(sizeof(struct spanhash), sz)));
57
	new_spanhash->alloc_log2 = orig->alloc_log2 + 1;
58
	new_spanhash->free = INITIAL_FREE(new_spanhash->alloc_log2);
59
	memset(new_spanhash->data, 0, sizeof(struct spanhash) * sz);
60
	for (i = 0; i < osz; i++) {
61
		struct spanhash *o = &(orig->data[i]);
62
		int bucket;
63
		if (!o->cnt)
64
			continue;
65
		bucket = o->hashval & (sz - 1);
66
		while (1) {
67
			struct spanhash *h = &(new_spanhash->data[bucket++]);
68
			if (!h->cnt) {
69
				h->hashval = o->hashval;
70
				h->cnt = o->cnt;
71
				new_spanhash->free--;
72
				break;
73
			}
74
			if (sz <= bucket)
75
				bucket = 0;
76
		}
77
	}
78
	free(orig);
79
	return new_spanhash;
80
}
81

82
static struct spanhash_top *add_spanhash(struct spanhash_top *top,
83
					 unsigned int hashval, int cnt)
84
{
85
	int bucket, lim;
86
	struct spanhash *h;
87

88
	lim = (1 << top->alloc_log2);
89
	bucket = hashval & (lim - 1);
90
	while (1) {
91
		h = &(top->data[bucket++]);
92
		if (!h->cnt) {
93
			h->hashval = hashval;
94
			h->cnt = cnt;
95
			top->free--;
96
			if (top->free < 0)
97
				return spanhash_rehash(top);
98
			return top;
99
		}
100
		if (h->hashval == hashval) {
101
			h->cnt += cnt;
102
			return top;
103
		}
104
		if (lim <= bucket)
105
			bucket = 0;
106
	}
107
}
108

109
static int spanhash_cmp(const void *a_, const void *b_)
110
{
111
	const struct spanhash *a = a_;
112
	const struct spanhash *b = b_;
113

114
	/* A count of zero compares at the end.. */
115
	if (!a->cnt)
116
		return !b->cnt ? 0 : 1;
117
	if (!b->cnt)
118
		return -1;
119
	return a->hashval < b->hashval ? -1 :
120
		a->hashval > b->hashval ? 1 : 0;
121
}
122

123
static struct spanhash_top *hash_chars(struct repository *r,
124
				       struct diff_filespec *one)
125
{
126
	int i, n;
127
	unsigned int accum1, accum2, hashval;
128
	struct spanhash_top *hash;
129
	unsigned char *buf = one->data;
130
	unsigned int sz = one->size;
131
	int is_text = !diff_filespec_is_binary(r, one);
132

133
	i = INITIAL_HASH_SIZE;
134
	hash = xmalloc(st_add(sizeof(*hash),
135
			      st_mult(sizeof(struct spanhash), (size_t)1 << i)));
136
	hash->alloc_log2 = i;
137
	hash->free = INITIAL_FREE(i);
138
	memset(hash->data, 0, sizeof(struct spanhash) * ((size_t)1 << i));
139

140
	n = 0;
141
	accum1 = accum2 = 0;
142
	while (sz) {
143
		unsigned int c = *buf++;
144
		unsigned int old_1 = accum1;
145
		sz--;
146

147
		/* Ignore CR in CRLF sequence if text */
148
		if (is_text && c == '\r' && sz && *buf == '\n')
149
			continue;
150

151
		accum1 = (accum1 << 7) ^ (accum2 >> 25);
152
		accum2 = (accum2 << 7) ^ (old_1 >> 25);
153
		accum1 += c;
154
		if (++n < 64 && c != '\n')
155
			continue;
156
		hashval = (accum1 + accum2 * 0x61) % HASHBASE;
157
		hash = add_spanhash(hash, hashval, n);
158
		n = 0;
159
		accum1 = accum2 = 0;
160
	}
161
	if (n > 0) {
162
		hashval = (accum1 + accum2 * 0x61) % HASHBASE;
163
		hash = add_spanhash(hash, hashval, n);
164
	}
165
	QSORT(hash->data, (size_t)1ul << hash->alloc_log2, spanhash_cmp);
166
	return hash;
167
}
168

169
int diffcore_count_changes(struct repository *r,
170
			   struct diff_filespec *src,
171
			   struct diff_filespec *dst,
172
			   void **src_count_p,
173
			   void **dst_count_p,
174
			   unsigned long *src_copied,
175
			   unsigned long *literal_added)
176
{
177
	struct spanhash *s, *d;
178
	struct spanhash_top *src_count, *dst_count;
179
	unsigned long sc, la;
180

181
	src_count = dst_count = NULL;
182
	if (src_count_p)
183
		src_count = *src_count_p;
184
	if (!src_count) {
185
		src_count = hash_chars(r, src);
186
		if (src_count_p)
187
			*src_count_p = src_count;
188
	}
189
	if (dst_count_p)
190
		dst_count = *dst_count_p;
191
	if (!dst_count) {
192
		dst_count = hash_chars(r, dst);
193
		if (dst_count_p)
194
			*dst_count_p = dst_count;
195
	}
196
	sc = la = 0;
197

198
	s = src_count->data;
199
	d = dst_count->data;
200
	for (;;) {
201
		unsigned dst_cnt, src_cnt;
202
		if (!s->cnt)
203
			break; /* we checked all in src */
204
		while (d->cnt) {
205
			if (d->hashval >= s->hashval)
206
				break;
207
			la += d->cnt;
208
			d++;
209
		}
210
		src_cnt = s->cnt;
211
		dst_cnt = 0;
212
		if (d->cnt && d->hashval == s->hashval) {
213
			dst_cnt = d->cnt;
214
			d++;
215
		}
216
		if (src_cnt < dst_cnt) {
217
			la += dst_cnt - src_cnt;
218
			sc += src_cnt;
219
		}
220
		else
221
			sc += dst_cnt;
222
		s++;
223
	}
224
	while (d->cnt) {
225
		la += d->cnt;
226
		d++;
227
	}
228

229
	if (!src_count_p)
230
		free(src_count);
231
	if (!dst_count_p)
232
		free(dst_count);
233
	*src_copied = sc;
234
	*literal_added = la;
235
	return 0;
236
}
237
git

Использование cookies