git
/
pack-revindex.c
583 строки · 15.1 Кб
1#define USE_THE_REPOSITORY_VARIABLE2
3#include "git-compat-util.h"4#include "gettext.h"5#include "pack-revindex.h"6#include "object-file.h"7#include "object-store-ll.h"8#include "packfile.h"9#include "strbuf.h"10#include "trace2.h"11#include "parse.h"12#include "midx.h"13#include "csum-file.h"14
15struct revindex_entry {16off_t offset;17unsigned int nr;18};19
20/*
21* Pack index for existing packs give us easy access to the offsets into
22* corresponding pack file where each object's data starts, but the entries
23* do not store the size of the compressed representation (uncompressed
24* size is easily available by examining the pack entry header). It is
25* also rather expensive to find the sha1 for an object given its offset.
26*
27* The pack index file is sorted by object name mapping to offset;
28* this revindex array is a list of offset/index_nr pairs
29* ordered by offset, so if you know the offset of an object, next offset
30* is where its packed representation ends and the index_nr can be used to
31* get the object sha1 from the main index.
32*/
33
34/*
35* This is a least-significant-digit radix sort.
36*
37* It sorts each of the "n" items in "entries" by its offset field. The "max"
38* parameter must be at least as large as the largest offset in the array,
39* and lets us quit the sort early.
40*/
41static void sort_revindex(struct revindex_entry *entries, unsigned n, off_t max)42{
43/*44* We use a "digit" size of 16 bits. That keeps our memory
45* usage reasonable, and we can generally (for a 4G or smaller
46* packfile) quit after two rounds of radix-sorting.
47*/
48#define DIGIT_SIZE (16)49#define BUCKETS (1 << DIGIT_SIZE)50/*51* We want to know the bucket that a[i] will go into when we are using
52* the digit that is N bits from the (least significant) end.
53*/
54#define BUCKET_FOR(a, i, bits) (((a)[(i)].offset >> (bits)) & (BUCKETS-1))55
56/*57* We need O(n) temporary storage. Rather than do an extra copy of the
58* partial results into "entries", we sort back and forth between the
59* real array and temporary storage. In each iteration of the loop, we
60* keep track of them with alias pointers, always sorting from "from"
61* to "to".
62*/
63struct revindex_entry *tmp, *from, *to;64int bits;65unsigned *pos;66
67ALLOC_ARRAY(pos, BUCKETS);68ALLOC_ARRAY(tmp, n);69from = entries;70to = tmp;71
72/*73* If (max >> bits) is zero, then we know that the radix digit we are
74* on (and any higher) will be zero for all entries, and our loop will
75* be a no-op, as everybody lands in the same zero-th bucket.
76*/
77for (bits = 0; max >> bits; bits += DIGIT_SIZE) {78unsigned i;79
80memset(pos, 0, BUCKETS * sizeof(*pos));81
82/*83* We want pos[i] to store the index of the last element that
84* will go in bucket "i" (actually one past the last element).
85* To do this, we first count the items that will go in each
86* bucket, which gives us a relative offset from the last
87* bucket. We can then cumulatively add the index from the
88* previous bucket to get the true index.
89*/
90for (i = 0; i < n; i++)91pos[BUCKET_FOR(from, i, bits)]++;92for (i = 1; i < BUCKETS; i++)93pos[i] += pos[i-1];94
95/*96* Now we can drop the elements into their correct buckets (in
97* our temporary array). We iterate the pos counter backwards
98* to avoid using an extra index to count up. And since we are
99* going backwards there, we must also go backwards through the
100* array itself, to keep the sort stable.
101*
102* Note that we use an unsigned iterator to make sure we can
103* handle 2^32-1 objects, even on a 32-bit system. But this
104* means we cannot use the more obvious "i >= 0" loop condition
105* for counting backwards, and must instead check for
106* wrap-around with UINT_MAX.
107*/
108for (i = n - 1; i != UINT_MAX; i--)109to[--pos[BUCKET_FOR(from, i, bits)]] = from[i];110
111/*112* Now "to" contains the most sorted list, so we swap "from" and
113* "to" for the next iteration.
114*/
115SWAP(from, to);116}117
118/*119* If we ended with our data in the original array, great. If not,
120* we have to move it back from the temporary storage.
121*/
122if (from != entries)123COPY_ARRAY(entries, tmp, n);124free(tmp);125free(pos);126
127#undef BUCKET_FOR128#undef BUCKETS129#undef DIGIT_SIZE130}
131
132/*
133* Ordered list of offsets of objects in the pack.
134*/
135static void create_pack_revindex(struct packed_git *p)136{
137const unsigned num_ent = p->num_objects;138unsigned i;139const char *index = p->index_data;140const unsigned hashsz = the_hash_algo->rawsz;141
142ALLOC_ARRAY(p->revindex, num_ent + 1);143index += 4 * 256;144
145if (p->index_version > 1) {146const uint32_t *off_32 =147(uint32_t *)(index + 8 + (size_t)p->num_objects * (hashsz + 4));148const uint32_t *off_64 = off_32 + p->num_objects;149for (i = 0; i < num_ent; i++) {150const uint32_t off = ntohl(*off_32++);151if (!(off & 0x80000000)) {152p->revindex[i].offset = off;153} else {154p->revindex[i].offset = get_be64(off_64);155off_64 += 2;156}157p->revindex[i].nr = i;158}159} else {160for (i = 0; i < num_ent; i++) {161const uint32_t hl = *((uint32_t *)(index + (hashsz + 4) * i));162p->revindex[i].offset = ntohl(hl);163p->revindex[i].nr = i;164}165}166
167/*168* This knows the pack format -- the hash trailer
169* follows immediately after the last object data.
170*/
171p->revindex[num_ent].offset = p->pack_size - hashsz;172p->revindex[num_ent].nr = -1;173sort_revindex(p->revindex, num_ent, p->pack_size);174}
175
176static int create_pack_revindex_in_memory(struct packed_git *p)177{
178if (git_env_bool(GIT_TEST_REV_INDEX_DIE_IN_MEMORY, 0))179die("dying as requested by '%s'",180GIT_TEST_REV_INDEX_DIE_IN_MEMORY);181if (open_pack_index(p))182return -1;183create_pack_revindex(p);184return 0;185}
186
187static char *pack_revindex_filename(struct packed_git *p)188{
189size_t len;190if (!strip_suffix(p->pack_name, ".pack", &len))191BUG("pack_name does not end in .pack");192return xstrfmt("%.*s.rev", (int)len, p->pack_name);193}
194
195#define RIDX_HEADER_SIZE (12)196#define RIDX_MIN_SIZE (RIDX_HEADER_SIZE + (2 * the_hash_algo->rawsz))197
198struct revindex_header {199uint32_t signature;200uint32_t version;201uint32_t hash_id;202};203
204static int load_revindex_from_disk(char *revindex_name,205uint32_t num_objects,206const uint32_t **data_p, size_t *len_p)207{
208int fd, ret = 0;209struct stat st;210void *data = NULL;211size_t revindex_size;212struct revindex_header *hdr;213
214if (git_env_bool(GIT_TEST_REV_INDEX_DIE_ON_DISK, 0))215die("dying as requested by '%s'", GIT_TEST_REV_INDEX_DIE_ON_DISK);216
217fd = git_open(revindex_name);218
219if (fd < 0) {220/* "No file" means return 1. */221ret = 1;222goto cleanup;223}224if (fstat(fd, &st)) {225ret = error_errno(_("failed to read %s"), revindex_name);226goto cleanup;227}228
229revindex_size = xsize_t(st.st_size);230
231if (revindex_size < RIDX_MIN_SIZE) {232ret = error(_("reverse-index file %s is too small"), revindex_name);233goto cleanup;234}235
236if (revindex_size - RIDX_MIN_SIZE != st_mult(sizeof(uint32_t), num_objects)) {237ret = error(_("reverse-index file %s is corrupt"), revindex_name);238goto cleanup;239}240
241data = xmmap(NULL, revindex_size, PROT_READ, MAP_PRIVATE, fd, 0);242hdr = data;243
244if (ntohl(hdr->signature) != RIDX_SIGNATURE) {245ret = error(_("reverse-index file %s has unknown signature"), revindex_name);246goto cleanup;247}248if (ntohl(hdr->version) != 1) {249ret = error(_("reverse-index file %s has unsupported version %"PRIu32),250revindex_name, ntohl(hdr->version));251goto cleanup;252}253if (!(ntohl(hdr->hash_id) == 1 || ntohl(hdr->hash_id) == 2)) {254ret = error(_("reverse-index file %s has unsupported hash id %"PRIu32),255revindex_name, ntohl(hdr->hash_id));256goto cleanup;257}258
259cleanup:260if (ret) {261if (data)262munmap(data, revindex_size);263} else {264*len_p = revindex_size;265*data_p = (const uint32_t *)data;266}267
268if (fd >= 0)269close(fd);270return ret;271}
272
273int load_pack_revindex_from_disk(struct packed_git *p)274{
275char *revindex_name;276int ret;277if (open_pack_index(p))278return -1;279
280revindex_name = pack_revindex_filename(p);281
282ret = load_revindex_from_disk(revindex_name,283p->num_objects,284&p->revindex_map,285&p->revindex_size);286if (ret)287goto cleanup;288
289p->revindex_data = (const uint32_t *)((const char *)p->revindex_map + RIDX_HEADER_SIZE);290
291cleanup:292free(revindex_name);293return ret;294}
295
296int load_pack_revindex(struct repository *r, struct packed_git *p)297{
298if (p->revindex || p->revindex_data)299return 0;300
301prepare_repo_settings(r);302
303if (r->settings.pack_read_reverse_index &&304!load_pack_revindex_from_disk(p))305return 0;306else if (!create_pack_revindex_in_memory(p))307return 0;308return -1;309}
310
311/*
312* verify_pack_revindex verifies that the on-disk rev-index for the given
313* pack-file is the same that would be created if written from scratch.
314*
315* A negative number is returned on error.
316*/
317int verify_pack_revindex(struct packed_git *p)318{
319int res = 0;320
321/* Do not bother checking if not initialized. */322if (!p->revindex_map || !p->revindex_data)323return res;324
325if (!hashfile_checksum_valid((const unsigned char *)p->revindex_map, p->revindex_size)) {326error(_("invalid checksum"));327res = -1;328}329
330/* This may fail due to a broken .idx. */331if (create_pack_revindex_in_memory(p))332return res;333
334for (size_t i = 0; i < p->num_objects; i++) {335uint32_t nr = p->revindex[i].nr;336uint32_t rev_val = get_be32(p->revindex_data + i);337
338if (nr != rev_val) {339error(_("invalid rev-index position at %"PRIu64": %"PRIu32" != %"PRIu32""),340(uint64_t)i, nr, rev_val);341res = -1;342}343}344
345return res;346}
347
348static int can_use_midx_ridx_chunk(struct multi_pack_index *m)349{
350if (!m->chunk_revindex)351return 0;352if (m->chunk_revindex_len != st_mult(sizeof(uint32_t), m->num_objects)) {353error(_("multi-pack-index reverse-index chunk is the wrong size"));354return 0;355}356return 1;357}
358
359int load_midx_revindex(struct multi_pack_index *m)360{
361struct strbuf revindex_name = STRBUF_INIT;362int ret;363
364if (m->revindex_data)365return 0;366
367if (can_use_midx_ridx_chunk(m)) {368/*369* If the MIDX `m` has a `RIDX` chunk, then use its contents for
370* the reverse index instead of trying to load a separate `.rev`
371* file.
372*
373* Note that we do *not* set `m->revindex_map` here, since we do
374* not want to accidentally call munmap() in the middle of the
375* MIDX.
376*/
377trace2_data_string("load_midx_revindex", the_repository,378"source", "midx");379m->revindex_data = (const uint32_t *)m->chunk_revindex;380return 0;381}382
383trace2_data_string("load_midx_revindex", the_repository,384"source", "rev");385
386get_midx_filename_ext(&revindex_name, m->object_dir,387get_midx_checksum(m), MIDX_EXT_REV);388
389ret = load_revindex_from_disk(revindex_name.buf,390m->num_objects,391&m->revindex_map,392&m->revindex_len);393if (ret)394goto cleanup;395
396m->revindex_data = (const uint32_t *)((const char *)m->revindex_map + RIDX_HEADER_SIZE);397
398cleanup:399strbuf_release(&revindex_name);400return ret;401}
402
403int close_midx_revindex(struct multi_pack_index *m)404{
405if (!m || !m->revindex_map)406return 0;407
408munmap((void*)m->revindex_map, m->revindex_len);409
410m->revindex_map = NULL;411m->revindex_data = NULL;412m->revindex_len = 0;413
414return 0;415}
416
417int offset_to_pack_pos(struct packed_git *p, off_t ofs, uint32_t *pos)418{
419unsigned lo, hi;420
421if (load_pack_revindex(the_repository, p) < 0)422return -1;423
424lo = 0;425hi = p->num_objects + 1;426
427do {428const unsigned mi = lo + (hi - lo) / 2;429off_t got = pack_pos_to_offset(p, mi);430
431if (got == ofs) {432*pos = mi;433return 0;434} else if (ofs < got)435hi = mi;436else437lo = mi + 1;438} while (lo < hi);439
440error("bad offset for revindex");441return -1;442}
443
444uint32_t pack_pos_to_index(struct packed_git *p, uint32_t pos)445{
446if (!(p->revindex || p->revindex_data))447BUG("pack_pos_to_index: reverse index not yet loaded");448if (p->num_objects <= pos)449BUG("pack_pos_to_index: out-of-bounds object at %"PRIu32, pos);450
451if (p->revindex)452return p->revindex[pos].nr;453else454return get_be32(p->revindex_data + pos);455}
456
457off_t pack_pos_to_offset(struct packed_git *p, uint32_t pos)458{
459if (!(p->revindex || p->revindex_data))460BUG("pack_pos_to_index: reverse index not yet loaded");461if (p->num_objects < pos)462BUG("pack_pos_to_offset: out-of-bounds object at %"PRIu32, pos);463
464if (p->revindex)465return p->revindex[pos].offset;466else if (pos == p->num_objects)467return p->pack_size - the_hash_algo->rawsz;468else469return nth_packed_object_offset(p, pack_pos_to_index(p, pos));470}
471
472uint32_t pack_pos_to_midx(struct multi_pack_index *m, uint32_t pos)473{
474if (!m->revindex_data)475BUG("pack_pos_to_midx: reverse index not yet loaded");476if (m->num_objects <= pos)477BUG("pack_pos_to_midx: out-of-bounds object at %"PRIu32, pos);478return get_be32(m->revindex_data + pos);479}
480
481struct midx_pack_key {482uint32_t pack;483off_t offset;484
485uint32_t preferred_pack;486struct multi_pack_index *midx;487};488
489static int midx_pack_order_cmp(const void *va, const void *vb)490{
491const struct midx_pack_key *key = va;492struct multi_pack_index *midx = key->midx;493
494uint32_t versus = pack_pos_to_midx(midx, (uint32_t*)vb - (const uint32_t *)midx->revindex_data);495uint32_t versus_pack = nth_midxed_pack_int_id(midx, versus);496off_t versus_offset;497
498uint32_t key_preferred = key->pack == key->preferred_pack;499uint32_t versus_preferred = versus_pack == key->preferred_pack;500
501/*502* First, compare the preferred-ness, noting that the preferred pack
503* comes first.
504*/
505if (key_preferred && !versus_preferred)506return -1;507else if (!key_preferred && versus_preferred)508return 1;509
510/* Then, break ties first by comparing the pack IDs. */511if (key->pack < versus_pack)512return -1;513else if (key->pack > versus_pack)514return 1;515
516/* Finally, break ties by comparing offsets within a pack. */517versus_offset = nth_midxed_offset(midx, versus);518if (key->offset < versus_offset)519return -1;520else if (key->offset > versus_offset)521return 1;522
523return 0;524}
525
526static int midx_key_to_pack_pos(struct multi_pack_index *m,527struct midx_pack_key *key,528uint32_t *pos)529{
530uint32_t *found;531
532if (key->pack >= m->num_packs)533BUG("MIDX pack lookup out of bounds (%"PRIu32" >= %"PRIu32")",534key->pack, m->num_packs);535/*536* The preferred pack sorts first, so determine its identifier by
537* looking at the first object in pseudo-pack order.
538*
539* Note that if no --preferred-pack is explicitly given when writing a
540* multi-pack index, then whichever pack has the lowest identifier
541* implicitly is preferred (and includes all its objects, since ties are
542* broken first by pack identifier).
543*/
544if (midx_preferred_pack(key->midx, &key->preferred_pack) < 0)545return error(_("could not determine preferred pack"));546
547found = bsearch(key, m->revindex_data, m->num_objects,548sizeof(*m->revindex_data),549midx_pack_order_cmp);550
551if (!found)552return -1;553
554*pos = found - m->revindex_data;555return 0;556}
557
558int midx_to_pack_pos(struct multi_pack_index *m, uint32_t at, uint32_t *pos)559{
560struct midx_pack_key key;561
562if (!m->revindex_data)563BUG("midx_to_pack_pos: reverse index not yet loaded");564if (m->num_objects <= at)565BUG("midx_to_pack_pos: out-of-bounds object at %"PRIu32, at);566
567key.pack = nth_midxed_pack_int_id(m, at);568key.offset = nth_midxed_offset(m, at);569key.midx = m;570
571return midx_key_to_pack_pos(m, &key, pos);572}
573
574int midx_pair_to_pack_pos(struct multi_pack_index *m, uint32_t pack_int_id,575off_t ofs, uint32_t *pos)576{
577struct midx_pack_key key = {578.pack = pack_int_id,579.offset = ofs,580.midx = m,581};582return midx_key_to_pack_pos(m, &key, pos);583}
584