git
/
blame.c
2951 строка · 84.2 Кб
1#define USE_THE_REPOSITORY_VARIABLE2
3#include "git-compat-util.h"4#include "refs.h"5#include "object-store-ll.h"6#include "cache-tree.h"7#include "mergesort.h"8#include "commit.h"9#include "convert.h"10#include "diff.h"11#include "diffcore.h"12#include "gettext.h"13#include "hex.h"14#include "path.h"15#include "read-cache.h"16#include "revision.h"17#include "setup.h"18#include "tag.h"19#include "trace2.h"20#include "blame.h"21#include "alloc.h"22#include "commit-slab.h"23#include "bloom.h"24#include "commit-graph.h"25
26define_commit_slab(blame_suspects, struct blame_origin *);27static struct blame_suspects blame_suspects;28
29struct blame_origin *get_blame_suspects(struct commit *commit)30{
31struct blame_origin **result;32
33result = blame_suspects_peek(&blame_suspects, commit);34
35return result ? *result : NULL;36}
37
38static void set_blame_suspects(struct commit *commit, struct blame_origin *origin)39{
40*blame_suspects_at(&blame_suspects, commit) = origin;41}
42
43void blame_origin_decref(struct blame_origin *o)44{
45if (o && --o->refcnt <= 0) {46struct blame_origin *p, *l = NULL;47if (o->previous)48blame_origin_decref(o->previous);49free(o->file.ptr);50/* Should be present exactly once in commit chain */51for (p = get_blame_suspects(o->commit); p; l = p, p = p->next) {52if (p == o) {53if (l)54l->next = p->next;55else56set_blame_suspects(o->commit, p->next);57free(o);58return;59}60}61die("internal error in blame_origin_decref");62}63}
64
65/*
66* Given a commit and a path in it, create a new origin structure.
67* The callers that add blame to the scoreboard should use
68* get_origin() to obtain shared, refcounted copy instead of calling
69* this function directly.
70*/
71static struct blame_origin *make_origin(struct commit *commit, const char *path)72{
73struct blame_origin *o;74FLEX_ALLOC_STR(o, path, path);75o->commit = commit;76o->refcnt = 1;77o->next = get_blame_suspects(commit);78set_blame_suspects(commit, o);79return o;80}
81
82/*
83* Locate an existing origin or create a new one.
84* This moves the origin to front position in the commit util list.
85*/
86static struct blame_origin *get_origin(struct commit *commit, const char *path)87{
88struct blame_origin *o, *l;89
90for (o = get_blame_suspects(commit), l = NULL; o; l = o, o = o->next) {91if (!strcmp(o->path, path)) {92/* bump to front */93if (l) {94l->next = o->next;95o->next = get_blame_suspects(commit);96set_blame_suspects(commit, o);97}98return blame_origin_incref(o);99}100}101return make_origin(commit, path);102}
103
104
105
106static void verify_working_tree_path(struct repository *r,107struct commit *work_tree, const char *path)108{
109struct commit_list *parents;110int pos;111
112for (parents = work_tree->parents; parents; parents = parents->next) {113const struct object_id *commit_oid = &parents->item->object.oid;114struct object_id blob_oid;115unsigned short mode;116
117if (!get_tree_entry(r, commit_oid, path, &blob_oid, &mode) &&118oid_object_info(r, &blob_oid, NULL) == OBJ_BLOB)119return;120}121
122pos = index_name_pos(r->index, path, strlen(path));123if (pos >= 0)124; /* path is in the index */125else if (-1 - pos < r->index->cache_nr &&126!strcmp(r->index->cache[-1 - pos]->name, path))127; /* path is in the index, unmerged */128else129die("no such path '%s' in HEAD", path);130}
131
132static struct commit_list **append_parent(struct repository *r,133struct commit_list **tail,134const struct object_id *oid)135{
136struct commit *parent;137
138parent = lookup_commit_reference(r, oid);139if (!parent)140die("no such commit %s", oid_to_hex(oid));141return &commit_list_insert(parent, tail)->next;142}
143
144static void append_merge_parents(struct repository *r,145struct commit_list **tail)146{
147int merge_head;148struct strbuf line = STRBUF_INIT;149
150merge_head = open(git_path_merge_head(r), O_RDONLY);151if (merge_head < 0) {152if (errno == ENOENT)153return;154die("cannot open '%s' for reading",155git_path_merge_head(r));156}157
158while (!strbuf_getwholeline_fd(&line, merge_head, '\n')) {159struct object_id oid;160if (get_oid_hex(line.buf, &oid))161die("unknown line in '%s': %s",162git_path_merge_head(r), line.buf);163tail = append_parent(r, tail, &oid);164}165close(merge_head);166strbuf_release(&line);167}
168
169/*
170* This isn't as simple as passing sb->buf and sb->len, because we
171* want to transfer ownership of the buffer to the commit (so we
172* must use detach).
173*/
174static void set_commit_buffer_from_strbuf(struct repository *r,175struct commit *c,176struct strbuf *sb)177{
178size_t len;179void *buf = strbuf_detach(sb, &len);180set_commit_buffer(r, c, buf, len);181}
182
183/*
184* Prepare a dummy commit that represents the work tree (or staged) item.
185* Note that annotating work tree item never works in the reverse.
186*/
187static struct commit *fake_working_tree_commit(struct repository *r,188struct diff_options *opt,189const char *path,190const char *contents_from,191struct object_id *oid)192{
193struct commit *commit;194struct blame_origin *origin;195struct commit_list **parent_tail, *parent;196struct strbuf buf = STRBUF_INIT;197const char *ident;198time_t now;199int len;200struct cache_entry *ce;201unsigned mode;202struct strbuf msg = STRBUF_INIT;203
204repo_read_index(r);205time(&now);206commit = alloc_commit_node(r);207commit->object.parsed = 1;208commit->date = now;209parent_tail = &commit->parents;210
211parent_tail = append_parent(r, parent_tail, oid);212append_merge_parents(r, parent_tail);213verify_working_tree_path(r, commit, path);214
215origin = make_origin(commit, path);216
217if (contents_from)218ident = fmt_ident("External file (--contents)", "external.file",219WANT_BLANK_IDENT, NULL, 0);220else221ident = fmt_ident("Not Committed Yet", "not.committed.yet",222WANT_BLANK_IDENT, NULL, 0);223strbuf_addstr(&msg, "tree 0000000000000000000000000000000000000000\n");224for (parent = commit->parents; parent; parent = parent->next)225strbuf_addf(&msg, "parent %s\n",226oid_to_hex(&parent->item->object.oid));227strbuf_addf(&msg,228"author %s\n"229"committer %s\n\n"230"Version of %s from %s\n",231ident, ident, path,232(!contents_from ? path :233(!strcmp(contents_from, "-") ? "standard input" : contents_from)));234set_commit_buffer_from_strbuf(r, commit, &msg);235
236if (!contents_from || strcmp("-", contents_from)) {237struct stat st;238const char *read_from;239char *buf_ptr;240unsigned long buf_len;241
242if (contents_from) {243if (stat(contents_from, &st) < 0)244die_errno("Cannot stat '%s'", contents_from);245read_from = contents_from;246}247else {248if (lstat(path, &st) < 0)249die_errno("Cannot lstat '%s'", path);250read_from = path;251}252mode = canon_mode(st.st_mode);253
254switch (st.st_mode & S_IFMT) {255case S_IFREG:256if (opt->flags.allow_textconv &&257textconv_object(r, read_from, mode, null_oid(), 0, &buf_ptr, &buf_len))258strbuf_attach(&buf, buf_ptr, buf_len, buf_len + 1);259else if (strbuf_read_file(&buf, read_from, st.st_size) != st.st_size)260die_errno("cannot open or read '%s'", read_from);261break;262case S_IFLNK:263if (strbuf_readlink(&buf, read_from, st.st_size) < 0)264die_errno("cannot readlink '%s'", read_from);265break;266default:267die("unsupported file type %s", read_from);268}269}270else {271/* Reading from stdin */272mode = 0;273if (strbuf_read(&buf, 0, 0) < 0)274die_errno("failed to read from stdin");275}276convert_to_git(r->index, path, buf.buf, buf.len, &buf, 0);277origin->file.ptr = buf.buf;278origin->file.size = buf.len;279pretend_object_file(buf.buf, buf.len, OBJ_BLOB, &origin->blob_oid);280
281/*282* Read the current index, replace the path entry with
283* origin->blob_sha1 without mucking with its mode or type
284* bits; we are not going to write this index out -- we just
285* want to run "diff-index --cached".
286*/
287discard_index(r->index);288repo_read_index(r);289
290len = strlen(path);291if (!mode) {292int pos = index_name_pos(r->index, path, len);293if (0 <= pos)294mode = r->index->cache[pos]->ce_mode;295else296/* Let's not bother reading from HEAD tree */297mode = S_IFREG | 0644;298}299ce = make_empty_cache_entry(r->index, len);300oidcpy(&ce->oid, &origin->blob_oid);301memcpy(ce->name, path, len);302ce->ce_flags = create_ce_flags(0);303ce->ce_namelen = len;304ce->ce_mode = create_ce_mode(mode);305add_index_entry(r->index, ce,306ADD_CACHE_OK_TO_ADD | ADD_CACHE_OK_TO_REPLACE);307
308cache_tree_invalidate_path(r->index, path);309
310return commit;311}
312
313
314
315static int diff_hunks(mmfile_t *file_a, mmfile_t *file_b,316xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, int xdl_opts)317{
318xpparam_t xpp = {0};319xdemitconf_t xecfg = {0};320xdemitcb_t ecb = {NULL};321
322xpp.flags = xdl_opts;323xecfg.hunk_func = hunk_func;324ecb.priv = cb_data;325return xdi_diff(file_a, file_b, &xpp, &xecfg, &ecb);326}
327
328static const char *get_next_line(const char *start, const char *end)329{
330const char *nl = memchr(start, '\n', end - start);331
332return nl ? nl + 1 : end;333}
334
335static int find_line_starts(int **line_starts, const char *buf,336unsigned long len)337{
338const char *end = buf + len;339const char *p;340int *lineno;341int num = 0;342
343for (p = buf; p < end; p = get_next_line(p, end))344num++;345
346ALLOC_ARRAY(*line_starts, num + 1);347lineno = *line_starts;348
349for (p = buf; p < end; p = get_next_line(p, end))350*lineno++ = p - buf;351
352*lineno = len;353
354return num;355}
356
357struct fingerprint_entry;358
359/* A fingerprint is intended to loosely represent a string, such that two
360* fingerprints can be quickly compared to give an indication of the similarity
361* of the strings that they represent.
362*
363* A fingerprint is represented as a multiset of the lower-cased byte pairs in
364* the string that it represents. Whitespace is added at each end of the
365* string. Whitespace pairs are ignored. Whitespace is converted to '\0'.
366* For example, the string "Darth Radar" will be converted to the following
367* fingerprint:
368* {"\0d", "da", "da", "ar", "ar", "rt", "th", "h\0", "\0r", "ra", "ad", "r\0"}
369*
370* The similarity between two fingerprints is the size of the intersection of
371* their multisets, including repeated elements. See fingerprint_similarity for
372* examples.
373*
374* For ease of implementation, the fingerprint is implemented as a map
375* of byte pairs to the count of that byte pair in the string, instead of
376* allowing repeated elements in a set.
377*/
378struct fingerprint {379struct hashmap map;380/* As we know the maximum number of entries in advance, it's381* convenient to store the entries in a single array instead of having
382* the hashmap manage the memory.
383*/
384struct fingerprint_entry *entries;385};386
387/* A byte pair in a fingerprint. Stores the number of times the byte pair
388* occurs in the string that the fingerprint represents.
389*/
390struct fingerprint_entry {391/* The hashmap entry - the hash represents the byte pair in its392* entirety so we don't need to store the byte pair separately.
393*/
394struct hashmap_entry entry;395/* The number of times the byte pair occurs in the string that the396* fingerprint represents.
397*/
398int count;399};400
401/* See `struct fingerprint` for an explanation of what a fingerprint is.
402* \param result the fingerprint of the string is stored here. This must be
403* freed later using free_fingerprint.
404* \param line_begin the start of the string
405* \param line_end the end of the string
406*/
407static void get_fingerprint(struct fingerprint *result,408const char *line_begin,409const char *line_end)410{
411unsigned int hash, c0 = 0, c1;412const char *p;413int max_map_entry_count = 1 + line_end - line_begin;414struct fingerprint_entry *entry = xcalloc(max_map_entry_count,415sizeof(struct fingerprint_entry));416struct fingerprint_entry *found_entry;417
418hashmap_init(&result->map, NULL, NULL, max_map_entry_count);419result->entries = entry;420for (p = line_begin; p <= line_end; ++p, c0 = c1) {421/* Always terminate the string with whitespace.422* Normalise whitespace to 0, and normalise letters to
423* lower case. This won't work for multibyte characters but at
424* worst will match some unrelated characters.
425*/
426if ((p == line_end) || isspace(*p))427c1 = 0;428else429c1 = tolower(*p);430hash = c0 | (c1 << 8);431/* Ignore whitespace pairs */432if (hash == 0)433continue;434hashmap_entry_init(&entry->entry, hash);435
436found_entry = hashmap_get_entry(&result->map, entry,437/* member name */ entry, NULL);438if (found_entry) {439found_entry->count += 1;440} else {441entry->count = 1;442hashmap_add(&result->map, &entry->entry);443++entry;444}445}446}
447
448static void free_fingerprint(struct fingerprint *f)449{
450hashmap_clear(&f->map);451free(f->entries);452}
453
454/* Calculates the similarity between two fingerprints as the size of the
455* intersection of their multisets, including repeated elements. See
456* `struct fingerprint` for an explanation of the fingerprint representation.
457* The similarity between "cat mat" and "father rather" is 2 because "at" is
458* present twice in both strings while the similarity between "tim" and "mit"
459* is 0.
460*/
461static int fingerprint_similarity(struct fingerprint *a, struct fingerprint *b)462{
463int intersection = 0;464struct hashmap_iter iter;465const struct fingerprint_entry *entry_a, *entry_b;466
467hashmap_for_each_entry(&b->map, &iter, entry_b,468entry /* member name */) {469entry_a = hashmap_get_entry(&a->map, entry_b, entry, NULL);470if (entry_a) {471intersection += entry_a->count < entry_b->count ?472entry_a->count : entry_b->count;473}474}475return intersection;476}
477
478/* Subtracts byte-pair elements in B from A, modifying A in place.
479*/
480static void fingerprint_subtract(struct fingerprint *a, struct fingerprint *b)481{
482struct hashmap_iter iter;483struct fingerprint_entry *entry_a;484const struct fingerprint_entry *entry_b;485
486hashmap_iter_init(&b->map, &iter);487
488hashmap_for_each_entry(&b->map, &iter, entry_b,489entry /* member name */) {490entry_a = hashmap_get_entry(&a->map, entry_b, entry, NULL);491if (entry_a) {492if (entry_a->count <= entry_b->count)493hashmap_remove(&a->map, &entry_b->entry, NULL);494else495entry_a->count -= entry_b->count;496}497}498}
499
500/* Calculate fingerprints for a series of lines.
501* Puts the fingerprints in the fingerprints array, which must have been
502* preallocated to allow storing line_count elements.
503*/
504static void get_line_fingerprints(struct fingerprint *fingerprints,505const char *content, const int *line_starts,506long first_line, long line_count)507{
508int i;509const char *linestart, *lineend;510
511line_starts += first_line;512for (i = 0; i < line_count; ++i) {513linestart = content + line_starts[i];514lineend = content + line_starts[i + 1];515get_fingerprint(fingerprints + i, linestart, lineend);516}517}
518
519static void free_line_fingerprints(struct fingerprint *fingerprints,520int nr_fingerprints)521{
522int i;523
524for (i = 0; i < nr_fingerprints; i++)525free_fingerprint(&fingerprints[i]);526}
527
528/* This contains the data necessary to linearly map a line number in one half
529* of a diff chunk to the line in the other half of the diff chunk that is
530* closest in terms of its position as a fraction of the length of the chunk.
531*/
532struct line_number_mapping {533int destination_start, destination_length,534source_start, source_length;535};536
537/* Given a line number in one range, offset and scale it to map it onto the
538* other range.
539* Essentially this mapping is a simple linear equation but the calculation is
540* more complicated to allow performing it with integer operations.
541* Another complication is that if a line could map onto many lines in the
542* destination range then we want to choose the line at the center of those
543* possibilities.
544* Example: if the chunk is 2 lines long in A and 10 lines long in B then the
545* first 5 lines in B will map onto the first line in the A chunk, while the
546* last 5 lines will all map onto the second line in the A chunk.
547* Example: if the chunk is 10 lines long in A and 2 lines long in B then line
548* 0 in B will map onto line 2 in A, and line 1 in B will map onto line 7 in A.
549*/
550static int map_line_number(int line_number,551const struct line_number_mapping *mapping)552{
553return ((line_number - mapping->source_start) * 2 + 1) *554mapping->destination_length /555(mapping->source_length * 2) +556mapping->destination_start;557}
558
559/* Get a pointer to the element storing the similarity between a line in A
560* and a line in B.
561*
562* The similarities are stored in a 2-dimensional array. Each "row" in the
563* array contains the similarities for a line in B. The similarities stored in
564* a row are the similarities between the line in B and the nearby lines in A.
565* To keep the length of each row the same, it is padded out with values of -1
566* where the search range extends beyond the lines in A.
567* For example, if max_search_distance_a is 2 and the two sides of a diff chunk
568* look like this:
569* a | m
570* b | n
571* c | o
572* d | p
573* e | q
574* Then the similarity array will contain:
575* [-1, -1, am, bm, cm,
576* -1, an, bn, cn, dn,
577* ao, bo, co, do, eo,
578* bp, cp, dp, ep, -1,
579* cq, dq, eq, -1, -1]
580* Where similarities are denoted either by -1 for invalid, or the
581* concatenation of the two lines in the diff being compared.
582*
583* \param similarities array of similarities between lines in A and B
584* \param line_a the index of the line in A, in the same frame of reference as
585* closest_line_a.
586* \param local_line_b the index of the line in B, relative to the first line
587* in B that similarities represents.
588* \param closest_line_a the index of the line in A that is deemed to be
589* closest to local_line_b. This must be in the same
590* frame of reference as line_a. This value defines
591* where similarities is centered for the line in B.
592* \param max_search_distance_a maximum distance in lines from the closest line
593* in A for other lines in A for which
594* similarities may be calculated.
595*/
596static int *get_similarity(int *similarities,597int line_a, int local_line_b,598int closest_line_a, int max_search_distance_a)599{
600assert(abs(line_a - closest_line_a) <=601max_search_distance_a);602return similarities + line_a - closest_line_a +603max_search_distance_a +604local_line_b * (max_search_distance_a * 2 + 1);605}
606
607#define CERTAIN_NOTHING_MATCHES -2608#define CERTAINTY_NOT_CALCULATED -1609
610/* Given a line in B, first calculate its similarities with nearby lines in A
611* if not already calculated, then identify the most similar and second most
612* similar lines. The "certainty" is calculated based on those two
613* similarities.
614*
615* \param start_a the index of the first line of the chunk in A
616* \param length_a the length in lines of the chunk in A
617* \param local_line_b the index of the line in B, relative to the first line
618* in the chunk.
619* \param fingerprints_a array of fingerprints for the chunk in A
620* \param fingerprints_b array of fingerprints for the chunk in B
621* \param similarities 2-dimensional array of similarities between lines in A
622* and B. See get_similarity() for more details.
623* \param certainties array of values indicating how strongly a line in B is
624* matched with some line in A.
625* \param second_best_result array of absolute indices in A for the second
626* closest match of a line in B.
627* \param result array of absolute indices in A for the closest match of a line
628* in B.
629* \param max_search_distance_a maximum distance in lines from the closest line
630* in A for other lines in A for which
631* similarities may be calculated.
632* \param map_line_number_in_b_to_a parameter to map_line_number().
633*/
634static void find_best_line_matches(635int start_a,636int length_a,637int start_b,638int local_line_b,639struct fingerprint *fingerprints_a,640struct fingerprint *fingerprints_b,641int *similarities,642int *certainties,643int *second_best_result,644int *result,645const int max_search_distance_a,646const struct line_number_mapping *map_line_number_in_b_to_a)647{
648
649int i, search_start, search_end, closest_local_line_a, *similarity,650best_similarity = 0, second_best_similarity = 0,651best_similarity_index = 0, second_best_similarity_index = 0;652
653/* certainty has already been calculated so no need to redo the work */654if (certainties[local_line_b] != CERTAINTY_NOT_CALCULATED)655return;656
657closest_local_line_a = map_line_number(658local_line_b + start_b, map_line_number_in_b_to_a) - start_a;659
660search_start = closest_local_line_a - max_search_distance_a;661if (search_start < 0)662search_start = 0;663
664search_end = closest_local_line_a + max_search_distance_a + 1;665if (search_end > length_a)666search_end = length_a;667
668for (i = search_start; i < search_end; ++i) {669similarity = get_similarity(similarities,670i, local_line_b,671closest_local_line_a,672max_search_distance_a);673if (*similarity == -1) {674/* This value will never exceed 10 but assert just in675* case
676*/
677assert(abs(i - closest_local_line_a) < 1000);678/* scale the similarity by (1000 - distance from679* closest line) to act as a tie break between lines
680* that otherwise are equally similar.
681*/
682*similarity = fingerprint_similarity(683fingerprints_b + local_line_b,684fingerprints_a + i) *685(1000 - abs(i - closest_local_line_a));686}687if (*similarity > best_similarity) {688second_best_similarity = best_similarity;689second_best_similarity_index = best_similarity_index;690best_similarity = *similarity;691best_similarity_index = i;692} else if (*similarity > second_best_similarity) {693second_best_similarity = *similarity;694second_best_similarity_index = i;695}696}697
698if (best_similarity == 0) {699/* this line definitely doesn't match with anything. Mark it700* with this special value so it doesn't get invalidated and
701* won't be recalculated.
702*/
703certainties[local_line_b] = CERTAIN_NOTHING_MATCHES;704result[local_line_b] = -1;705} else {706/* Calculate the certainty with which this line matches.707* If the line matches well with two lines then that reduces
708* the certainty. However we still want to prioritise matching
709* a line that matches very well with two lines over matching a
710* line that matches poorly with one line, hence doubling
711* best_similarity.
712* This means that if we have
713* line X that matches only one line with a score of 3,
714* line Y that matches two lines equally with a score of 5,
715* and line Z that matches only one line with a score or 2,
716* then the lines in order of certainty are X, Y, Z.
717*/
718certainties[local_line_b] = best_similarity * 2 -719second_best_similarity;720
721/* We keep both the best and second best results to allow us to722* check at a later stage of the matching process whether the
723* result needs to be invalidated.
724*/
725result[local_line_b] = start_a + best_similarity_index;726second_best_result[local_line_b] =727start_a + second_best_similarity_index;728}729}
730
731/*
732* This finds the line that we can match with the most confidence, and
733* uses it as a partition. It then calls itself on the lines on either side of
734* that partition. In this way we avoid lines appearing out of order, and
735* retain a sensible line ordering.
736* \param start_a index of the first line in A with which lines in B may be
737* compared.
738* \param start_b index of the first line in B for which matching should be
739* done.
740* \param length_a number of lines in A with which lines in B may be compared.
741* \param length_b number of lines in B for which matching should be done.
742* \param fingerprints_a mutable array of fingerprints in A. The first element
743* corresponds to the line at start_a.
744* \param fingerprints_b array of fingerprints in B. The first element
745* corresponds to the line at start_b.
746* \param similarities 2-dimensional array of similarities between lines in A
747* and B. See get_similarity() for more details.
748* \param certainties array of values indicating how strongly a line in B is
749* matched with some line in A.
750* \param second_best_result array of absolute indices in A for the second
751* closest match of a line in B.
752* \param result array of absolute indices in A for the closest match of a line
753* in B.
754* \param max_search_distance_a maximum distance in lines from the closest line
755* in A for other lines in A for which
756* similarities may be calculated.
757* \param max_search_distance_b an upper bound on the greatest possible
758* distance between lines in B such that they will
759* both be compared with the same line in A
760* according to max_search_distance_a.
761* \param map_line_number_in_b_to_a parameter to map_line_number().
762*/
763static void fuzzy_find_matching_lines_recurse(764int start_a, int start_b,765int length_a, int length_b,766struct fingerprint *fingerprints_a,767struct fingerprint *fingerprints_b,768int *similarities,769int *certainties,770int *second_best_result,771int *result,772int max_search_distance_a,773int max_search_distance_b,774const struct line_number_mapping *map_line_number_in_b_to_a)775{
776int i, invalidate_min, invalidate_max, offset_b,777second_half_start_a, second_half_start_b,778second_half_length_a, second_half_length_b,779most_certain_line_a, most_certain_local_line_b = -1,780most_certain_line_certainty = -1,781closest_local_line_a;782
783for (i = 0; i < length_b; ++i) {784find_best_line_matches(start_a,785length_a,786start_b,787i,788fingerprints_a,789fingerprints_b,790similarities,791certainties,792second_best_result,793result,794max_search_distance_a,795map_line_number_in_b_to_a);796
797if (certainties[i] > most_certain_line_certainty) {798most_certain_line_certainty = certainties[i];799most_certain_local_line_b = i;800}801}802
803/* No matches. */804if (most_certain_local_line_b == -1)805return;806
807most_certain_line_a = result[most_certain_local_line_b];808
809/*810* Subtract the most certain line's fingerprint in B from the matched
811* fingerprint in A. This means that other lines in B can't also match
812* the same parts of the line in A.
813*/
814fingerprint_subtract(fingerprints_a + most_certain_line_a - start_a,815fingerprints_b + most_certain_local_line_b);816
817/* Invalidate results that may be affected by the choice of most818* certain line.
819*/
820invalidate_min = most_certain_local_line_b - max_search_distance_b;821invalidate_max = most_certain_local_line_b + max_search_distance_b + 1;822if (invalidate_min < 0)823invalidate_min = 0;824if (invalidate_max > length_b)825invalidate_max = length_b;826
827/* As the fingerprint in A has changed, discard previously calculated828* similarity values with that fingerprint.
829*/
830for (i = invalidate_min; i < invalidate_max; ++i) {831closest_local_line_a = map_line_number(832i + start_b, map_line_number_in_b_to_a) - start_a;833
834/* Check that the lines in A and B are close enough that there835* is a similarity value for them.
836*/
837if (abs(most_certain_line_a - start_a - closest_local_line_a) >838max_search_distance_a) {839continue;840}841
842*get_similarity(similarities, most_certain_line_a - start_a,843i, closest_local_line_a,844max_search_distance_a) = -1;845}846
847/* More invalidating of results that may be affected by the choice of848* most certain line.
849* Discard the matches for lines in B that are currently matched with a
850* line in A such that their ordering contradicts the ordering imposed
851* by the choice of most certain line.
852*/
853for (i = most_certain_local_line_b - 1; i >= invalidate_min; --i) {854/* In this loop we discard results for lines in B that are855* before most-certain-line-B but are matched with a line in A
856* that is after most-certain-line-A.
857*/
858if (certainties[i] >= 0 &&859(result[i] >= most_certain_line_a ||860second_best_result[i] >= most_certain_line_a)) {861certainties[i] = CERTAINTY_NOT_CALCULATED;862}863}864for (i = most_certain_local_line_b + 1; i < invalidate_max; ++i) {865/* In this loop we discard results for lines in B that are866* after most-certain-line-B but are matched with a line in A
867* that is before most-certain-line-A.
868*/
869if (certainties[i] >= 0 &&870(result[i] <= most_certain_line_a ||871second_best_result[i] <= most_certain_line_a)) {872certainties[i] = CERTAINTY_NOT_CALCULATED;873}874}875
876/* Repeat the matching process for lines before the most certain line.877*/
878if (most_certain_local_line_b > 0) {879fuzzy_find_matching_lines_recurse(880start_a, start_b,881most_certain_line_a + 1 - start_a,882most_certain_local_line_b,883fingerprints_a, fingerprints_b, similarities,884certainties, second_best_result, result,885max_search_distance_a,886max_search_distance_b,887map_line_number_in_b_to_a);888}889/* Repeat the matching process for lines after the most certain line.890*/
891if (most_certain_local_line_b + 1 < length_b) {892second_half_start_a = most_certain_line_a;893offset_b = most_certain_local_line_b + 1;894second_half_start_b = start_b + offset_b;895second_half_length_a =896length_a + start_a - second_half_start_a;897second_half_length_b =898length_b + start_b - second_half_start_b;899fuzzy_find_matching_lines_recurse(900second_half_start_a, second_half_start_b,901second_half_length_a, second_half_length_b,902fingerprints_a + second_half_start_a - start_a,903fingerprints_b + offset_b,904similarities +905offset_b * (max_search_distance_a * 2 + 1),906certainties + offset_b,907second_best_result + offset_b, result + offset_b,908max_search_distance_a,909max_search_distance_b,910map_line_number_in_b_to_a);911}912}
913
914/* Find the lines in the parent line range that most closely match the lines in
915* the target line range. This is accomplished by matching fingerprints in each
916* blame_origin, and choosing the best matches that preserve the line ordering.
917* See struct fingerprint for details of fingerprint matching, and
918* fuzzy_find_matching_lines_recurse for details of preserving line ordering.
919*
920* The performance is believed to be O(n log n) in the typical case and O(n^2)
921* in a pathological case, where n is the number of lines in the target range.
922*/
923static int *fuzzy_find_matching_lines(struct blame_origin *parent,924struct blame_origin *target,925int tlno, int parent_slno, int same,926int parent_len)927{
928/* We use the terminology "A" for the left hand side of the diff AKA929* parent, and "B" for the right hand side of the diff AKA target. */
930int start_a = parent_slno;931int length_a = parent_len;932int start_b = tlno;933int length_b = same - tlno;934
935struct line_number_mapping map_line_number_in_b_to_a = {936start_a, length_a, start_b, length_b937};938
939struct fingerprint *fingerprints_a = parent->fingerprints;940struct fingerprint *fingerprints_b = target->fingerprints;941
942int i, *result, *second_best_result,943*certainties, *similarities, similarity_count;944
945/*946* max_search_distance_a means that given a line in B, compare it to
947* the line in A that is closest to its position, and the lines in A
948* that are no greater than max_search_distance_a lines away from the
949* closest line in A.
950*
951* max_search_distance_b is an upper bound on the greatest possible
952* distance between lines in B such that they will both be compared
953* with the same line in A according to max_search_distance_a.
954*/
955int max_search_distance_a = 10, max_search_distance_b;956
957if (length_a <= 0)958return NULL;959
960if (max_search_distance_a >= length_a)961max_search_distance_a = length_a ? length_a - 1 : 0;962
963max_search_distance_b = ((2 * max_search_distance_a + 1) * length_b964- 1) / length_a;965
966CALLOC_ARRAY(result, length_b);967CALLOC_ARRAY(second_best_result, length_b);968CALLOC_ARRAY(certainties, length_b);969
970/* See get_similarity() for details of similarities. */971similarity_count = length_b * (max_search_distance_a * 2 + 1);972CALLOC_ARRAY(similarities, similarity_count);973
974for (i = 0; i < length_b; ++i) {975result[i] = -1;976second_best_result[i] = -1;977certainties[i] = CERTAINTY_NOT_CALCULATED;978}979
980for (i = 0; i < similarity_count; ++i)981similarities[i] = -1;982
983fuzzy_find_matching_lines_recurse(start_a, start_b,984length_a, length_b,985fingerprints_a + start_a,986fingerprints_b + start_b,987similarities,988certainties,989second_best_result,990result,991max_search_distance_a,992max_search_distance_b,993&map_line_number_in_b_to_a);994
995free(similarities);996free(certainties);997free(second_best_result);998
999return result;1000}
1001
1002static void fill_origin_fingerprints(struct blame_origin *o)1003{
1004int *line_starts;1005
1006if (o->fingerprints)1007return;1008o->num_lines = find_line_starts(&line_starts, o->file.ptr,1009o->file.size);1010CALLOC_ARRAY(o->fingerprints, o->num_lines);1011get_line_fingerprints(o->fingerprints, o->file.ptr, line_starts,10120, o->num_lines);1013free(line_starts);1014}
1015
1016static void drop_origin_fingerprints(struct blame_origin *o)1017{
1018if (o->fingerprints) {1019free_line_fingerprints(o->fingerprints, o->num_lines);1020o->num_lines = 0;1021FREE_AND_NULL(o->fingerprints);1022}1023}
1024
1025/*
1026* Given an origin, prepare mmfile_t structure to be used by the
1027* diff machinery
1028*/
1029static void fill_origin_blob(struct diff_options *opt,1030struct blame_origin *o, mmfile_t *file,1031int *num_read_blob, int fill_fingerprints)1032{
1033if (!o->file.ptr) {1034enum object_type type;1035unsigned long file_size;1036
1037(*num_read_blob)++;1038if (opt->flags.allow_textconv &&1039textconv_object(opt->repo, o->path, o->mode,1040&o->blob_oid, 1, &file->ptr, &file_size))1041;1042else1043file->ptr = repo_read_object_file(the_repository,1044&o->blob_oid, &type,1045&file_size);1046file->size = file_size;1047
1048if (!file->ptr)1049die("Cannot read blob %s for path %s",1050oid_to_hex(&o->blob_oid),1051o->path);1052o->file = *file;1053}1054else1055*file = o->file;1056if (fill_fingerprints)1057fill_origin_fingerprints(o);1058}
1059
1060static void drop_origin_blob(struct blame_origin *o)1061{
1062FREE_AND_NULL(o->file.ptr);1063drop_origin_fingerprints(o);1064}
1065
1066/*
1067* Any merge of blames happens on lists of blames that arrived via
1068* different parents in a single suspect. In this case, we want to
1069* sort according to the suspect line numbers as opposed to the final
1070* image line numbers. The function body is somewhat longish because
1071* it avoids unnecessary writes.
1072*/
1073
1074static struct blame_entry *blame_merge(struct blame_entry *list1,1075struct blame_entry *list2)1076{
1077struct blame_entry *p1 = list1, *p2 = list2,1078**tail = &list1;1079
1080if (!p1)1081return p2;1082if (!p2)1083return p1;1084
1085if (p1->s_lno <= p2->s_lno) {1086do {1087tail = &p1->next;1088if (!(p1 = *tail)) {1089*tail = p2;1090return list1;1091}1092} while (p1->s_lno <= p2->s_lno);1093}1094for (;;) {1095*tail = p2;1096do {1097tail = &p2->next;1098if (!(p2 = *tail)) {1099*tail = p1;1100return list1;1101}1102} while (p1->s_lno > p2->s_lno);1103*tail = p1;1104do {1105tail = &p1->next;1106if (!(p1 = *tail)) {1107*tail = p2;1108return list1;1109}1110} while (p1->s_lno <= p2->s_lno);1111}1112}
1113
1114DEFINE_LIST_SORT(static, sort_blame_entries, struct blame_entry, next);1115
1116/*
1117* Final image line numbers are all different, so we don't need a
1118* three-way comparison here.
1119*/
1120
1121static int compare_blame_final(const struct blame_entry *e1,1122const struct blame_entry *e2)1123{
1124return e1->lno > e2->lno ? 1 : -1;1125}
1126
1127static int compare_blame_suspect(const struct blame_entry *s1,1128const struct blame_entry *s2)1129{
1130/*1131* to allow for collating suspects, we sort according to the
1132* respective pointer value as the primary sorting criterion.
1133* The actual relation is pretty unimportant as long as it
1134* establishes a total order. Comparing as integers gives us
1135* that.
1136*/
1137if (s1->suspect != s2->suspect)1138return (intptr_t)s1->suspect > (intptr_t)s2->suspect ? 1 : -1;1139if (s1->s_lno == s2->s_lno)1140return 0;1141return s1->s_lno > s2->s_lno ? 1 : -1;1142}
1143
1144void blame_sort_final(struct blame_scoreboard *sb)1145{
1146sort_blame_entries(&sb->ent, compare_blame_final);1147}
1148
1149static int compare_commits_by_reverse_commit_date(const void *a,1150const void *b,1151void *c)1152{
1153return -compare_commits_by_commit_date(a, b, c);1154}
1155
1156/*
1157* For debugging -- origin is refcounted, and this asserts that
1158* we do not underflow.
1159*/
1160static void sanity_check_refcnt(struct blame_scoreboard *sb)1161{
1162int baa = 0;1163struct blame_entry *ent;1164
1165for (ent = sb->ent; ent; ent = ent->next) {1166/* Nobody should have zero or negative refcnt */1167if (ent->suspect->refcnt <= 0) {1168fprintf(stderr, "%s in %s has negative refcnt %d\n",1169ent->suspect->path,1170oid_to_hex(&ent->suspect->commit->object.oid),1171ent->suspect->refcnt);1172baa = 1;1173}1174}1175if (baa)1176sb->on_sanity_fail(sb, baa);1177}
1178
1179/*
1180* If two blame entries that are next to each other came from
1181* contiguous lines in the same origin (i.e. <commit, path> pair),
1182* merge them together.
1183*/
1184void blame_coalesce(struct blame_scoreboard *sb)1185{
1186struct blame_entry *ent, *next;1187
1188for (ent = sb->ent; ent && (next = ent->next); ent = next) {1189if (ent->suspect == next->suspect &&1190ent->s_lno + ent->num_lines == next->s_lno &&1191ent->lno + ent->num_lines == next->lno &&1192ent->ignored == next->ignored &&1193ent->unblamable == next->unblamable) {1194ent->num_lines += next->num_lines;1195ent->next = next->next;1196blame_origin_decref(next->suspect);1197free(next);1198ent->score = 0;1199next = ent; /* again */1200}1201}1202
1203if (sb->debug) /* sanity */1204sanity_check_refcnt(sb);1205}
1206
1207/*
1208* Merge the given sorted list of blames into a preexisting origin.
1209* If there were no previous blames to that commit, it is entered into
1210* the commit priority queue of the score board.
1211*/
1212
1213static void queue_blames(struct blame_scoreboard *sb, struct blame_origin *porigin,1214struct blame_entry *sorted)1215{
1216if (porigin->suspects)1217porigin->suspects = blame_merge(porigin->suspects, sorted);1218else {1219struct blame_origin *o;1220for (o = get_blame_suspects(porigin->commit); o; o = o->next) {1221if (o->suspects) {1222porigin->suspects = sorted;1223return;1224}1225}1226porigin->suspects = sorted;1227prio_queue_put(&sb->commits, porigin->commit);1228}1229}
1230
1231/*
1232* Fill the blob_sha1 field of an origin if it hasn't, so that later
1233* call to fill_origin_blob() can use it to locate the data. blob_sha1
1234* for an origin is also used to pass the blame for the entire file to
1235* the parent to detect the case where a child's blob is identical to
1236* that of its parent's.
1237*
1238* This also fills origin->mode for corresponding tree path.
1239*/
1240static int fill_blob_sha1_and_mode(struct repository *r,1241struct blame_origin *origin)1242{
1243if (!is_null_oid(&origin->blob_oid))1244return 0;1245if (get_tree_entry(r, &origin->commit->object.oid, origin->path, &origin->blob_oid, &origin->mode))1246goto error_out;1247if (oid_object_info(r, &origin->blob_oid, NULL) != OBJ_BLOB)1248goto error_out;1249return 0;1250error_out:1251oidclr(&origin->blob_oid, the_repository->hash_algo);1252origin->mode = S_IFINVALID;1253return -1;1254}
1255
1256struct blame_bloom_data {1257/*1258* Changed-path Bloom filter keys. These can help prevent
1259* computing diffs against first parents, but we need to
1260* expand the list as code is moved or files are renamed.
1261*/
1262struct bloom_filter_settings *settings;1263struct bloom_key **keys;1264int nr;1265int alloc;1266};1267
1268static int bloom_count_queries = 0;1269static int bloom_count_no = 0;1270static int maybe_changed_path(struct repository *r,1271struct blame_origin *origin,1272struct blame_bloom_data *bd)1273{
1274int i;1275struct bloom_filter *filter;1276
1277if (!bd)1278return 1;1279
1280if (commit_graph_generation(origin->commit) == GENERATION_NUMBER_INFINITY)1281return 1;1282
1283filter = get_bloom_filter(r, origin->commit);1284
1285if (!filter)1286return 1;1287
1288bloom_count_queries++;1289for (i = 0; i < bd->nr; i++) {1290if (bloom_filter_contains(filter,1291bd->keys[i],1292bd->settings))1293return 1;1294}1295
1296bloom_count_no++;1297return 0;1298}
1299
1300static void add_bloom_key(struct blame_bloom_data *bd,1301const char *path)1302{
1303if (!bd)1304return;1305
1306if (bd->nr >= bd->alloc) {1307bd->alloc *= 2;1308REALLOC_ARRAY(bd->keys, bd->alloc);1309}1310
1311bd->keys[bd->nr] = xmalloc(sizeof(struct bloom_key));1312fill_bloom_key(path, strlen(path), bd->keys[bd->nr], bd->settings);1313bd->nr++;1314}
1315
1316/*
1317* We have an origin -- check if the same path exists in the
1318* parent and return an origin structure to represent it.
1319*/
1320static struct blame_origin *find_origin(struct repository *r,1321struct commit *parent,1322struct blame_origin *origin,1323struct blame_bloom_data *bd)1324{
1325struct blame_origin *porigin;1326struct diff_options diff_opts;1327const char *paths[2];1328
1329/* First check any existing origins */1330for (porigin = get_blame_suspects(parent); porigin; porigin = porigin->next)1331if (!strcmp(porigin->path, origin->path)) {1332/*1333* The same path between origin and its parent
1334* without renaming -- the most common case.
1335*/
1336return blame_origin_incref (porigin);1337}1338
1339/* See if the origin->path is different between parent1340* and origin first. Most of the time they are the
1341* same and diff-tree is fairly efficient about this.
1342*/
1343repo_diff_setup(r, &diff_opts);1344diff_opts.flags.recursive = 1;1345diff_opts.detect_rename = 0;1346diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT;1347paths[0] = origin->path;1348paths[1] = NULL;1349
1350parse_pathspec(&diff_opts.pathspec,1351PATHSPEC_ALL_MAGIC & ~PATHSPEC_LITERAL,1352PATHSPEC_LITERAL_PATH, "", paths);1353diff_setup_done(&diff_opts);1354
1355if (is_null_oid(&origin->commit->object.oid))1356do_diff_cache(get_commit_tree_oid(parent), &diff_opts);1357else {1358int compute_diff = 1;1359if (origin->commit->parents &&1360oideq(&parent->object.oid,1361&origin->commit->parents->item->object.oid))1362compute_diff = maybe_changed_path(r, origin, bd);1363
1364if (compute_diff)1365diff_tree_oid(get_commit_tree_oid(parent),1366get_commit_tree_oid(origin->commit),1367"", &diff_opts);1368}1369diffcore_std(&diff_opts);1370
1371if (!diff_queued_diff.nr) {1372/* The path is the same as parent */1373porigin = get_origin(parent, origin->path);1374oidcpy(&porigin->blob_oid, &origin->blob_oid);1375porigin->mode = origin->mode;1376} else {1377/*1378* Since origin->path is a pathspec, if the parent
1379* commit had it as a directory, we will see a whole
1380* bunch of deletion of files in the directory that we
1381* do not care about.
1382*/
1383int i;1384struct diff_filepair *p = NULL;1385for (i = 0; i < diff_queued_diff.nr; i++) {1386const char *name;1387p = diff_queued_diff.queue[i];1388name = p->one->path ? p->one->path : p->two->path;1389if (!strcmp(name, origin->path))1390break;1391}1392if (!p)1393die("internal error in blame::find_origin");1394switch (p->status) {1395default:1396die("internal error in blame::find_origin (%c)",1397p->status);1398case 'M':1399porigin = get_origin(parent, origin->path);1400oidcpy(&porigin->blob_oid, &p->one->oid);1401porigin->mode = p->one->mode;1402break;1403case 'A':1404case 'T':1405/* Did not exist in parent, or type changed */1406break;1407}1408}1409diff_flush(&diff_opts);1410return porigin;1411}
1412
1413/*
1414* We have an origin -- find the path that corresponds to it in its
1415* parent and return an origin structure to represent it.
1416*/
1417static struct blame_origin *find_rename(struct repository *r,1418struct commit *parent,1419struct blame_origin *origin,1420struct blame_bloom_data *bd)1421{
1422struct blame_origin *porigin = NULL;1423struct diff_options diff_opts;1424int i;1425
1426repo_diff_setup(r, &diff_opts);1427diff_opts.flags.recursive = 1;1428diff_opts.detect_rename = DIFF_DETECT_RENAME;1429diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT;1430diff_opts.single_follow = origin->path;1431diff_setup_done(&diff_opts);1432
1433if (is_null_oid(&origin->commit->object.oid))1434do_diff_cache(get_commit_tree_oid(parent), &diff_opts);1435else1436diff_tree_oid(get_commit_tree_oid(parent),1437get_commit_tree_oid(origin->commit),1438"", &diff_opts);1439diffcore_std(&diff_opts);1440
1441for (i = 0; i < diff_queued_diff.nr; i++) {1442struct diff_filepair *p = diff_queued_diff.queue[i];1443if ((p->status == 'R' || p->status == 'C') &&1444!strcmp(p->two->path, origin->path)) {1445add_bloom_key(bd, p->one->path);1446porigin = get_origin(parent, p->one->path);1447oidcpy(&porigin->blob_oid, &p->one->oid);1448porigin->mode = p->one->mode;1449break;1450}1451}1452diff_flush(&diff_opts);1453return porigin;1454}
1455
1456/*
1457* Append a new blame entry to a given output queue.
1458*/
1459static void add_blame_entry(struct blame_entry ***queue,1460const struct blame_entry *src)1461{
1462struct blame_entry *e = xmalloc(sizeof(*e));1463memcpy(e, src, sizeof(*e));1464blame_origin_incref(e->suspect);1465
1466e->next = **queue;1467**queue = e;1468*queue = &e->next;1469}
1470
1471/*
1472* src typically is on-stack; we want to copy the information in it to
1473* a malloced blame_entry that gets added to the given queue. The
1474* origin of dst loses a refcnt.
1475*/
1476static void dup_entry(struct blame_entry ***queue,1477struct blame_entry *dst, struct blame_entry *src)1478{
1479blame_origin_incref(src->suspect);1480blame_origin_decref(dst->suspect);1481memcpy(dst, src, sizeof(*src));1482dst->next = **queue;1483**queue = dst;1484*queue = &dst->next;1485}
1486
1487const char *blame_nth_line(struct blame_scoreboard *sb, long lno)1488{
1489return sb->final_buf + sb->lineno[lno];1490}
1491
1492/*
1493* It is known that lines between tlno to same came from parent, and e
1494* has an overlap with that range. it also is known that parent's
1495* line plno corresponds to e's line tlno.
1496*
1497* <---- e ----->
1498* <------>
1499* <------------>
1500* <------------>
1501* <------------------>
1502*
1503* Split e into potentially three parts; before this chunk, the chunk
1504* to be blamed for the parent, and after that portion.
1505*/
1506static void split_overlap(struct blame_entry *split,1507struct blame_entry *e,1508int tlno, int plno, int same,1509struct blame_origin *parent)1510{
1511int chunk_end_lno;1512int i;1513memset(split, 0, sizeof(struct blame_entry [3]));1514
1515for (i = 0; i < 3; i++) {1516split[i].ignored = e->ignored;1517split[i].unblamable = e->unblamable;1518}1519
1520if (e->s_lno < tlno) {1521/* there is a pre-chunk part not blamed on parent */1522split[0].suspect = blame_origin_incref(e->suspect);1523split[0].lno = e->lno;1524split[0].s_lno = e->s_lno;1525split[0].num_lines = tlno - e->s_lno;1526split[1].lno = e->lno + tlno - e->s_lno;1527split[1].s_lno = plno;1528}1529else {1530split[1].lno = e->lno;1531split[1].s_lno = plno + (e->s_lno - tlno);1532}1533
1534if (same < e->s_lno + e->num_lines) {1535/* there is a post-chunk part not blamed on parent */1536split[2].suspect = blame_origin_incref(e->suspect);1537split[2].lno = e->lno + (same - e->s_lno);1538split[2].s_lno = e->s_lno + (same - e->s_lno);1539split[2].num_lines = e->s_lno + e->num_lines - same;1540chunk_end_lno = split[2].lno;1541}1542else1543chunk_end_lno = e->lno + e->num_lines;1544split[1].num_lines = chunk_end_lno - split[1].lno;1545
1546/*1547* if it turns out there is nothing to blame the parent for,
1548* forget about the splitting. !split[1].suspect signals this.
1549*/
1550if (split[1].num_lines < 1)1551return;1552split[1].suspect = blame_origin_incref(parent);1553}
1554
1555/*
1556* split_overlap() divided an existing blame e into up to three parts
1557* in split. Any assigned blame is moved to queue to
1558* reflect the split.
1559*/
1560static void split_blame(struct blame_entry ***blamed,1561struct blame_entry ***unblamed,1562struct blame_entry *split,1563struct blame_entry *e)1564{
1565if (split[0].suspect && split[2].suspect) {1566/* The first part (reuse storage for the existing entry e) */1567dup_entry(unblamed, e, &split[0]);1568
1569/* The last part -- me */1570add_blame_entry(unblamed, &split[2]);1571
1572/* ... and the middle part -- parent */1573add_blame_entry(blamed, &split[1]);1574}1575else if (!split[0].suspect && !split[2].suspect)1576/*1577* The parent covers the entire area; reuse storage for
1578* e and replace it with the parent.
1579*/
1580dup_entry(blamed, e, &split[1]);1581else if (split[0].suspect) {1582/* me and then parent */1583dup_entry(unblamed, e, &split[0]);1584add_blame_entry(blamed, &split[1]);1585}1586else {1587/* parent and then me */1588dup_entry(blamed, e, &split[1]);1589add_blame_entry(unblamed, &split[2]);1590}1591}
1592
1593/*
1594* After splitting the blame, the origins used by the
1595* on-stack blame_entry should lose one refcnt each.
1596*/
1597static void decref_split(struct blame_entry *split)1598{
1599int i;1600
1601for (i = 0; i < 3; i++)1602blame_origin_decref(split[i].suspect);1603}
1604
1605/*
1606* reverse_blame reverses the list given in head, appending tail.
1607* That allows us to build lists in reverse order, then reverse them
1608* afterwards. This can be faster than building the list in proper
1609* order right away. The reason is that building in proper order
1610* requires writing a link in the _previous_ element, while building
1611* in reverse order just requires placing the list head into the
1612* _current_ element.
1613*/
1614
1615static struct blame_entry *reverse_blame(struct blame_entry *head,1616struct blame_entry *tail)1617{
1618while (head) {1619struct blame_entry *next = head->next;1620head->next = tail;1621tail = head;1622head = next;1623}1624return tail;1625}
1626
1627/*
1628* Splits a blame entry into two entries at 'len' lines. The original 'e'
1629* consists of len lines, i.e. [e->lno, e->lno + len), and the second part,
1630* which is returned, consists of the remainder: [e->lno + len, e->lno +
1631* e->num_lines). The caller needs to sort out the reference counting for the
1632* new entry's suspect.
1633*/
1634static struct blame_entry *split_blame_at(struct blame_entry *e, int len,1635struct blame_origin *new_suspect)1636{
1637struct blame_entry *n = xcalloc(1, sizeof(struct blame_entry));1638
1639n->suspect = new_suspect;1640n->ignored = e->ignored;1641n->unblamable = e->unblamable;1642n->lno = e->lno + len;1643n->s_lno = e->s_lno + len;1644n->num_lines = e->num_lines - len;1645e->num_lines = len;1646e->score = 0;1647return n;1648}
1649
1650struct blame_line_tracker {1651int is_parent;1652int s_lno;1653};1654
1655static int are_lines_adjacent(struct blame_line_tracker *first,1656struct blame_line_tracker *second)1657{
1658return first->is_parent == second->is_parent &&1659first->s_lno + 1 == second->s_lno;1660}
1661
1662static int scan_parent_range(struct fingerprint *p_fps,1663struct fingerprint *t_fps, int t_idx,1664int from, int nr_lines)1665{
1666int sim, p_idx;1667#define FINGERPRINT_FILE_THRESHOLD 101668int best_sim_val = FINGERPRINT_FILE_THRESHOLD;1669int best_sim_idx = -1;1670
1671for (p_idx = from; p_idx < from + nr_lines; p_idx++) {1672sim = fingerprint_similarity(&t_fps[t_idx], &p_fps[p_idx]);1673if (sim < best_sim_val)1674continue;1675/* Break ties with the closest-to-target line number */1676if (sim == best_sim_val && best_sim_idx != -1 &&1677abs(best_sim_idx - t_idx) < abs(p_idx - t_idx))1678continue;1679best_sim_val = sim;1680best_sim_idx = p_idx;1681}1682return best_sim_idx;1683}
1684
1685/*
1686* The first pass checks the blame entry (from the target) against the parent's
1687* diff chunk. If that fails for a line, the second pass tries to match that
1688* line to any part of parent file. That catches cases where a change was
1689* broken into two chunks by 'context.'
1690*/
1691static void guess_line_blames(struct blame_origin *parent,1692struct blame_origin *target,1693int tlno, int offset, int same, int parent_len,1694struct blame_line_tracker *line_blames)1695{
1696int i, best_idx, target_idx;1697int parent_slno = tlno + offset;1698int *fuzzy_matches;1699
1700fuzzy_matches = fuzzy_find_matching_lines(parent, target,1701tlno, parent_slno, same,1702parent_len);1703for (i = 0; i < same - tlno; i++) {1704target_idx = tlno + i;1705if (fuzzy_matches && fuzzy_matches[i] >= 0) {1706best_idx = fuzzy_matches[i];1707} else {1708best_idx = scan_parent_range(parent->fingerprints,1709target->fingerprints,1710target_idx, 0,1711parent->num_lines);1712}1713if (best_idx >= 0) {1714line_blames[i].is_parent = 1;1715line_blames[i].s_lno = best_idx;1716} else {1717line_blames[i].is_parent = 0;1718line_blames[i].s_lno = target_idx;1719}1720}1721free(fuzzy_matches);1722}
1723
1724/*
1725* This decides which parts of a blame entry go to the parent (added to the
1726* ignoredp list) and which stay with the target (added to the diffp list). The
1727* actual decision was made in a separate heuristic function, and those answers
1728* for the lines in 'e' are in line_blames. This consumes e, essentially
1729* putting it on a list.
1730*
1731* Note that the blame entries on the ignoredp list are not necessarily sorted
1732* with respect to the parent's line numbers yet.
1733*/
1734static void ignore_blame_entry(struct blame_entry *e,1735struct blame_origin *parent,1736struct blame_entry **diffp,1737struct blame_entry **ignoredp,1738struct blame_line_tracker *line_blames)1739{
1740int entry_len, nr_lines, i;1741
1742/*1743* We carve new entries off the front of e. Each entry comes from a
1744* contiguous chunk of lines: adjacent lines from the same origin
1745* (either the parent or the target).
1746*/
1747entry_len = 1;1748nr_lines = e->num_lines; /* e changes in the loop */1749for (i = 0; i < nr_lines; i++) {1750struct blame_entry *next = NULL;1751
1752/*1753* We are often adjacent to the next line - only split the blame
1754* entry when we have to.
1755*/
1756if (i + 1 < nr_lines) {1757if (are_lines_adjacent(&line_blames[i],1758&line_blames[i + 1])) {1759entry_len++;1760continue;1761}1762next = split_blame_at(e, entry_len,1763blame_origin_incref(e->suspect));1764}1765if (line_blames[i].is_parent) {1766e->ignored = 1;1767blame_origin_decref(e->suspect);1768e->suspect = blame_origin_incref(parent);1769e->s_lno = line_blames[i - entry_len + 1].s_lno;1770e->next = *ignoredp;1771*ignoredp = e;1772} else {1773e->unblamable = 1;1774/* e->s_lno is already in the target's address space. */1775e->next = *diffp;1776*diffp = e;1777}1778assert(e->num_lines == entry_len);1779e = next;1780entry_len = 1;1781}1782assert(!e);1783}
1784
1785/*
1786* Process one hunk from the patch between the current suspect for
1787* blame_entry e and its parent. This first blames any unfinished
1788* entries before the chunk (which is where target and parent start
1789* differing) on the parent, and then splits blame entries at the
1790* start and at the end of the difference region. Since use of -M and
1791* -C options may lead to overlapping/duplicate source line number
1792* ranges, all we can rely on from sorting/merging is the order of the
1793* first suspect line number.
1794*
1795* tlno: line number in the target where this chunk begins
1796* same: line number in the target where this chunk ends
1797* offset: add to tlno to get the chunk starting point in the parent
1798* parent_len: number of lines in the parent chunk
1799*/
1800static void blame_chunk(struct blame_entry ***dstq, struct blame_entry ***srcq,1801int tlno, int offset, int same, int parent_len,1802struct blame_origin *parent,1803struct blame_origin *target, int ignore_diffs)1804{
1805struct blame_entry *e = **srcq;1806struct blame_entry *samep = NULL, *diffp = NULL, *ignoredp = NULL;1807struct blame_line_tracker *line_blames = NULL;1808
1809while (e && e->s_lno < tlno) {1810struct blame_entry *next = e->next;1811/*1812* current record starts before differing portion. If
1813* it reaches into it, we need to split it up and
1814* examine the second part separately.
1815*/
1816if (e->s_lno + e->num_lines > tlno) {1817/* Move second half to a new record */1818struct blame_entry *n;1819
1820n = split_blame_at(e, tlno - e->s_lno, e->suspect);1821/* Push new record to diffp */1822n->next = diffp;1823diffp = n;1824} else1825blame_origin_decref(e->suspect);1826/* Pass blame for everything before the differing1827* chunk to the parent */
1828e->suspect = blame_origin_incref(parent);1829e->s_lno += offset;1830e->next = samep;1831samep = e;1832e = next;1833}1834/*1835* As we don't know how much of a common stretch after this
1836* diff will occur, the currently blamed parts are all that we
1837* can assign to the parent for now.
1838*/
1839
1840if (samep) {1841**dstq = reverse_blame(samep, **dstq);1842*dstq = &samep->next;1843}1844/*1845* Prepend the split off portions: everything after e starts
1846* after the blameable portion.
1847*/
1848e = reverse_blame(diffp, e);1849
1850/*1851* Now retain records on the target while parts are different
1852* from the parent.
1853*/
1854samep = NULL;1855diffp = NULL;1856
1857if (ignore_diffs && same - tlno > 0) {1858CALLOC_ARRAY(line_blames, same - tlno);1859guess_line_blames(parent, target, tlno, offset, same,1860parent_len, line_blames);1861}1862
1863while (e && e->s_lno < same) {1864struct blame_entry *next = e->next;1865
1866/*1867* If current record extends into sameness, need to split.
1868*/
1869if (e->s_lno + e->num_lines > same) {1870/*1871* Move second half to a new record to be
1872* processed by later chunks
1873*/
1874struct blame_entry *n;1875
1876n = split_blame_at(e, same - e->s_lno,1877blame_origin_incref(e->suspect));1878/* Push new record to samep */1879n->next = samep;1880samep = n;1881}1882if (ignore_diffs) {1883ignore_blame_entry(e, parent, &diffp, &ignoredp,1884line_blames + e->s_lno - tlno);1885} else {1886e->next = diffp;1887diffp = e;1888}1889e = next;1890}1891free(line_blames);1892if (ignoredp) {1893/*1894* Note ignoredp is not sorted yet, and thus neither is dstq.
1895* That list must be sorted before we queue_blames(). We defer
1896* sorting until after all diff hunks are processed, so that
1897* guess_line_blames() can pick *any* line in the parent. The
1898* slight drawback is that we end up sorting all blame entries
1899* passed to the parent, including those that are unrelated to
1900* changes made by the ignored commit.
1901*/
1902**dstq = reverse_blame(ignoredp, **dstq);1903*dstq = &ignoredp->next;1904}1905**srcq = reverse_blame(diffp, reverse_blame(samep, e));1906/* Move across elements that are in the unblamable portion */1907if (diffp)1908*srcq = &diffp->next;1909}
1910
1911struct blame_chunk_cb_data {1912struct blame_origin *parent;1913struct blame_origin *target;1914long offset;1915int ignore_diffs;1916struct blame_entry **dstq;1917struct blame_entry **srcq;1918};1919
1920/* diff chunks are from parent to target */
1921static int blame_chunk_cb(long start_a, long count_a,1922long start_b, long count_b, void *data)1923{
1924struct blame_chunk_cb_data *d = data;1925if (start_a - start_b != d->offset)1926die("internal error in blame::blame_chunk_cb");1927blame_chunk(&d->dstq, &d->srcq, start_b, start_a - start_b,1928start_b + count_b, count_a, d->parent, d->target,1929d->ignore_diffs);1930d->offset = start_a + count_a - (start_b + count_b);1931return 0;1932}
1933
1934/*
1935* We are looking at the origin 'target' and aiming to pass blame
1936* for the lines it is suspected to its parent. Run diff to find
1937* which lines came from parent and pass blame for them.
1938*/
1939static void pass_blame_to_parent(struct blame_scoreboard *sb,1940struct blame_origin *target,1941struct blame_origin *parent, int ignore_diffs)1942{
1943mmfile_t file_p, file_o;1944struct blame_chunk_cb_data d;1945struct blame_entry *newdest = NULL;1946
1947if (!target->suspects)1948return; /* nothing remains for this target */1949
1950d.parent = parent;1951d.target = target;1952d.offset = 0;1953d.ignore_diffs = ignore_diffs;1954d.dstq = &newdest; d.srcq = &target->suspects;1955
1956fill_origin_blob(&sb->revs->diffopt, parent, &file_p,1957&sb->num_read_blob, ignore_diffs);1958fill_origin_blob(&sb->revs->diffopt, target, &file_o,1959&sb->num_read_blob, ignore_diffs);1960sb->num_get_patch++;1961
1962if (diff_hunks(&file_p, &file_o, blame_chunk_cb, &d, sb->xdl_opts))1963die("unable to generate diff (%s -> %s)",1964oid_to_hex(&parent->commit->object.oid),1965oid_to_hex(&target->commit->object.oid));1966/* The rest are the same as the parent */1967blame_chunk(&d.dstq, &d.srcq, INT_MAX, d.offset, INT_MAX, 0,1968parent, target, 0);1969*d.dstq = NULL;1970if (ignore_diffs)1971sort_blame_entries(&newdest, compare_blame_suspect);1972queue_blames(sb, parent, newdest);1973
1974return;1975}
1976
1977/*
1978* The lines in blame_entry after splitting blames many times can become
1979* very small and trivial, and at some point it becomes pointless to
1980* blame the parents. E.g. "\t\t}\n\t}\n\n" appears everywhere in any
1981* ordinary C program, and it is not worth to say it was copied from
1982* totally unrelated file in the parent.
1983*
1984* Compute how trivial the lines in the blame_entry are.
1985*/
1986unsigned blame_entry_score(struct blame_scoreboard *sb, struct blame_entry *e)1987{
1988unsigned score;1989const char *cp, *ep;1990
1991if (e->score)1992return e->score;1993
1994score = 1;1995cp = blame_nth_line(sb, e->lno);1996ep = blame_nth_line(sb, e->lno + e->num_lines);1997while (cp < ep) {1998unsigned ch = *((unsigned char *)cp);1999if (isalnum(ch))2000score++;2001cp++;2002}2003e->score = score;2004return score;2005}
2006
2007/*
2008* best_so_far[] and potential[] are both a split of an existing blame_entry
2009* that passes blame to the parent. Maintain best_so_far the best split so
2010* far, by comparing potential and best_so_far and copying potential into
2011* bst_so_far as needed.
2012*/
2013static void copy_split_if_better(struct blame_scoreboard *sb,2014struct blame_entry *best_so_far,2015struct blame_entry *potential)2016{
2017int i;2018
2019if (!potential[1].suspect)2020return;2021if (best_so_far[1].suspect) {2022if (blame_entry_score(sb, &potential[1]) <2023blame_entry_score(sb, &best_so_far[1]))2024return;2025}2026
2027for (i = 0; i < 3; i++)2028blame_origin_incref(potential[i].suspect);2029decref_split(best_so_far);2030memcpy(best_so_far, potential, sizeof(struct blame_entry[3]));2031}
2032
2033/*
2034* We are looking at a part of the final image represented by
2035* ent (tlno and same are offset by ent->s_lno).
2036* tlno is where we are looking at in the final image.
2037* up to (but not including) same match preimage.
2038* plno is where we are looking at in the preimage.
2039*
2040* <-------------- final image ---------------------->
2041* <------ent------>
2042* ^tlno ^same
2043* <---------preimage----->
2044* ^plno
2045*
2046* All line numbers are 0-based.
2047*/
2048static void handle_split(struct blame_scoreboard *sb,2049struct blame_entry *ent,2050int tlno, int plno, int same,2051struct blame_origin *parent,2052struct blame_entry *split)2053{
2054if (ent->num_lines <= tlno)2055return;2056if (tlno < same) {2057struct blame_entry potential[3];2058tlno += ent->s_lno;2059same += ent->s_lno;2060split_overlap(potential, ent, tlno, plno, same, parent);2061copy_split_if_better(sb, split, potential);2062decref_split(potential);2063}2064}
2065
2066struct handle_split_cb_data {2067struct blame_scoreboard *sb;2068struct blame_entry *ent;2069struct blame_origin *parent;2070struct blame_entry *split;2071long plno;2072long tlno;2073};2074
2075static int handle_split_cb(long start_a, long count_a,2076long start_b, long count_b, void *data)2077{
2078struct handle_split_cb_data *d = data;2079handle_split(d->sb, d->ent, d->tlno, d->plno, start_b, d->parent,2080d->split);2081d->plno = start_a + count_a;2082d->tlno = start_b + count_b;2083return 0;2084}
2085
2086/*
2087* Find the lines from parent that are the same as ent so that
2088* we can pass blames to it. file_p has the blob contents for
2089* the parent.
2090*/
2091static void find_copy_in_blob(struct blame_scoreboard *sb,2092struct blame_entry *ent,2093struct blame_origin *parent,2094struct blame_entry *split,2095mmfile_t *file_p)2096{
2097const char *cp;2098mmfile_t file_o;2099struct handle_split_cb_data d;2100
2101memset(&d, 0, sizeof(d));2102d.sb = sb; d.ent = ent; d.parent = parent; d.split = split;2103/*2104* Prepare mmfile that contains only the lines in ent.
2105*/
2106cp = blame_nth_line(sb, ent->lno);2107file_o.ptr = (char *) cp;2108file_o.size = blame_nth_line(sb, ent->lno + ent->num_lines) - cp;2109
2110/*2111* file_o is a part of final image we are annotating.
2112* file_p partially may match that image.
2113*/
2114memset(split, 0, sizeof(struct blame_entry [3]));2115if (diff_hunks(file_p, &file_o, handle_split_cb, &d, sb->xdl_opts))2116die("unable to generate diff (%s)",2117oid_to_hex(&parent->commit->object.oid));2118/* remainder, if any, all match the preimage */2119handle_split(sb, ent, d.tlno, d.plno, ent->num_lines, parent, split);2120}
2121
2122/* Move all blame entries from list *source that have a score smaller
2123* than score_min to the front of list *small.
2124* Returns a pointer to the link pointing to the old head of the small list.
2125*/
2126
2127static struct blame_entry **filter_small(struct blame_scoreboard *sb,2128struct blame_entry **small,2129struct blame_entry **source,2130unsigned score_min)2131{
2132struct blame_entry *p = *source;2133struct blame_entry *oldsmall = *small;2134while (p) {2135if (blame_entry_score(sb, p) <= score_min) {2136*small = p;2137small = &p->next;2138p = *small;2139} else {2140*source = p;2141source = &p->next;2142p = *source;2143}2144}2145*small = oldsmall;2146*source = NULL;2147return small;2148}
2149
2150/*
2151* See if lines currently target is suspected for can be attributed to
2152* parent.
2153*/
2154static void find_move_in_parent(struct blame_scoreboard *sb,2155struct blame_entry ***blamed,2156struct blame_entry **toosmall,2157struct blame_origin *target,2158struct blame_origin *parent)2159{
2160struct blame_entry *e, split[3];2161struct blame_entry *unblamed = target->suspects;2162struct blame_entry *leftover = NULL;2163mmfile_t file_p;2164
2165if (!unblamed)2166return; /* nothing remains for this target */2167
2168fill_origin_blob(&sb->revs->diffopt, parent, &file_p,2169&sb->num_read_blob, 0);2170if (!file_p.ptr)2171return;2172
2173/* At each iteration, unblamed has a NULL-terminated list of2174* entries that have not yet been tested for blame. leftover
2175* contains the reversed list of entries that have been tested
2176* without being assignable to the parent.
2177*/
2178do {2179struct blame_entry **unblamedtail = &unblamed;2180struct blame_entry *next;2181for (e = unblamed; e; e = next) {2182next = e->next;2183find_copy_in_blob(sb, e, parent, split, &file_p);2184if (split[1].suspect &&2185sb->move_score < blame_entry_score(sb, &split[1])) {2186split_blame(blamed, &unblamedtail, split, e);2187} else {2188e->next = leftover;2189leftover = e;2190}2191decref_split(split);2192}2193*unblamedtail = NULL;2194toosmall = filter_small(sb, toosmall, &unblamed, sb->move_score);2195} while (unblamed);2196target->suspects = reverse_blame(leftover, NULL);2197}
2198
2199struct blame_list {2200struct blame_entry *ent;2201struct blame_entry split[3];2202};2203
2204/*
2205* Count the number of entries the target is suspected for,
2206* and prepare a list of entry and the best split.
2207*/
2208static struct blame_list *setup_blame_list(struct blame_entry *unblamed,2209int *num_ents_p)2210{
2211struct blame_entry *e;2212int num_ents, i;2213struct blame_list *blame_list = NULL;2214
2215for (e = unblamed, num_ents = 0; e; e = e->next)2216num_ents++;2217if (num_ents) {2218CALLOC_ARRAY(blame_list, num_ents);2219for (e = unblamed, i = 0; e; e = e->next)2220blame_list[i++].ent = e;2221}2222*num_ents_p = num_ents;2223return blame_list;2224}
2225
2226/*
2227* For lines target is suspected for, see if we can find code movement
2228* across file boundary from the parent commit. porigin is the path
2229* in the parent we already tried.
2230*/
2231static void find_copy_in_parent(struct blame_scoreboard *sb,2232struct blame_entry ***blamed,2233struct blame_entry **toosmall,2234struct blame_origin *target,2235struct commit *parent,2236struct blame_origin *porigin,2237int opt)2238{
2239struct diff_options diff_opts;2240int i, j;2241struct blame_list *blame_list;2242int num_ents;2243struct blame_entry *unblamed = target->suspects;2244struct blame_entry *leftover = NULL;2245
2246if (!unblamed)2247return; /* nothing remains for this target */2248
2249repo_diff_setup(sb->repo, &diff_opts);2250diff_opts.flags.recursive = 1;2251diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT;2252
2253diff_setup_done(&diff_opts);2254
2255/* Try "find copies harder" on new path if requested;2256* we do not want to use diffcore_rename() actually to
2257* match things up; find_copies_harder is set only to
2258* force diff_tree_oid() to feed all filepairs to diff_queue,
2259* and this code needs to be after diff_setup_done(), which
2260* usually makes find-copies-harder imply copy detection.
2261*/
2262if ((opt & PICKAXE_BLAME_COPY_HARDEST)2263|| ((opt & PICKAXE_BLAME_COPY_HARDER)2264&& (!porigin || strcmp(target->path, porigin->path))))2265diff_opts.flags.find_copies_harder = 1;2266
2267if (is_null_oid(&target->commit->object.oid))2268do_diff_cache(get_commit_tree_oid(parent), &diff_opts);2269else2270diff_tree_oid(get_commit_tree_oid(parent),2271get_commit_tree_oid(target->commit),2272"", &diff_opts);2273
2274if (!diff_opts.flags.find_copies_harder)2275diffcore_std(&diff_opts);2276
2277do {2278struct blame_entry **unblamedtail = &unblamed;2279blame_list = setup_blame_list(unblamed, &num_ents);2280
2281for (i = 0; i < diff_queued_diff.nr; i++) {2282struct diff_filepair *p = diff_queued_diff.queue[i];2283struct blame_origin *norigin;2284mmfile_t file_p;2285struct blame_entry potential[3];2286
2287if (!DIFF_FILE_VALID(p->one))2288continue; /* does not exist in parent */2289if (S_ISGITLINK(p->one->mode))2290continue; /* ignore git links */2291if (porigin && !strcmp(p->one->path, porigin->path))2292/* find_move already dealt with this path */2293continue;2294
2295norigin = get_origin(parent, p->one->path);2296oidcpy(&norigin->blob_oid, &p->one->oid);2297norigin->mode = p->one->mode;2298fill_origin_blob(&sb->revs->diffopt, norigin, &file_p,2299&sb->num_read_blob, 0);2300if (!file_p.ptr)2301continue;2302
2303for (j = 0; j < num_ents; j++) {2304find_copy_in_blob(sb, blame_list[j].ent,2305norigin, potential, &file_p);2306copy_split_if_better(sb, blame_list[j].split,2307potential);2308decref_split(potential);2309}2310blame_origin_decref(norigin);2311}2312
2313for (j = 0; j < num_ents; j++) {2314struct blame_entry *split = blame_list[j].split;2315if (split[1].suspect &&2316sb->copy_score < blame_entry_score(sb, &split[1])) {2317split_blame(blamed, &unblamedtail, split,2318blame_list[j].ent);2319} else {2320blame_list[j].ent->next = leftover;2321leftover = blame_list[j].ent;2322}2323decref_split(split);2324}2325free(blame_list);2326*unblamedtail = NULL;2327toosmall = filter_small(sb, toosmall, &unblamed, sb->copy_score);2328} while (unblamed);2329target->suspects = reverse_blame(leftover, NULL);2330diff_flush(&diff_opts);2331}
2332
2333/*
2334* The blobs of origin and porigin exactly match, so everything
2335* origin is suspected for can be blamed on the parent.
2336*/
2337static void pass_whole_blame(struct blame_scoreboard *sb,2338struct blame_origin *origin, struct blame_origin *porigin)2339{
2340struct blame_entry *e, *suspects;2341
2342if (!porigin->file.ptr && origin->file.ptr) {2343/* Steal its file */2344porigin->file = origin->file;2345origin->file.ptr = NULL;2346}2347suspects = origin->suspects;2348origin->suspects = NULL;2349for (e = suspects; e; e = e->next) {2350blame_origin_incref(porigin);2351blame_origin_decref(e->suspect);2352e->suspect = porigin;2353}2354queue_blames(sb, porigin, suspects);2355}
2356
2357/*
2358* We pass blame from the current commit to its parents. We keep saying
2359* "parent" (and "porigin"), but what we mean is to find scapegoat to
2360* exonerate ourselves.
2361*/
2362static struct commit_list *first_scapegoat(struct rev_info *revs, struct commit *commit,2363int reverse)2364{
2365if (!reverse) {2366if (revs->first_parent_only &&2367commit->parents &&2368commit->parents->next) {2369free_commit_list(commit->parents->next);2370commit->parents->next = NULL;2371}2372return commit->parents;2373}2374return lookup_decoration(&revs->children, &commit->object);2375}
2376
2377static int num_scapegoats(struct rev_info *revs, struct commit *commit, int reverse)2378{
2379struct commit_list *l = first_scapegoat(revs, commit, reverse);2380return commit_list_count(l);2381}
2382
2383/* Distribute collected unsorted blames to the respected sorted lists
2384* in the various origins.
2385*/
2386static void distribute_blame(struct blame_scoreboard *sb, struct blame_entry *blamed)2387{
2388sort_blame_entries(&blamed, compare_blame_suspect);2389while (blamed)2390{2391struct blame_origin *porigin = blamed->suspect;2392struct blame_entry *suspects = NULL;2393do {2394struct blame_entry *next = blamed->next;2395blamed->next = suspects;2396suspects = blamed;2397blamed = next;2398} while (blamed && blamed->suspect == porigin);2399suspects = reverse_blame(suspects, NULL);2400queue_blames(sb, porigin, suspects);2401}2402}
2403
2404#define MAXSG 162405
2406typedef struct blame_origin *(*blame_find_alg)(struct repository *,2407struct commit *,2408struct blame_origin *,2409struct blame_bloom_data *);2410
2411static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin, int opt)2412{
2413struct rev_info *revs = sb->revs;2414int i, pass, num_sg;2415struct commit *commit = origin->commit;2416struct commit_list *sg;2417struct blame_origin *sg_buf[MAXSG];2418struct blame_origin *porigin, **sg_origin = sg_buf;2419struct blame_entry *toosmall = NULL;2420struct blame_entry *blames, **blametail = &blames;2421
2422num_sg = num_scapegoats(revs, commit, sb->reverse);2423if (!num_sg)2424goto finish;2425else if (num_sg < ARRAY_SIZE(sg_buf))2426memset(sg_buf, 0, sizeof(sg_buf));2427else2428CALLOC_ARRAY(sg_origin, num_sg);2429
2430/*2431* The first pass looks for unrenamed path to optimize for
2432* common cases, then we look for renames in the second pass.
2433*/
2434for (pass = 0; pass < 2 - sb->no_whole_file_rename; pass++) {2435blame_find_alg find = pass ? find_rename : find_origin;2436
2437for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);2438i < num_sg && sg;2439sg = sg->next, i++) {2440struct commit *p = sg->item;2441int j, same;2442
2443if (sg_origin[i])2444continue;2445if (repo_parse_commit(the_repository, p))2446continue;2447porigin = find(sb->repo, p, origin, sb->bloom_data);2448if (!porigin)2449continue;2450if (oideq(&porigin->blob_oid, &origin->blob_oid)) {2451pass_whole_blame(sb, origin, porigin);2452blame_origin_decref(porigin);2453goto finish;2454}2455for (j = same = 0; j < i; j++)2456if (sg_origin[j] &&2457oideq(&sg_origin[j]->blob_oid, &porigin->blob_oid)) {2458same = 1;2459break;2460}2461if (!same)2462sg_origin[i] = porigin;2463else2464blame_origin_decref(porigin);2465}2466}2467
2468sb->num_commits++;2469for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);2470i < num_sg && sg;2471sg = sg->next, i++) {2472struct blame_origin *porigin = sg_origin[i];2473if (!porigin)2474continue;2475if (!origin->previous) {2476blame_origin_incref(porigin);2477origin->previous = porigin;2478}2479pass_blame_to_parent(sb, origin, porigin, 0);2480if (!origin->suspects)2481goto finish;2482}2483
2484/*2485* Pass remaining suspects for ignored commits to their parents.
2486*/
2487if (oidset_contains(&sb->ignore_list, &commit->object.oid)) {2488for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);2489i < num_sg && sg;2490sg = sg->next, i++) {2491struct blame_origin *porigin = sg_origin[i];2492
2493if (!porigin)2494continue;2495pass_blame_to_parent(sb, origin, porigin, 1);2496/*2497* Preemptively drop porigin so we can refresh the
2498* fingerprints if we use the parent again, which can
2499* occur if you ignore back-to-back commits.
2500*/
2501drop_origin_blob(porigin);2502if (!origin->suspects)2503goto finish;2504}2505}2506
2507/*2508* Optionally find moves in parents' files.
2509*/
2510if (opt & PICKAXE_BLAME_MOVE) {2511filter_small(sb, &toosmall, &origin->suspects, sb->move_score);2512if (origin->suspects) {2513for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);2514i < num_sg && sg;2515sg = sg->next, i++) {2516struct blame_origin *porigin = sg_origin[i];2517if (!porigin)2518continue;2519find_move_in_parent(sb, &blametail, &toosmall, origin, porigin);2520if (!origin->suspects)2521break;2522}2523}2524}2525
2526/*2527* Optionally find copies from parents' files.
2528*/
2529if (opt & PICKAXE_BLAME_COPY) {2530if (sb->copy_score > sb->move_score)2531filter_small(sb, &toosmall, &origin->suspects, sb->copy_score);2532else if (sb->copy_score < sb->move_score) {2533origin->suspects = blame_merge(origin->suspects, toosmall);2534toosmall = NULL;2535filter_small(sb, &toosmall, &origin->suspects, sb->copy_score);2536}2537if (!origin->suspects)2538goto finish;2539
2540for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);2541i < num_sg && sg;2542sg = sg->next, i++) {2543struct blame_origin *porigin = sg_origin[i];2544find_copy_in_parent(sb, &blametail, &toosmall,2545origin, sg->item, porigin, opt);2546if (!origin->suspects)2547goto finish;2548}2549}2550
2551finish:2552*blametail = NULL;2553distribute_blame(sb, blames);2554/*2555* prepend toosmall to origin->suspects
2556*
2557* There is no point in sorting: this ends up on a big
2558* unsorted list in the caller anyway.
2559*/
2560if (toosmall) {2561struct blame_entry **tail = &toosmall;2562while (*tail)2563tail = &(*tail)->next;2564*tail = origin->suspects;2565origin->suspects = toosmall;2566}2567for (i = 0; i < num_sg; i++) {2568if (sg_origin[i]) {2569if (!sg_origin[i]->suspects)2570drop_origin_blob(sg_origin[i]);2571blame_origin_decref(sg_origin[i]);2572}2573}2574drop_origin_blob(origin);2575if (sg_buf != sg_origin)2576free(sg_origin);2577}
2578
2579/*
2580* The main loop -- while we have blobs with lines whose true origin
2581* is still unknown, pick one blob, and allow its lines to pass blames
2582* to its parents. */
2583void assign_blame(struct blame_scoreboard *sb, int opt)2584{
2585struct rev_info *revs = sb->revs;2586struct commit *commit = prio_queue_get(&sb->commits);2587
2588while (commit) {2589struct blame_entry *ent;2590struct blame_origin *suspect = get_blame_suspects(commit);2591
2592/* find one suspect to break down */2593while (suspect && !suspect->suspects)2594suspect = suspect->next;2595
2596if (!suspect) {2597commit = prio_queue_get(&sb->commits);2598continue;2599}2600
2601assert(commit == suspect->commit);2602
2603/*2604* We will use this suspect later in the loop,
2605* so hold onto it in the meantime.
2606*/
2607blame_origin_incref(suspect);2608repo_parse_commit(the_repository, commit);2609if (sb->reverse ||2610(!(commit->object.flags & UNINTERESTING) &&2611!(revs->max_age != -1 && commit->date < revs->max_age)))2612pass_blame(sb, suspect, opt);2613else {2614commit->object.flags |= UNINTERESTING;2615if (commit->object.parsed)2616mark_parents_uninteresting(sb->revs, commit);2617}2618/* treat root commit as boundary */2619if (!commit->parents && !sb->show_root)2620commit->object.flags |= UNINTERESTING;2621
2622/* Take responsibility for the remaining entries */2623ent = suspect->suspects;2624if (ent) {2625suspect->guilty = 1;2626for (;;) {2627struct blame_entry *next = ent->next;2628if (sb->found_guilty_entry)2629sb->found_guilty_entry(ent, sb->found_guilty_entry_data);2630if (next) {2631ent = next;2632continue;2633}2634ent->next = sb->ent;2635sb->ent = suspect->suspects;2636suspect->suspects = NULL;2637break;2638}2639}2640blame_origin_decref(suspect);2641
2642if (sb->debug) /* sanity */2643sanity_check_refcnt(sb);2644}2645}
2646
2647/*
2648* To allow quick access to the contents of nth line in the
2649* final image, prepare an index in the scoreboard.
2650*/
2651static int prepare_lines(struct blame_scoreboard *sb)2652{
2653sb->num_lines = find_line_starts(&sb->lineno, sb->final_buf,2654sb->final_buf_size);2655return sb->num_lines;2656}
2657
2658static struct commit *find_single_final(struct rev_info *revs,2659const char **name_p)2660{
2661int i;2662struct commit *found = NULL;2663const char *name = NULL;2664
2665for (i = 0; i < revs->pending.nr; i++) {2666struct object *obj = revs->pending.objects[i].item;2667if (obj->flags & UNINTERESTING)2668continue;2669obj = deref_tag(revs->repo, obj, NULL, 0);2670if (!obj || obj->type != OBJ_COMMIT)2671die("Non commit %s?", revs->pending.objects[i].name);2672if (found)2673die("More than one commit to dig from %s and %s?",2674revs->pending.objects[i].name, name);2675found = (struct commit *)obj;2676name = revs->pending.objects[i].name;2677}2678if (name_p)2679*name_p = xstrdup_or_null(name);2680return found;2681}
2682
2683static struct commit *dwim_reverse_initial(struct rev_info *revs,2684const char **name_p)2685{
2686/*2687* DWIM "git blame --reverse ONE -- PATH" as
2688* "git blame --reverse ONE..HEAD -- PATH" but only do so
2689* when it makes sense.
2690*/
2691struct object *obj;2692struct commit *head_commit;2693struct object_id head_oid;2694
2695if (revs->pending.nr != 1)2696return NULL;2697
2698/* Is that sole rev a committish? */2699obj = revs->pending.objects[0].item;2700obj = deref_tag(revs->repo, obj, NULL, 0);2701if (!obj || obj->type != OBJ_COMMIT)2702return NULL;2703
2704/* Do we have HEAD? */2705if (!refs_resolve_ref_unsafe(get_main_ref_store(the_repository), "HEAD", RESOLVE_REF_READING, &head_oid, NULL))2706return NULL;2707head_commit = lookup_commit_reference_gently(revs->repo,2708&head_oid, 1);2709if (!head_commit)2710return NULL;2711
2712/* Turn "ONE" into "ONE..HEAD" then */2713obj->flags |= UNINTERESTING;2714add_pending_object(revs, &head_commit->object, "HEAD");2715
2716if (name_p)2717*name_p = revs->pending.objects[0].name;2718return (struct commit *)obj;2719}
2720
2721static struct commit *find_single_initial(struct rev_info *revs,2722const char **name_p)2723{
2724int i;2725struct commit *found = NULL;2726const char *name = NULL;2727
2728/*2729* There must be one and only one negative commit, and it must be
2730* the boundary.
2731*/
2732for (i = 0; i < revs->pending.nr; i++) {2733struct object *obj = revs->pending.objects[i].item;2734if (!(obj->flags & UNINTERESTING))2735continue;2736obj = deref_tag(revs->repo, obj, NULL, 0);2737if (!obj || obj->type != OBJ_COMMIT)2738die("Non commit %s?", revs->pending.objects[i].name);2739if (found)2740die("More than one commit to dig up from, %s and %s?",2741revs->pending.objects[i].name, name);2742found = (struct commit *) obj;2743name = revs->pending.objects[i].name;2744}2745
2746if (!name)2747found = dwim_reverse_initial(revs, &name);2748if (!name)2749die("No commit to dig up from?");2750
2751if (name_p)2752*name_p = xstrdup(name);2753return found;2754}
2755
2756void init_scoreboard(struct blame_scoreboard *sb)2757{
2758memset(sb, 0, sizeof(struct blame_scoreboard));2759sb->move_score = BLAME_DEFAULT_MOVE_SCORE;2760sb->copy_score = BLAME_DEFAULT_COPY_SCORE;2761}
2762
2763void setup_scoreboard(struct blame_scoreboard *sb,2764struct blame_origin **orig)2765{
2766const char *final_commit_name = NULL;2767struct blame_origin *o;2768struct commit *final_commit = NULL;2769enum object_type type;2770
2771init_blame_suspects(&blame_suspects);2772
2773if (sb->reverse && sb->contents_from)2774die(_("--contents and --reverse do not blend well."));2775
2776if (!sb->repo)2777BUG("repo is NULL");2778
2779if (!sb->reverse) {2780sb->final = find_single_final(sb->revs, &final_commit_name);2781sb->commits.compare = compare_commits_by_commit_date;2782} else {2783sb->final = find_single_initial(sb->revs, &final_commit_name);2784sb->commits.compare = compare_commits_by_reverse_commit_date;2785}2786
2787if (sb->reverse && sb->revs->first_parent_only)2788sb->revs->children.name = NULL;2789
2790if (sb->contents_from || !sb->final) {2791struct object_id head_oid, *parent_oid;2792
2793/*2794* Build a fake commit at the top of the history, when
2795* (1) "git blame [^A] --path", i.e. with no positive end
2796* of the history range, in which case we build such
2797* a fake commit on top of the HEAD to blame in-tree
2798* modifications.
2799* (2) "git blame --contents=file [A] -- path", with or
2800* without positive end of the history range but with
2801* --contents, in which case we pretend that there is
2802* a fake commit on top of the positive end (defaulting to
2803* HEAD) that has the given contents in the path.
2804*/
2805if (sb->final) {2806parent_oid = &sb->final->object.oid;2807} else {2808if (!refs_resolve_ref_unsafe(get_main_ref_store(the_repository), "HEAD", RESOLVE_REF_READING, &head_oid, NULL))2809die("no such ref: HEAD");2810parent_oid = &head_oid;2811}2812
2813if (!sb->contents_from)2814setup_work_tree();2815
2816sb->final = fake_working_tree_commit(sb->repo,2817&sb->revs->diffopt,2818sb->path, sb->contents_from,2819parent_oid);2820add_pending_object(sb->revs, &(sb->final->object), ":");2821}2822
2823if (sb->reverse && sb->revs->first_parent_only) {2824final_commit = find_single_final(sb->revs, NULL);2825if (!final_commit)2826die(_("--reverse and --first-parent together require specified latest commit"));2827}2828
2829/*2830* If we have bottom, this will mark the ancestors of the
2831* bottom commits we would reach while traversing as
2832* uninteresting.
2833*/
2834if (prepare_revision_walk(sb->revs))2835die(_("revision walk setup failed"));2836
2837if (sb->reverse && sb->revs->first_parent_only) {2838struct commit *c = final_commit;2839
2840sb->revs->children.name = "children";2841while (c->parents &&2842!oideq(&c->object.oid, &sb->final->object.oid)) {2843struct commit_list *l = xcalloc(1, sizeof(*l));2844
2845l->item = c;2846if (add_decoration(&sb->revs->children,2847&c->parents->item->object, l))2848BUG("not unique item in first-parent chain");2849c = c->parents->item;2850}2851
2852if (!oideq(&c->object.oid, &sb->final->object.oid))2853die(_("--reverse --first-parent together require range along first-parent chain"));2854}2855
2856if (is_null_oid(&sb->final->object.oid)) {2857o = get_blame_suspects(sb->final);2858sb->final_buf = xmemdupz(o->file.ptr, o->file.size);2859sb->final_buf_size = o->file.size;2860}2861else {2862o = get_origin(sb->final, sb->path);2863if (fill_blob_sha1_and_mode(sb->repo, o))2864die(_("no such path %s in %s"), sb->path, final_commit_name);2865
2866if (sb->revs->diffopt.flags.allow_textconv &&2867textconv_object(sb->repo, sb->path, o->mode, &o->blob_oid, 1, (char **) &sb->final_buf,2868&sb->final_buf_size))2869;2870else2871sb->final_buf = repo_read_object_file(the_repository,2872&o->blob_oid,2873&type,2874&sb->final_buf_size);2875
2876if (!sb->final_buf)2877die(_("cannot read blob %s for path %s"),2878oid_to_hex(&o->blob_oid),2879sb->path);2880}2881sb->num_read_blob++;2882prepare_lines(sb);2883
2884if (orig)2885*orig = o;2886
2887free((char *)final_commit_name);2888}
2889
2890
2891
2892struct blame_entry *blame_entry_prepend(struct blame_entry *head,2893long start, long end,2894struct blame_origin *o)2895{
2896struct blame_entry *new_head = xcalloc(1, sizeof(struct blame_entry));2897new_head->lno = start;2898new_head->num_lines = end - start;2899new_head->suspect = o;2900new_head->s_lno = start;2901new_head->next = head;2902blame_origin_incref(o);2903return new_head;2904}
2905
2906void setup_blame_bloom_data(struct blame_scoreboard *sb)2907{
2908struct blame_bloom_data *bd;2909struct bloom_filter_settings *bs;2910
2911if (!sb->repo->objects->commit_graph)2912return;2913
2914bs = get_bloom_filter_settings(sb->repo);2915if (!bs)2916return;2917
2918bd = xmalloc(sizeof(struct blame_bloom_data));2919
2920bd->settings = bs;2921
2922bd->alloc = 4;2923bd->nr = 0;2924ALLOC_ARRAY(bd->keys, bd->alloc);2925
2926add_bloom_key(bd, sb->path);2927
2928sb->bloom_data = bd;2929}
2930
2931void cleanup_scoreboard(struct blame_scoreboard *sb)2932{
2933free(sb->lineno);2934clear_prio_queue(&sb->commits);2935oidset_clear(&sb->ignore_list);2936
2937if (sb->bloom_data) {2938int i;2939for (i = 0; i < sb->bloom_data->nr; i++) {2940free(sb->bloom_data->keys[i]->hashes);2941free(sb->bloom_data->keys[i]);2942}2943free(sb->bloom_data->keys);2944FREE_AND_NULL(sb->bloom_data);2945
2946trace2_data_intmax("blame", sb->repo,2947"bloom/queries", bloom_count_queries);2948trace2_data_intmax("blame", sb->repo,2949"bloom/response-no", bloom_count_no);2950}2951}
2952