From 612702e8ea3aa070586001f1aa0a2070058664e0 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 20 Oct 2006 23:49:31 -0700 Subject: [PATCH 1/4] git-pickaxe: do not keep commit buffer. We need the commit buffer data while generating the final result, but until then we do not need them. Signed-off-by: Junio C Hamano --- builtin-pickaxe.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index e5a50b9278..e628b5a807 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -108,6 +108,7 @@ struct scoreboard { /* linked list of blames */ struct blame_entry *ent; + /* look-up a line in the final buffer */ int num_lines; int *lineno; }; @@ -188,7 +189,8 @@ static struct origin *find_rename(struct scoreboard *sb, for (i = 0; i < diff_queued_diff.nr; i++) { struct diff_filepair *p = diff_queued_diff.queue[i]; - if (p->status == 'R' && !strcmp(p->one->path, origin->path)) { + if ((p->status == 'R' || p->status == 'C') && + !strcmp(p->one->path, origin->path)) { porigin = find_origin(sb, parent, p->two->path); break; } @@ -457,7 +459,7 @@ static void split_blame(struct scoreboard *sb, if (1) { /* sanity */ struct blame_entry *ent; - int lno = 0, corrupt = 0; + int lno = sb->ent->lno, corrupt = 0; for (ent = sb->ent; ent; ent = ent->next) { if (lno != ent->lno) @@ -467,7 +469,7 @@ static void split_blame(struct scoreboard *sb, lno += ent->num_lines; } if (corrupt) { - lno = 0; + lno = sb->ent->lno; for (ent = sb->ent; ent; ent = ent->next) { printf("L %8d l %8d n %8d\n", lno, ent->lno, ent->num_lines); @@ -508,10 +510,9 @@ static void blame_chunk(struct scoreboard *sb, int tlno, int plno, int same, struct origin *target, struct origin *parent) { - struct blame_entry *e, *n; + struct blame_entry *e; - for (e = sb->ent; e; e = n) { - n = e->next; + for (e = sb->ent; e; e = e->next) { if (e->guilty || e->suspect != target) continue; if (same <= e->s_lno) @@ -556,7 +557,7 @@ static unsigned ent_score(struct scoreboard *sb, struct blame_entry *e) if (e->score) return e->score; - score = 0; + score = 1; cp = nth_line(sb, e->lno); ep = nth_line(sb, e->lno + e->num_lines); while (cp < ep) { @@ -933,6 +934,15 @@ static void get_commit_info(struct commit *commit, static char committer_buf[1024]; static char summary_buf[1024]; + /* We've operated without save_commit_buffer, so + * we now need to populate them for output. + */ + if (!commit->buffer) { + char type[20]; + unsigned long size; + commit->buffer = + read_sha1_file(commit->object.sha1, type, &size); + } ret->author = author_buf; get_ac_line(commit->buffer, "\nauthor ", sizeof(author_buf), author_buf, &ret->author_mail, @@ -1214,6 +1224,8 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) const char *final_commit_name = NULL; char type[10]; + save_commit_buffer = 0; + opt = 0; bottom = top = 0; seen_dashdash = 0; From 46014766bdacec8ad22355ce78898b895bf172d6 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 21 Oct 2006 00:41:38 -0700 Subject: [PATCH 2/4] git-pickaxe: do not confuse two origins that are the same. It used to be that we can compare the address of the origin structure to determine if they are the same because they are always registered with scoreboard. After introduction of the loop to try finding the best split, that is not true anymore. The current code has rather serious leaks with origin structure, but more importantly it gets confused when two origins that points at the same commit and same path. We might eventually have to refcount and gc origin, but let's fix the correctness issue first. Signed-off-by: Junio C Hamano --- builtin-pickaxe.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index e628b5a807..3ab87efac7 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -113,12 +113,20 @@ struct scoreboard { int *lineno; }; +static int cmp_suspect(struct origin *a, struct origin *b) +{ + int cmp = hashcmp(a->commit->object.sha1, b->commit->object.sha1); + if (cmp) + return cmp; + return strcmp(a->path, b->path); +} + static void coalesce(struct scoreboard *sb) { struct blame_entry *ent, *next; for (ent = sb->ent; ent && (next = ent->next); ent = next) { - if (ent->suspect == next->suspect && + if (!cmp_suspect(ent->suspect, next->suspect) && ent->guilty == next->guilty && ent->s_lno + ent->num_lines == next->s_lno) { ent->num_lines += next->num_lines; @@ -126,6 +134,7 @@ static void coalesce(struct scoreboard *sb) if (ent->next) ent->next->prev = ent; free(next); + ent->score = 0; next = ent; /* again */ } } @@ -498,7 +507,7 @@ static int find_last_in_target(struct scoreboard *sb, struct origin *target) int last_in_target = -1; for (e = sb->ent; e; e = e->next) { - if (e->guilty || e->suspect != target) + if (e->guilty || cmp_suspect(e->suspect, target)) continue; if (last_in_target < e->s_lno + e->num_lines) last_in_target = e->s_lno + e->num_lines; @@ -513,7 +522,7 @@ static void blame_chunk(struct scoreboard *sb, struct blame_entry *e; for (e = sb->ent; e; e = e->next) { - if (e->guilty || e->suspect != target) + if (e->guilty || cmp_suspect(e->suspect, target)) continue; if (same <= e->s_lno) continue; @@ -634,7 +643,7 @@ static int find_move_in_parent(struct scoreboard *sb, struct origin *parent) { int last_in_target; - struct blame_entry *ent, split[3]; + struct blame_entry *e, split[3]; mmfile_t file_p; char type[10]; char *blob_p; @@ -651,13 +660,13 @@ static int find_move_in_parent(struct scoreboard *sb, return 0; } - for (ent = sb->ent; ent; ent = ent->next) { - if (ent->suspect != target || ent->guilty) + for (e = sb->ent; e; e = e->next) { + if (e->guilty || cmp_suspect(e->suspect, target)) continue; - find_copy_in_blob(sb, ent, parent, split, &file_p); + find_copy_in_blob(sb, e, parent, split, &file_p); if (split[1].suspect && blame_move_score < ent_score(sb, &split[1])) - split_blame(sb, split, ent); + split_blame(sb, split, e); } free(blob_p); return 0; @@ -671,7 +680,7 @@ static int find_copy_in_parent(struct scoreboard *sb, { struct diff_options diff_opts; const char *paths[1]; - struct blame_entry *ent; + struct blame_entry *e; int i; if (find_last_in_target(sb, target) < 0) @@ -696,9 +705,9 @@ static int find_copy_in_parent(struct scoreboard *sb, "", &diff_opts); diffcore_std(&diff_opts); - for (ent = sb->ent; ent; ent = ent->next) { + for (e = sb->ent; e; e = e->next) { struct blame_entry split[3]; - if (ent->suspect != target || ent->guilty) + if (e->guilty || cmp_suspect(e->suspect, target)) continue; memset(split, 0, sizeof(split)); @@ -724,12 +733,12 @@ static int find_copy_in_parent(struct scoreboard *sb, free(blob); continue; } - find_copy_in_blob(sb, ent, norigin, this, &file_p); + find_copy_in_blob(sb, e, norigin, this, &file_p); copy_split_if_better(sb, split, this); } if (split[1].suspect && blame_copy_score < ent_score(sb, &split[1])) - split_blame(sb, split, ent); + split_blame(sb, split, e); } diff_flush(&diff_opts); @@ -763,12 +772,6 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) for (e = sb->ent; e; e = e->next) if (e->suspect == origin) e->suspect = porigin; - /* now everything blamed for origin is blamed for - * porigin, we do not need to keep it anymore. - * Do not free porigin (or the ones we got from - * earlier round); they may still be used elsewhere. - */ - free_origin(origin); return; } parent_origin[i] = porigin; @@ -834,8 +837,10 @@ static void assign_blame(struct scoreboard *sb, struct rev_info *revs, int opt) /* Take responsibility for the remaining entries */ for (ent = sb->ent; ent; ent = ent->next) - if (ent->suspect == suspect) + if (!cmp_suspect(ent->suspect, suspect)) ent->guilty = 1; + + coalesce(sb); } } From f6c0e191020ad330c06438c144e0ea787ca964fd Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 21 Oct 2006 02:56:33 -0700 Subject: [PATCH 3/4] git-pickaxe: get rid of wasteful find_origin(). After finding out which path in the parent to scan to pass blames, using get_tree_entry() to extract the blob information again was quite wasteful, since diff-tree already gave us that information. Separate the function to create an origin out as get_origin(). You'll never know what is more efficient unless you try and/or think hard. I somehow thought that extracting one known path out of commit's tree is cheaper than running a diff-tree for the current path between the commit and its parent, but it is not the case. In real, non-toy projects, most commits do not touch the path you are interested in, and if the path is a few levels away from the toplevel, whole-subdirectory comparison logic diff-tree allows us to skip opening lower subdirectories. This commit rewrites find_origin() function to use a single-path diff-tree to see if the parent has the same blob as the current suspect, which is cheaper than extracting the blob information using get_tree_entry() and comparing it with what the current suspect has. This shaves about 6% overhead when annotating kernel/sched.c in the Linux kernel repository on my machine. The saving rises to 25% for arch/i386/kernel/Makefile. Signed-off-by: Junio C Hamano --- builtin-pickaxe.c | 138 +++++++++++++++++++++++++++++++++------------- 1 file changed, 101 insertions(+), 37 deletions(-) diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 3ab87efac7..cf474b0d11 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -140,48 +140,103 @@ static void coalesce(struct scoreboard *sb) } } -static void free_origin(struct origin *o) +static struct origin *get_origin(struct scoreboard *sb, + struct commit *commit, + const char *path) { - free(o); -} - -static struct origin *find_origin(struct scoreboard *sb, - struct commit *commit, - const char *path) -{ - struct blame_entry *ent; + struct blame_entry *e; struct origin *o; - unsigned mode; - char type[10]; - for (ent = sb->ent; ent; ent = ent->next) { - if (ent->suspect->commit == commit && - !strcmp(ent->suspect->path, path)) - return ent->suspect; + for (e = sb->ent; e; e = e->next) { + if (e->suspect->commit == commit && + !strcmp(e->suspect->path, path)) + return e->suspect; } - o = xcalloc(1, sizeof(*o) + strlen(path) + 1); o->commit = commit; strcpy(o->path, path); - if (get_tree_entry(commit->object.sha1, path, o->blob_sha1, &mode)) - goto err_out; - if (sha1_object_info(o->blob_sha1, type, NULL) || - strcmp(type, blob_type)) - goto err_out; return o; - err_out: - free_origin(o); - return NULL; } -static struct origin *find_rename(struct scoreboard *sb, +static int fill_blob_sha1(struct origin *origin) +{ + unsigned mode; + char type[10]; + + if (!is_null_sha1(origin->blob_sha1)) + return 0; + if (get_tree_entry(origin->commit->object.sha1, + origin->path, + origin->blob_sha1, &mode)) + goto error_out; + if (sha1_object_info(origin->blob_sha1, type, NULL) || + strcmp(type, blob_type)) + goto error_out; + return 0; + error_out: + hashclr(origin->blob_sha1); + return -1; +} + +static struct origin *find_origin(struct scoreboard *sb, struct commit *parent, struct origin *origin) { struct origin *porigin = NULL; struct diff_options diff_opts; int i; - const char *paths[1]; + const char *paths[2]; + + /* See if the origin->path is different between parent + * and origin first. Most of the time they are the + * same and diff-tree is fairly efficient about this. + */ + diff_setup(&diff_opts); + diff_opts.recursive = 1; + diff_opts.detect_rename = 0; + diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT; + paths[0] = origin->path; + paths[1] = NULL; + + diff_tree_setup_paths(paths, &diff_opts); + if (diff_setup_done(&diff_opts) < 0) + die("diff-setup"); + diff_tree_sha1(parent->tree->object.sha1, + origin->commit->tree->object.sha1, + "", &diff_opts); + diffcore_std(&diff_opts); + + /* It is either one entry that says "modified", or "created", + * or nothing. + */ + if (!diff_queued_diff.nr) { + /* The path is the same as parent */ + porigin = get_origin(sb, parent, origin->path); + hashcpy(porigin->blob_sha1, origin->blob_sha1); + } + else if (diff_queued_diff.nr != 1) + die("internal error in pickaxe::find_origin"); + else { + struct diff_filepair *p = diff_queued_diff.queue[0]; + switch (p->status) { + default: + die("internal error in pickaxe::find_origin (%c)", + p->status); + case 'M': + porigin = get_origin(sb, parent, origin->path); + hashcpy(porigin->blob_sha1, p->one->sha1); + break; + case 'A': + case 'T': + /* Did not exist in parent, or type changed */ + break; + } + } + diff_flush(&diff_opts); + if (porigin) + return porigin; + + /* Otherwise we would look for a rename */ diff_setup(&diff_opts); diff_opts.recursive = 1; @@ -191,16 +246,17 @@ static struct origin *find_rename(struct scoreboard *sb, diff_tree_setup_paths(paths, &diff_opts); if (diff_setup_done(&diff_opts) < 0) die("diff-setup"); - diff_tree_sha1(origin->commit->tree->object.sha1, - parent->tree->object.sha1, + diff_tree_sha1(parent->tree->object.sha1, + origin->commit->tree->object.sha1, "", &diff_opts); diffcore_std(&diff_opts); for (i = 0; i < diff_queued_diff.nr; i++) { struct diff_filepair *p = diff_queued_diff.queue[i]; if ((p->status == 'R' || p->status == 'C') && - !strcmp(p->one->path, origin->path)) { - porigin = find_origin(sb, parent, p->two->path); + !strcmp(p->two->path, origin->path)) { + porigin = get_origin(sb, parent, p->one->path); + hashcpy(porigin->blob_sha1, p->one->sha1); break; } } @@ -705,6 +761,15 @@ static int find_copy_in_parent(struct scoreboard *sb, "", &diff_opts); diffcore_std(&diff_opts); + + /* + * NEEDSWORK: This loop is wasteful in that it opens the same + * blob in the parent number of times. We should swap the + * loop inside out, which would require keeping track of + * "best blame so far" for blame entries that the current + * "target" is being suspected. + */ + for (e = sb->ent; e; e = e->next) { struct blame_entry split[3]; if (e->guilty || cmp_suspect(e->suspect, target)) @@ -724,8 +789,8 @@ static int find_copy_in_parent(struct scoreboard *sb, if (porigin && !strcmp(p->one->path, porigin->path)) /* find_move already dealt with this path */ continue; - norigin = find_origin(sb, parent, p->one->path); - + norigin = get_origin(sb, parent, p->one->path); + hashcpy(norigin->blob_sha1, p->one->sha1); blob = read_sha1_file(norigin->blob_sha1, type, (unsigned long *) &file_p.size); file_p.ptr = blob; @@ -735,6 +800,7 @@ static int find_copy_in_parent(struct scoreboard *sb, } find_copy_in_blob(sb, e, norigin, this, &file_p); copy_split_if_better(sb, split, this); + free(blob); } if (split[1].suspect && blame_copy_score < ent_score(sb, &split[1])) @@ -762,9 +828,7 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) if (parse_commit(p)) continue; - porigin = find_origin(sb, parent->item, origin->path); - if (!porigin) - porigin = find_rename(sb, parent->item, origin); + porigin = find_origin(sb, parent->item, origin); if (!porigin) continue; if (!hashcmp(porigin->blob_sha1, origin->blob_sha1)) { @@ -1423,8 +1487,8 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) */ prepare_revision_walk(&revs); - o = find_origin(&sb, sb.final, path); - if (!o) + o = get_origin(&sb, sb.final, path); + if (fill_blob_sha1(o)) die("no such path %s in %s", path, final_commit_name); sb.final_buf = read_sha1_file(o->blob_sha1, type, &sb.final_buf_size); From aec8fa1f587d68a0e50ada2720c8dc6e09709a9c Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 21 Oct 2006 03:30:53 -0700 Subject: [PATCH 4/4] git-pickaxe: swap comparison loop used for -C When assigning blames for code movements across file boundaries, we used to iterate over blame entries (i.e. groups of lines to be blamed) in the outer loop and compared each entry with paths in the parent commit in an inner loop. This meant that we opened the blob data from each path number of times. Reorganize the loop so that we read the same path only once, and compare it against all relevant blame entries. This should perform better, but seems to give mixed results, though. Signed-off-by: Junio C Hamano --- builtin-pickaxe.c | 91 ++++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index cf474b0d11..663b96dac5 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -737,11 +737,27 @@ static int find_copy_in_parent(struct scoreboard *sb, struct diff_options diff_opts; const char *paths[1]; struct blame_entry *e; - int i; + int i, j; + struct blame_list { + struct blame_entry *ent; + struct blame_entry split[3]; + } *blame_list; + int num_ents; - if (find_last_in_target(sb, target) < 0) + /* Count the number of entries the target is suspected for, + * and prepare a list of entry and the best split. + */ + for (e = sb->ent, num_ents = 0; e; e = e->next) + if (!e->guilty && !cmp_suspect(e->suspect, target)) + num_ents++; + if (!num_ents) return 1; /* nothing remains for this target */ + blame_list = xcalloc(num_ents, sizeof(struct blame_list)); + for (e = sb->ent, i = 0; e; e = e->next) + if (!e->guilty && !cmp_suspect(e->suspect, target)) + blame_list[i++].ent = e; + diff_setup(&diff_opts); diff_opts.recursive = 1; diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT; @@ -761,53 +777,46 @@ static int find_copy_in_parent(struct scoreboard *sb, "", &diff_opts); diffcore_std(&diff_opts); + for (i = 0; i < diff_queued_diff.nr; i++) { + struct diff_filepair *p = diff_queued_diff.queue[i]; + struct origin *norigin; + mmfile_t file_p; + char type[10]; + char *blob; + struct blame_entry this[3]; - /* - * NEEDSWORK: This loop is wasteful in that it opens the same - * blob in the parent number of times. We should swap the - * loop inside out, which would require keeping track of - * "best blame so far" for blame entries that the current - * "target" is being suspected. - */ - - for (e = sb->ent; e; e = e->next) { - struct blame_entry split[3]; - if (e->guilty || cmp_suspect(e->suspect, target)) + if (!DIFF_FILE_VALID(p->one)) + continue; /* does not exist in parent */ + if (porigin && !strcmp(p->one->path, porigin->path)) + /* find_move already dealt with this path */ continue; - memset(split, 0, sizeof(split)); - for (i = 0; i < diff_queued_diff.nr; i++) { - struct diff_filepair *p = diff_queued_diff.queue[i]; - struct origin *norigin; - mmfile_t file_p; - char type[10]; - char *blob; - struct blame_entry this[3]; + norigin = get_origin(sb, parent, p->one->path); + hashcpy(norigin->blob_sha1, p->one->sha1); + blob = read_sha1_file(norigin->blob_sha1, type, + (unsigned long *) &file_p.size); + file_p.ptr = blob; + if (!file_p.ptr) + continue; - if (!DIFF_FILE_VALID(p->one)) - continue; /* does not exist in parent */ - if (porigin && !strcmp(p->one->path, porigin->path)) - /* find_move already dealt with this path */ - continue; - norigin = get_origin(sb, parent, p->one->path); - hashcpy(norigin->blob_sha1, p->one->sha1); - blob = read_sha1_file(norigin->blob_sha1, type, - (unsigned long *) &file_p.size); - file_p.ptr = blob; - if (!file_p.ptr) { - free(blob); - continue; - } - find_copy_in_blob(sb, e, norigin, this, &file_p); - copy_split_if_better(sb, split, this); - free(blob); + for (j = 0; j < num_ents; j++) { + find_copy_in_blob(sb, blame_list[j].ent, norigin, + this, &file_p); + copy_split_if_better(sb, blame_list[j].split, + this); } - if (split[1].suspect && - blame_copy_score < ent_score(sb, &split[1])) - split_blame(sb, split, e); + free(blob); } diff_flush(&diff_opts); + for (j = 0; j < num_ents; j++) { + struct blame_entry *split = blame_list[j].split; + if (split[1].suspect && + blame_copy_score < ent_score(sb, &split[1])) + split_blame(sb, split, blame_list[j].ent); + } + free(blame_list); + return 0; }