From 1617baa5878c9b3886926f0f2e6dc52d067ada4a Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 20 Oct 2006 18:37:10 -0700 Subject: [PATCH 1/4] git-pickaxe: pagenate output by default. Signed-off-by: Junio C Hamano --- git.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/git.c b/git.c index 6164380667..b944c37401 100644 --- a/git.c +++ b/git.c @@ -245,7 +245,7 @@ static void handle_internal_command(int argc, const char **argv, char **envp) { "mv", cmd_mv, RUN_SETUP }, { "name-rev", cmd_name_rev, RUN_SETUP }, { "pack-objects", cmd_pack_objects, RUN_SETUP }, - { "pickaxe", cmd_pickaxe, RUN_SETUP }, + { "pickaxe", cmd_pickaxe, RUN_SETUP | USE_PAGER }, { "prune", cmd_prune, RUN_SETUP }, { "prune-packed", cmd_prune_packed, RUN_SETUP }, { "push", cmd_push, RUN_SETUP }, From 1ca6ca876e3553d9609823a70d168c50d7beba7e Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 20 Oct 2006 18:48:18 -0700 Subject: [PATCH 2/4] git-pickaxe: fix nth_line() We would want to be able to refer to the end of the file as "the beginning of Nth line" for a file that is N lines long. Signed-off-by: Junio C Hamano --- builtin-pickaxe.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 74c7c9a33b..b595299bf8 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -1085,6 +1085,9 @@ static int prepare_lines(struct scoreboard *sb) bol = 1; } } + sb->lineno = xrealloc(sb->lineno, + sizeof(int* ) * (num + incomplete + 1)); + sb->lineno[num + incomplete] = buf - sb->final_buf; sb->num_lines = num + incomplete; return sb->num_lines; } From 5ff62c300209d761e8e7b3c78da19b910cd9b860 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 20 Oct 2006 14:51:12 -0700 Subject: [PATCH 3/4] git-pickaxe: improve "best match" heuristics Instead of comparing number of lines matched, look at the matched characters and count alnums, so that we do not pass blame on not-so-interesting lines, such as an empty line and a line that is indentation followed by a closing brace. Add an option --score-debug to show the score of each blame_entry while we cook this further on the "next" branch. Signed-off-by: Junio C Hamano --- builtin-pickaxe.c | 71 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index b595299bf8..52e32dcc71 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -34,8 +34,7 @@ static int longest_file; static int longest_author; static int max_orig_digits; static int max_digits; - -#define DEBUG 0 +static int max_score_digits; #define PICKAXE_BLAME_MOVE 01 #define PICKAXE_BLAME_COPY 02 @@ -78,6 +77,11 @@ struct blame_entry { * suspect's file; internally all line numbers are 0 based. */ int s_lno; + + /* how significant this entry is -- cached to avoid + * scanning the lines over and over + */ + unsigned score; }; struct scoreboard { @@ -215,9 +219,6 @@ static void process_u_diff(void *state_, char *line, unsigned long len) struct chunk *chunk; int off1, off2, len1, len2, num; - if (DEBUG) - fprintf(stderr, "%.*s", (int) len, line); - num = state->ret->num; if (len < 4 || line[0] != '@' || line[1] != '@') { if (state->hunk_in_pre_context && line[0] == ' ') @@ -295,10 +296,6 @@ static struct patch *get_patch(struct origin *parent, struct origin *origin) char *blob_p, *blob_o; struct patch *patch; - if (DEBUG) fprintf(stderr, "get patch %.8s %.8s\n", - sha1_to_hex(parent->commit->object.sha1), - sha1_to_hex(origin->commit->object.sha1)); - blob_p = read_sha1_file(parent->blob_sha1, type, (unsigned long *) &file_p.size); blob_o = read_sha1_file(origin->blob_sha1, type, @@ -352,6 +349,7 @@ static void dup_entry(struct blame_entry *dst, struct blame_entry *src) memcpy(dst, src, sizeof(*src)); dst->prev = p; dst->next = n; + dst->score = 0; } static const char *nth_line(struct scoreboard *sb, int lno) @@ -448,7 +446,7 @@ static void split_blame(struct scoreboard *sb, add_blame_entry(sb, new_entry); } - if (DEBUG) { + if (1) { /* sanity */ struct blame_entry *ent; int lno = 0, corrupt = 0; @@ -530,12 +528,6 @@ static int pass_blame_to_parent(struct scoreboard *sb, for (i = 0; i < patch->num; i++) { struct chunk *chunk = &patch->chunks[i]; - if (DEBUG) - fprintf(stderr, - "plno = %d, tlno = %d, " - "same as parent up to %d, resync %d and %d\n", - plno, tlno, - chunk->same, chunk->p_next, chunk->t_next); blame_chunk(sb, tlno, plno, chunk->same, target, parent); plno = chunk->p_next; tlno = chunk->t_next; @@ -547,14 +539,37 @@ static int pass_blame_to_parent(struct scoreboard *sb, return 0; } -static void copy_split_if_better(struct blame_entry best_so_far[3], +static unsigned ent_score(struct scoreboard *sb, struct blame_entry *e) +{ + unsigned score; + const char *cp, *ep; + + if (e->score) + return e->score; + + score = 0; + cp = nth_line(sb, e->lno); + ep = nth_line(sb, e->lno + e->num_lines); + while (cp < ep) { + unsigned ch = *((unsigned char *)cp); + if (isalnum(ch)) + score++; + cp++; + } + e->score = score; + return score; +} + +static void copy_split_if_better(struct scoreboard *sb, + struct blame_entry best_so_far[3], struct blame_entry this[3]) { if (!this[1].suspect) return; - if (best_so_far[1].suspect && - (this[1].num_lines < best_so_far[1].num_lines)) - return; + if (best_so_far[1].suspect) { + if (ent_score(sb, &this[1]) < ent_score(sb, &best_so_far[1])) + return; + } memcpy(best_so_far, this, sizeof(struct blame_entry [3])); } @@ -596,7 +611,7 @@ static void find_copy_in_blob(struct scoreboard *sb, tlno + ent->s_lno, plno, chunk->same + ent->s_lno, parent); - copy_split_if_better(split, this); + copy_split_if_better(sb, split, this); } plno = chunk->p_next; tlno = chunk->t_next; @@ -699,7 +714,7 @@ static int find_copy_in_parent(struct scoreboard *sb, continue; } find_copy_in_blob(sb, ent, norigin, this, &file_p); - copy_split_if_better(split, this); + copy_split_if_better(sb, split, this); } if (split[1].suspect) split_blame(sb, split, ent); @@ -944,6 +959,7 @@ static void get_commit_info(struct commit *commit, #define OUTPUT_PORCELAIN 010 #define OUTPUT_SHOW_NAME 020 #define OUTPUT_SHOW_NUMBER 040 +#define OUTPUT_SHOW_SCORE 0100 static void emit_porcelain(struct scoreboard *sb, struct blame_entry *ent) { @@ -1016,6 +1032,8 @@ static void emit_other(struct scoreboard *sb, struct blame_entry *ent, int opt) show_raw_time), ent->lno + 1 + cnt); else { + if (opt & OUTPUT_SHOW_SCORE) + printf(" %*d", max_score_digits, ent->score); if (opt & OUTPUT_SHOW_NAME) printf(" %-*.*s", longest_file, longest_file, suspect->path); @@ -1060,8 +1078,9 @@ static void output(struct scoreboard *sb, int option) for (ent = sb->ent; ent; ent = ent->next) { if (option & OUTPUT_PORCELAIN) emit_porcelain(sb, ent); - else + else { emit_other(sb, ent, option); + } } } @@ -1121,6 +1140,7 @@ static void find_alignment(struct scoreboard *sb, int *option) { int longest_src_lines = 0; int longest_dst_lines = 0; + unsigned largest_score = 0; struct blame_entry *e; for (e = sb->ent; e; e = e->next) { @@ -1146,9 +1166,12 @@ static void find_alignment(struct scoreboard *sb, int *option) num = e->lno + e->num_lines; if (longest_dst_lines < num) longest_dst_lines = num; + if (largest_score < ent_score(sb, e)) + largest_score = ent_score(sb, e); } max_orig_digits = lineno_width(longest_src_lines); max_digits = lineno_width(longest_dst_lines); + max_score_digits = lineno_width(largest_score); } static int has_path_in_work_tree(const char *path) @@ -1209,6 +1232,8 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) tmp = top; top = bottom; bottom = tmp; } } + else if (!strcmp("--score-debug", arg)) + output_option |= OUTPUT_SHOW_SCORE; else if (!strcmp("-f", arg) || !strcmp("--show-name", arg)) output_option |= OUTPUT_SHOW_NAME; From 4a0fc95f182dbf5b0c58e6d6cfe8cd7459da7ab6 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 20 Oct 2006 15:37:12 -0700 Subject: [PATCH 4/4] git-pickaxe: introduce heuristics to avoid "trivial" chunks This adds scoring logic to blame_entry to prevent blames on very trivial chunks (e.g. lots of empty lines, indent followed by a closing brace) from being passed down to unrelated lines in the parent. The current heuristics are quite simple and may need to be tweaked later, but we need to start somewhere. Signed-off-by: Junio C Hamano --- builtin-pickaxe.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 52e32dcc71..e5a50b9278 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -40,6 +40,15 @@ static int max_score_digits; #define PICKAXE_BLAME_COPY 02 #define PICKAXE_BLAME_COPY_HARDER 04 +/* + * blame for a blame_entry with score lower than these thresholds + * is not passed to the parent using move/copy logic. + */ +static unsigned blame_move_score; +static unsigned blame_copy_score; +#define BLAME_DEFAULT_MOVE_SCORE 20 +#define BLAME_DEFAULT_COPY_SCORE 40 + /* bits #0..7 in revision.h, #8..11 used for merge_bases() in commit.c */ #define METAINFO_SHOWN (1u<<12) #define MORE_THAN_ONE_PATH (1u<<13) @@ -645,7 +654,8 @@ static int find_move_in_parent(struct scoreboard *sb, if (ent->suspect != target || ent->guilty) continue; find_copy_in_blob(sb, ent, parent, split, &file_p); - if (split[1].suspect) + if (split[1].suspect && + blame_move_score < ent_score(sb, &split[1])) split_blame(sb, split, ent); } free(blob_p); @@ -716,7 +726,8 @@ static int find_copy_in_parent(struct scoreboard *sb, find_copy_in_blob(sb, ent, norigin, this, &file_p); copy_split_if_better(sb, split, this); } - if (split[1].suspect) + if (split[1].suspect && + blame_copy_score < ent_score(sb, &split[1])) split_blame(sb, split, ent); } diff_flush(&diff_opts); @@ -1180,6 +1191,15 @@ static int has_path_in_work_tree(const char *path) return !lstat(path, &st); } +static unsigned parse_score(const char *arg) +{ + char *end; + unsigned long score = strtoul(arg, &end, 10); + if (*end) + return 0; + return score; +} + int cmd_pickaxe(int argc, const char **argv, const char *prefix) { struct rev_info revs; @@ -1209,12 +1229,15 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) output_option |= OUTPUT_LONG_OBJECT_NAME; else if (!strcmp("-S", arg) && ++i < argc) revs_file = argv[i]; - else if (!strcmp("-M", arg)) + else if (!strncmp("-M", arg, 2)) { opt |= PICKAXE_BLAME_MOVE; - else if (!strcmp("-C", arg)) { + blame_move_score = parse_score(arg+2); + } + else if (!strncmp("-C", arg, 2)) { if (opt & PICKAXE_BLAME_COPY) opt |= PICKAXE_BLAME_COPY_HARDER; opt |= PICKAXE_BLAME_COPY | PICKAXE_BLAME_MOVE; + blame_copy_score = parse_score(arg+2); } else if (!strcmp("-L", arg) && ++i < argc) { char *term; @@ -1252,6 +1275,11 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) argv[unk++] = arg; } + if (!blame_move_score) + blame_move_score = BLAME_DEFAULT_MOVE_SCORE; + if (!blame_copy_score) + blame_copy_score = BLAME_DEFAULT_COPY_SCORE; + /* We have collected options unknown to us in argv[1..unk] * which are to be passed to revision machinery if we are * going to do the "bottom" procesing.