From 4b5a808bb906896e07ca147645b33feb81e91c4c Mon Sep 17 00:00:00 2001 From: Victoria Dye Date: Fri, 20 May 2022 15:01:45 -0400 Subject: [PATCH 1/3] repack: respect --keep-pack with geometric repack Update 'repack' to ignore packs named on the command line with the '--keep-pack' option. Specifically, modify 'init_pack_geometry()' to treat command line-kept packs the same way it treats packs with an on-disk '.keep' file (that is, skip the pack and do not include it in the 'geometry' structure). Without this handling, a '--keep-pack' pack would be included in the 'geometry' structure. If the pack is *before* the geometry split line (with at least one other pack and/or loose objects present), 'repack' assumes the pack's contents are "rolled up" into another pack via 'pack-objects'. However, because the internally-invoked 'pack-objects' properly excludes '--keep-pack' objects, any new pack it creates will not contain the kept objects. Finally, 'repack' deletes the '--keep-pack' as "redundant" (since it assumes 'pack-objects' created a new pack with its contents), resulting in possible object loss and repository corruption. Add a test ensuring that '--keep-pack' packs are now appropriately handled. Co-authored-by: Taylor Blau Signed-off-by: Victoria Dye Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- builtin/repack.c | 50 +++++++++++++++++++++++++++---------- t/t7703-repack-geometric.sh | 28 +++++++++++++++++++++ 2 files changed, 65 insertions(+), 13 deletions(-) diff --git a/builtin/repack.c b/builtin/repack.c index d1a563d5b6..ea56e92ad5 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -137,6 +137,8 @@ static void collect_pack_filenames(struct string_list *fname_nonkept_list, string_list_append_nodup(fname_nonkept_list, fname); } closedir(dir); + + string_list_sort(fname_kept_list); } static void remove_redundant_pack(const char *dir_name, const char *base_name) @@ -332,17 +334,38 @@ static int geometry_cmp(const void *va, const void *vb) return 0; } -static void init_pack_geometry(struct pack_geometry **geometry_p) +static void init_pack_geometry(struct pack_geometry **geometry_p, + struct string_list *existing_kept_packs) { struct packed_git *p; struct pack_geometry *geometry; + struct strbuf buf = STRBUF_INIT; *geometry_p = xcalloc(1, sizeof(struct pack_geometry)); geometry = *geometry_p; for (p = get_all_packs(the_repository); p; p = p->next) { - if (!pack_kept_objects && p->pack_keep) - continue; + if (!pack_kept_objects) { + /* + * Any pack that has its pack_keep bit set will appear + * in existing_kept_packs below, but this saves us from + * doing a more expensive check. + */ + if (p->pack_keep) + continue; + + /* + * The pack may be kept via the --keep-pack option; + * check 'existing_kept_packs' to determine whether to + * ignore it. + */ + strbuf_reset(&buf); + strbuf_addstr(&buf, pack_basename(p)); + strbuf_strip_suffix(&buf, ".pack"); + + if (string_list_has_string(existing_kept_packs, buf.buf)) + continue; + } ALLOC_GROW(geometry->pack, geometry->pack_nr + 1, @@ -353,6 +376,7 @@ static void init_pack_geometry(struct pack_geometry **geometry_p) } QSORT(geometry->pack, geometry->pack_nr, geometry_cmp); + strbuf_release(&buf); } static void split_pack_geometry(struct pack_geometry *geometry, int factor) @@ -714,17 +738,20 @@ int cmd_repack(int argc, const char **argv, const char *prefix) strbuf_release(&path); } - if (geometric_factor) { - if (pack_everything) - die(_("options '%s' and '%s' cannot be used together"), "--geometric", "-A/-a"); - init_pack_geometry(&geometry); - split_pack_geometry(geometry, geometric_factor); - } - packdir = mkpathdup("%s/pack", get_object_directory()); packtmp_name = xstrfmt(".tmp-%d-pack", (int)getpid()); packtmp = mkpathdup("%s/%s", packdir, packtmp_name); + collect_pack_filenames(&existing_nonkept_packs, &existing_kept_packs, + &keep_pack_list); + + if (geometric_factor) { + if (pack_everything) + die(_("options '%s' and '%s' cannot be used together"), "--geometric", "-A/-a"); + init_pack_geometry(&geometry, &existing_kept_packs); + split_pack_geometry(geometry, geometric_factor); + } + sigchain_push_common(remove_pack_on_signal); prepare_pack_objects(&cmd, &po_args); @@ -764,9 +791,6 @@ int cmd_repack(int argc, const char **argv, const char *prefix) if (use_delta_islands) strvec_push(&cmd.args, "--delta-islands"); - collect_pack_filenames(&existing_nonkept_packs, &existing_kept_packs, - &keep_pack_list); - if (pack_everything & ALL_INTO_ONE) { repack_promisor_objects(&po_args, &names); diff --git a/t/t7703-repack-geometric.sh b/t/t7703-repack-geometric.sh index bdbbcbf1ec..91bb2b37a8 100755 --- a/t/t7703-repack-geometric.sh +++ b/t/t7703-repack-geometric.sh @@ -180,6 +180,34 @@ test_expect_success '--geometric ignores kept packs' ' ) ' +test_expect_success '--geometric ignores --keep-pack packs' ' + git init geometric && + test_when_finished "rm -fr geometric" && + ( + cd geometric && + + # Create two equal-sized packs + test_commit kept && # 3 objects + git repack -d && + test_commit pack && # 3 objects + git repack -d && + + find $objdir/pack -type f -name "*.pack" | sort >packs.before && + git repack --geometric 2 -dm \ + --keep-pack="$(basename "$(head -n 1 packs.before)")" >out && + find $objdir/pack -type f -name "*.pack" | sort >packs.after && + + # Packs should not have changed (only one non-kept pack, no + # loose objects), but $midx should now exist. + grep "Nothing new to pack" out && + test_path_is_file $midx && + + test_cmp packs.before packs.after && + + git fsck + ) +' + test_expect_success '--geometric chooses largest MIDX preferred pack' ' git init geometric && test_when_finished "rm -fr geometric" && From aab7bea14fee0f185b26413bf27a1cfefbe0114d Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Fri, 20 May 2022 15:01:48 -0400 Subject: [PATCH 2/3] t7703: demonstrate object corruption with pack.packSizeLimit When doing a `--geometric=` repack, `git repack` determines a splitting point among packs ordered by their object count such that: - each pack above the split has at least `` times as many objects as the next-largest pack by object count, and - the first pack above the split has at least `` times as many object as the sum of all packs below the split line combined `git repack` then creates a pack containing all of the objects contained in packs below the split line by running `git pack-objects --stdin-packs` underneath. Once packs are moved into place, then any packs below the split line are removed, since their objects were just combined into a new pack. But `git repack` tries to be careful to avoid removing a pack that it just wrote, by checking: struct packed_git *p = geometry->pack[i]; if (string_list_has_string(&names, hash_to_hex(p->hash))) continue; in the `delete_redundant` and `geometric` conditional towards the end of `cmd_repack`. But it's possible to trick `git repack` into not recognizing a pack that it just wrote when `names` is out-of-order (which violates `string_list_has_string()`'s assumption that the list is sorted and thus binary search-able). When this happens in just the right circumstances, it is possible to remove a pack that we just wrote, leading to object corruption. Luckily, this is quite difficult to provoke in practice (for a couple of reasons): - we ordinarily write just one pack, so `names` usually contains just one entry, and is thus sorted - when we do write more than one pack (e.g., due to `--max-pack-size`) we have to: (a) write a pack identical to one that already exists, (b) have that pack be below the split line, and (c) have the set of packs written by `pack-objects` occur in an order which tricks `string_list_has_string()`. Demonstrate the above scenario in a failing test, which causes `git repack --geometric` to write a pack which occurs below the split line, _and_ fail to recognize that it wrote that pack. The following patch will fix this bug. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/t7703-repack-geometric.sh | 47 +++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/t/t7703-repack-geometric.sh b/t/t7703-repack-geometric.sh index 91bb2b37a8..2cd1de7295 100755 --- a/t/t7703-repack-geometric.sh +++ b/t/t7703-repack-geometric.sh @@ -7,6 +7,7 @@ test_description='git repack --geometric works correctly' GIT_TEST_MULTI_PACK_INDEX=0 objdir=.git/objects +packdir=$objdir/pack midx=$objdir/pack/multi-pack-index test_expect_success '--geometric with no packs' ' @@ -230,4 +231,50 @@ test_expect_success '--geometric chooses largest MIDX preferred pack' ' ) ' +test_expect_failure '--geometric with pack.packSizeLimit' ' + git init pack-rewrite && + test_when_finished "rm -fr pack-rewrite" && + ( + cd pack-rewrite && + + test-tool genrandom foo 1048576 >foo && + test-tool genrandom bar 1048576 >bar && + + git add foo bar && + test_tick && + git commit -m base && + + git rev-parse HEAD:foo HEAD:bar >p1.objects && + git rev-parse HEAD HEAD^{tree} >p2.objects && + + # These two packs each contain two objects, so the following + # `--geometric` repack will try to combine them. + p1="$(git pack-objects $packdir/pack Date: Fri, 20 May 2022 15:01:51 -0400 Subject: [PATCH 3/3] builtin/repack.c: ensure that `names` is sorted The previous patch demonstrates a scenario where the list of packs written by `pack-objects` (and stored in the `names` string_list) is out-of-order, and can thus cause us to delete packs we shouldn't. This patch resolves that bug by ensuring that `names` is sorted in all cases, not just when delete_redundant && pack_everything & ALL_INTO_ONE is true. Because we did sort `names` in that case (which, prior to `--geometric` repacks, was the only time we would actually delete packs, this is only a bug for `--geometric` repacks. It would be sufficient to only sort `names` when `delete_redundant` is set to a non-zero value. But sorting a small list of strings is cheap, and it is defensive against future calls to `string_list_has_string()` on this list. Co-discovered-by: Victoria Dye Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- builtin/repack.c | 3 ++- t/t7703-repack-geometric.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/builtin/repack.c b/builtin/repack.c index ea56e92ad5..0e4aae80c0 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -856,6 +856,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix) if (!names.nr && !po_args.quiet) printf_ln(_("Nothing new to pack.")); + string_list_sort(&names); + for_each_string_list_item(item, &names) { item->util = (void *)(uintptr_t)populate_pack_exts(item->string); } @@ -896,7 +898,6 @@ int cmd_repack(int argc, const char **argv, const char *prefix) if (delete_redundant && pack_everything & ALL_INTO_ONE) { const int hexsz = the_hash_algo->hexsz; - string_list_sort(&names); for_each_string_list_item(item, &existing_nonkept_packs) { char *sha1; size_t len = strlen(item->string); diff --git a/t/t7703-repack-geometric.sh b/t/t7703-repack-geometric.sh index 2cd1de7295..da87f8b2d8 100755 --- a/t/t7703-repack-geometric.sh +++ b/t/t7703-repack-geometric.sh @@ -231,7 +231,7 @@ test_expect_success '--geometric chooses largest MIDX preferred pack' ' ) ' -test_expect_failure '--geometric with pack.packSizeLimit' ' +test_expect_success '--geometric with pack.packSizeLimit' ' git init pack-rewrite && test_when_finished "rm -fr pack-rewrite" && (