From 26fc7b59cd00ee4042494b0a01afbda62c9d5b1a Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 23 Feb 2026 17:00:06 +0100 Subject: [PATCH 1/4] t/helper: improve "genrandom" test helper The `test-tool genrandom` test helper can be used to generate random data, either as an infinite stream or with a specified number of bytes. The way we handle parsing the number of bytes is lacking though: - We don't have good error handling, so if the caller for example uses `test-tool genrandom 200xyz` then we'll end up generating 200 bytes of random data successfully. - Many callers want to generate e.g. 1 kilobyte or megabyte of data, but they have to either use unwieldy numbers like 1048576, or they have to precompute them. Fix both of these issues by using `git_parse_ulong()` to parse the argument. This function has better error handling, and it knows to handle unit suffixes. Adapt a couple of our tests to use suffixes instead of manual computations. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- t/helper/test-genrandom.c | 5 ++++- t/t1006-cat-file.sh | 2 +- t/t1050-large.sh | 6 +++--- t/t1450-fsck.sh | 2 +- t/t5301-sliding-window.sh | 2 +- t/t5310-pack-bitmaps.sh | 2 +- t/t5710-promisor-remote-capability.sh | 4 ++-- t/t7700-repack.sh | 6 +++--- 8 files changed, 16 insertions(+), 13 deletions(-) diff --git a/t/helper/test-genrandom.c b/t/helper/test-genrandom.c index 51b67f2f87..d681961abb 100644 --- a/t/helper/test-genrandom.c +++ b/t/helper/test-genrandom.c @@ -6,6 +6,7 @@ #include "test-tool.h" #include "git-compat-util.h" +#include "parse.h" int cmd__genrandom(int argc, const char **argv) { @@ -22,7 +23,9 @@ int cmd__genrandom(int argc, const char **argv) next = next * 11 + *c; } while (*c++); - count = (argc == 3) ? strtoul(argv[2], NULL, 0) : ULONG_MAX; + count = ULONG_MAX; + if (argc == 3 && !git_parse_ulong(argv[2], &count)) + return error_errno("cannot parse argument '%s'", argv[2]); while (count--) { next = next * 1103515245 + 12345; diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh index 0eee3bb878..5499be8dc9 100755 --- a/t/t1006-cat-file.sh +++ b/t/t1006-cat-file.sh @@ -643,7 +643,7 @@ test_expect_success 'object reference via commit text search' ' ' test_expect_success 'setup blobs which are likely to delta' ' - test-tool genrandom foo 10240 >foo && + test-tool genrandom foo 10k >foo && { cat foo && echo plus; } >foo-plus && git add foo foo-plus && git commit -m foo && diff --git a/t/t1050-large.sh b/t/t1050-large.sh index 5be273611a..7d40d08521 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -104,9 +104,9 @@ test_expect_success 'packsize limit' ' # mid1 and mid2 will fit within 256k limit but # appending mid3 will bust the limit and will # result in a separate packfile. - test-tool genrandom "a" $(( 66 * 1024 )) >mid1 && - test-tool genrandom "b" $(( 80 * 1024 )) >mid2 && - test-tool genrandom "c" $(( 128 * 1024 )) >mid3 && + test-tool genrandom "a" 66k >mid1 && + test-tool genrandom "b" 80k >mid2 && + test-tool genrandom "c" 128k >mid3 && git add mid1 mid2 mid3 && count=0 && diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index 3fae05f9d9..8fb79b3e5d 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -918,7 +918,7 @@ test_expect_success 'fsck detects trailing loose garbage (large blob)' ' test_expect_success 'fsck detects truncated loose object' ' # make it big enough that we know we will truncate in the data # portion, not the header - test-tool genrandom truncate 4096 >file && + test-tool genrandom truncate 4k >file && blob=$(git hash-object -w file) && file=$(sha1_file $blob) && test_when_finished "remove_object $blob" && diff --git a/t/t5301-sliding-window.sh b/t/t5301-sliding-window.sh index ff6b5159a3..3c3666b278 100755 --- a/t/t5301-sliding-window.sh +++ b/t/t5301-sliding-window.sh @@ -12,7 +12,7 @@ test_expect_success 'setup' ' for i in a b c do echo $i >$i && - test-tool genrandom "$i" 32768 >>$i && + test-tool genrandom "$i" 32k >>$i && git update-index --add $i || return 1 done && echo d >d && cat c >>d && git update-index --add d && diff --git a/t/t5310-pack-bitmaps.sh b/t/t5310-pack-bitmaps.sh index 6718fb98c0..3e3366f57d 100755 --- a/t/t5310-pack-bitmaps.sh +++ b/t/t5310-pack-bitmaps.sh @@ -242,7 +242,7 @@ test_bitmap_cases () { ' test_expect_success 'splitting packs does not generate bogus bitmaps' ' - test-tool genrandom foo $((1024 * 1024)) >rand && + test-tool genrandom foo 1m >rand && git add rand && git commit -m "commit with big file" && git -c pack.packSizeLimit=500k repack -adb && diff --git a/t/t5710-promisor-remote-capability.sh b/t/t5710-promisor-remote-capability.sh index 023735d6a8..66af84cd56 100755 --- a/t/t5710-promisor-remote-capability.sh +++ b/t/t5710-promisor-remote-capability.sh @@ -20,7 +20,7 @@ test_expect_success 'setup: create "template" repository' ' test_commit -C template 1 && test_commit -C template 2 && test_commit -C template 3 && - test-tool genrandom foo 10240 >template/foo && + test-tool genrandom foo 10k >template/foo && git -C template add foo && git -C template commit -m foo ' @@ -376,7 +376,7 @@ test_expect_success "clone with promisor.advertise set to 'true' but don't delet test_expect_success "setup for subsequent fetches" ' # Generate new commit with large blob - test-tool genrandom bar 10240 >template/bar && + test-tool genrandom bar 10k >template/bar && git -C template add bar && git -C template commit -m bar && diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh index 73b78bdd88..439ab24d23 100755 --- a/t/t7700-repack.sh +++ b/t/t7700-repack.sh @@ -319,7 +319,7 @@ test_expect_success 'no bitmaps created if .keep files present' ' test_expect_success 'auto-bitmaps do not complain if unavailable' ' test_config -C bare.git pack.packSizeLimit 1M && - blob=$(test-tool genrandom big $((1024*1024)) | + blob=$(test-tool genrandom big 1m | git -C bare.git hash-object -w --stdin) && git -C bare.git update-ref refs/tags/big $blob && @@ -495,9 +495,9 @@ test_expect_success '--filter works with --max-pack-size' ' cd max-pack-size && test_commit base && # two blobs which exceed the maximum pack size - test-tool genrandom foo 1048576 >foo && + test-tool genrandom foo 1m >foo && git hash-object -w foo && - test-tool genrandom bar 1048576 >bar && + test-tool genrandom bar 1m >bar && git hash-object -w bar && git add foo bar && git commit -m "adding foo and bar" From 10a6762719f612bb5edc554e62239a744bbc4283 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 23 Feb 2026 17:00:07 +0100 Subject: [PATCH 2/4] object-file: adapt `stream_object_signature()` to take a stream The function `stream_object_signature()` is responsible for verifying whether the given object ID matches the actual hash of the object's contents. In contrast to `check_object_signature()` it does so in a streaming fashion so that we don't have to load the full object into memory. In a subsequent commit we'll want to adapt one of its callsites to pass a preconstructed stream. Prepare for this by accepting a stream as input that the caller needs to assemble. While at it, improve the error reporting in `parse_object_with_flags()` to tell apart the two failure modes. Helped-by: Jeff King Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- object-file.c | 10 +++------- object-file.h | 4 +++- object.c | 19 ++++++++++++++++--- pack-check.c | 12 +++++++++--- 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/object-file.c b/object-file.c index e7e4c3348f..a59b7abaaf 100644 --- a/object-file.c +++ b/object-file.c @@ -129,18 +129,15 @@ int check_object_signature(struct repository *r, const struct object_id *oid, return !oideq(oid, &real_oid) ? -1 : 0; } -int stream_object_signature(struct repository *r, const struct object_id *oid) +int stream_object_signature(struct repository *r, + struct odb_read_stream *st, + const struct object_id *oid) { struct object_id real_oid; - struct odb_read_stream *st; struct git_hash_ctx c; char hdr[MAX_HEADER_LEN]; int hdrlen; - st = odb_read_stream_open(r->objects, oid, NULL); - if (!st) - return -1; - /* Generate the header */ hdrlen = format_object_header(hdr, sizeof(hdr), st->type, st->size); @@ -160,7 +157,6 @@ int stream_object_signature(struct repository *r, const struct object_id *oid) git_hash_update(&c, buf, readlen); } git_hash_final_oid(&real_oid, &c); - odb_read_stream_close(st); return !oideq(oid, &real_oid) ? -1 : 0; } diff --git a/object-file.h b/object-file.h index 1229d5f675..6936fd0fef 100644 --- a/object-file.h +++ b/object-file.h @@ -164,7 +164,9 @@ int check_object_signature(struct repository *r, const struct object_id *oid, * Try reading the object named with "oid" using * the streaming interface and rehash it to do the same. */ -int stream_object_signature(struct repository *r, const struct object_id *oid); +int stream_object_signature(struct repository *r, + struct odb_read_stream *stream, + const struct object_id *oid); enum finalize_object_file_flags { FOF_SKIP_COLLISION_CHECK = 1, diff --git a/object.c b/object.c index 4669b8d65e..9d2c676b16 100644 --- a/object.c +++ b/object.c @@ -6,6 +6,7 @@ #include "object.h" #include "replace-object.h" #include "object-file.h" +#include "odb/streaming.h" #include "blob.h" #include "statinfo.h" #include "tree.h" @@ -330,9 +331,21 @@ struct object *parse_object_with_flags(struct repository *r, if ((!obj || obj->type == OBJ_NONE || obj->type == OBJ_BLOB) && odb_read_object_info(r->objects, oid, NULL) == OBJ_BLOB) { - if (!skip_hash && stream_object_signature(r, repl) < 0) { - error(_("hash mismatch %s"), oid_to_hex(oid)); - return NULL; + if (!skip_hash) { + struct odb_read_stream *stream = odb_read_stream_open(r->objects, oid, NULL); + + if (!stream) { + error(_("unable to open object stream for %s"), oid_to_hex(oid)); + return NULL; + } + + if (stream_object_signature(r, stream, repl) < 0) { + error(_("hash mismatch %s"), oid_to_hex(oid)); + odb_read_stream_close(stream); + return NULL; + } + + odb_read_stream_close(stream); } parse_blob_buffer(lookup_blob(r, oid)); return lookup_object(r, oid); diff --git a/pack-check.c b/pack-check.c index 67cb2cf72f..46782a29d5 100644 --- a/pack-check.c +++ b/pack-check.c @@ -9,6 +9,7 @@ #include "packfile.h" #include "object-file.h" #include "odb.h" +#include "odb/streaming.h" struct idx_entry { off_t offset; @@ -104,6 +105,7 @@ static int verify_packfile(struct repository *r, QSORT(entries, nr_objects, compare_entries); for (i = 0; i < nr_objects; i++) { + struct odb_read_stream *stream = NULL; void *data; struct object_id oid; enum object_type type; @@ -152,7 +154,9 @@ static int verify_packfile(struct repository *r, type) < 0) err = error("packed %s from %s is corrupt", oid_to_hex(&oid), p->pack_name); - else if (!data && stream_object_signature(r, &oid) < 0) + else if (!data && + (!(stream = odb_read_stream_open(r->objects, &oid, NULL)) || + stream_object_signature(r, stream, &oid) < 0)) err = error("packed %s from %s is corrupt", oid_to_hex(&oid), p->pack_name); else if (fn) { @@ -163,12 +167,14 @@ static int verify_packfile(struct repository *r, } if (((base_count + i) & 1023) == 0) display_progress(progress, base_count + i); - free(data); + if (stream) + odb_read_stream_close(stream); + free(data); } + display_progress(progress, base_count + i); free(entries); - return err; } From 41b42e3527eb6b0ab4b557d16adedcd255c9045f Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 23 Feb 2026 17:00:08 +0100 Subject: [PATCH 3/4] packfile: expose function to read object stream for an offset The function `packfile_store_read_object_stream()` takes as input an object ID and then constructs a `struct odb_read_stream` from it. In a subsequent commit we'll want to create an object stream for a given combination of packfile and offset though, which is not something that can currently be done. Extract a new function `packfile_read_object_stream()` that makes this functionality available. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- packfile.c | 40 ++++++++++++++++++++++++---------------- packfile.h | 5 +++++ 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/packfile.c b/packfile.c index 402c3b5dc7..3e61176128 100644 --- a/packfile.c +++ b/packfile.c @@ -2553,32 +2553,28 @@ static int close_istream_pack_non_delta(struct odb_read_stream *_st) return 0; } -int packfile_store_read_object_stream(struct odb_read_stream **out, - struct packfile_store *store, - const struct object_id *oid) +int packfile_read_object_stream(struct odb_read_stream **out, + const struct object_id *oid, + struct packed_git *pack, + off_t offset) { struct odb_packed_read_stream *stream; struct pack_window *window = NULL; - struct object_info oi = OBJECT_INFO_INIT; enum object_type in_pack_type; unsigned long size; - oi.sizep = &size; + in_pack_type = unpack_object_header(pack, &window, &offset, &size); + unuse_pack(&window); - if (packfile_store_read_object_info(store, oid, &oi, 0) || - oi.u.packed.type == PACKED_OBJECT_TYPE_REF_DELTA || - oi.u.packed.type == PACKED_OBJECT_TYPE_OFS_DELTA || - repo_settings_get_big_file_threshold(store->source->odb->repo) >= size) + if (repo_settings_get_big_file_threshold(pack->repo) >= size) return -1; - in_pack_type = unpack_object_header(oi.u.packed.pack, - &window, - &oi.u.packed.offset, - &size); - unuse_pack(&window); switch (in_pack_type) { default: return -1; /* we do not do deltas for now */ + case OBJ_BAD: + mark_bad_packed_object(pack, oid); + return -1; case OBJ_COMMIT: case OBJ_TREE: case OBJ_BLOB: @@ -2592,10 +2588,22 @@ int packfile_store_read_object_stream(struct odb_read_stream **out, stream->base.type = in_pack_type; stream->base.size = size; stream->z_state = ODB_PACKED_READ_STREAM_UNINITIALIZED; - stream->pack = oi.u.packed.pack; - stream->pos = oi.u.packed.offset; + stream->pack = pack; + stream->pos = offset; *out = &stream->base; return 0; } + +int packfile_store_read_object_stream(struct odb_read_stream **out, + struct packfile_store *store, + const struct object_id *oid) +{ + struct pack_entry e; + + if (!find_pack_entry(store, oid, &e)) + return -1; + + return packfile_read_object_stream(out, oid, e.p, e.offset); +} diff --git a/packfile.h b/packfile.h index acc5c55ad5..b9f5f1c18c 100644 --- a/packfile.h +++ b/packfile.h @@ -436,6 +436,11 @@ off_t get_delta_base(struct packed_git *p, struct pack_window **w_curs, off_t *curpos, enum object_type type, off_t delta_obj_offset); +int packfile_read_object_stream(struct odb_read_stream **out, + const struct object_id *oid, + struct packed_git *pack, + off_t offset); + void release_pack_memory(size_t); /* global flag to enable extra checks when accessing packed objects */ From 13eb65d36615d7269df053015bddf7987ef6d923 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 23 Feb 2026 17:00:09 +0100 Subject: [PATCH 4/4] pack-check: fix verification of large objects It was reported [1] that git-fsck(1) may sometimes run into an infinite loop when processing packfiles. This bug was bisected to c31bad4f7d (packfile: track packs via the MRU list exclusively, 2025-10-30), which refactored our lsit of packfiles to only be tracked via an MRU list, exclusively. This isn't entirely surprising: any caller that iterates through the list of packfiles and then hits `find_pack_entry()`, for example because they read an object from it, may cause the MRU list to be updated. And if the caller is unlucky, this may cause the mentioned infinite loop. While this mechanism is somewhat fragile, it is still surprising that we encounter it when verifying the packfile. We iterate through objects in a given pack one by one and then read them via their offset, and doing this shouldn't ever end up in `find_pack_entry()`. But there is an edge case here: when the object in question is a blob bigger than "core.largeFileThreshold", then we will be careful to not read it into memory. Instead, we read it via an object stream by calling `odb_read_object_stream()`, and that function will perform an object lookup via `odb_read_object_info()`. So in the case where there are at least two blobs in two different packfiles, and both of these blobs exceed "core.largeFileThreshold", then we'll run into an infinite loop because we'll always update the MRU. We could fix this by improving `repo_for_each_pack()` to not update the MRU, and this would address the issue. But the fun part is that using `odb_read_object_stream()` is the wrong thing to do in the first place: it may open _any_ instance of this object, so we ultimately cannot be sure that we even verified the object in our given packfile. Fix this bug by creating the object stream for the packed object directly via `packfile_read_object_stream()`. Add a test that would have caused the infinite loop. [1]: <20260222183710.2963424-1-sandals@crustytoothpaste.net> Reported-by: brian m. carlson Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- pack-check.c | 2 +- t/t1450-fsck.sh | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/pack-check.c b/pack-check.c index 46782a29d5..7378c80730 100644 --- a/pack-check.c +++ b/pack-check.c @@ -155,7 +155,7 @@ static int verify_packfile(struct repository *r, err = error("packed %s from %s is corrupt", oid_to_hex(&oid), p->pack_name); else if (!data && - (!(stream = odb_read_stream_open(r->objects, &oid, NULL)) || + (packfile_read_object_stream(&stream, &oid, p, entries[i].offset) < 0 || stream_object_signature(r, stream, &oid) < 0)) err = error("packed %s from %s is corrupt", oid_to_hex(&oid), p->pack_name); diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index 8fb79b3e5d..54e81c2636 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -852,6 +852,44 @@ test_expect_success 'fsck errors in packed objects' ' ! grep corrupt out ' +test_expect_success 'fsck handles multiple packfiles with big blobs' ' + test_when_finished "rm -rf repo" && + git init repo && + ( + cd repo && + + # We construct two packfiles with two objects in common and one + # object not in common. The objects in common can then be + # corrupted in one of the packfiles, respectively. The other + # objects that are unique to the packs are merely used to not + # have both packs contain the same data. + blob_one=$(test-tool genrandom one 200k | git hash-object -t blob -w --stdin) && + blob_two=$(test-tool genrandom two 200k | git hash-object -t blob -w --stdin) && + blob_three=$(test-tool genrandom three 200k | git hash-object -t blob -w --stdin) && + blob_four=$(test-tool genrandom four 200k | git hash-object -t blob -w --stdin) && + pack_one=$(printf "%s\n" "$blob_one" "$blob_two" "$blob_three" | git pack-objects .git/objects/pack/pack) && + pack_two=$(printf "%s\n" "$blob_two" "$blob_three" "$blob_four" | git pack-objects .git/objects/pack/pack) && + chmod a+w .git/objects/pack/pack-*.pack && + + # Corrupt blob two in the first pack. + git verify-pack -v .git/objects/pack/pack-$pack_one >objects && + offset_one=$(sed objects && + offset_two=$(sed err && + test_grep "unknown object type 0 at offset $offset_one in .git/objects/pack/pack-$pack_one.pack" err && + test_grep "unknown object type 0 at offset $offset_two in .git/objects/pack/pack-$pack_two.pack" err + ) +' + test_expect_success 'fsck fails on corrupt packfile' ' hsh=$(git commit-tree -m mycommit HEAD^{tree}) && pack=$(echo $hsh | git pack-objects .git/objects/pack/pack) &&