From 53eeed0a81dbd486a84b3252f35642c4cc2e9488 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:49:28 -0400 Subject: [PATCH 01/13] object-file.h: fix typo in variable declaration This should be "compat", not "comapt". Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object-file.h b/object-file.h index a85b2e5b49..fd715663fb 100644 --- a/object-file.h +++ b/object-file.h @@ -180,7 +180,7 @@ enum { int write_object_file_flags(const void *buf, unsigned long len, enum object_type type, struct object_id *oid, - struct object_id *comapt_oid_in, unsigned flags); + struct object_id *compat_oid_in, unsigned flags); static inline int write_object_file(const void *buf, unsigned long len, enum object_type type, struct object_id *oid) { From f227fc7d43d9607edb286eaab0f7714a2f1e4659 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:49:35 -0400 Subject: [PATCH 02/13] cat-file: make --allow-unknown-type a noop The cat-file command has some minor support for handling objects with "unknown" types. I.e., strings that are not "blob", "commit", "tree", or "tag". In theory this could be used for debugging or experimenting with extensions to Git. But in practice this support is not very useful: 1. You can get the type and size of such objects, but nothing else. Not even the contents! 2. Only loose objects are supported, since packfiles use numeric ids for the types, rather than strings. 3. Likewise you cannot ever transfer objects between repositories, because they cannot be represented in the packfiles used for the on-the-wire protocol. The support for these unknown types complicates the object-parsing code, and has led to bugs such as b748ddb7a4 (unpack_loose_header(): fix infinite loop on broken zlib input, 2025-02-25). So let's drop it. The first step is to remove the user-facing parts, which are accessible only via cat-file. This is technically backwards-incompatible, but given the limitations listed above, these objects couldn't possibly be useful in any workflow. However, we can't just rip out the option entirely. That would hurt a caller who ran: git cat-file -t --allow-unknown-object and fed it normal, well-formed objects. There --allow-unknown-type was doing nothing, but we wouldn't want to start bailing with an error. So to protect any such callers, we'll retain --allow-unknown-type as a noop. The code change is fairly small (but we'll able to clean up more code in follow-on patches). The test updates drop any use of the option. We still retain tests that feed the broken objects to cat-file without --allow-unknown-type, as we should continue to confirm that those objects are rejected. Note that in one spot we can drop a layer of loop, re-indenting the body; viewing the diff with "-w" helps there. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- Documentation/git-cat-file.adoc | 6 +- builtin/cat-file.c | 18 +-- t/t1006-cat-file.sh | 217 ++++++++------------------------ 3 files changed, 59 insertions(+), 182 deletions(-) diff --git a/Documentation/git-cat-file.adoc b/Documentation/git-cat-file.adoc index fc4b92f104..cde79ad242 100644 --- a/Documentation/git-cat-file.adoc +++ b/Documentation/git-cat-file.adoc @@ -9,8 +9,7 @@ SYNOPSIS -------- [verse] 'git cat-file' -'git cat-file' (-e | -p) -'git cat-file' (-t | -s) [--allow-unknown-type] +'git cat-file' (-e | -p | -t | -s) 'git cat-file' (--textconv | --filters) [: | --path= ] 'git cat-file' (--batch | --batch-check | --batch-command) [--batch-all-objects] @@ -202,9 +201,6 @@ flush:: only once, even if it is stored multiple times in the repository. ---allow-unknown-type:: - Allow `-s` or `-t` to query broken/corrupt objects of unknown type. - --follow-symlinks:: With `--batch` or `--batch-check`, follow symlinks inside the repository when requesting objects with extended SHA-1 diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 3914a2a3f6..4adc19aa29 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -100,8 +100,7 @@ static int stream_blob(const struct object_id *oid) return 0; } -static int cat_one_file(int opt, const char *exp_type, const char *obj_name, - int unknown_type) +static int cat_one_file(int opt, const char *exp_type, const char *obj_name) { int ret; struct object_id oid; @@ -121,9 +120,6 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name, if (!path && opt_cw) get_oid_flags |= GET_OID_REQUIRE_PATH; - if (unknown_type) - flags |= OBJECT_INFO_ALLOW_UNKNOWN_TYPE; - if (get_oid_with_context(the_repository, obj_name, get_oid_flags, &oid, &obj_context)) die("Not a valid object name %s", obj_name); @@ -1038,8 +1034,7 @@ int cmd_cat_file(int argc, const char * const builtin_catfile_usage[] = { N_("git cat-file "), - N_("git cat-file (-e | -p) "), - N_("git cat-file (-t | -s) [--allow-unknown-type] "), + N_("git cat-file (-e | -p | -t | -s) "), N_("git cat-file (--textconv | --filters)\n" " [: | --path= ]"), N_("git cat-file (--batch | --batch-check | --batch-command) [--batch-all-objects]\n" @@ -1057,8 +1052,8 @@ int cmd_cat_file(int argc, OPT_GROUP(N_("Emit [broken] object attributes")), OPT_CMDMODE('t', NULL, &opt, N_("show object type (one of 'blob', 'tree', 'commit', 'tag', ...)"), 't'), OPT_CMDMODE('s', NULL, &opt, N_("show object size"), 's'), - OPT_BOOL(0, "allow-unknown-type", &unknown_type, - N_("allow -s and -t to work with broken/corrupt objects")), + OPT_HIDDEN_BOOL(0, "allow-unknown-type", &unknown_type, + N_("historical option -- no-op")), OPT_BOOL(0, "use-mailmap", &use_mailmap, N_("use mail map file")), OPT_ALIAS(0, "mailmap", "use-mailmap"), /* Batch mode */ @@ -1209,10 +1204,7 @@ int cmd_cat_file(int argc, obj_name = argv[1]; } - if (unknown_type && opt != 't' && opt != 's') - die("git cat-file --allow-unknown-type: use with -s or -t"); - - ret = cat_one_file(opt, exp_type, obj_name, unknown_type); + ret = cat_one_file(opt, exp_type, obj_name); out: list_objects_filter_release(&batch.objects_filter); diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh index ce8b27bf54..d96d02ad7d 100755 --- a/t/t1006-cat-file.sh +++ b/t/t1006-cat-file.sh @@ -136,18 +136,6 @@ $content" test_cmp expect actual ' - test_expect_success "Type of $type is correct using --allow-unknown-type" ' - echo $type >expect && - git cat-file -t --allow-unknown-type $oid >actual && - test_cmp expect actual - ' - - test_expect_success "Size of $type is correct using --allow-unknown-type" ' - echo $size >expect && - git cat-file -s --allow-unknown-type $oid >actual && - test_cmp expect actual - ' - test -z "$content" || test_expect_success "Content of $type is correct" ' echo_without_newline "$content" >expect && @@ -677,95 +665,67 @@ test_expect_success 'setup bogus data' ' bogus_long_oid=$(echo_without_newline "$bogus_long_content" | git hash-object -t $bogus_long_type --literally -w --stdin) ' -for arg1 in '' --allow-unknown-type +for arg1 in -s -t -p do - for arg2 in -s -t -p - do - if test "$arg1" = "--allow-unknown-type" && test "$arg2" = "-p" + test_expect_success "cat-file $arg1 error on bogus short OID" ' + cat >expect <<-\EOF && + fatal: invalid object type + EOF + + test_must_fail git cat-file $arg1 $bogus_short_oid >out 2>actual && + test_must_be_empty out && + test_cmp expect actual + ' + + test_expect_success "cat-file $arg1 error on bogus full OID" ' + if test "$arg1" = "-p" then - continue - fi - - - test_expect_success "cat-file $arg1 $arg2 error on bogus short OID" ' - cat >expect <<-\EOF && - fatal: invalid object type + cat >expect <<-EOF + error: header for $bogus_long_oid too long, exceeds 32 bytes + fatal: Not a valid object name $bogus_long_oid EOF - - if test "$arg1" = "--allow-unknown-type" - then - git cat-file $arg1 $arg2 $bogus_short_oid - else - test_must_fail git cat-file $arg1 $arg2 $bogus_short_oid >out 2>actual && - test_must_be_empty out && - test_cmp expect actual - fi - ' - - test_expect_success "cat-file $arg1 $arg2 error on bogus full OID" ' - if test "$arg2" = "-p" - then - cat >expect <<-EOF - error: header for $bogus_long_oid too long, exceeds 32 bytes - fatal: Not a valid object name $bogus_long_oid - EOF - else - cat >expect <<-EOF - error: header for $bogus_long_oid too long, exceeds 32 bytes - fatal: git cat-file: could not get object info - EOF - fi && - - if test "$arg1" = "--allow-unknown-type" - then - git cat-file $arg1 $arg2 $bogus_short_oid - else - test_must_fail git cat-file $arg1 $arg2 $bogus_long_oid >out 2>actual && - test_must_be_empty out && - test_cmp expect actual - fi - ' - - test_expect_success "cat-file $arg1 $arg2 error on missing short OID" ' - cat >expect.err <<-EOF && - fatal: Not a valid object name $(test_oid deadbeef_short) + else + cat >expect <<-EOF + error: header for $bogus_long_oid too long, exceeds 32 bytes + fatal: git cat-file: could not get object info EOF - test_must_fail git cat-file $arg1 $arg2 $(test_oid deadbeef_short) >out 2>err.actual && - test_must_be_empty out && - test_cmp expect.err err.actual - ' + fi && - test_expect_success "cat-file $arg1 $arg2 error on missing full OID" ' - if test "$arg2" = "-p" - then - cat >expect.err <<-EOF - fatal: Not a valid object name $(test_oid deadbeef) - EOF - else - cat >expect.err <<-\EOF - fatal: git cat-file: could not get object info - EOF - fi && - test_must_fail git cat-file $arg1 $arg2 $(test_oid deadbeef) >out 2>err.actual && - test_must_be_empty out && - test_cmp expect.err err.actual - ' - done + test_must_fail git cat-file $arg1 $bogus_long_oid >out 2>actual && + test_must_be_empty out && + test_cmp expect actual + ' + + test_expect_success "cat-file $arg1 error on missing short OID" ' + cat >expect.err <<-EOF && + fatal: Not a valid object name $(test_oid deadbeef_short) + EOF + test_must_fail git cat-file $arg1 $(test_oid deadbeef_short) >out 2>err.actual && + test_must_be_empty out && + test_cmp expect.err err.actual + ' + + test_expect_success "cat-file $arg1 error on missing full OID" ' + if test "$arg1" = "-p" + then + cat >expect.err <<-EOF + fatal: Not a valid object name $(test_oid deadbeef) + EOF + else + cat >expect.err <<-\EOF + fatal: git cat-file: could not get object info + EOF + fi && + test_must_fail git cat-file $arg1 $(test_oid deadbeef) >out 2>err.actual && + test_must_be_empty out && + test_cmp expect.err err.actual + ' done -test_expect_success '-e is OK with a broken object without --allow-unknown-type' ' +test_expect_success '-e is OK with a broken object' ' git cat-file -e $bogus_short_oid ' -test_expect_success '-e can not be combined with --allow-unknown-type' ' - test_expect_code 128 git cat-file -e --allow-unknown-type $bogus_short_oid -' - -test_expect_success '-p cannot print a broken object even with --allow-unknown-type' ' - test_must_fail git cat-file -p $bogus_short_oid && - test_expect_code 128 git cat-file -p --allow-unknown-type $bogus_short_oid -' - test_expect_success ' does not work with objects of broken types' ' cat >err.expect <<-\EOF && fatal: invalid object type "bogus" @@ -788,60 +748,8 @@ test_expect_success 'broken types combined with --batch and --batch-check' ' test_cmp err.expect err.actual ' -test_expect_success 'the --batch and --batch-check options do not combine with --allow-unknown-type' ' - test_expect_code 128 git cat-file --batch --allow-unknown-type expect <<-EOF && - $bogus_short_type - EOF - git cat-file -t --allow-unknown-type $bogus_short_oid >actual && - test_cmp expect actual && - - # Create it manually, as "git replace" will die on bogus - # types. - head=$(git rev-parse --verify HEAD) && - test_when_finished "test-tool ref-store main delete-refs 0 msg refs/replace/$bogus_short_oid" && - test-tool ref-store main update-ref msg "refs/replace/$bogus_short_oid" $head $ZERO_OID REF_SKIP_OID_VERIFICATION && - - cat >expect <<-EOF && - commit - EOF - git cat-file -t --allow-unknown-type $bogus_short_oid >actual && - test_cmp expect actual -' - -test_expect_success "Type of broken object is correct" ' - echo $bogus_short_type >expect && - git cat-file -t --allow-unknown-type $bogus_short_oid >actual && - test_cmp expect actual -' - -test_expect_success "Size of broken object is correct" ' - echo $bogus_short_size >expect && - git cat-file -s --allow-unknown-type $bogus_short_oid >actual && - test_cmp expect actual -' - -test_expect_success 'clean up broken object' ' - rm .git/objects/$(test_oid_to_path $bogus_short_oid) -' - -test_expect_success "Type of broken object is correct when type is large" ' - echo $bogus_long_type >expect && - git cat-file -t --allow-unknown-type $bogus_long_oid >actual && - test_cmp expect actual -' - -test_expect_success "Size of large broken object is correct when type is large" ' - echo $bogus_long_size >expect && - git cat-file -s --allow-unknown-type $bogus_long_oid >actual && - test_cmp expect actual -' - -test_expect_success 'clean up broken object' ' +test_expect_success 'clean up broken objects' ' + rm .git/objects/$(test_oid_to_path $bogus_short_oid) && rm .git/objects/$(test_oid_to_path $bogus_long_oid) ' @@ -903,25 +811,6 @@ test_expect_success 'cat-file -t and -s on corrupt loose object' ' ) ' -test_expect_success 'truncated object with --allow-unknown-type' - <<\EOT - objtype='a really long type name that exceeds the 32-byte limit' && - blob=$(git hash-object -w --literally -t "$objtype" /dev/null) && - objpath=.git/objects/$(test_oid_to_path "$blob") && - - # We want to truncate the object far enough in that we don't hit the - # end while inflating the first 32 bytes (since we want to have to dig - # for the trailing NUL of the header). But we don't want to go too far, - # since our header isn't very big. And of course we are counting - # deflated zlib bytes in the on-disk file, so it's a bit of a guess. - # Empirically 50 seems to work. - mv "$objpath" obj.bak && - test_when_finished 'mv obj.bak "$objpath"' && - test_copy_bytes 50 "$objpath" && - - test_must_fail git cat-file --allow-unknown-type -t $blob 2>err && - test_grep "unable to unpack $blob header" err -EOT - test_expect_success 'object reading handles zlib dictionary' - <<\EOT echo 'content that will be recompressed' >file && blob=$(git hash-object -w file) && From ae24b032a04ccd1565cb1ce13317b56daa77ce7f Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:49:45 -0400 Subject: [PATCH 03/13] object-file: drop OBJECT_INFO_ALLOW_UNKNOWN_TYPE flag Since cat-file dropped its "--allow-unknown-type" option in the previous commit, there are no more uses of the internal flag that implemented it. Let's drop it. That in turn lets us drop the strbuf parameter of unpack_loose_header(), which now is always NULL. And without that, we can drop all of the additional code to inflate larger headers into the strbuf. Arguably we could drop ULHR_TOO_LONG, as no callers really care about the distinction from ULHR_BAD. But it's easy enough to retain, and it does let us produce a slightly more specific message in one instance. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 45 +++++++-------------------------------------- object-file.h | 10 ++-------- object-store.h | 2 -- streaming.c | 2 +- 4 files changed, 10 insertions(+), 49 deletions(-) diff --git a/object-file.c b/object-file.c index dc56a4766d..1127e154f6 100644 --- a/object-file.c +++ b/object-file.c @@ -299,8 +299,7 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, unsigned char *map, unsigned long mapsize, void *buffer, - unsigned long bufsiz, - struct strbuf *header) + unsigned long bufsiz) { int status; @@ -325,32 +324,9 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, return ULHR_OK; /* - * We have a header longer than MAX_HEADER_LEN. The "header" - * here is only non-NULL when we run "cat-file - * --allow-unknown-type". + * We have a header longer than MAX_HEADER_LEN. */ - if (!header) - return ULHR_TOO_LONG; - - /* - * buffer[0..bufsiz] was not large enough. Copy the partial - * result out to header, and then append the result of further - * reading the stream. - */ - strbuf_add(header, buffer, stream->next_out - (unsigned char *)buffer); - - do { - stream->next_out = buffer; - stream->avail_out = bufsiz; - - obj_read_unlock(); - status = git_inflate(stream, 0); - obj_read_lock(); - strbuf_add(header, buffer, stream->next_out - (unsigned char *)buffer); - if (memchr(buffer, '\0', stream->next_out - (unsigned char *)buffer)) - return 0; - } while (status == Z_OK); - return ULHR_BAD; + return ULHR_TOO_LONG; } static void *unpack_loose_rest(git_zstream *stream, @@ -476,10 +452,8 @@ int loose_object_info(struct repository *r, void *map; git_zstream stream; char hdr[MAX_HEADER_LEN]; - struct strbuf hdrbuf = STRBUF_INIT; unsigned long size_scratch; enum object_type type_scratch; - int allow_unknown = flags & OBJECT_INFO_ALLOW_UNKNOWN_TYPE; if (oi->delta_base_oid) oidclr(oi->delta_base_oid, the_repository->hash_algo); @@ -521,18 +495,15 @@ int loose_object_info(struct repository *r, if (oi->disk_sizep) *oi->disk_sizep = mapsize; - switch (unpack_loose_header(&stream, map, mapsize, hdr, sizeof(hdr), - allow_unknown ? &hdrbuf : NULL)) { + switch (unpack_loose_header(&stream, map, mapsize, hdr, sizeof(hdr))) { case ULHR_OK: - if (parse_loose_header(hdrbuf.len ? hdrbuf.buf : hdr, oi) < 0) + if (parse_loose_header(hdr, oi) < 0) status = error(_("unable to parse %s header"), oid_to_hex(oid)); - else if (!allow_unknown && *oi->typep < 0) + else if (*oi->typep < 0) die(_("invalid object type")); if (!oi->contentp) break; - if (hdrbuf.len) - BUG("unpacking content with unknown types not yet supported"); *oi->contentp = unpack_loose_rest(&stream, hdr, *oi->sizep, oid); if (*oi->contentp) goto cleanup; @@ -558,7 +529,6 @@ cleanup: munmap(map, mapsize); if (oi->sizep == &size_scratch) oi->sizep = NULL; - strbuf_release(&hdrbuf); if (oi->typep == &type_scratch) oi->typep = NULL; oi->whence = OI_LOOSE; @@ -1682,8 +1652,7 @@ int read_loose_object(const char *path, goto out; } - if (unpack_loose_header(&stream, map, mapsize, hdr, sizeof(hdr), - NULL) != ULHR_OK) { + if (unpack_loose_header(&stream, map, mapsize, hdr, sizeof(hdr)) != ULHR_OK) { error(_("unable to unpack header of %s"), path); goto out_inflate; } diff --git a/object-file.h b/object-file.h index fd715663fb..a979fd5e4d 100644 --- a/object-file.h +++ b/object-file.h @@ -133,12 +133,7 @@ int format_object_header(char *str, size_t size, enum object_type type, * - ULHR_BAD on error * - ULHR_TOO_LONG if the header was too long * - * It will only parse up to MAX_HEADER_LEN bytes unless an optional - * "hdrbuf" argument is non-NULL. This is intended for use with - * OBJECT_INFO_ALLOW_UNKNOWN_TYPE to extract the bad type for (error) - * reporting. The full header will be extracted to "hdrbuf" for use - * with parse_loose_header(), ULHR_TOO_LONG will still be returned - * from this function to indicate that the header was too long. + * It will only parse up to MAX_HEADER_LEN bytes. */ enum unpack_loose_header_result { ULHR_OK, @@ -149,8 +144,7 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, unsigned char *map, unsigned long mapsize, void *buffer, - unsigned long bufsiz, - struct strbuf *hdrbuf); + unsigned long bufsiz); /** * parse_loose_header() parses the starting " \0" of an diff --git a/object-store.h b/object-store.h index c2fe5a1960..cf908fe68e 100644 --- a/object-store.h +++ b/object-store.h @@ -240,8 +240,6 @@ struct object_info { /* Invoke lookup_replace_object() on the given hash */ #define OBJECT_INFO_LOOKUP_REPLACE 1 -/* Allow reading from a loose object file of unknown/bogus type */ -#define OBJECT_INFO_ALLOW_UNKNOWN_TYPE 2 /* Do not retry packed storage after checking packed and loose storage */ #define OBJECT_INFO_QUICK 8 /* diff --git a/streaming.c b/streaming.c index 127d6b5d6a..6d6512e2e0 100644 --- a/streaming.c +++ b/streaming.c @@ -238,7 +238,7 @@ static int open_istream_loose(struct git_istream *st, struct repository *r, return -1; switch (unpack_loose_header(&st->z, st->u.loose.mapped, st->u.loose.mapsize, st->u.loose.hdr, - sizeof(st->u.loose.hdr), NULL)) { + sizeof(st->u.loose.hdr))) { case ULHR_OK: break; case ULHR_BAD: From aac2abeca7077aa5f87f4132b98d37dd938b3573 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:49:47 -0400 Subject: [PATCH 04/13] cat-file: use type enum instead of buffer for -t option Now that we no longer support OBJECT_INFO_ALLOW_UNKNOWN_TYPE, there is no need to pass a strbuf into oid_object_info_extended() to record the type. The regular object_type enum is sufficient to capture all of the types we will allow. This simplifies the code a bit, and will eventually let us drop object_info's type_name strbuf support. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 4adc19aa29..67a5ff2b9e 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -109,7 +109,6 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name) unsigned long size; struct object_context obj_context = {0}; struct object_info oi = OBJECT_INFO_INIT; - struct strbuf sb = STRBUF_INIT; unsigned flags = OBJECT_INFO_LOOKUP_REPLACE; unsigned get_oid_flags = GET_OID_RECORD_PATH | @@ -132,16 +131,12 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name) buf = NULL; switch (opt) { case 't': - oi.type_name = &sb; + oi.typep = &type; if (oid_object_info_extended(the_repository, &oid, &oi, flags) < 0) die("git cat-file: could not get object info"); - if (sb.len) { - printf("%s\n", sb.buf); - strbuf_release(&sb); - ret = 0; - goto cleanup; - } - break; + printf("%s\n", type_name(type)); + ret = 0; + goto cleanup; case 's': oi.sizep = &size; From b32b434bfe241cde380c5f3aca48a1fdcd86961b Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:49:50 -0400 Subject: [PATCH 05/13] oid_object_info_convert(): stop using string for object type In oid_object_info_convert(), we convert objects between their sha1 and sha256 variants. To do this, we naturally need to know the type, which we get from oid_object_info_extended() using its type_name strbuf option. But getting the value as a string (versus an object_type enum) is not helpful. Since we do not allow unknown types, the regular enum is sufficient. And the resulting code is a bit simpler, as we no longer have to manage the extra allocation nor convert the string to an enum ourselves. Note that at first glance, it might seem like we should retain the error check for "type == -1" to catch bogus types found by the underlying parser. But we don't need it, as an unknown type would have yielded an error from the call to oid_object_info_extended(), which would already have caused us to return an error. In fact, I suspect this was always impossible to trigger. Even when we were converting the string to a type enum ourselves, an invalid type would never have escaped oid_object_info_extended(), since we never passed the (now removed) OBJECT_INFO_ALLOW_UNKNOWN_TYPE option. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-store.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/object-store.c b/object-store.c index 2f51d0e3b0..b8f6955ea7 100644 --- a/object-store.c +++ b/object-store.c @@ -727,7 +727,7 @@ static int oid_object_info_convert(struct repository *r, { const struct git_hash_algo *input_algo = &hash_algos[input_oid->algo]; int do_die = flags & OBJECT_INFO_DIE_IF_CORRUPT; - struct strbuf type_name = STRBUF_INIT; + enum object_type type; struct object_id oid, delta_base_oid; struct object_info new_oi, *oi; unsigned long size; @@ -753,7 +753,7 @@ static int oid_object_info_convert(struct repository *r, if (input_oi->sizep || input_oi->contentp) { new_oi.contentp = &content; new_oi.sizep = &size; - new_oi.type_name = &type_name; + new_oi.typep = &type; } oi = &new_oi; } @@ -766,12 +766,7 @@ static int oid_object_info_convert(struct repository *r, if (new_oi.contentp) { struct strbuf outbuf = STRBUF_INIT; - enum object_type type; - type = type_from_string_gently(type_name.buf, type_name.len, - !do_die); - if (type == -1) - return -1; if (type != OBJ_BLOB) { ret = convert_object_file(the_repository, &outbuf, the_hash_algo, input_algo, @@ -788,10 +783,8 @@ static int oid_object_info_convert(struct repository *r, *input_oi->contentp = content; else free(content); - if (input_oi->type_name) - *input_oi->type_name = type_name; - else - strbuf_release(&type_name); + if (input_oi->typep) + *input_oi->typep = type; } if (new_oi.delta_base_oid == &delta_base_oid) { if (repo_oid_to_algop(r, &delta_base_oid, input_algo, From 4ae0e9423c95c63c17f66fb2de255c46dc14c4e5 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:49:53 -0400 Subject: [PATCH 06/13] fsck: stop using object_info->type_name strbuf When fsck-ing a loose object, we use object_info's type_name strbuf to record the parsed object type as a string. For most objects this is redundant with the object_type enum, but it does let us report the string when we encounter an object with an unknown type (for which there is no matching enum value). There are a few downsides, though: 1. The code to report these cases is not actually robust. Since we did not pass a strbuf to unpack_loose_header(), we only retrieved types from headers up to 32 bytes. In longer cases, we'd simply say "object corrupt or missing". 2. This is the last caller that uses object_info's type_name strbuf support. It would be nice to refactor it so that we can simplify that code. 3. Likewise, we'll check the hash of the object using its unknown type (again, as long as that type is short enough). That depends on the hash_object_file_literally() code, which we'd eventually like to get rid of. So we can simplify things by bailing immediately in read_loose_object() when we encounter an unknown type. This has a few user-visible effects: a. Instead of producing a single line of error output like this: error: 26ed13ce3564fbbb44e35bde42c7da717ea004a6: object is of unknown type 'bogus': .git/objects/26/ed13ce3564fbbb44e35bde42c7da717ea004a6 we'll now issue two lines (the first from read_loose_object() when we see the unparsable header, and the second from the fsck code, since we couldn't read the object): error: unable to parse type from header 'bogus 4' of .git/objects/26/ed13ce3564fbbb44e35bde42c7da717ea004a6 error: 26ed13ce3564fbbb44e35bde42c7da717ea004a6: object corrupt or missing: .git/objects/26/ed13ce3564fbbb44e35bde42c7da717ea004a6 This is a little more verbose, but this sort of error should be rare (such objects are almost impossible to work with, and cannot be transferred between repositories as they are not representable in packfiles). And as a bonus, reporting the broken header in full could help with debugging other cases (e.g., a header like "blob xyzzy\0" would fail in parsing the size, but previously we'd not have showed the offending bytes). b. An object with an unknown type will be reported as corrupt, without actually doing a hash check. Again, I think this is unlikely to matter in practice since such objects are totally unusable. We'll update one fsck test to match the new error strings. And we can remove another test that covered the case of an object with an unknown type _and_ a hash corruption. Since we'll skip the hash check now in this case, the test is no longer interesting. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/fsck.c | 13 ++----------- object-file.c | 12 +++++++++--- t/t1450-fsck.sh | 29 +++-------------------------- 3 files changed, 14 insertions(+), 40 deletions(-) diff --git a/builtin/fsck.c b/builtin/fsck.c index 6cac28356c..e7d96a9c8e 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -614,12 +614,11 @@ static void get_default_heads(void) struct for_each_loose_cb { struct progress *progress; - struct strbuf obj_type; }; -static int fsck_loose(const struct object_id *oid, const char *path, void *data) +static int fsck_loose(const struct object_id *oid, const char *path, + void *data UNUSED) { - struct for_each_loose_cb *cb_data = data; struct object *obj; enum object_type type = OBJ_NONE; unsigned long size; @@ -629,8 +628,6 @@ static int fsck_loose(const struct object_id *oid, const char *path, void *data) struct object_id real_oid = *null_oid(the_hash_algo); int err = 0; - strbuf_reset(&cb_data->obj_type); - oi.type_name = &cb_data->obj_type; oi.sizep = &size; oi.typep = &type; @@ -642,10 +639,6 @@ static int fsck_loose(const struct object_id *oid, const char *path, void *data) err = error(_("%s: object corrupt or missing: %s"), oid_to_hex(oid), path); } - if (type != OBJ_NONE && type < 0) - err = error(_("%s: object is of unknown type '%s': %s"), - oid_to_hex(&real_oid), cb_data->obj_type.buf, - path); if (err < 0) { errors_found |= ERROR_OBJECT; free(contents); @@ -697,7 +690,6 @@ static void fsck_object_dir(const char *path) { struct progress *progress = NULL; struct for_each_loose_cb cb_data = { - .obj_type = STRBUF_INIT, .progress = progress, }; @@ -712,7 +704,6 @@ static void fsck_object_dir(const char *path) &cb_data); display_progress(progress, 256); stop_progress(&progress); - strbuf_release(&cb_data.obj_type); } static int fsck_head_link(const char *head_ref_name, diff --git a/object-file.c b/object-file.c index 1127e154f6..7a35bde96e 100644 --- a/object-file.c +++ b/object-file.c @@ -1662,6 +1662,12 @@ int read_loose_object(const char *path, goto out_inflate; } + if (*oi->typep < 0) { + error(_("unable to parse type from header '%s' of %s"), + hdr, path); + goto out_inflate; + } + if (*oi->typep == OBJ_BLOB && *size > repo_settings_get_big_file_threshold(the_repository)) { if (check_stream_oid(&stream, hdr, *size, path, expected_oid) < 0) @@ -1672,9 +1678,9 @@ int read_loose_object(const char *path, error(_("unable to unpack contents of %s"), path); goto out_inflate; } - hash_object_file_literally(the_repository->hash_algo, - *contents, *size, - oi->type_name->buf, real_oid); + hash_object_file(the_repository->hash_algo, + *contents, *size, + *oi->typep, real_oid); if (!oideq(expected_oid, real_oid)) goto out_inflate; } diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index 0105045376..3f52dd5abc 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -71,30 +71,6 @@ test_expect_success 'object with hash mismatch' ' ) ' -test_expect_success 'object with hash and type mismatch' ' - git init --bare hash-type-mismatch && - ( - cd hash-type-mismatch && - - oid=$(echo blob | git hash-object -w --stdin -t garbage --literally) && - oldoid=$oid && - old=$(test_oid_to_path "$oid") && - new=$(dirname $old)/$(test_oid ff_2) && - oid="$(dirname $new)$(basename $new)" && - - mv objects/$old objects/$new && - git update-index --add --cacheinfo 100644 $oid foo && - tree=$(git write-tree) && - cmt=$(echo bogus | git commit-tree $tree) && - git update-ref refs/heads/bogus $cmt && - - - test_must_fail git fsck 2>out && - grep "^error: $oldoid: hash-path mismatch, found at: .*$new" out && - grep "^error: $oldoid: object is of unknown type '"'"'garbage'"'"'" out - ) -' - test_expect_success 'zlib corrupt loose object output ' ' git init --bare corrupt-loose-output && ( @@ -1001,8 +977,9 @@ test_expect_success 'fsck error and recovery on invalid object type' ' test_must_fail git fsck 2>err && grep -e "^error" -e "^fatal" err >errors && - test_line_count = 1 errors && - grep "$garbage_blob: object is of unknown type '"'"'garbage'"'"':" err + test_line_count = 2 errors && + test_grep "unable to parse type from header .garbage" err && + test_grep "$garbage_blob: object corrupt or missing:" err ) ' From d2956385a9319155928e2d7bc5f9d90eeac5d0a5 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:49:56 -0400 Subject: [PATCH 07/13] oid_object_info(): drop type_name strbuf We provide a mechanism for callers to get the object type as a raw string, rather than an object_type enum. This was in theory useful for returning types that are not representable in the enum, but we consider any such type to be an error, and there are no callers that use the strbuf anymore. Let's drop support to simplify the code a bit. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 4 +--- object-store.c | 2 -- object-store.h | 1 - packfile.c | 7 +------ 4 files changed, 2 insertions(+), 12 deletions(-) diff --git a/object-file.c b/object-file.c index 7a35bde96e..b10e283529 100644 --- a/object-file.c +++ b/object-file.c @@ -403,8 +403,6 @@ int parse_loose_header(const char *hdr, struct object_info *oi) } type = type_from_string_gently(type_buf, type_len, 1); - if (oi->type_name) - strbuf_add(oi->type_name, type_buf, type_len); if (oi->typep) *oi->typep = type; @@ -466,7 +464,7 @@ int loose_object_info(struct repository *r, * return value implicitly indicates whether the * object even exists. */ - if (!oi->typep && !oi->type_name && !oi->sizep && !oi->contentp) { + if (!oi->typep && !oi->sizep && !oi->contentp) { struct stat st; if (!oi->disk_sizep && (flags & OBJECT_INFO_QUICK)) return quick_has_loose(r, oid) ? 0 : -1; diff --git a/object-store.c b/object-store.c index b8f6955ea7..216c61dcf2 100644 --- a/object-store.c +++ b/object-store.c @@ -646,8 +646,6 @@ static int do_oid_object_info_extended(struct repository *r, *(oi->disk_sizep) = 0; if (oi->delta_base_oid) oidclr(oi->delta_base_oid, the_repository->hash_algo); - if (oi->type_name) - strbuf_addstr(oi->type_name, type_name(co->type)); if (oi->contentp) *oi->contentp = xmemdupz(co->buf, co->size); oi->whence = OI_CACHED; diff --git a/object-store.h b/object-store.h index cf908fe68e..6b55c245eb 100644 --- a/object-store.h +++ b/object-store.h @@ -205,7 +205,6 @@ struct object_info { unsigned long *sizep; off_t *disk_sizep; struct object_id *delta_base_oid; - struct strbuf *type_name; void **contentp; /* Response */ diff --git a/packfile.c b/packfile.c index d91016f1c7..80e35f1032 100644 --- a/packfile.c +++ b/packfile.c @@ -1598,17 +1598,12 @@ int packed_object_info(struct repository *r, struct packed_git *p, *oi->disk_sizep = pack_pos_to_offset(p, pos + 1) - obj_offset; } - if (oi->typep || oi->type_name) { + if (oi->typep) { enum object_type ptot; ptot = packed_to_object_type(r, p, obj_offset, type, &w_curs, curpos); if (oi->typep) *oi->typep = ptot; - if (oi->type_name) { - const char *tn = type_name(ptot); - if (tn) - strbuf_addstr(oi->type_name, tn); - } if (ptot < 0) { type = OBJ_BAD; goto out; From f2ed511a2f8f7339e21e4f2792ebe230e92dd669 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:49:59 -0400 Subject: [PATCH 08/13] t/helper: add zlib test-tool It's occasionally useful when testing or debugging to be able to do raw zlib inflate/deflate operations (e.g., to check the bytes of a specific loose or packed object). Even though zlib's deflate algorithm is used by many other programs, this is surprisingly hard to do in a portable way. E.g., gzip can do this if you manually munge some header bytes. But the result is somewhat arcane, and we don't assume gzip is available anyway. Likewise, pigz will handle raw zlib, but we can't assume it is available. So let's introduce a short test helper for just doing zlib operations. We'll use it in subsequent patches to add some new tests, but it would also have come in handy a few times in the past: - The hard-coded pack data from 3b910d0c5e (add tests for indexing packs with delta cycles, 2013-08-23) could probably be generated on the fly. - Likewise we could avoid the hard-coded data from 0b1493c2d4 (git_inflate(): skip zlib_post_call() sanity check on Z_NEED_DICT, 2025-02-25). Though note this would require support for more zlib options. - It would have helped with the debugging documented in 41dfbb2dbe (howto: add article on recovering a corrupted object, 2013-10-25). I'll leave refactoring existing tests for another day, but I hope the examples above show the general utility. I aimed for simplicity in the code. In particular, it will read all input into a memory buffer, rather than streaming. That makes the zlib loops harder to get wrong (which has been a source of subtle bugs in the past). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- Makefile | 1 + t/helper/meson.build | 1 + t/helper/test-tool.c | 1 + t/helper/test-tool.h | 1 + t/helper/test-zlib.c | 62 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 66 insertions(+) create mode 100644 t/helper/test-zlib.c diff --git a/Makefile b/Makefile index de73c6ddcd..14616ff625 100644 --- a/Makefile +++ b/Makefile @@ -859,6 +859,7 @@ TEST_BUILTINS_OBJS += test-wildmatch.o TEST_BUILTINS_OBJS += test-windows-named-pipe.o TEST_BUILTINS_OBJS += test-write-cache.o TEST_BUILTINS_OBJS += test-xml-encode.o +TEST_BUILTINS_OBJS += test-zlib.o # Do not add more tests here unless they have extra dependencies. Add # them in TEST_BUILTINS_OBJS above. diff --git a/t/helper/meson.build b/t/helper/meson.build index d4e8b26df8..675e64c010 100644 --- a/t/helper/meson.build +++ b/t/helper/meson.build @@ -77,6 +77,7 @@ test_tool_sources = [ 'test-windows-named-pipe.c', 'test-write-cache.c', 'test-xml-encode.c', + 'test-zlib.c', ] test_tool = executable('test-tool', diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index 74812ed86d..a7abc618b3 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -91,6 +91,7 @@ static struct test_cmd cmds[] = { { "windows-named-pipe", cmd__windows_named_pipe }, #endif { "write-cache", cmd__write_cache }, + { "zlib", cmd__zlib }, }; static NORETURN void die_usage(void) diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index 2571a3ccfe..7f150fa1eb 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -84,6 +84,7 @@ int cmd__wildmatch(int argc, const char **argv); int cmd__windows_named_pipe(int argc, const char **argv); #endif int cmd__write_cache(int argc, const char **argv); +int cmd__zlib(int argc, const char **argv); int cmd_hash_impl(int ac, const char **av, int algo, int unsafe); diff --git a/t/helper/test-zlib.c b/t/helper/test-zlib.c new file mode 100644 index 0000000000..de7e9edee1 --- /dev/null +++ b/t/helper/test-zlib.c @@ -0,0 +1,62 @@ +#include "test-tool.h" +#include "git-zlib.h" +#include "strbuf.h" + +static const char *zlib_usage = "test-tool zlib [inflate|deflate]"; + +static void do_zlib(struct git_zstream *stream, + int (*zlib_func)(git_zstream *, int), + int fd_in, int fd_out) +{ + struct strbuf buf_in = STRBUF_INIT; + int status = Z_OK; + + if (strbuf_read(&buf_in, fd_in, 0) < 0) + die_errno("read error"); + + stream->next_in = (unsigned char *)buf_in.buf; + stream->avail_in = buf_in.len; + + while (status == Z_OK || + (status == Z_BUF_ERROR && !stream->avail_out)) { + unsigned char buf_out[4096]; + + stream->next_out = buf_out; + stream->avail_out = sizeof(buf_out); + + status = zlib_func(stream, Z_FINISH); + if (write_in_full(fd_out, buf_out, + sizeof(buf_out) - stream->avail_out) < 0) + die_errno("write error"); + } + + if (status != Z_STREAM_END) + die("zlib error %d", status); + + strbuf_release(&buf_in); +} + +int cmd__zlib(int argc, const char **argv) +{ + git_zstream stream; + + if (argc != 2) + usage(zlib_usage); + + memset(&stream, 0, sizeof(stream)); + + if (!strcmp(argv[1], "inflate")) { + git_inflate_init(&stream); + do_zlib(&stream, git_inflate, 0, 1); + git_inflate_end(&stream); + } else if (!strcmp(argv[1], "deflate")) { + git_deflate_init(&stream, Z_DEFAULT_COMPRESSION); + do_zlib(&stream, git_deflate, 0, 1); + git_deflate_end(&stream); + } else { + error("unknown mode: %s", argv[1]); + usage(zlib_usage); + } + + return 0; +} From b5643b60acb71e3c117558b37020a8db8fe17c69 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:50:02 -0400 Subject: [PATCH 09/13] t: add lib-loose.sh This commit adds a shell library for writing raw loose objects into the object database. Normally this is done with hash-object, but the specific intent here is to allow broken objects that hash-object may not support. We'll convert several cases that use "hash-object --literally" to write objects with invalid types. That works currently, but dropping this dependency will allow us to remove that feature and simplify the object-writing code. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- t/lib-loose.sh | 30 +++++++++++++++++++++++++++++ t/t1006-cat-file.sh | 5 +++-- t/t1450-fsck.sh | 3 ++- t/t1512-rev-parse-disambiguation.sh | 5 +++-- 4 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 t/lib-loose.sh diff --git a/t/lib-loose.sh b/t/lib-loose.sh new file mode 100644 index 0000000000..3613631eaf --- /dev/null +++ b/t/lib-loose.sh @@ -0,0 +1,30 @@ +# Support routines for hand-crafting loose objects. + +# Write a loose object into the odb at $1, with object type $2 and contents +# from stdin. Writes the oid to stdout. Example: +# +# oid=$(echo foo | loose_obj .git/objects blob) +# +loose_obj () { + cat >tmp_loose.content && + size=$(wc -c tmp_loose.raw && + + oid=$(test-tool $test_hash_algo tmp_loose.zlib && + mkdir -p "$dir" && + mv tmp_loose.zlib "$file" && + + rm tmp_loose.raw tmp_loose.content && + echo "$oid" +} diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh index d96d02ad7d..317da6869c 100755 --- a/t/t1006-cat-file.sh +++ b/t/t1006-cat-file.sh @@ -3,6 +3,7 @@ test_description='git cat-file' . ./test-lib.sh +. "$TEST_DIRECTORY/lib-loose.sh" test_cmdmode_usage () { test_expect_code 129 "$@" 2>err && @@ -657,12 +658,12 @@ test_expect_success 'setup bogus data' ' bogus_short_type="bogus" && bogus_short_content="bogus" && bogus_short_size=$(strlen "$bogus_short_content") && - bogus_short_oid=$(echo_without_newline "$bogus_short_content" | git hash-object -t $bogus_short_type --literally -w --stdin) && + bogus_short_oid=$(echo_without_newline "$bogus_short_content" | loose_obj .git/objects $bogus_short_type) && bogus_long_type="abcdefghijklmnopqrstuvwxyz1234679" && bogus_long_content="bogus" && bogus_long_size=$(strlen "$bogus_long_content") && - bogus_long_oid=$(echo_without_newline "$bogus_long_content" | git hash-object -t $bogus_long_type --literally -w --stdin) + bogus_long_oid=$(echo_without_newline "$bogus_long_content" | loose_obj .git/objects $bogus_long_type) ' for arg1 in -s -t -p diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index 3f52dd5abc..5ae86c42be 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -7,6 +7,7 @@ test_description='git fsck random collection of tests ' . ./test-lib.sh +. "$TEST_DIRECTORY/lib-loose.sh" test_expect_success setup ' git config gc.auto 0 && @@ -973,7 +974,7 @@ test_expect_success 'fsck error and recovery on invalid object type' ' ( cd garbage-type && - garbage_blob=$(git hash-object --stdin -w -t garbage --literally err && grep -e "^error" -e "^fatal" err >errors && diff --git a/t/t1512-rev-parse-disambiguation.sh b/t/t1512-rev-parse-disambiguation.sh index 70f1e0a998..1a380a4184 100755 --- a/t/t1512-rev-parse-disambiguation.sh +++ b/t/t1512-rev-parse-disambiguation.sh @@ -24,6 +24,7 @@ GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME . ./test-lib.sh +. "$TEST_DIRECTORY/lib-loose.sh" test_cmp_failed_rev_parse () { dir=$1 @@ -67,8 +68,8 @@ test_expect_success 'ambiguous loose bad object parsed as OBJ_BAD' ' cd blob.bad && # Both have the prefix "bad0" - echo xyzfaowcoh | git hash-object -t bad -w --stdin --literally && - echo xyzhjpyvwl | git hash-object -t bad -w --stdin --literally + echo xyzfaowcoh | loose_obj objects bad && + echo xyzhjpyvwl | loose_obj objects bad ) && test_cmp_failed_rev_parse blob.bad bad0 <<-\EOF From 65a6a79b4204a2038498fd14be993b89067a046a Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:50:05 -0400 Subject: [PATCH 10/13] hash-object: stop allowing unknown types When passed the "--literally" option, hash-object will allow any arbitrary string for its "-t" type option. Such objects are only useful for testing or debugging, as they cannot be used in the normal way (e.g., you cannot fetch their contents!). Let's drop this feature, which will eventually let us simplify the object-writing code. This is technically backwards incompatible, but since such objects were never really functional, it seems unlikely that anybody will notice. We will retain the --literally flag, as it also instructs hash-object not to worry about other format issues (e.g., type-specific things that fsck would complain about). The documentation does not need to be updated, as it was always vague about which checks we're loosening (it uses only the phrase "any garbage"). The code change is a bit hard to verify from just the patch text. We can drop our local hash_literally() helper, but it was really just wrapping write_object_file_literally(). We now replace that with calling index_fd(), as we do for the non-literal code path, but dropping the INDEX_FORMAT_CHECK flag. This ends up being the same semantically as what the _literally() code path was doing (modulo handling unknown types, which is our goal). We'll be able to clean up these code paths a bit more in subsequent patches. The existing test is flipped to show that we now reject the unknown type. The additional "extra-long type" test is now redundant, as we bail early upon seeing a bogus type. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/hash-object.c | 29 +++++------------------------ t/t1007-hash-object.sh | 11 ++--------- 2 files changed, 7 insertions(+), 33 deletions(-) diff --git a/builtin/hash-object.c b/builtin/hash-object.c index cd53fa3bde..3c6949b3fa 100644 --- a/builtin/hash-object.c +++ b/builtin/hash-object.c @@ -24,26 +24,6 @@ enum { HASH_OBJECT_WRITE = (1 << 1), }; -/* - * This is to create corrupt objects for debugging and as such it - * needs to bypass the data conversion performed by, and the type - * limitation imposed by, index_fd() and its callees. - */ -static int hash_literally(struct object_id *oid, int fd, const char *type, unsigned flags) -{ - struct strbuf buf = STRBUF_INIT; - int ret; - - if (strbuf_read(&buf, fd, 4096) < 0) - ret = -1; - else - ret = write_object_file_literally(buf.buf, buf.len, type, oid, - (flags & HASH_OBJECT_WRITE) ? WRITE_OBJECT_FILE_PERSIST : 0); - close(fd); - strbuf_release(&buf); - return ret; -} - static void hash_fd(int fd, const char *type, const char *path, unsigned flags, int literally) { @@ -56,11 +36,12 @@ static void hash_fd(int fd, const char *type, const char *path, unsigned flags, if (flags & HASH_OBJECT_CHECK) index_flags |= INDEX_FORMAT_CHECK; + if (literally) + index_flags &= ~INDEX_FORMAT_CHECK; + if (fstat(fd, &st) < 0 || - (literally - ? hash_literally(&oid, fd, type, flags) - : index_fd(the_repository->index, &oid, fd, &st, - type_from_string(type), path, index_flags))) + index_fd(the_repository->index, &oid, fd, &st, + type_from_string(type), path, index_flags)) die((flags & HASH_OBJECT_WRITE) ? "Unable to add %s to database" : "Unable to hash %s", path); diff --git a/t/t1007-hash-object.sh b/t/t1007-hash-object.sh index b3cf53ff8c..dbbe9fb0d4 100755 --- a/t/t1007-hash-object.sh +++ b/t/t1007-hash-object.sh @@ -248,15 +248,8 @@ test_expect_success 'hash-object complains about truncated type name' ' test_must_fail git hash-object -t bl --stdin Date: Fri, 16 May 2025 00:50:08 -0400 Subject: [PATCH 11/13] hash-object: merge HASH_* and INDEX_* flags The hash-object command has its own custom flag bits that it sets based on command-line options. But since we dropped hash_literally() in the previous commit, the only thing we do with those flag bits is convert them directly into "index_flags" to pass to index_fd(). This extra layer of indirection makes the code harder to read and reason about. Let's just use the INDEX_* flags directly. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/hash-object.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/builtin/hash-object.c b/builtin/hash-object.c index 3c6949b3fa..1ecb70b551 100644 --- a/builtin/hash-object.c +++ b/builtin/hash-object.c @@ -19,30 +19,19 @@ #include "strbuf.h" #include "write-or-die.h" -enum { - HASH_OBJECT_CHECK = (1 << 0), - HASH_OBJECT_WRITE = (1 << 1), -}; - static void hash_fd(int fd, const char *type, const char *path, unsigned flags, int literally) { - unsigned int index_flags = 0; struct stat st; struct object_id oid; - if (flags & HASH_OBJECT_WRITE) - index_flags |= INDEX_WRITE_OBJECT; - if (flags & HASH_OBJECT_CHECK) - index_flags |= INDEX_FORMAT_CHECK; - if (literally) - index_flags &= ~INDEX_FORMAT_CHECK; + flags &= ~INDEX_FORMAT_CHECK; if (fstat(fd, &st) < 0 || index_fd(the_repository->index, &oid, fd, &st, - type_from_string(type), path, index_flags)) - die((flags & HASH_OBJECT_WRITE) + type_from_string(type), path, flags)) + die((flags & INDEX_WRITE_OBJECT) ? "Unable to add %s to database" : "Unable to hash %s", path); printf("%s\n", oid_to_hex(&oid)); @@ -94,13 +83,13 @@ int cmd_hash_object(int argc, int no_filters = 0; int literally = 0; int nongit = 0; - unsigned flags = HASH_OBJECT_CHECK; + unsigned flags = INDEX_FORMAT_CHECK; const char *vpath = NULL; char *vpath_free = NULL; const struct option hash_object_options[] = { OPT_STRING('t', NULL, &type, N_("type"), N_("object type")), OPT_BIT('w', NULL, &flags, N_("write the object into the object database"), - HASH_OBJECT_WRITE), + INDEX_WRITE_OBJECT), OPT_COUNTUP( 0 , "stdin", &hashstdin, N_("read the object from stdin")), OPT_BOOL( 0 , "stdin-paths", &stdin_paths, N_("read file names from stdin")), OPT_BOOL( 0 , "no-filters", &no_filters, N_("store file as is without filters")), @@ -114,7 +103,7 @@ int cmd_hash_object(int argc, argc = parse_options(argc, argv, prefix, hash_object_options, hash_object_usage, 0); - if (flags & HASH_OBJECT_WRITE) + if (flags & INDEX_WRITE_OBJECT) prefix = setup_git_directory(); else prefix = setup_git_directory_gently(&nongit); From f710fd7b49218ce3407a88b2c548704299c7c664 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:50:10 -0400 Subject: [PATCH 12/13] hash-object: handle --literally with OPT_NEGBIT Since we recently removed the hash_literally() function, the hash-object --literally option has been simplified to just removing the INDEX_FORMAT_CHECK flag. Rather than pass it around as a separate bool, we can just have the option parser remove the bit from the set of flags directly. This simplifies the helper functions. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/hash-object.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/builtin/hash-object.c b/builtin/hash-object.c index 1ecb70b551..6a99ec250d 100644 --- a/builtin/hash-object.c +++ b/builtin/hash-object.c @@ -19,15 +19,11 @@ #include "strbuf.h" #include "write-or-die.h" -static void hash_fd(int fd, const char *type, const char *path, unsigned flags, - int literally) +static void hash_fd(int fd, const char *type, const char *path, unsigned flags) { struct stat st; struct object_id oid; - if (literally) - flags &= ~INDEX_FORMAT_CHECK; - if (fstat(fd, &st) < 0 || index_fd(the_repository->index, &oid, fd, &st, type_from_string(type), path, flags)) @@ -39,15 +35,14 @@ static void hash_fd(int fd, const char *type, const char *path, unsigned flags, } static void hash_object(const char *path, const char *type, const char *vpath, - unsigned flags, int literally) + unsigned flags) { int fd; fd = xopen(path, O_RDONLY); - hash_fd(fd, type, vpath, flags, literally); + hash_fd(fd, type, vpath, flags); } -static void hash_stdin_paths(const char *type, int no_filters, unsigned flags, - int literally) +static void hash_stdin_paths(const char *type, int no_filters, unsigned flags) { struct strbuf buf = STRBUF_INIT; struct strbuf unquoted = STRBUF_INIT; @@ -59,8 +54,7 @@ static void hash_stdin_paths(const char *type, int no_filters, unsigned flags, die("line is badly quoted"); strbuf_swap(&buf, &unquoted); } - hash_object(buf.buf, type, no_filters ? NULL : buf.buf, flags, - literally); + hash_object(buf.buf, type, no_filters ? NULL : buf.buf, flags); } strbuf_release(&buf); strbuf_release(&unquoted); @@ -81,7 +75,6 @@ int cmd_hash_object(int argc, int hashstdin = 0; int stdin_paths = 0; int no_filters = 0; - int literally = 0; int nongit = 0; unsigned flags = INDEX_FORMAT_CHECK; const char *vpath = NULL; @@ -93,7 +86,9 @@ int cmd_hash_object(int argc, OPT_COUNTUP( 0 , "stdin", &hashstdin, N_("read the object from stdin")), OPT_BOOL( 0 , "stdin-paths", &stdin_paths, N_("read file names from stdin")), OPT_BOOL( 0 , "no-filters", &no_filters, N_("store file as is without filters")), - OPT_BOOL( 0, "literally", &literally, N_("just hash any random garbage to create corrupt objects for debugging Git")), + OPT_NEGBIT( 0, "literally", &flags, + N_("just hash any random garbage to create corrupt objects for debugging Git"), + INDEX_FORMAT_CHECK), OPT_STRING( 0 , "path", &vpath, N_("file"), N_("process file as it were from this path")), OPT_END() }; @@ -139,7 +134,7 @@ int cmd_hash_object(int argc, } if (hashstdin) - hash_fd(0, type, vpath, flags, literally); + hash_fd(0, type, vpath, flags); for (i = 0 ; i < argc; i++) { const char *arg = argv[i]; @@ -148,12 +143,12 @@ int cmd_hash_object(int argc, if (prefix) arg = to_free = prefix_filename(prefix, arg); hash_object(arg, type, no_filters ? NULL : vpath ? vpath : arg, - flags, literally); + flags); free(to_free); } if (stdin_paths) - hash_stdin_paths(type, no_filters, flags, literally); + hash_stdin_paths(type, no_filters, flags); free(vpath_free); From 141f8c8c0535004fa5432d9a6d57bf08129a7dd8 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 16 May 2025 00:50:13 -0400 Subject: [PATCH 13/13] object-file: drop support for writing objects with unknown types Since "hash-object --literally" no longer supports objects with unknown types, there are now no callers of write_object_file_literally() and its helpers. Let's drop them to simplify the code. In particular, this gets rid of some ugly copy-and-paste code from write_object_file_literally(), which is a parallel implementation of write_object_file(). When the split was originally made, the two weren't that long, but commits like 63a6745a07 (object-file: update the loose object map when writing loose objects, 2023-10-01) ended up having to duplicate some tricky code. This patch drops all of that duplication and should make things less error-prone going forward. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 81 ++++----------------------------------------------- object-file.h | 5 +--- 2 files changed, 6 insertions(+), 80 deletions(-) diff --git a/object-file.c b/object-file.c index b10e283529..1ac04c2891 100644 --- a/object-file.c +++ b/object-file.c @@ -130,12 +130,6 @@ int has_loose_object(const struct object_id *oid) return check_and_freshen(oid, 0); } -static int format_object_header_literally(char *str, size_t size, - const char *type, size_t objsize) -{ - return xsnprintf(str, size, "%s %"PRIuMAX, type, (uintmax_t)objsize) + 1; -} - int format_object_header(char *str, size_t size, enum object_type type, size_t objsize) { @@ -144,7 +138,7 @@ int format_object_header(char *str, size_t size, enum object_type type, if (!name) BUG("could not get a type name for 'enum object_type' value %d", type); - return format_object_header_literally(str, size, name, objsize); + return xsnprintf(str, size, "%s %"PRIuMAX, name, (uintmax_t)objsize) + 1; } int check_object_signature(struct repository *r, const struct object_id *oid, @@ -558,17 +552,6 @@ static void write_object_file_prepare(const struct git_hash_algo *algo, hash_object_body(algo, &c, buf, len, oid, hdr, hdrlen); } -static void write_object_file_prepare_literally(const struct git_hash_algo *algo, - const void *buf, unsigned long len, - const char *type, struct object_id *oid, - char *hdr, int *hdrlen) -{ - struct git_hash_ctx c; - - *hdrlen = format_object_header_literally(hdr, *hdrlen, type, len); - hash_object_body(algo, &c, buf, len, oid, hdr, hdrlen); -} - #define CHECK_COLLISION_DEST_VANISHED -2 static int check_collision(const char *source, const char *dest) @@ -698,21 +681,14 @@ out: return 0; } -static void hash_object_file_literally(const struct git_hash_algo *algo, - const void *buf, unsigned long len, - const char *type, struct object_id *oid) -{ - char hdr[MAX_HEADER_LEN]; - int hdrlen = sizeof(hdr); - - write_object_file_prepare_literally(algo, buf, len, type, oid, hdr, &hdrlen); -} - void hash_object_file(const struct git_hash_algo *algo, const void *buf, unsigned long len, enum object_type type, struct object_id *oid) { - hash_object_file_literally(algo, buf, len, type_name(type), oid); + char hdr[MAX_HEADER_LEN]; + int hdrlen = sizeof(hdr); + + write_object_file_prepare(algo, buf, len, type, oid, hdr, &hdrlen); } /* Finalize a file on disk, and close it. */ @@ -1114,53 +1090,6 @@ int write_object_file_flags(const void *buf, unsigned long len, return 0; } -int write_object_file_literally(const void *buf, unsigned long len, - const char *type, struct object_id *oid, - unsigned flags) -{ - char *header; - struct repository *repo = the_repository; - const struct git_hash_algo *algo = repo->hash_algo; - const struct git_hash_algo *compat = repo->compat_hash_algo; - struct object_id compat_oid; - int hdrlen, status = 0; - int compat_type = -1; - - if (compat) { - compat_type = type_from_string_gently(type, -1, 1); - if (compat_type == OBJ_BLOB) - hash_object_file(compat, buf, len, compat_type, - &compat_oid); - else if (compat_type != -1) { - struct strbuf converted = STRBUF_INIT; - convert_object_file(the_repository, - &converted, algo, compat, - buf, len, compat_type, 0); - hash_object_file(compat, converted.buf, converted.len, - compat_type, &compat_oid); - strbuf_release(&converted); - } - } - - /* type string, SP, %lu of the length plus NUL must fit this */ - hdrlen = strlen(type) + MAX_HEADER_LEN; - header = xmalloc(hdrlen); - write_object_file_prepare_literally(the_hash_algo, buf, len, type, - oid, header, &hdrlen); - - if (!(flags & WRITE_OBJECT_FILE_PERSIST)) - goto cleanup; - if (freshen_packed_object(oid) || freshen_loose_object(oid)) - goto cleanup; - status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0); - if (compat_type != -1) - return repo_add_loose_object_map(repo, oid, &compat_oid); - -cleanup: - free(header); - return status; -} - int force_object_loose(const struct object_id *oid, time_t mtime) { struct repository *repo = the_repository; diff --git a/object-file.h b/object-file.h index a979fd5e4d..6f41142452 100644 --- a/object-file.h +++ b/object-file.h @@ -159,7 +159,7 @@ int parse_loose_header(const char *hdr, struct object_info *oi); enum { /* - * By default, `write_object_file_literally()` does not actually write + * By default, `write_object_file()` does not actually write * anything into the object store, but only computes the object ID. * This flag changes that so that the object will be written as a loose * object and persisted. @@ -187,9 +187,6 @@ struct input_stream { int is_finished; }; -int write_object_file_literally(const void *buf, unsigned long len, - const char *type, struct object_id *oid, - unsigned flags); int stream_loose_object(struct input_stream *in_stream, size_t len, struct object_id *oid);