From f23ac77a4325fc776ebb38044a34f5e9629e4f67 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 16 Feb 2026 16:38:01 +0100 Subject: [PATCH 1/3] commit: avoid parsing non-commits in `lookup_commit_reference_gently()` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function `lookup_commit_reference_gently()` can be used to look up a committish by object ID. As such, the function knows to peel for example tag objects so that we eventually end up with the commit. The function is used quite a lot throughout our tree. One such user is "shallow.c" via `assign_shallow_commits_to_refs()`. The intent of this function is to figure out whether a shallow push is missing any objects that are required to satisfy the ref updates, and if so, which of the ref updates is missing objects. This is done by painting the tree with `UNINTERESTING`. We start painting by calling `refs_for_each_ref()` so that we can mark all existing referenced objects as the boundary of objects that we already have, and which are supposed to be fully connected. The reference tips are then parsed via `lookup_commit_reference_gently()`, and the commit is then marked as uninteresting. But references may not necessarily point to a committish, and if a lot of them aren't then this step takes a lot of time. This is mostly due to the way that `lookup_commit_reference_gently()` is implemented: before we learn about the type of the object we already call `parse_object()` on the object ID. This has two consequences: - We parse all objects, including trees and blobs, even though we don't even need the contents of them. - More importantly though, `parse_object()` will cause us to check whether the object ID matches its contents. Combined this means that we deflate and hash every non-committish object, and that of course ends up being both CPU- and memory-intensive. Improve the logic so that we first use `peel_object()`. This function won't parse the object for us, and thus it allows us to learn about the object's type before we parse and return it. The following benchmark pushes a single object from a shallow clone into a repository that has 100,000 refs. These refs were created by listing all objects via `git rev-list(1) --objects --all` and creating refs for a subset of them, so lots of those refs will cover non-commit objects. Benchmark 1: git-receive-pack (rev = HEAD~) Time (mean ± σ): 62.571 s ± 0.413 s [User: 58.331 s, System: 4.053 s] Range (min … max): 62.191 s … 63.010 s 3 runs Benchmark 2: git-receive-pack (rev = HEAD) Time (mean ± σ): 38.339 s ± 0.192 s [User: 36.220 s, System: 1.992 s] Range (min … max): 38.176 s … 38.551 s 3 runs Summary git-receive-pack . Signed-off-by: Junio C Hamano --- commit.c | 32 +++++++++++++++++++++++++++----- object.c | 23 ++++++++++++++++++----- object.h | 5 +++++ 3 files changed, 50 insertions(+), 10 deletions(-) diff --git a/commit.c b/commit.c index 28bb5ce029..fb1b01b9c3 100644 --- a/commit.c +++ b/commit.c @@ -43,13 +43,35 @@ const char *commit_type = "commit"; struct commit *lookup_commit_reference_gently(struct repository *r, const struct object_id *oid, int quiet) { - struct object *obj = deref_tag(r, - parse_object(r, oid), - NULL, 0); + const struct object_id *maybe_peeled; + struct object_id peeled_oid; + struct object *object; + enum object_type type; - if (!obj) + switch (peel_object_ext(r, oid, &peeled_oid, 0, &type)) { + case PEEL_NON_TAG: + maybe_peeled = oid; + break; + case PEEL_PEELED: + maybe_peeled = &peeled_oid; + break; + default: return NULL; - return object_as_type(obj, OBJ_COMMIT, quiet); + } + + if (type != OBJ_COMMIT) { + if (!quiet) + error(_("object %s is a %s, not a %s"), + oid_to_hex(oid), type_name(type), + type_name(OBJ_COMMIT)); + return NULL; + } + + object = parse_object(r, maybe_peeled); + if (!object) + return NULL; + + return object_as_type(object, OBJ_COMMIT, quiet); } struct commit *lookup_commit_reference(struct repository *r, const struct object_id *oid) diff --git a/object.c b/object.c index 4669b8d65e..99b6df3780 100644 --- a/object.c +++ b/object.c @@ -207,10 +207,11 @@ struct object *lookup_object_by_type(struct repository *r, } } -enum peel_status peel_object(struct repository *r, - const struct object_id *name, - struct object_id *oid, - unsigned flags) +enum peel_status peel_object_ext(struct repository *r, + const struct object_id *name, + struct object_id *oid, + unsigned flags, + enum object_type *typep) { struct object *o = lookup_unknown_object(r, name); @@ -220,8 +221,10 @@ enum peel_status peel_object(struct repository *r, return PEEL_INVALID; } - if (o->type != OBJ_TAG) + if (o->type != OBJ_TAG) { + *typep = o->type; return PEEL_NON_TAG; + } while (o && o->type == OBJ_TAG) { o = parse_object(r, &o->oid); @@ -241,9 +244,19 @@ enum peel_status peel_object(struct repository *r, return PEEL_INVALID; oidcpy(oid, &o->oid); + *typep = o->type; return PEEL_PEELED; } +enum peel_status peel_object(struct repository *r, + const struct object_id *name, + struct object_id *oid, + unsigned flags) +{ + enum object_type dummy; + return peel_object_ext(r, name, oid, flags, &dummy); +} + struct object *parse_object_buffer(struct repository *r, const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p) { struct object *obj; diff --git a/object.h b/object.h index 4bca957b8d..8f98382243 100644 --- a/object.h +++ b/object.h @@ -309,6 +309,11 @@ enum peel_status peel_object(struct repository *r, const struct object_id *name, struct object_id *oid, unsigned flags); +enum peel_status peel_object_ext(struct repository *r, + const struct object_id *name, + struct object_id *oid, + unsigned flags, + enum object_type *typep); struct object_list *object_list_insert(struct object *item, struct object_list **list_p); From 024b4c96976fabdc8b73f4183d6bb8626ffe2c7d Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 16 Feb 2026 16:38:02 +0100 Subject: [PATCH 2/3] commit: make `repo_parse_commit_no_graph()` more robust In the next commit we will start to parse more commits via the commit-graph. This change will lead to a segfault though because we try to access the tree of a commit via `repo_get_commit_tree()`, but: - The commit has been parsed via the commit-graph, and thus its `maybe_tree` field is not yet populated. - We cannot use the commit-graph to populate the commit's tree because we're in the process of writing the commit-graph. The consequence is that we'll get a `NULL` pointer for the tree in `write_graph_chunk_data()`. In theory we are already mindful of this situation, as we explicitly use `repo_parse_commit_no_graph()` to parse the commit without the help of the commit-graph. But that doesn't do the trick as the commit is already marked as parsed, so the function will not re-populate it. And as the commit-graph has been closed, neither will `get_commit_tree_oid()` be able to load the tree for us. It seems like this issue can only be hit under artificial circumstances: the error was hit via `git_test_write_commit_graph_or_die()`, which is run by git-commit(1) and git-merge(1) in case `GIT_TEST_COMMIT_GRAPH=1`: $ GIT_TEST_COMMIT_GRAPH=1 meson test t7507-commit-verbose \ --test-args=-ix -i ... ++ git -c commit.verbose=true commit --amend hint: Waiting for your editor to close the file... ./test-lib.sh: line 1012: 55895 Segmentation fault (core dumped) git -c commit.verbose=true commit --amend To the best of my knowledge, this is the only case where we end up writing a commit-graph in the same process that might have already consulted the commit-graph to look up arbitrary objects. But regardless of that, this feels like a bigger accident that is just waiting to happen. Make the code more robust by extending `repo_parse_commit_no_graph()` to unparse a commit first in case we detect it's coming from a graph. This ensures that we will re-read the object without it, and thus we will populate `maybe_tree` properly. This fix shouldn't have any performance consequences: the function is only ever called in the "commit-graph.c" code, and we'll only re-parse the commit at most once. Add an exclusion to our Coccinelle rules so that it doesn't complain about us accessing `maybe_tree` directly. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- commit.h | 14 ++++++++++++-- contrib/coccinelle/commit.cocci | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/commit.h b/commit.h index 79a761c37d..05165a48a5 100644 --- a/commit.h +++ b/commit.h @@ -103,16 +103,26 @@ static inline int repo_parse_commit(struct repository *r, struct commit *item) return repo_parse_commit_gently(r, item, 0); } +void unparse_commit(struct repository *r, const struct object_id *oid); + static inline int repo_parse_commit_no_graph(struct repository *r, struct commit *commit) { + /* + * When the commit has been parsed but its tree wasn't populated then + * this is an indicator that it has been parsed via the commit-graph. + * We cannot read the tree via the commit-graph, as we're explicitly + * told not to use it. We thus have to first un-parse the object so + * that we can re-parse it without the graph. + */ + if (commit->object.parsed && !commit->maybe_tree) + unparse_commit(r, &commit->object.oid); + return repo_parse_commit_internal(r, commit, 0, 0); } void parse_commit_or_die(struct commit *item); -void unparse_commit(struct repository *r, const struct object_id *oid); - struct buffer_slab; struct buffer_slab *allocate_commit_buffer_slab(void); void free_commit_buffer_slab(struct buffer_slab *bs); diff --git a/contrib/coccinelle/commit.cocci b/contrib/coccinelle/commit.cocci index c5284604c5..42725161e9 100644 --- a/contrib/coccinelle/commit.cocci +++ b/contrib/coccinelle/commit.cocci @@ -26,7 +26,7 @@ expression s; // repo_get_commit_tree() on the LHS. @@ identifier f != { repo_get_commit_tree, get_commit_tree_in_graph_one, - load_tree_for_commit, set_commit_tree }; + load_tree_for_commit, set_commit_tree, repo_parse_commit_no_graph }; expression c; @@ f(...) {<... From bb5da75d6116c35924a04a418ef4c3182663d0a2 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 16 Feb 2026 16:38:03 +0100 Subject: [PATCH 3/3] commit: use commit graph in `lookup_commit_reference_gently()` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the preceding commit we refactored `lookup_commit_reference_gently()` so that it doesn't parse non-commit objects anymore. This has led to a speedup when git-receive-pack(1) accepts a shallow push into a repo with lots of refs that point to blobs or trees. But while this case is now faster, we still have the issue that accepting pushes with lots of "normal" refs that point to commits are still slow. This is mostly because we look up the commits via the object database, and that is rather costly. Adapt the code to use `repo_parse_commit_gently()` instead of `parse_object()` to parse the resulting commit object. This function knows to use the commit-graph to fill in the object, which is way more cost efficient. This leads to another significant speedup when accepting shallow pushes. The following benchmark pushes a single objects from a shallow clone into a repository with 600,000 references that all point to commits: Benchmark 1: git-receive-pack (rev = HEAD~) Time (mean ± σ): 9.179 s ± 0.031 s [User: 8.858 s, System: 0.528 s] Range (min … max): 9.154 s … 9.213 s 3 runs Benchmark 2: git-receive-pack (rev = HEAD) Time (mean ± σ): 2.337 s ± 0.032 s [User: 2.331 s, System: 0.234 s] Range (min … max): 2.308 s … 2.371 s 3 runs Summary git-receive-pack . Signed-off-by: Junio C Hamano --- commit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/commit.c b/commit.c index fb1b01b9c3..25cdbd3be4 100644 --- a/commit.c +++ b/commit.c @@ -45,7 +45,7 @@ struct commit *lookup_commit_reference_gently(struct repository *r, { const struct object_id *maybe_peeled; struct object_id peeled_oid; - struct object *object; + struct commit *commit; enum object_type type; switch (peel_object_ext(r, oid, &peeled_oid, 0, &type)) { @@ -67,11 +67,11 @@ struct commit *lookup_commit_reference_gently(struct repository *r, return NULL; } - object = parse_object(r, maybe_peeled); - if (!object) + commit = lookup_commit(r, maybe_peeled); + if (!commit || repo_parse_commit_gently(r, commit, quiet) < 0) return NULL; - return object_as_type(object, OBJ_COMMIT, quiet); + return commit; } struct commit *lookup_commit_reference(struct repository *r, const struct object_id *oid)