From d80523bbf998f405b65f53a33f86212c265ebb43 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 16:18:21 +0100 Subject: [PATCH 1/6] odb: stop including "odb/source.h" The "odb.h" header currently includes the "odb/source.h" file. This is somewhat roundabout though: most callers shouldn't have to care about the `struct odb_source`, but should rather use the ODB-level functions. Furthermore, it means that a couple of definitions have to live on the source level even though they should be part of the generic interface. Reverse the relation between "odb/source.h" and "odb.h" and move the enums and typedefs that relate to the generic interfaces back into "odb.h". Add the necessary includes to all files that rely on the transitive include. Suggested-by: Justin Tobler Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/multi-pack-index.c | 1 + builtin/submodule--helper.c | 1 + odb.h | 50 ++++++++++++++++++++++++++++++++++- odb/source.h | 52 +------------------------------------ odb/streaming.c | 1 + repository.c | 1 + submodule-config.c | 1 + tmp-objdir.c | 1 + 8 files changed, 56 insertions(+), 52 deletions(-) diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c index 5f364aa816..3fcb207f1a 100644 --- a/builtin/multi-pack-index.c +++ b/builtin/multi-pack-index.c @@ -9,6 +9,7 @@ #include "strbuf.h" #include "trace2.h" #include "odb.h" +#include "odb/source.h" #include "replace-object.h" #include "repository.h" diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c index 143f7cb3cc..4957487536 100644 --- a/builtin/submodule--helper.c +++ b/builtin/submodule--helper.c @@ -29,6 +29,7 @@ #include "object-file.h" #include "object-name.h" #include "odb.h" +#include "odb/source.h" #include "advice.h" #include "branch.h" #include "list-objects-filter-options.h" diff --git a/odb.h b/odb.h index 86e0365c24..7a583e3873 100644 --- a/odb.h +++ b/odb.h @@ -3,7 +3,6 @@ #include "hashmap.h" #include "object.h" -#include "odb/source.h" #include "oidset.h" #include "oidmap.h" #include "string-list.h" @@ -12,6 +11,7 @@ struct oidmap; struct oidtree; struct strbuf; +struct strvec; struct repository; struct multi_pack_index; @@ -339,6 +339,42 @@ struct object_info { */ #define OBJECT_INFO_INIT { 0 } +/* Flags that can be passed to `odb_read_object_info_extended()`. */ +enum object_info_flags { + /* Invoke lookup_replace_object() on the given hash. */ + OBJECT_INFO_LOOKUP_REPLACE = (1 << 0), + + /* Do not reprepare object sources when the first lookup has failed. */ + OBJECT_INFO_QUICK = (1 << 1), + + /* + * Do not attempt to fetch the object if missing (even if fetch_is_missing is + * nonzero). + */ + OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2), + + /* Die if object corruption (not just an object being missing) was detected. */ + OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3), + + /* + * We have already tried reading the object, but it couldn't be found + * via any of the attached sources, and are now doing a second read. + * This second read asks the individual sources to also evaluate + * whether any on-disk state may have changed that may have caused the + * object to appear. + * + * This flag is for internal use, only. The second read only occurs + * when `OBJECT_INFO_QUICK` was not passed. + */ + OBJECT_INFO_SECOND_READ = (1 << 4), + + /* + * This is meant for bulk prefetching of missing blobs in a partial + * clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK. + */ + OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK), +}; + /* * Read object info from the object database and populate the `object_info` * structure. Returns 0 on success, a negative error code otherwise. @@ -432,6 +468,18 @@ enum odb_for_each_object_flags { ODB_FOR_EACH_OBJECT_SKIP_ON_DISK_KEPT_PACKS = (1<<4), }; +/* + * A callback function that can be used to iterate through objects. If given, + * the optional `oi` parameter will be populated the same as if you would call + * `odb_read_object_info()`. + * + * Returning a non-zero error code will cause iteration to abort. The error + * code will be propagated. + */ +typedef int (*odb_for_each_object_cb)(const struct object_id *oid, + struct object_info *oi, + void *cb_data); + /* * Iterate through all objects contained in the object database. Note that * objects may be iterated over multiple times in case they are either stored diff --git a/odb/source.h b/odb/source.h index caac558149..a1fd9dd920 100644 --- a/odb/source.h +++ b/odb/source.h @@ -2,6 +2,7 @@ #define ODB_SOURCE_H #include "object.h" +#include "odb.h" enum odb_source_type { /* @@ -14,61 +15,10 @@ enum odb_source_type { ODB_SOURCE_FILES, }; -/* Flags that can be passed to `odb_read_object_info_extended()`. */ -enum object_info_flags { - /* Invoke lookup_replace_object() on the given hash. */ - OBJECT_INFO_LOOKUP_REPLACE = (1 << 0), - - /* Do not reprepare object sources when the first lookup has failed. */ - OBJECT_INFO_QUICK = (1 << 1), - - /* - * Do not attempt to fetch the object if missing (even if fetch_is_missing is - * nonzero). - */ - OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2), - - /* Die if object corruption (not just an object being missing) was detected. */ - OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3), - - /* - * We have already tried reading the object, but it couldn't be found - * via any of the attached sources, and are now doing a second read. - * This second read asks the individual sources to also evaluate - * whether any on-disk state may have changed that may have caused the - * object to appear. - * - * This flag is for internal use, only. The second read only occurs - * when `OBJECT_INFO_QUICK` was not passed. - */ - OBJECT_INFO_SECOND_READ = (1 << 4), - - /* - * This is meant for bulk prefetching of missing blobs in a partial - * clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK. - */ - OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK), -}; - struct object_id; -struct object_info; struct odb_read_stream; -struct odb_transaction; -struct odb_write_stream; struct strvec; -/* - * A callback function that can be used to iterate through objects. If given, - * the optional `oi` parameter will be populated the same as if you would call - * `odb_read_object_info()`. - * - * Returning a non-zero error code will cause iteration to abort. The error - * code will be propagated. - */ -typedef int (*odb_for_each_object_cb)(const struct object_id *oid, - struct object_info *oi, - void *cb_data); - /* * The source is the part of the object database that stores the actual * objects. It thus encapsulates the logic to read and write the specific diff --git a/odb/streaming.c b/odb/streaming.c index a4355cd245..5927a12954 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -7,6 +7,7 @@ #include "environment.h" #include "repository.h" #include "odb.h" +#include "odb/source.h" #include "odb/streaming.h" #include "replace-object.h" diff --git a/repository.c b/repository.c index e7fa42c14f..05c26bdbc3 100644 --- a/repository.c +++ b/repository.c @@ -2,6 +2,7 @@ #include "abspath.h" #include "repository.h" #include "odb.h" +#include "odb/source.h" #include "config.h" #include "object.h" #include "lockfile.h" diff --git a/submodule-config.c b/submodule-config.c index 1f19fe2077..72a46b7a54 100644 --- a/submodule-config.c +++ b/submodule-config.c @@ -14,6 +14,7 @@ #include "strbuf.h" #include "object-name.h" #include "odb.h" +#include "odb/source.h" #include "parse-options.h" #include "thread-utils.h" #include "tree-walk.h" diff --git a/tmp-objdir.c b/tmp-objdir.c index e436eed07e..d199d39e7c 100644 --- a/tmp-objdir.c +++ b/tmp-objdir.c @@ -11,6 +11,7 @@ #include "strvec.h" #include "quote.h" #include "odb.h" +#include "odb/source.h" #include "repository.h" struct tmp_objdir { From ebf0664e7a803e7e161c4d3a7bc0876646383aeb Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 16:18:22 +0100 Subject: [PATCH 2/6] packfile: extract logic to count number of objects In a subsequent commit we're about to introduce a new `odb_source_count_objects()` function so that we can make the logic pluggable. Prepare for this change by extracting the logic that we have to count packed objects into a standalone function. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- packfile.c | 45 +++++++++++++++++++++++++++++++++++---------- packfile.h | 9 +++++++++ 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/packfile.c b/packfile.c index 215a23e42b..1ee5dd3da3 100644 --- a/packfile.c +++ b/packfile.c @@ -1101,6 +1101,36 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor return store->packs.head; } +int packfile_store_count_objects(struct packfile_store *store, + unsigned long *out) +{ + struct packfile_list_entry *e; + struct multi_pack_index *m; + unsigned long count = 0; + int ret; + + m = get_multi_pack_index(store->source); + if (m) + count += m->num_objects + m->num_objects_in_base; + + for (e = packfile_store_get_packs(store); e; e = e->next) { + if (e->pack->multi_pack_index) + continue; + if (open_pack_index(e->pack)) { + ret = -1; + goto out; + } + + count += e->pack->num_objects; + } + + *out = count; + ret = 0; + +out: + return ret; +} + /* * Give a fast, rough count of the number of objects in the repository. This * ignores loose objects completely. If you have a lot of them, then either @@ -1113,21 +1143,16 @@ unsigned long repo_approximate_object_count(struct repository *r) if (!r->objects->approximate_object_count_valid) { struct odb_source *source; unsigned long count = 0; - struct packed_git *p; odb_prepare_alternates(r->objects); - for (source = r->objects->sources; source; source = source->next) { - struct multi_pack_index *m = get_multi_pack_index(source); - if (m) - count += m->num_objects + m->num_objects_in_base; + struct odb_source_files *files = odb_source_files_downcast(source); + unsigned long c; + + if (!packfile_store_count_objects(files->packed, &c)) + count += c; } - repo_for_each_pack(r, p) { - if (p->multi_pack_index || open_pack_index(p)) - continue; - count += p->num_objects; - } r->objects->approximate_object_count = count; r->objects->approximate_object_count_valid = 1; } diff --git a/packfile.h b/packfile.h index 8b04a258a7..1da8c729cb 100644 --- a/packfile.h +++ b/packfile.h @@ -268,6 +268,15 @@ enum kept_pack_type { KEPT_PACK_IN_CORE = (1 << 1), }; +/* + * Count the number objects contained in the given packfile store. If + * successful, the number of objects will be written to the `out` pointer. + * + * Return 0 on success, a negative error code otherwise. + */ +int packfile_store_count_objects(struct packfile_store *store, + unsigned long *out); + /* * Retrieve the cache of kept packs from the given packfile store. Accepts a * combination of `kept_pack_type` flags. The cache is computed on demand and From 3b5ca32b5f24a288a84b2ae9dc755404ad90f5cc Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 16:18:23 +0100 Subject: [PATCH 3/6] object-file: extract logic to approximate object count In "builtin/gc.c" we have some logic that checks whether we need to repack objects. This is done by counting the number of objects that we have and checking whether it exceeds a certain threshold. We don't really need an accurate object count though, which is why we only open a single object diretcroy shard and then extrapolate from there. Extract this logic into a new function that is owned by the loose object database source. This is done to prepare for a subsequent change, where we'll introduce object counting on the object database source level. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/gc.c | 35 ++++++++--------------------------- object-file.c | 41 +++++++++++++++++++++++++++++++++++++++++ object-file.h | 13 +++++++++++++ 3 files changed, 62 insertions(+), 27 deletions(-) diff --git a/builtin/gc.c b/builtin/gc.c index fb329c2cff..a08c7554cb 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -467,37 +467,18 @@ out: static int too_many_loose_objects(int limit) { /* - * Quickly check if a "gc" is needed, by estimating how - * many loose objects there are. Because SHA-1 is evenly - * distributed, we can check only one and get a reasonable - * estimate. + * This is weird, but stems from legacy behaviour: the GC auto + * threshold was always essentially interpreted as if it was rounded up + * to the next multiple 256 of, so we retain this behaviour for now. */ - DIR *dir; - struct dirent *ent; - int auto_threshold; - int num_loose = 0; - int needed = 0; - const unsigned hexsz_loose = the_hash_algo->hexsz - 2; - char *path; + int auto_threshold = DIV_ROUND_UP(limit, 256) * 256; + unsigned long loose_count; - path = repo_git_path(the_repository, "objects/17"); - dir = opendir(path); - free(path); - if (!dir) + if (odb_source_loose_approximate_object_count(the_repository->objects->sources, + &loose_count) < 0) return 0; - auto_threshold = DIV_ROUND_UP(limit, 256); - while ((ent = readdir(dir)) != NULL) { - if (strspn(ent->d_name, "0123456789abcdef") != hexsz_loose || - ent->d_name[hexsz_loose] != '\0') - continue; - if (++num_loose > auto_threshold) { - needed = 1; - break; - } - } - closedir(dir); - return needed; + return loose_count > auto_threshold; } static struct packed_git *find_base_packs(struct string_list *packs, diff --git a/object-file.c b/object-file.c index a3ff7f586c..da67e3c9ff 100644 --- a/object-file.c +++ b/object-file.c @@ -1868,6 +1868,47 @@ int odb_source_loose_for_each_object(struct odb_source *source, NULL, NULL, &data); } +int odb_source_loose_approximate_object_count(struct odb_source *source, + unsigned long *out) +{ + const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2; + unsigned long count = 0; + struct dirent *ent; + char *path = NULL; + DIR *dir = NULL; + int ret; + + path = xstrfmt("%s/17", source->path); + + dir = opendir(path); + if (!dir) { + if (errno == ENOENT) { + *out = 0; + ret = 0; + goto out; + } + + ret = error_errno("cannot open object shard '%s'", path); + goto out; + } + + while ((ent = readdir(dir)) != NULL) { + if (strspn(ent->d_name, "0123456789abcdef") != hexsz || + ent->d_name[hexsz] != '\0') + continue; + count++; + } + + *out = count * 256; + ret = 0; + +out: + if (dir) + closedir(dir); + free(path); + return ret; +} + static int append_loose_object(const struct object_id *oid, const char *path UNUSED, void *data) diff --git a/object-file.h b/object-file.h index ff6da65296..b870ea9fa8 100644 --- a/object-file.h +++ b/object-file.h @@ -139,6 +139,19 @@ int odb_source_loose_for_each_object(struct odb_source *source, void *cb_data, unsigned flags); +/* + * Count the number of loose objects in this source. + * + * The object count is approximated by opening a single sharding directory for + * loose objects and scanning its contents. The result is then extrapolated by + * 256. This should generally work as a reasonable estimate given that the + * object hash is supposed to be indistinguishable from random. + * + * Returns 0 on success, a negative error code otherwise. + */ +int odb_source_loose_approximate_object_count(struct odb_source *source, + unsigned long *out); + /** * format_object_header() is a thin wrapper around s xsnprintf() that * writes the initial " " part of the loose object From f7d1918469d763f46a7acbae437e6424ed49c0a3 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 16:18:24 +0100 Subject: [PATCH 4/6] object-file: generalize counting objects Generalize the function introduced in the preceding commit to not only be able to approximate the number of loose objects, but to also provide an accurate count. The behaviour can be toggled via a new flag. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/gc.c | 5 +++-- object-file.c | 58 ++++++++++++++++++++++++++++++++------------------- object-file.h | 5 +++-- odb.h | 9 ++++++++ 4 files changed, 52 insertions(+), 25 deletions(-) diff --git a/builtin/gc.c b/builtin/gc.c index a08c7554cb..3a64d28da8 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -474,8 +474,9 @@ static int too_many_loose_objects(int limit) int auto_threshold = DIV_ROUND_UP(limit, 256) * 256; unsigned long loose_count; - if (odb_source_loose_approximate_object_count(the_repository->objects->sources, - &loose_count) < 0) + if (odb_source_loose_count_objects(the_repository->objects->sources, + ODB_COUNT_OBJECTS_APPROXIMATE, + &loose_count) < 0) return 0; return loose_count > auto_threshold; diff --git a/object-file.c b/object-file.c index da67e3c9ff..d35cec201f 100644 --- a/object-file.c +++ b/object-file.c @@ -1868,40 +1868,56 @@ int odb_source_loose_for_each_object(struct odb_source *source, NULL, NULL, &data); } -int odb_source_loose_approximate_object_count(struct odb_source *source, - unsigned long *out) +static int count_loose_object(const struct object_id *oid UNUSED, + struct object_info *oi UNUSED, + void *payload) +{ + unsigned long *count = payload; + (*count)++; + return 0; +} + +int odb_source_loose_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out) { const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2; - unsigned long count = 0; - struct dirent *ent; char *path = NULL; DIR *dir = NULL; int ret; - path = xstrfmt("%s/17", source->path); + if (flags & ODB_COUNT_OBJECTS_APPROXIMATE) { + unsigned long count = 0; + struct dirent *ent; - dir = opendir(path); - if (!dir) { - if (errno == ENOENT) { - *out = 0; - ret = 0; + path = xstrfmt("%s/17", source->path); + + dir = opendir(path); + if (!dir) { + if (errno == ENOENT) { + *out = 0; + ret = 0; + goto out; + } + + ret = error_errno("cannot open object shard '%s'", path); goto out; } - ret = error_errno("cannot open object shard '%s'", path); - goto out; - } + while ((ent = readdir(dir)) != NULL) { + if (strspn(ent->d_name, "0123456789abcdef") != hexsz || + ent->d_name[hexsz] != '\0') + continue; + count++; + } - while ((ent = readdir(dir)) != NULL) { - if (strspn(ent->d_name, "0123456789abcdef") != hexsz || - ent->d_name[hexsz] != '\0') - continue; - count++; + *out = count * 256; + ret = 0; + } else { + ret = odb_source_loose_for_each_object(source, NULL, count_loose_object, + out, 0); } - *out = count * 256; - ret = 0; - out: if (dir) closedir(dir); diff --git a/object-file.h b/object-file.h index b870ea9fa8..f8d8805a18 100644 --- a/object-file.h +++ b/object-file.h @@ -149,8 +149,9 @@ int odb_source_loose_for_each_object(struct odb_source *source, * * Returns 0 on success, a negative error code otherwise. */ -int odb_source_loose_approximate_object_count(struct odb_source *source, - unsigned long *out); +int odb_source_loose_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out); /** * format_object_header() is a thin wrapper around s xsnprintf() that diff --git a/odb.h b/odb.h index 7a583e3873..e6057477f6 100644 --- a/odb.h +++ b/odb.h @@ -500,6 +500,15 @@ int odb_for_each_object(struct object_database *odb, void *cb_data, unsigned flags); +enum odb_count_objects_flags { + /* + * Instead of providing an accurate count, allow the number of objects + * to be approximated. Details of how this approximation works are + * subject to the specific source's implementation. + */ + ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0), +}; + enum { /* * By default, `odb_write_object()` does not actually write anything From 2f2492f7ee85dcc1fe491fb90b2f80aa90ffc368 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 16:18:25 +0100 Subject: [PATCH 5/6] odb/source: introduce generic object counting Introduce generic object counting on the object database source level with a new backend-specific callback function. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb/source-files.c | 30 ++++++++++++++++++++++++++++++ odb/source.h | 27 +++++++++++++++++++++++++++ packfile.c | 4 ++-- packfile.h | 1 + 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/odb/source-files.c b/odb/source-files.c index 14cb9adeca..c08d8993e3 100644 --- a/odb/source-files.c +++ b/odb/source-files.c @@ -93,6 +93,35 @@ static int odb_source_files_for_each_object(struct odb_source *source, return 0; } +static int odb_source_files_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out) +{ + struct odb_source_files *files = odb_source_files_downcast(source); + unsigned long count; + int ret; + + ret = packfile_store_count_objects(files->packed, flags, &count); + if (ret < 0) + goto out; + + if (!(flags & ODB_COUNT_OBJECTS_APPROXIMATE)) { + unsigned long loose_count; + + ret = odb_source_loose_count_objects(source, flags, &loose_count); + if (ret < 0) + goto out; + + count += loose_count; + } + + *out = count; + ret = 0; + +out: + return ret; +} + static int odb_source_files_freshen_object(struct odb_source *source, const struct object_id *oid) { @@ -220,6 +249,7 @@ struct odb_source_files *odb_source_files_new(struct object_database *odb, files->base.read_object_info = odb_source_files_read_object_info; files->base.read_object_stream = odb_source_files_read_object_stream; files->base.for_each_object = odb_source_files_for_each_object; + files->base.count_objects = odb_source_files_count_objects; files->base.freshen_object = odb_source_files_freshen_object; files->base.write_object = odb_source_files_write_object; files->base.write_object_stream = odb_source_files_write_object_stream; diff --git a/odb/source.h b/odb/source.h index a1fd9dd920..96c906e7a1 100644 --- a/odb/source.h +++ b/odb/source.h @@ -142,6 +142,21 @@ struct odb_source { void *cb_data, unsigned flags); + /* + * This callback is expected to count objects in the given object + * database source. The callback function does not have to guarantee + * that only unique objects are counted. The result shall be assigned + * to the `out` pointer. + * + * Accepts `enum odb_count_objects_flag` flags to alter the behaviour. + * + * The callback is expected to return 0 on success, or a negative error + * code otherwise. + */ + int (*count_objects)(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out); + /* * This callback is expected to freshen the given object so that its * last access time is set to the current time. This is used to ensure @@ -333,6 +348,18 @@ static inline int odb_source_for_each_object(struct odb_source *source, return source->for_each_object(source, request, cb, cb_data, flags); } +/* + * Count the number of objects in the given object database source. + * + * Returns 0 on success, a negative error code otherwise. + */ +static inline int odb_source_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out) +{ + return source->count_objects(source, flags, out); +} + /* * Freshen an object in the object database by updating its timestamp. * Returns 1 in case the object has been freshened, 0 in case the object does diff --git a/packfile.c b/packfile.c index 1ee5dd3da3..8ee462303a 100644 --- a/packfile.c +++ b/packfile.c @@ -1102,6 +1102,7 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor } int packfile_store_count_objects(struct packfile_store *store, + enum odb_count_objects_flags flags UNUSED, unsigned long *out) { struct packfile_list_entry *e; @@ -1146,10 +1147,9 @@ unsigned long repo_approximate_object_count(struct repository *r) odb_prepare_alternates(r->objects); for (source = r->objects->sources; source; source = source->next) { - struct odb_source_files *files = odb_source_files_downcast(source); unsigned long c; - if (!packfile_store_count_objects(files->packed, &c)) + if (!odb_source_count_objects(source, ODB_COUNT_OBJECTS_APPROXIMATE, &c)) count += c; } diff --git a/packfile.h b/packfile.h index 1da8c729cb..74b6bc58c5 100644 --- a/packfile.h +++ b/packfile.h @@ -275,6 +275,7 @@ enum kept_pack_type { * Return 0 on success, a negative error code otherwise. */ int packfile_store_count_objects(struct packfile_store *store, + enum odb_count_objects_flags flags, unsigned long *out); /* From 5b21c20953af5129929092292799af0e2ab0ff78 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 16:18:26 +0100 Subject: [PATCH 6/6] odb: introduce generic object counting Similar to the preceding commit, introduce counting of objects on the object database level, replacing the logic that we have in `repo_approximate_object_count()`. Note that the function knows to cache the object count. It's unclear whether this cache is really required as we shouldn't have that many cases where we count objects repeatedly. But to be on the safe side the caching mechanism is retained, with the only excepting being that we also have to use the passed flags as caching key. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/gc.c | 6 +++++- commit-graph.c | 3 ++- object-name.c | 6 +++++- odb.c | 37 ++++++++++++++++++++++++++++++++++++- odb.h | 17 +++++++++++++++-- packfile.c | 27 --------------------------- packfile.h | 6 ------ 7 files changed, 63 insertions(+), 39 deletions(-) diff --git a/builtin/gc.c b/builtin/gc.c index 3a64d28da8..cb9ca89a97 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -574,9 +574,13 @@ static uint64_t total_ram(void) static uint64_t estimate_repack_memory(struct gc_config *cfg, struct packed_git *pack) { - unsigned long nr_objects = repo_approximate_object_count(the_repository); + unsigned long nr_objects; size_t os_cache, heap; + if (odb_count_objects(the_repository->objects, + ODB_COUNT_OBJECTS_APPROXIMATE, &nr_objects) < 0) + return 0; + if (!pack || !nr_objects) return 0; diff --git a/commit-graph.c b/commit-graph.c index f8e24145a5..c030003330 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -2607,7 +2607,8 @@ int write_commit_graph(struct odb_source *source, replace = ctx.opts->split_flags & COMMIT_GRAPH_SPLIT_REPLACE; } - ctx.approx_nr_objects = repo_approximate_object_count(r); + if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &ctx.approx_nr_objects) < 0) + ctx.approx_nr_objects = 0; if (ctx.append && g) { for (i = 0; i < g->num_commits; i++) { diff --git a/object-name.c b/object-name.c index 7b14c3bf9b..e5adec4c9d 100644 --- a/object-name.c +++ b/object-name.c @@ -837,7 +837,11 @@ int repo_find_unique_abbrev_r(struct repository *r, char *hex, const unsigned hexsz = algo->hexsz; if (len < 0) { - unsigned long count = repo_approximate_object_count(r); + unsigned long count; + + if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &count) < 0) + count = 0; + /* * Add one because the MSB only tells us the highest bit set, * not including the value of all the _other_ bits (so "15" diff --git a/odb.c b/odb.c index 84a31084d3..350e23f3c0 100644 --- a/odb.c +++ b/odb.c @@ -917,6 +917,41 @@ int odb_for_each_object(struct object_database *odb, return 0; } +int odb_count_objects(struct object_database *odb, + enum odb_count_objects_flags flags, + unsigned long *out) +{ + struct odb_source *source; + unsigned long count = 0; + int ret; + + if (odb->object_count_valid && odb->object_count_flags == flags) { + *out = odb->object_count; + return 0; + } + + odb_prepare_alternates(odb); + for (source = odb->sources; source; source = source->next) { + unsigned long c; + + ret = odb_source_count_objects(source, flags, &c); + if (ret < 0) + goto out; + + count += c; + } + + odb->object_count = count; + odb->object_count_valid = 1; + odb->object_count_flags = flags; + + *out = count; + ret = 0; + +out: + return ret; +} + void odb_assert_oid_type(struct object_database *odb, const struct object_id *oid, enum object_type expect) { @@ -1030,7 +1065,7 @@ void odb_reprepare(struct object_database *o) for (source = o->sources; source; source = source->next) odb_source_reprepare(source); - o->approximate_object_count_valid = 0; + o->object_count_valid = 0; obj_read_unlock(); } diff --git a/odb.h b/odb.h index e6057477f6..7b004f1cf4 100644 --- a/odb.h +++ b/odb.h @@ -112,8 +112,9 @@ struct object_database { * These two fields are not meant for direct access. Use * repo_approximate_object_count() instead. */ - unsigned long approximate_object_count; - unsigned approximate_object_count_valid : 1; + unsigned long object_count; + unsigned object_count_flags; + unsigned object_count_valid : 1; /* * Submodule source paths that will be added as additional sources to @@ -509,6 +510,18 @@ enum odb_count_objects_flags { ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0), }; +/* + * Count the number of objects in the given object database. This object count + * may double-count objects that are stored in multiple backends, or which are + * stored multiple times in a single backend. + * + * Returns 0 on success, a negative error code otherwise. The number of objects + * will be assigned to the `out` pointer on success. + */ +int odb_count_objects(struct object_database *odb, + enum odb_count_objects_flags flags, + unsigned long *out); + enum { /* * By default, `odb_write_object()` does not actually write anything diff --git a/packfile.c b/packfile.c index 8ee462303a..d4de9f3ffe 100644 --- a/packfile.c +++ b/packfile.c @@ -1132,33 +1132,6 @@ out: return ret; } -/* - * Give a fast, rough count of the number of objects in the repository. This - * ignores loose objects completely. If you have a lot of them, then either - * you should repack because your performance will be awful, or they are - * all unreachable objects about to be pruned, in which case they're not really - * interesting as a measure of repo size in the first place. - */ -unsigned long repo_approximate_object_count(struct repository *r) -{ - if (!r->objects->approximate_object_count_valid) { - struct odb_source *source; - unsigned long count = 0; - - odb_prepare_alternates(r->objects); - for (source = r->objects->sources; source; source = source->next) { - unsigned long c; - - if (!odb_source_count_objects(source, ODB_COUNT_OBJECTS_APPROXIMATE, &c)) - count += c; - } - - r->objects->approximate_object_count = count; - r->objects->approximate_object_count_valid = 1; - } - return r->objects->approximate_object_count; -} - unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep) { diff --git a/packfile.h b/packfile.h index 74b6bc58c5..a16ec3950d 100644 --- a/packfile.h +++ b/packfile.h @@ -375,12 +375,6 @@ int packfile_store_for_each_object(struct packfile_store *store, #define PACKDIR_FILE_GARBAGE 4 extern void (*report_garbage)(unsigned seen_bits, const char *path); -/* - * Give a rough count of objects in the repository. This sacrifices accuracy - * for speed. - */ -unsigned long repo_approximate_object_count(struct repository *r); - void pack_report(struct repository *repo); /*