diff --git a/builtin/gc.c b/builtin/gc.c index fb329c2cff..cb9ca89a97 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -467,37 +467,19 @@ out: static int too_many_loose_objects(int limit) { /* - * Quickly check if a "gc" is needed, by estimating how - * many loose objects there are. Because SHA-1 is evenly - * distributed, we can check only one and get a reasonable - * estimate. + * This is weird, but stems from legacy behaviour: the GC auto + * threshold was always essentially interpreted as if it was rounded up + * to the next multiple 256 of, so we retain this behaviour for now. */ - DIR *dir; - struct dirent *ent; - int auto_threshold; - int num_loose = 0; - int needed = 0; - const unsigned hexsz_loose = the_hash_algo->hexsz - 2; - char *path; + int auto_threshold = DIV_ROUND_UP(limit, 256) * 256; + unsigned long loose_count; - path = repo_git_path(the_repository, "objects/17"); - dir = opendir(path); - free(path); - if (!dir) + if (odb_source_loose_count_objects(the_repository->objects->sources, + ODB_COUNT_OBJECTS_APPROXIMATE, + &loose_count) < 0) return 0; - auto_threshold = DIV_ROUND_UP(limit, 256); - while ((ent = readdir(dir)) != NULL) { - if (strspn(ent->d_name, "0123456789abcdef") != hexsz_loose || - ent->d_name[hexsz_loose] != '\0') - continue; - if (++num_loose > auto_threshold) { - needed = 1; - break; - } - } - closedir(dir); - return needed; + return loose_count > auto_threshold; } static struct packed_git *find_base_packs(struct string_list *packs, @@ -592,9 +574,13 @@ static uint64_t total_ram(void) static uint64_t estimate_repack_memory(struct gc_config *cfg, struct packed_git *pack) { - unsigned long nr_objects = repo_approximate_object_count(the_repository); + unsigned long nr_objects; size_t os_cache, heap; + if (odb_count_objects(the_repository->objects, + ODB_COUNT_OBJECTS_APPROXIMATE, &nr_objects) < 0) + return 0; + if (!pack || !nr_objects) return 0; diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c index 2f24c113c8..0f72d96c02 100644 --- a/builtin/multi-pack-index.c +++ b/builtin/multi-pack-index.c @@ -9,6 +9,7 @@ #include "strbuf.h" #include "trace2.h" #include "odb.h" +#include "odb/source.h" #include "replace-object.h" #include "repository.h" diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c index 143f7cb3cc..4957487536 100644 --- a/builtin/submodule--helper.c +++ b/builtin/submodule--helper.c @@ -29,6 +29,7 @@ #include "object-file.h" #include "object-name.h" #include "odb.h" +#include "odb/source.h" #include "advice.h" #include "branch.h" #include "list-objects-filter-options.h" diff --git a/commit-graph.c b/commit-graph.c index f8e24145a5..c030003330 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -2607,7 +2607,8 @@ int write_commit_graph(struct odb_source *source, replace = ctx.opts->split_flags & COMMIT_GRAPH_SPLIT_REPLACE; } - ctx.approx_nr_objects = repo_approximate_object_count(r); + if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &ctx.approx_nr_objects) < 0) + ctx.approx_nr_objects = 0; if (ctx.append && g) { for (i = 0; i < g->num_commits; i++) { diff --git a/object-file.c b/object-file.c index c62e5496e0..5629f35015 100644 --- a/object-file.c +++ b/object-file.c @@ -1868,6 +1868,63 @@ int odb_source_loose_for_each_object(struct odb_source *source, NULL, NULL, &data); } +static int count_loose_object(const struct object_id *oid UNUSED, + struct object_info *oi UNUSED, + void *payload) +{ + unsigned long *count = payload; + (*count)++; + return 0; +} + +int odb_source_loose_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out) +{ + const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2; + char *path = NULL; + DIR *dir = NULL; + int ret; + + if (flags & ODB_COUNT_OBJECTS_APPROXIMATE) { + unsigned long count = 0; + struct dirent *ent; + + path = xstrfmt("%s/17", source->path); + + dir = opendir(path); + if (!dir) { + if (errno == ENOENT) { + *out = 0; + ret = 0; + goto out; + } + + ret = error_errno("cannot open object shard '%s'", path); + goto out; + } + + while ((ent = readdir(dir)) != NULL) { + if (strspn(ent->d_name, "0123456789abcdef") != hexsz || + ent->d_name[hexsz] != '\0') + continue; + count++; + } + + *out = count * 256; + ret = 0; + } else { + ret = odb_source_loose_for_each_object(source, NULL, count_loose_object, + out, 0); + } + +out: + if (dir) + closedir(dir); + free(path); + return ret; +} + static int append_loose_object(const struct object_id *oid, const char *path UNUSED, void *data) diff --git a/object-file.h b/object-file.h index ff6da65296..f8d8805a18 100644 --- a/object-file.h +++ b/object-file.h @@ -139,6 +139,20 @@ int odb_source_loose_for_each_object(struct odb_source *source, void *cb_data, unsigned flags); +/* + * Count the number of loose objects in this source. + * + * The object count is approximated by opening a single sharding directory for + * loose objects and scanning its contents. The result is then extrapolated by + * 256. This should generally work as a reasonable estimate given that the + * object hash is supposed to be indistinguishable from random. + * + * Returns 0 on success, a negative error code otherwise. + */ +int odb_source_loose_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out); + /** * format_object_header() is a thin wrapper around s xsnprintf() that * writes the initial " " part of the loose object diff --git a/object-name.c b/object-name.c index 7b14c3bf9b..e5adec4c9d 100644 --- a/object-name.c +++ b/object-name.c @@ -837,7 +837,11 @@ int repo_find_unique_abbrev_r(struct repository *r, char *hex, const unsigned hexsz = algo->hexsz; if (len < 0) { - unsigned long count = repo_approximate_object_count(r); + unsigned long count; + + if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &count) < 0) + count = 0; + /* * Add one because the MSB only tells us the highest bit set, * not including the value of all the _other_ bits (so "15" diff --git a/odb.c b/odb.c index 84a31084d3..350e23f3c0 100644 --- a/odb.c +++ b/odb.c @@ -917,6 +917,41 @@ int odb_for_each_object(struct object_database *odb, return 0; } +int odb_count_objects(struct object_database *odb, + enum odb_count_objects_flags flags, + unsigned long *out) +{ + struct odb_source *source; + unsigned long count = 0; + int ret; + + if (odb->object_count_valid && odb->object_count_flags == flags) { + *out = odb->object_count; + return 0; + } + + odb_prepare_alternates(odb); + for (source = odb->sources; source; source = source->next) { + unsigned long c; + + ret = odb_source_count_objects(source, flags, &c); + if (ret < 0) + goto out; + + count += c; + } + + odb->object_count = count; + odb->object_count_valid = 1; + odb->object_count_flags = flags; + + *out = count; + ret = 0; + +out: + return ret; +} + void odb_assert_oid_type(struct object_database *odb, const struct object_id *oid, enum object_type expect) { @@ -1030,7 +1065,7 @@ void odb_reprepare(struct object_database *o) for (source = o->sources; source; source = source->next) odb_source_reprepare(source); - o->approximate_object_count_valid = 0; + o->object_count_valid = 0; obj_read_unlock(); } diff --git a/odb.h b/odb.h index 86e0365c24..7b004f1cf4 100644 --- a/odb.h +++ b/odb.h @@ -3,7 +3,6 @@ #include "hashmap.h" #include "object.h" -#include "odb/source.h" #include "oidset.h" #include "oidmap.h" #include "string-list.h" @@ -12,6 +11,7 @@ struct oidmap; struct oidtree; struct strbuf; +struct strvec; struct repository; struct multi_pack_index; @@ -112,8 +112,9 @@ struct object_database { * These two fields are not meant for direct access. Use * repo_approximate_object_count() instead. */ - unsigned long approximate_object_count; - unsigned approximate_object_count_valid : 1; + unsigned long object_count; + unsigned object_count_flags; + unsigned object_count_valid : 1; /* * Submodule source paths that will be added as additional sources to @@ -339,6 +340,42 @@ struct object_info { */ #define OBJECT_INFO_INIT { 0 } +/* Flags that can be passed to `odb_read_object_info_extended()`. */ +enum object_info_flags { + /* Invoke lookup_replace_object() on the given hash. */ + OBJECT_INFO_LOOKUP_REPLACE = (1 << 0), + + /* Do not reprepare object sources when the first lookup has failed. */ + OBJECT_INFO_QUICK = (1 << 1), + + /* + * Do not attempt to fetch the object if missing (even if fetch_is_missing is + * nonzero). + */ + OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2), + + /* Die if object corruption (not just an object being missing) was detected. */ + OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3), + + /* + * We have already tried reading the object, but it couldn't be found + * via any of the attached sources, and are now doing a second read. + * This second read asks the individual sources to also evaluate + * whether any on-disk state may have changed that may have caused the + * object to appear. + * + * This flag is for internal use, only. The second read only occurs + * when `OBJECT_INFO_QUICK` was not passed. + */ + OBJECT_INFO_SECOND_READ = (1 << 4), + + /* + * This is meant for bulk prefetching of missing blobs in a partial + * clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK. + */ + OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK), +}; + /* * Read object info from the object database and populate the `object_info` * structure. Returns 0 on success, a negative error code otherwise. @@ -432,6 +469,18 @@ enum odb_for_each_object_flags { ODB_FOR_EACH_OBJECT_SKIP_ON_DISK_KEPT_PACKS = (1<<4), }; +/* + * A callback function that can be used to iterate through objects. If given, + * the optional `oi` parameter will be populated the same as if you would call + * `odb_read_object_info()`. + * + * Returning a non-zero error code will cause iteration to abort. The error + * code will be propagated. + */ +typedef int (*odb_for_each_object_cb)(const struct object_id *oid, + struct object_info *oi, + void *cb_data); + /* * Iterate through all objects contained in the object database. Note that * objects may be iterated over multiple times in case they are either stored @@ -452,6 +501,27 @@ int odb_for_each_object(struct object_database *odb, void *cb_data, unsigned flags); +enum odb_count_objects_flags { + /* + * Instead of providing an accurate count, allow the number of objects + * to be approximated. Details of how this approximation works are + * subject to the specific source's implementation. + */ + ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0), +}; + +/* + * Count the number of objects in the given object database. This object count + * may double-count objects that are stored in multiple backends, or which are + * stored multiple times in a single backend. + * + * Returns 0 on success, a negative error code otherwise. The number of objects + * will be assigned to the `out` pointer on success. + */ +int odb_count_objects(struct object_database *odb, + enum odb_count_objects_flags flags, + unsigned long *out); + enum { /* * By default, `odb_write_object()` does not actually write anything diff --git a/odb/source-files.c b/odb/source-files.c index 14cb9adeca..c08d8993e3 100644 --- a/odb/source-files.c +++ b/odb/source-files.c @@ -93,6 +93,35 @@ static int odb_source_files_for_each_object(struct odb_source *source, return 0; } +static int odb_source_files_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out) +{ + struct odb_source_files *files = odb_source_files_downcast(source); + unsigned long count; + int ret; + + ret = packfile_store_count_objects(files->packed, flags, &count); + if (ret < 0) + goto out; + + if (!(flags & ODB_COUNT_OBJECTS_APPROXIMATE)) { + unsigned long loose_count; + + ret = odb_source_loose_count_objects(source, flags, &loose_count); + if (ret < 0) + goto out; + + count += loose_count; + } + + *out = count; + ret = 0; + +out: + return ret; +} + static int odb_source_files_freshen_object(struct odb_source *source, const struct object_id *oid) { @@ -220,6 +249,7 @@ struct odb_source_files *odb_source_files_new(struct object_database *odb, files->base.read_object_info = odb_source_files_read_object_info; files->base.read_object_stream = odb_source_files_read_object_stream; files->base.for_each_object = odb_source_files_for_each_object; + files->base.count_objects = odb_source_files_count_objects; files->base.freshen_object = odb_source_files_freshen_object; files->base.write_object = odb_source_files_write_object; files->base.write_object_stream = odb_source_files_write_object_stream; diff --git a/odb/source.h b/odb/source.h index caac558149..96c906e7a1 100644 --- a/odb/source.h +++ b/odb/source.h @@ -2,6 +2,7 @@ #define ODB_SOURCE_H #include "object.h" +#include "odb.h" enum odb_source_type { /* @@ -14,61 +15,10 @@ enum odb_source_type { ODB_SOURCE_FILES, }; -/* Flags that can be passed to `odb_read_object_info_extended()`. */ -enum object_info_flags { - /* Invoke lookup_replace_object() on the given hash. */ - OBJECT_INFO_LOOKUP_REPLACE = (1 << 0), - - /* Do not reprepare object sources when the first lookup has failed. */ - OBJECT_INFO_QUICK = (1 << 1), - - /* - * Do not attempt to fetch the object if missing (even if fetch_is_missing is - * nonzero). - */ - OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2), - - /* Die if object corruption (not just an object being missing) was detected. */ - OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3), - - /* - * We have already tried reading the object, but it couldn't be found - * via any of the attached sources, and are now doing a second read. - * This second read asks the individual sources to also evaluate - * whether any on-disk state may have changed that may have caused the - * object to appear. - * - * This flag is for internal use, only. The second read only occurs - * when `OBJECT_INFO_QUICK` was not passed. - */ - OBJECT_INFO_SECOND_READ = (1 << 4), - - /* - * This is meant for bulk prefetching of missing blobs in a partial - * clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK. - */ - OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK), -}; - struct object_id; -struct object_info; struct odb_read_stream; -struct odb_transaction; -struct odb_write_stream; struct strvec; -/* - * A callback function that can be used to iterate through objects. If given, - * the optional `oi` parameter will be populated the same as if you would call - * `odb_read_object_info()`. - * - * Returning a non-zero error code will cause iteration to abort. The error - * code will be propagated. - */ -typedef int (*odb_for_each_object_cb)(const struct object_id *oid, - struct object_info *oi, - void *cb_data); - /* * The source is the part of the object database that stores the actual * objects. It thus encapsulates the logic to read and write the specific @@ -192,6 +142,21 @@ struct odb_source { void *cb_data, unsigned flags); + /* + * This callback is expected to count objects in the given object + * database source. The callback function does not have to guarantee + * that only unique objects are counted. The result shall be assigned + * to the `out` pointer. + * + * Accepts `enum odb_count_objects_flag` flags to alter the behaviour. + * + * The callback is expected to return 0 on success, or a negative error + * code otherwise. + */ + int (*count_objects)(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out); + /* * This callback is expected to freshen the given object so that its * last access time is set to the current time. This is used to ensure @@ -383,6 +348,18 @@ static inline int odb_source_for_each_object(struct odb_source *source, return source->for_each_object(source, request, cb, cb_data, flags); } +/* + * Count the number of objects in the given object database source. + * + * Returns 0 on success, a negative error code otherwise. + */ +static inline int odb_source_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out) +{ + return source->count_objects(source, flags, out); +} + /* * Freshen an object in the object database by updating its timestamp. * Returns 1 in case the object has been freshened, 0 in case the object does diff --git a/odb/streaming.c b/odb/streaming.c index a4355cd245..5927a12954 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -7,6 +7,7 @@ #include "environment.h" #include "repository.h" #include "odb.h" +#include "odb/source.h" #include "odb/streaming.h" #include "replace-object.h" diff --git a/packfile.c b/packfile.c index 215a23e42b..d4de9f3ffe 100644 --- a/packfile.c +++ b/packfile.c @@ -1101,37 +1101,35 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor return store->packs.head; } -/* - * Give a fast, rough count of the number of objects in the repository. This - * ignores loose objects completely. If you have a lot of them, then either - * you should repack because your performance will be awful, or they are - * all unreachable objects about to be pruned, in which case they're not really - * interesting as a measure of repo size in the first place. - */ -unsigned long repo_approximate_object_count(struct repository *r) +int packfile_store_count_objects(struct packfile_store *store, + enum odb_count_objects_flags flags UNUSED, + unsigned long *out) { - if (!r->objects->approximate_object_count_valid) { - struct odb_source *source; - unsigned long count = 0; - struct packed_git *p; + struct packfile_list_entry *e; + struct multi_pack_index *m; + unsigned long count = 0; + int ret; - odb_prepare_alternates(r->objects); + m = get_multi_pack_index(store->source); + if (m) + count += m->num_objects + m->num_objects_in_base; - for (source = r->objects->sources; source; source = source->next) { - struct multi_pack_index *m = get_multi_pack_index(source); - if (m) - count += m->num_objects + m->num_objects_in_base; + for (e = packfile_store_get_packs(store); e; e = e->next) { + if (e->pack->multi_pack_index) + continue; + if (open_pack_index(e->pack)) { + ret = -1; + goto out; } - repo_for_each_pack(r, p) { - if (p->multi_pack_index || open_pack_index(p)) - continue; - count += p->num_objects; - } - r->objects->approximate_object_count = count; - r->objects->approximate_object_count_valid = 1; + count += e->pack->num_objects; } - return r->objects->approximate_object_count; + + *out = count; + ret = 0; + +out: + return ret; } unsigned long unpack_object_header_buffer(const unsigned char *buf, diff --git a/packfile.h b/packfile.h index 8b04a258a7..a16ec3950d 100644 --- a/packfile.h +++ b/packfile.h @@ -268,6 +268,16 @@ enum kept_pack_type { KEPT_PACK_IN_CORE = (1 << 1), }; +/* + * Count the number objects contained in the given packfile store. If + * successful, the number of objects will be written to the `out` pointer. + * + * Return 0 on success, a negative error code otherwise. + */ +int packfile_store_count_objects(struct packfile_store *store, + enum odb_count_objects_flags flags, + unsigned long *out); + /* * Retrieve the cache of kept packs from the given packfile store. Accepts a * combination of `kept_pack_type` flags. The cache is computed on demand and @@ -365,12 +375,6 @@ int packfile_store_for_each_object(struct packfile_store *store, #define PACKDIR_FILE_GARBAGE 4 extern void (*report_garbage)(unsigned seen_bits, const char *path); -/* - * Give a rough count of objects in the repository. This sacrifices accuracy - * for speed. - */ -unsigned long repo_approximate_object_count(struct repository *r); - void pack_report(struct repository *repo); /* diff --git a/repository.c b/repository.c index 0b8f7ec200..9e5537f539 100644 --- a/repository.c +++ b/repository.c @@ -3,6 +3,7 @@ #include "repository.h" #include "hook.h" #include "odb.h" +#include "odb/source.h" #include "config.h" #include "gettext.h" #include "object.h" diff --git a/submodule-config.c b/submodule-config.c index 1f19fe2077..72a46b7a54 100644 --- a/submodule-config.c +++ b/submodule-config.c @@ -14,6 +14,7 @@ #include "strbuf.h" #include "object-name.h" #include "odb.h" +#include "odb/source.h" #include "parse-options.h" #include "thread-utils.h" #include "tree-walk.h" diff --git a/tmp-objdir.c b/tmp-objdir.c index e436eed07e..d199d39e7c 100644 --- a/tmp-objdir.c +++ b/tmp-objdir.c @@ -11,6 +11,7 @@ #include "strvec.h" #include "quote.h" #include "odb.h" +#include "odb/source.h" #include "repository.h" struct tmp_objdir {