object-file: extract logic to approximate object count

In "builtin/gc.c" we have some logic that checks whether we need to
repack objects. This is done by counting the number of objects that we
have and checking whether it exceeds a certain threshold. We don't
really need an accurate object count though, which is why we only
open a single object diretcroy shard and then extrapolate from there.

Extract this logic into a new function that is owned by the loose object
database source. This is done to prepare for a subsequent change, where
we'll introduce object counting on the object database source level.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Patrick Steinhardt
2026-03-10 16:18:23 +01:00
committed by Junio C Hamano
parent ebf0664e7a
commit 3b5ca32b5f
3 changed files with 62 additions and 27 deletions

View File

@@ -467,37 +467,18 @@ out:
static int too_many_loose_objects(int limit)
{
/*
* Quickly check if a "gc" is needed, by estimating how
* many loose objects there are. Because SHA-1 is evenly
* distributed, we can check only one and get a reasonable
* estimate.
* This is weird, but stems from legacy behaviour: the GC auto
* threshold was always essentially interpreted as if it was rounded up
* to the next multiple 256 of, so we retain this behaviour for now.
*/
DIR *dir;
struct dirent *ent;
int auto_threshold;
int num_loose = 0;
int needed = 0;
const unsigned hexsz_loose = the_hash_algo->hexsz - 2;
char *path;
int auto_threshold = DIV_ROUND_UP(limit, 256) * 256;
unsigned long loose_count;
path = repo_git_path(the_repository, "objects/17");
dir = opendir(path);
free(path);
if (!dir)
if (odb_source_loose_approximate_object_count(the_repository->objects->sources,
&loose_count) < 0)
return 0;
auto_threshold = DIV_ROUND_UP(limit, 256);
while ((ent = readdir(dir)) != NULL) {
if (strspn(ent->d_name, "0123456789abcdef") != hexsz_loose ||
ent->d_name[hexsz_loose] != '\0')
continue;
if (++num_loose > auto_threshold) {
needed = 1;
break;
}
}
closedir(dir);
return needed;
return loose_count > auto_threshold;
}
static struct packed_git *find_base_packs(struct string_list *packs,

View File

@@ -1868,6 +1868,47 @@ int odb_source_loose_for_each_object(struct odb_source *source,
NULL, NULL, &data);
}
int odb_source_loose_approximate_object_count(struct odb_source *source,
unsigned long *out)
{
const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
unsigned long count = 0;
struct dirent *ent;
char *path = NULL;
DIR *dir = NULL;
int ret;
path = xstrfmt("%s/17", source->path);
dir = opendir(path);
if (!dir) {
if (errno == ENOENT) {
*out = 0;
ret = 0;
goto out;
}
ret = error_errno("cannot open object shard '%s'", path);
goto out;
}
while ((ent = readdir(dir)) != NULL) {
if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
ent->d_name[hexsz] != '\0')
continue;
count++;
}
*out = count * 256;
ret = 0;
out:
if (dir)
closedir(dir);
free(path);
return ret;
}
static int append_loose_object(const struct object_id *oid,
const char *path UNUSED,
void *data)

View File

@@ -139,6 +139,19 @@ int odb_source_loose_for_each_object(struct odb_source *source,
void *cb_data,
unsigned flags);
/*
* Count the number of loose objects in this source.
*
* The object count is approximated by opening a single sharding directory for
* loose objects and scanning its contents. The result is then extrapolated by
* 256. This should generally work as a reasonable estimate given that the
* object hash is supposed to be indistinguishable from random.
*
* Returns 0 on success, a negative error code otherwise.
*/
int odb_source_loose_approximate_object_count(struct odb_source *source,
unsigned long *out);
/**
* format_object_header() is a thin wrapper around s xsnprintf() that
* writes the initial "<type> <obj-len>" part of the loose object