Merge branch 'ps/object-counting' into jch

* ps/object-counting:
  odb: introduce generic object counting
  odb/source: introduce generic object counting
  object-file: generalize counting objects
  object-file: extract logic to approximate object count
  packfile: extract logic to count number of objects
  odb: stop including "odb/source.h"
This commit is contained in:
Junio C Hamano
2026-03-10 14:24:04 -07:00
17 changed files with 298 additions and 116 deletions

View File

@@ -467,37 +467,19 @@ out:
static int too_many_loose_objects(int limit)
{
/*
* Quickly check if a "gc" is needed, by estimating how
* many loose objects there are. Because SHA-1 is evenly
* distributed, we can check only one and get a reasonable
* estimate.
* This is weird, but stems from legacy behaviour: the GC auto
* threshold was always essentially interpreted as if it was rounded up
* to the next multiple 256 of, so we retain this behaviour for now.
*/
DIR *dir;
struct dirent *ent;
int auto_threshold;
int num_loose = 0;
int needed = 0;
const unsigned hexsz_loose = the_hash_algo->hexsz - 2;
char *path;
int auto_threshold = DIV_ROUND_UP(limit, 256) * 256;
unsigned long loose_count;
path = repo_git_path(the_repository, "objects/17");
dir = opendir(path);
free(path);
if (!dir)
if (odb_source_loose_count_objects(the_repository->objects->sources,
ODB_COUNT_OBJECTS_APPROXIMATE,
&loose_count) < 0)
return 0;
auto_threshold = DIV_ROUND_UP(limit, 256);
while ((ent = readdir(dir)) != NULL) {
if (strspn(ent->d_name, "0123456789abcdef") != hexsz_loose ||
ent->d_name[hexsz_loose] != '\0')
continue;
if (++num_loose > auto_threshold) {
needed = 1;
break;
}
}
closedir(dir);
return needed;
return loose_count > auto_threshold;
}
static struct packed_git *find_base_packs(struct string_list *packs,
@@ -592,9 +574,13 @@ static uint64_t total_ram(void)
static uint64_t estimate_repack_memory(struct gc_config *cfg,
struct packed_git *pack)
{
unsigned long nr_objects = repo_approximate_object_count(the_repository);
unsigned long nr_objects;
size_t os_cache, heap;
if (odb_count_objects(the_repository->objects,
ODB_COUNT_OBJECTS_APPROXIMATE, &nr_objects) < 0)
return 0;
if (!pack || !nr_objects)
return 0;

View File

@@ -9,6 +9,7 @@
#include "strbuf.h"
#include "trace2.h"
#include "odb.h"
#include "odb/source.h"
#include "replace-object.h"
#include "repository.h"

View File

@@ -29,6 +29,7 @@
#include "object-file.h"
#include "object-name.h"
#include "odb.h"
#include "odb/source.h"
#include "advice.h"
#include "branch.h"
#include "list-objects-filter-options.h"

View File

@@ -2607,7 +2607,8 @@ int write_commit_graph(struct odb_source *source,
replace = ctx.opts->split_flags & COMMIT_GRAPH_SPLIT_REPLACE;
}
ctx.approx_nr_objects = repo_approximate_object_count(r);
if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &ctx.approx_nr_objects) < 0)
ctx.approx_nr_objects = 0;
if (ctx.append && g) {
for (i = 0; i < g->num_commits; i++) {

View File

@@ -1868,6 +1868,63 @@ int odb_source_loose_for_each_object(struct odb_source *source,
NULL, NULL, &data);
}
static int count_loose_object(const struct object_id *oid UNUSED,
struct object_info *oi UNUSED,
void *payload)
{
unsigned long *count = payload;
(*count)++;
return 0;
}
int odb_source_loose_count_objects(struct odb_source *source,
enum odb_count_objects_flags flags,
unsigned long *out)
{
const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
char *path = NULL;
DIR *dir = NULL;
int ret;
if (flags & ODB_COUNT_OBJECTS_APPROXIMATE) {
unsigned long count = 0;
struct dirent *ent;
path = xstrfmt("%s/17", source->path);
dir = opendir(path);
if (!dir) {
if (errno == ENOENT) {
*out = 0;
ret = 0;
goto out;
}
ret = error_errno("cannot open object shard '%s'", path);
goto out;
}
while ((ent = readdir(dir)) != NULL) {
if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
ent->d_name[hexsz] != '\0')
continue;
count++;
}
*out = count * 256;
ret = 0;
} else {
ret = odb_source_loose_for_each_object(source, NULL, count_loose_object,
out, 0);
}
out:
if (dir)
closedir(dir);
free(path);
return ret;
}
static int append_loose_object(const struct object_id *oid,
const char *path UNUSED,
void *data)

View File

@@ -139,6 +139,20 @@ int odb_source_loose_for_each_object(struct odb_source *source,
void *cb_data,
unsigned flags);
/*
* Count the number of loose objects in this source.
*
* The object count is approximated by opening a single sharding directory for
* loose objects and scanning its contents. The result is then extrapolated by
* 256. This should generally work as a reasonable estimate given that the
* object hash is supposed to be indistinguishable from random.
*
* Returns 0 on success, a negative error code otherwise.
*/
int odb_source_loose_count_objects(struct odb_source *source,
enum odb_count_objects_flags flags,
unsigned long *out);
/**
* format_object_header() is a thin wrapper around s xsnprintf() that
* writes the initial "<type> <obj-len>" part of the loose object

View File

@@ -837,7 +837,11 @@ int repo_find_unique_abbrev_r(struct repository *r, char *hex,
const unsigned hexsz = algo->hexsz;
if (len < 0) {
unsigned long count = repo_approximate_object_count(r);
unsigned long count;
if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &count) < 0)
count = 0;
/*
* Add one because the MSB only tells us the highest bit set,
* not including the value of all the _other_ bits (so "15"

37
odb.c
View File

@@ -917,6 +917,41 @@ int odb_for_each_object(struct object_database *odb,
return 0;
}
int odb_count_objects(struct object_database *odb,
enum odb_count_objects_flags flags,
unsigned long *out)
{
struct odb_source *source;
unsigned long count = 0;
int ret;
if (odb->object_count_valid && odb->object_count_flags == flags) {
*out = odb->object_count;
return 0;
}
odb_prepare_alternates(odb);
for (source = odb->sources; source; source = source->next) {
unsigned long c;
ret = odb_source_count_objects(source, flags, &c);
if (ret < 0)
goto out;
count += c;
}
odb->object_count = count;
odb->object_count_valid = 1;
odb->object_count_flags = flags;
*out = count;
ret = 0;
out:
return ret;
}
void odb_assert_oid_type(struct object_database *odb,
const struct object_id *oid, enum object_type expect)
{
@@ -1030,7 +1065,7 @@ void odb_reprepare(struct object_database *o)
for (source = o->sources; source; source = source->next)
odb_source_reprepare(source);
o->approximate_object_count_valid = 0;
o->object_count_valid = 0;
obj_read_unlock();
}

76
odb.h
View File

@@ -3,7 +3,6 @@
#include "hashmap.h"
#include "object.h"
#include "odb/source.h"
#include "oidset.h"
#include "oidmap.h"
#include "string-list.h"
@@ -12,6 +11,7 @@
struct oidmap;
struct oidtree;
struct strbuf;
struct strvec;
struct repository;
struct multi_pack_index;
@@ -112,8 +112,9 @@ struct object_database {
* These two fields are not meant for direct access. Use
* repo_approximate_object_count() instead.
*/
unsigned long approximate_object_count;
unsigned approximate_object_count_valid : 1;
unsigned long object_count;
unsigned object_count_flags;
unsigned object_count_valid : 1;
/*
* Submodule source paths that will be added as additional sources to
@@ -339,6 +340,42 @@ struct object_info {
*/
#define OBJECT_INFO_INIT { 0 }
/* Flags that can be passed to `odb_read_object_info_extended()`. */
enum object_info_flags {
/* Invoke lookup_replace_object() on the given hash. */
OBJECT_INFO_LOOKUP_REPLACE = (1 << 0),
/* Do not reprepare object sources when the first lookup has failed. */
OBJECT_INFO_QUICK = (1 << 1),
/*
* Do not attempt to fetch the object if missing (even if fetch_is_missing is
* nonzero).
*/
OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2),
/* Die if object corruption (not just an object being missing) was detected. */
OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3),
/*
* We have already tried reading the object, but it couldn't be found
* via any of the attached sources, and are now doing a second read.
* This second read asks the individual sources to also evaluate
* whether any on-disk state may have changed that may have caused the
* object to appear.
*
* This flag is for internal use, only. The second read only occurs
* when `OBJECT_INFO_QUICK` was not passed.
*/
OBJECT_INFO_SECOND_READ = (1 << 4),
/*
* This is meant for bulk prefetching of missing blobs in a partial
* clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK.
*/
OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK),
};
/*
* Read object info from the object database and populate the `object_info`
* structure. Returns 0 on success, a negative error code otherwise.
@@ -432,6 +469,18 @@ enum odb_for_each_object_flags {
ODB_FOR_EACH_OBJECT_SKIP_ON_DISK_KEPT_PACKS = (1<<4),
};
/*
* A callback function that can be used to iterate through objects. If given,
* the optional `oi` parameter will be populated the same as if you would call
* `odb_read_object_info()`.
*
* Returning a non-zero error code will cause iteration to abort. The error
* code will be propagated.
*/
typedef int (*odb_for_each_object_cb)(const struct object_id *oid,
struct object_info *oi,
void *cb_data);
/*
* Iterate through all objects contained in the object database. Note that
* objects may be iterated over multiple times in case they are either stored
@@ -452,6 +501,27 @@ int odb_for_each_object(struct object_database *odb,
void *cb_data,
unsigned flags);
enum odb_count_objects_flags {
/*
* Instead of providing an accurate count, allow the number of objects
* to be approximated. Details of how this approximation works are
* subject to the specific source's implementation.
*/
ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0),
};
/*
* Count the number of objects in the given object database. This object count
* may double-count objects that are stored in multiple backends, or which are
* stored multiple times in a single backend.
*
* Returns 0 on success, a negative error code otherwise. The number of objects
* will be assigned to the `out` pointer on success.
*/
int odb_count_objects(struct object_database *odb,
enum odb_count_objects_flags flags,
unsigned long *out);
enum {
/*
* By default, `odb_write_object()` does not actually write anything

View File

@@ -93,6 +93,35 @@ static int odb_source_files_for_each_object(struct odb_source *source,
return 0;
}
static int odb_source_files_count_objects(struct odb_source *source,
enum odb_count_objects_flags flags,
unsigned long *out)
{
struct odb_source_files *files = odb_source_files_downcast(source);
unsigned long count;
int ret;
ret = packfile_store_count_objects(files->packed, flags, &count);
if (ret < 0)
goto out;
if (!(flags & ODB_COUNT_OBJECTS_APPROXIMATE)) {
unsigned long loose_count;
ret = odb_source_loose_count_objects(source, flags, &loose_count);
if (ret < 0)
goto out;
count += loose_count;
}
*out = count;
ret = 0;
out:
return ret;
}
static int odb_source_files_freshen_object(struct odb_source *source,
const struct object_id *oid)
{
@@ -220,6 +249,7 @@ struct odb_source_files *odb_source_files_new(struct object_database *odb,
files->base.read_object_info = odb_source_files_read_object_info;
files->base.read_object_stream = odb_source_files_read_object_stream;
files->base.for_each_object = odb_source_files_for_each_object;
files->base.count_objects = odb_source_files_count_objects;
files->base.freshen_object = odb_source_files_freshen_object;
files->base.write_object = odb_source_files_write_object;
files->base.write_object_stream = odb_source_files_write_object_stream;

View File

@@ -2,6 +2,7 @@
#define ODB_SOURCE_H
#include "object.h"
#include "odb.h"
enum odb_source_type {
/*
@@ -14,61 +15,10 @@ enum odb_source_type {
ODB_SOURCE_FILES,
};
/* Flags that can be passed to `odb_read_object_info_extended()`. */
enum object_info_flags {
/* Invoke lookup_replace_object() on the given hash. */
OBJECT_INFO_LOOKUP_REPLACE = (1 << 0),
/* Do not reprepare object sources when the first lookup has failed. */
OBJECT_INFO_QUICK = (1 << 1),
/*
* Do not attempt to fetch the object if missing (even if fetch_is_missing is
* nonzero).
*/
OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2),
/* Die if object corruption (not just an object being missing) was detected. */
OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3),
/*
* We have already tried reading the object, but it couldn't be found
* via any of the attached sources, and are now doing a second read.
* This second read asks the individual sources to also evaluate
* whether any on-disk state may have changed that may have caused the
* object to appear.
*
* This flag is for internal use, only. The second read only occurs
* when `OBJECT_INFO_QUICK` was not passed.
*/
OBJECT_INFO_SECOND_READ = (1 << 4),
/*
* This is meant for bulk prefetching of missing blobs in a partial
* clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK.
*/
OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK),
};
struct object_id;
struct object_info;
struct odb_read_stream;
struct odb_transaction;
struct odb_write_stream;
struct strvec;
/*
* A callback function that can be used to iterate through objects. If given,
* the optional `oi` parameter will be populated the same as if you would call
* `odb_read_object_info()`.
*
* Returning a non-zero error code will cause iteration to abort. The error
* code will be propagated.
*/
typedef int (*odb_for_each_object_cb)(const struct object_id *oid,
struct object_info *oi,
void *cb_data);
/*
* The source is the part of the object database that stores the actual
* objects. It thus encapsulates the logic to read and write the specific
@@ -192,6 +142,21 @@ struct odb_source {
void *cb_data,
unsigned flags);
/*
* This callback is expected to count objects in the given object
* database source. The callback function does not have to guarantee
* that only unique objects are counted. The result shall be assigned
* to the `out` pointer.
*
* Accepts `enum odb_count_objects_flag` flags to alter the behaviour.
*
* The callback is expected to return 0 on success, or a negative error
* code otherwise.
*/
int (*count_objects)(struct odb_source *source,
enum odb_count_objects_flags flags,
unsigned long *out);
/*
* This callback is expected to freshen the given object so that its
* last access time is set to the current time. This is used to ensure
@@ -383,6 +348,18 @@ static inline int odb_source_for_each_object(struct odb_source *source,
return source->for_each_object(source, request, cb, cb_data, flags);
}
/*
* Count the number of objects in the given object database source.
*
* Returns 0 on success, a negative error code otherwise.
*/
static inline int odb_source_count_objects(struct odb_source *source,
enum odb_count_objects_flags flags,
unsigned long *out)
{
return source->count_objects(source, flags, out);
}
/*
* Freshen an object in the object database by updating its timestamp.
* Returns 1 in case the object has been freshened, 0 in case the object does

View File

@@ -7,6 +7,7 @@
#include "environment.h"
#include "repository.h"
#include "odb.h"
#include "odb/source.h"
#include "odb/streaming.h"
#include "replace-object.h"

View File

@@ -1101,37 +1101,35 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor
return store->packs.head;
}
/*
* Give a fast, rough count of the number of objects in the repository. This
* ignores loose objects completely. If you have a lot of them, then either
* you should repack because your performance will be awful, or they are
* all unreachable objects about to be pruned, in which case they're not really
* interesting as a measure of repo size in the first place.
*/
unsigned long repo_approximate_object_count(struct repository *r)
int packfile_store_count_objects(struct packfile_store *store,
enum odb_count_objects_flags flags UNUSED,
unsigned long *out)
{
if (!r->objects->approximate_object_count_valid) {
struct odb_source *source;
unsigned long count = 0;
struct packed_git *p;
struct packfile_list_entry *e;
struct multi_pack_index *m;
unsigned long count = 0;
int ret;
odb_prepare_alternates(r->objects);
m = get_multi_pack_index(store->source);
if (m)
count += m->num_objects + m->num_objects_in_base;
for (source = r->objects->sources; source; source = source->next) {
struct multi_pack_index *m = get_multi_pack_index(source);
if (m)
count += m->num_objects + m->num_objects_in_base;
for (e = packfile_store_get_packs(store); e; e = e->next) {
if (e->pack->multi_pack_index)
continue;
if (open_pack_index(e->pack)) {
ret = -1;
goto out;
}
repo_for_each_pack(r, p) {
if (p->multi_pack_index || open_pack_index(p))
continue;
count += p->num_objects;
}
r->objects->approximate_object_count = count;
r->objects->approximate_object_count_valid = 1;
count += e->pack->num_objects;
}
return r->objects->approximate_object_count;
*out = count;
ret = 0;
out:
return ret;
}
unsigned long unpack_object_header_buffer(const unsigned char *buf,

View File

@@ -268,6 +268,16 @@ enum kept_pack_type {
KEPT_PACK_IN_CORE = (1 << 1),
};
/*
* Count the number objects contained in the given packfile store. If
* successful, the number of objects will be written to the `out` pointer.
*
* Return 0 on success, a negative error code otherwise.
*/
int packfile_store_count_objects(struct packfile_store *store,
enum odb_count_objects_flags flags,
unsigned long *out);
/*
* Retrieve the cache of kept packs from the given packfile store. Accepts a
* combination of `kept_pack_type` flags. The cache is computed on demand and
@@ -365,12 +375,6 @@ int packfile_store_for_each_object(struct packfile_store *store,
#define PACKDIR_FILE_GARBAGE 4
extern void (*report_garbage)(unsigned seen_bits, const char *path);
/*
* Give a rough count of objects in the repository. This sacrifices accuracy
* for speed.
*/
unsigned long repo_approximate_object_count(struct repository *r);
void pack_report(struct repository *repo);
/*

View File

@@ -3,6 +3,7 @@
#include "repository.h"
#include "hook.h"
#include "odb.h"
#include "odb/source.h"
#include "config.h"
#include "gettext.h"
#include "object.h"

View File

@@ -14,6 +14,7 @@
#include "strbuf.h"
#include "object-name.h"
#include "odb.h"
#include "odb/source.h"
#include "parse-options.h"
#include "thread-utils.h"
#include "tree-walk.h"

View File

@@ -11,6 +11,7 @@
#include "strvec.h"
#include "quote.h"
#include "odb.h"
#include "odb/source.h"
#include "repository.h"
struct tmp_objdir {