cat-file: use delta_base_cache entries directly

For objects already in the delta_base_cache, we can safely use
one entry at-a-time directly to avoid the malloc+memcpy+free
overhead.  For a 1MB delta base object, this eliminates the
speed penalty of duplicating large objects into memory and
speeds up those 1MB delta base cached content retrievals by
roughly 30%.

While only 2-7% of objects are delta bases in repos I've looked
at, this avoids up to 96MB of duplicated memory in the worst
case with the default git config.

The new delta_base_cache_lock is a simple single-threaded
assertion to ensure cat-file (and similar) is the exclusive user
of the delta_base_cache.  In other words, we cannot have diff
or similar commands using two or more entries directly from the
delta base cache.  The new lock has nothing to do with parallel
access via multiple threads at the moment.

Signed-off-by: Eric Wong <e@80x24.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Eric Wong
2024-08-23 22:46:25 +00:00
committed by Junio C Hamano
parent a5f683f93d
commit 98521d6f04
5 changed files with 62 additions and 4 deletions

View File

@@ -386,7 +386,20 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
if (data->content) {
batch_write(opt, data->content, data->size);
FREE_AND_NULL(data->content);
switch (data->info.whence) {
case OI_CACHED:
/*
* only blame uses OI_CACHED atm, so it's unlikely
* we'll ever hit this path
*/
BUG("TODO OI_CACHED support not done");
case OI_LOOSE:
case OI_PACKED:
FREE_AND_NULL(data->content);
break;
case OI_DBCACHED:
unlock_delta_base_cache();
}
} else if (data->type == OBJ_BLOB) {
if (opt->buffer_output)
fflush(stdout);
@@ -815,6 +828,7 @@ static int batch_objects(struct batch_options *opt)
data.info.sizep = &data.size;
data.info.contentp = &data.content;
data.info.content_limit = big_file_threshold;
data.info.direct_cache = 1;
}
}

View File

@@ -1586,6 +1586,11 @@ static int do_oid_object_info_extended(struct repository *r,
oidclr(oi->delta_base_oid, the_repository->hash_algo);
if (oi->type_name)
strbuf_addstr(oi->type_name, type_name(co->type));
/*
* Currently `blame' is the only command which creates
* OI_CACHED, and direct_cache is only used by `cat-file'.
*/
assert(!oi->direct_cache);
if (oi->contentp)
*oi->contentp = xmemdupz(co->buf, co->size);
oi->whence = OI_CACHED;

View File

@@ -298,6 +298,14 @@ struct object_info {
OI_PACKED,
OI_DBCACHED
} whence;
/*
* Set if caller is able to use OI_DBCACHED entries without copying.
* This only applies to OI_DBCACHED entries at the moment,
* not OI_CACHED or any other type of entry.
*/
unsigned direct_cache:1;
union {
/*
* struct {

View File

@@ -1362,6 +1362,14 @@ unwind:
static struct hashmap delta_base_cache;
static size_t delta_base_cached;
/*
* Ensures only a single object is used at-a-time via oi->direct_cache.
* Using two objects directly at once (e.g. diff) would cause corruption
* since populating the cache may invalidate existing entries.
* This lock has nothing to do with parallelism at the moment.
*/
static int delta_base_cache_lock;
static LIST_HEAD(delta_base_cache_lru);
struct delta_base_cache_key {
@@ -1444,6 +1452,18 @@ static void detach_delta_base_cache_entry(struct delta_base_cache_entry *ent)
free(ent);
}
static void lock_delta_base_cache(void)
{
delta_base_cache_lock++;
assert(delta_base_cache_lock == 1);
}
void unlock_delta_base_cache(void)
{
delta_base_cache_lock--;
assert(delta_base_cache_lock == 0);
}
static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
{
free(ent->data);
@@ -1453,6 +1473,7 @@ static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
void clear_delta_base_cache(void)
{
struct list_head *lru, *tmp;
assert(!delta_base_cache_lock);
list_for_each_safe(lru, tmp, &delta_base_cache_lru) {
struct delta_base_cache_entry *entry =
list_entry(lru, struct delta_base_cache_entry, lru);
@@ -1466,6 +1487,7 @@ static void add_delta_base_cache(struct packed_git *p, off_t base_offset,
struct delta_base_cache_entry *ent;
struct list_head *lru, *tmp;
assert(!delta_base_cache_lock);
/*
* Check required to avoid redundant entries when more than one thread
* is unpacking the same object, in unpack_entry() (since its phases I
@@ -1520,11 +1542,16 @@ int packed_object_info(struct repository *r, struct packed_git *p,
if (oi->sizep)
*oi->sizep = ent->size;
if (oi->contentp) {
if (!oi->content_limit ||
ent->size <= oi->content_limit)
/* ignore content_limit if avoiding copy from cache */
if (oi->direct_cache) {
lock_delta_base_cache();
*oi->contentp = ent->data;
} else if (!oi->content_limit ||
ent->size <= oi->content_limit) {
*oi->contentp = xmemdupz(ent->data, ent->size);
else
} else {
*oi->contentp = NULL; /* caller must stream */
}
}
} else if (oi->contentp && !oi->content_limit) {
*oi->contentp = unpack_entry(r, p, obj_offset, &type,

View File

@@ -210,4 +210,8 @@ int is_promisor_object(const struct object_id *oid);
int load_idx(const char *path, const unsigned int hashsz, void *idx_map,
size_t idx_size, struct packed_git *p);
/*
* release lock acquired via oi->direct_cache
*/
void unlock_delta_base_cache(void);
#endif