From efaf5b6916280cfc08ae027f6511010bfcf00da9 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 17 Nov 2016 09:14:40 -0500 Subject: [PATCH 1/5] name-hash: eliminate duplicate memihash call Remove duplicate memihash() call in hash_dir_entry(). The existing code called memihash() to do the find_dir_entry() and it not found, called memihash() again to do the hashmap_add(). Signed-off-by: Jeff Hostetler --- name-hash.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/name-hash.c b/name-hash.c index 6d9f23e932..860b8dd0c7 100644 --- a/name-hash.c +++ b/name-hash.c @@ -23,13 +23,19 @@ static int dir_entry_cmp(const struct dir_entry *e1, name ? name : e2->name, e1->namelen); } +static struct dir_entry *find_dir_entry__hash(struct index_state *istate, + const char *name, unsigned int namelen, unsigned int hash) +{ + struct dir_entry key; + hashmap_entry_init(&key, hash); + key.namelen = namelen; + return hashmap_get(&istate->dir_hash, &key, name); +} + static struct dir_entry *find_dir_entry(struct index_state *istate, const char *name, unsigned int namelen) { - struct dir_entry key; - hashmap_entry_init(&key, memihash(name, namelen)); - key.namelen = namelen; - return hashmap_get(&istate->dir_hash, &key, name); + return find_dir_entry__hash(istate, name, namelen, memihash(name,namelen)); } static struct dir_entry *hash_dir_entry(struct index_state *istate, @@ -43,6 +49,7 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, * in index_state.name_hash (as ordinary cache_entries). */ struct dir_entry *dir; + unsigned int hash; /* get length of parent directory */ while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1])) @@ -52,11 +59,12 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, namelen--; /* lookup existing entry for that directory */ - dir = find_dir_entry(istate, ce->name, namelen); + hash = memihash(ce->name, namelen); + dir = find_dir_entry__hash(istate, ce->name, namelen, hash); if (!dir) { /* not found, create it and add to hash table */ FLEX_ALLOC_MEM(dir, name, ce->name, namelen); - hashmap_entry_init(dir, memihash(ce->name, namelen)); + hashmap_entry_init(dir, hash); dir->namelen = namelen; hashmap_add(&istate->dir_hash, dir); From a2768acb1e1eaf5d35a6414e1c1f752944b14bdf Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 17 Nov 2016 09:23:55 -0500 Subject: [PATCH 2/5] hashmap: allow memihash computation to be continued Add variant of memihash() to allow the hash computation to be continued. There are times when we compute the hash on a full path and then the hash on just the path to the parent directory. This can be expensive on large repositories. With this, we can hash the parent directory first. And then continue the computation to include the "/filename". Signed-off-by: Jeff Hostetler --- hashmap.c | 17 +++++++++++++++++ hashmap.h | 1 + 2 files changed, 18 insertions(+) diff --git a/hashmap.c b/hashmap.c index b10b642229..505e63fd3c 100644 --- a/hashmap.c +++ b/hashmap.c @@ -50,6 +50,23 @@ unsigned int memihash(const void *buf, size_t len) return hash; } +/* + * Incoporate another chunk of data into a memihash + * computation. + */ +unsigned int memihash_cont(unsigned int hash_seed, const void *buf, size_t len) +{ + unsigned int hash = hash_seed; + unsigned char *ucbuf = (unsigned char *) buf; + while (len--) { + unsigned int c = *ucbuf++; + if (c >= 'a' && c <= 'z') + c -= 'a' - 'A'; + hash = (hash * FNV32_PRIME) ^ c; + } + return hash; +} + #define HASHMAP_INITIAL_SIZE 64 /* grow / shrink by 2^2 */ #define HASHMAP_RESIZE_BITS 2 diff --git a/hashmap.h b/hashmap.h index ab7958ae33..45eda693bd 100644 --- a/hashmap.h +++ b/hashmap.h @@ -12,6 +12,7 @@ extern unsigned int strhash(const char *buf); extern unsigned int strihash(const char *buf); extern unsigned int memhash(const void *buf, size_t len); extern unsigned int memihash(const void *buf, size_t len); +extern unsigned int memihash_cont(unsigned int hash_seed, const void *buf, size_t len); static inline unsigned int sha1hash(const unsigned char *sha1) { From c47c3dfdd370b55431bbbc2ce75cc51bf17b87a1 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 17 Nov 2016 09:59:59 -0500 Subject: [PATCH 3/5] name-hash: precompute hash values during preload-index Precompute the istate.name_hash and istate.dir_hash values for each cache-entry during the preload-index phase. Move the expensive memihash() calculations from lazy_init_name_hash() to the multi-threaded preload-index phase. Signed-off-by: Jeff Hostetler --- cache.h | 16 ++++++++++++ name-hash.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++-- preload-index.c | 2 ++ read-cache.c | 3 +++ 4 files changed, 85 insertions(+), 2 deletions(-) diff --git a/cache.h b/cache.h index 5c035dae57..2a3ee42f7f 100644 --- a/cache.h +++ b/cache.h @@ -173,6 +173,9 @@ struct cache_entry { unsigned int ce_flags; unsigned int ce_namelen; unsigned int index; /* for link extension */ + unsigned int precompute_hash_state; + unsigned int precompute_hash_name; + unsigned int precompute_hash_dir; struct object_id oid; char name[FLEX_ARRAY]; /* more */ }; @@ -229,6 +232,19 @@ struct cache_entry { #error "CE_EXTENDED_FLAGS out of range" #endif +/* + * Bit set if preload-index precomputed the hash value(s) + * for this cache-entry. + */ +#define CE_PRECOMPUTE_HASH_STATE__SET (1 << 0) +/* + * Bit set if precompute-index also precomputed the hash value + * for the parent directory. + */ +#define CE_PRECOMPUTE_HASH_STATE__DIR (1 << 1) + +void precompute_istate_hashes(struct cache_entry *ce); + /* Forward structure decls */ struct pathspec; struct child_process; diff --git a/name-hash.c b/name-hash.c index 860b8dd0c7..7a220187ad 100644 --- a/name-hash.c +++ b/name-hash.c @@ -50,6 +50,17 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, */ struct dir_entry *dir; unsigned int hash; + int use_precomputed_dir_hash = 0; + + if (ce->precompute_hash_state & CE_PRECOMPUTE_HASH_STATE__SET) { + if (!(ce->precompute_hash_state & CE_PRECOMPUTE_HASH_STATE__DIR)) + return NULL; /* item does not have a parent directory */ + if (namelen == ce_namelen(ce)) { + /* dir hash only valid for outer-most call (not recursive ones) */ + use_precomputed_dir_hash = 1; + hash = ce->precompute_hash_dir; + } + } /* get length of parent directory */ while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1])) @@ -59,7 +70,8 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, namelen--; /* lookup existing entry for that directory */ - hash = memihash(ce->name, namelen); + if (!use_precomputed_dir_hash) + hash = memihash(ce->name, namelen); dir = find_dir_entry__hash(istate, ce->name, namelen, hash); if (!dir) { /* not found, create it and add to hash table */ @@ -99,10 +111,18 @@ static void remove_dir_entry(struct index_state *istate, struct cache_entry *ce) static void hash_index_entry(struct index_state *istate, struct cache_entry *ce) { + unsigned int h; + if (ce->ce_flags & CE_HASHED) return; ce->ce_flags |= CE_HASHED; - hashmap_entry_init(ce, memihash(ce->name, ce_namelen(ce))); + + if (ce->precompute_hash_state & CE_PRECOMPUTE_HASH_STATE__SET) + h = ce->precompute_hash_name; + else + h = memihash(ce->name, ce_namelen(ce)); + + hashmap_entry_init(ce, h); hashmap_add(&istate->name_hash, ce); if (ignore_case) @@ -244,3 +264,45 @@ void free_name_hash(struct index_state *istate) hashmap_free(&istate->name_hash, 0); hashmap_free(&istate->dir_hash, 1); } + +/* + * Precompute the hash values for this cache_entry + * for use in the istate.name_hash and istate.dir_hash. + * + * If the item is in the root directory, just compute the + * hash value (for istate.name_hash) on the full path. + * + * If the item is in a subdirectory, first compute the + * hash value for the immediate parent directory (for + * istate.dir_hash) and then the hash value for the full + * path by continuing the computation. + * + * Note that these hashes will be used by + * wt_status_collect_untracked() as it scans the worktree + * and maps observed paths back to the index (optionally + * ignoring case). Therefore, we probably only *NEED* to + * precompute this for non-skip-worktree items (since + * status should not observe skipped items), but because + * lazy_init_name_hash() hashes everything, we force it + * here. + */ +void precompute_istate_hashes(struct cache_entry *ce) +{ + int namelen = ce_namelen(ce); + + while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1])) + namelen--; + + if (namelen <= 0) { + ce->precompute_hash_name = memihash(ce->name, ce_namelen(ce)); + ce->precompute_hash_state = CE_PRECOMPUTE_HASH_STATE__SET; + } else { + namelen--; + ce->precompute_hash_dir = memihash(ce->name, namelen); + ce->precompute_hash_name = memihash_cont( + ce->precompute_hash_dir, &ce->name[namelen], + ce_namelen(ce) - namelen); + ce->precompute_hash_state = + CE_PRECOMPUTE_HASH_STATE__SET | CE_PRECOMPUTE_HASH_STATE__DIR; + } +} diff --git a/preload-index.c b/preload-index.c index c1fe3a3ef9..602737f9d0 100644 --- a/preload-index.c +++ b/preload-index.c @@ -47,6 +47,8 @@ static void *preload_thread(void *_data) struct cache_entry *ce = *cep++; struct stat st; + precompute_istate_hashes(ce); + if (ce_stage(ce)) continue; if (S_ISGITLINK(ce->ce_mode)) diff --git a/read-cache.c b/read-cache.c index 585ff02dc3..023e5a9d34 100644 --- a/read-cache.c +++ b/read-cache.c @@ -73,6 +73,7 @@ void rename_index_entry_at(struct index_state *istate, int nr, const char *new_n copy_cache_entry(new, old); new->ce_flags &= ~CE_HASHED; new->ce_namelen = namelen; + new->precompute_hash_state = 0; new->index = 0; memcpy(new->name, new_name, namelen + 1); @@ -621,6 +622,7 @@ static struct cache_entry *create_alias_ce(struct index_state *istate, new = xcalloc(1, cache_entry_size(len)); memcpy(new->name, alias->name, len); copy_cache_entry(new, ce); + new->precompute_hash_state = 0; save_or_free_index_entry(istate, ce); return new; } @@ -1456,6 +1458,7 @@ static struct cache_entry *cache_entry_from_ondisk(struct ondisk_cache_entry *on ce->ce_stat_data.sd_size = get_be32(&ondisk->size); ce->ce_flags = flags & ~CE_NAMEMASK; ce->ce_namelen = len; + ce->precompute_hash_state = 0; ce->index = 0; hashcpy(ce->oid.hash, ondisk->sha1); memcpy(ce->name, name, len); From 4723d1cfe62c6a89179a4a941d273e8c4d51cb26 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 17 Nov 2016 10:05:26 -0500 Subject: [PATCH 4/5] name-hash: specify initial size for istate.dir_hash table Specify an initial size for the istate.dir_hash HashMap matching the size of the istate.name_hash. Previously hashmap_init() was given 0, causing a 64 bucket hashmap to be created. When working with very large repositories, this would cause numerous rehash() calls to realloc and rebalance the hashmap. This is especially true when the worktree is deep, with many directories containing a few files. Signed-off-by: Jeff Hostetler --- name-hash.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/name-hash.c b/name-hash.c index 7a220187ad..b2f6d2df7d 100644 --- a/name-hash.c +++ b/name-hash.c @@ -148,7 +148,8 @@ static void lazy_init_name_hash(struct index_state *istate) return; hashmap_init(&istate->name_hash, (hashmap_cmp_fn) cache_entry_cmp, istate->cache_nr); - hashmap_init(&istate->dir_hash, (hashmap_cmp_fn) dir_entry_cmp, 0); + hashmap_init(&istate->dir_hash, (hashmap_cmp_fn) dir_entry_cmp, + istate->cache_nr); for (nr = 0; nr < istate->cache_nr; nr++) hash_index_entry(istate, istate->cache[nr]); istate->name_hash_initialized = 1; From aac8f185f3f1cc85a15bb86682eb1967ea7e4897 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 17 Nov 2016 10:37:01 -0500 Subject: [PATCH 5/5] name-hash: remember previous dir_entry during lazy_init_name_hash Teach hash_dir_entry() to remember the previously found dir_entry during lazy_init_name_hash() iteration. This is a performance optimization. Since items in the index array are sorted by full pathname, adjacent items are likely to be in the same directory. This can save memihash() computations and HashMap lookups. Signed-off-by: Jeff Hostetler --- name-hash.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/name-hash.c b/name-hash.c index b2f6d2df7d..fcd2f90cf7 100644 --- a/name-hash.c +++ b/name-hash.c @@ -39,7 +39,7 @@ static struct dir_entry *find_dir_entry(struct index_state *istate, } static struct dir_entry *hash_dir_entry(struct index_state *istate, - struct cache_entry *ce, int namelen) + struct cache_entry *ce, int namelen, struct dir_entry **p_previous_dir) { /* * Throw each directory component in the hash for quick lookup @@ -70,9 +70,21 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, namelen--; /* lookup existing entry for that directory */ - if (!use_precomputed_dir_hash) - hash = memihash(ce->name, namelen); - dir = find_dir_entry__hash(istate, ce->name, namelen, hash); + if (p_previous_dir && *p_previous_dir + && namelen == (*p_previous_dir)->namelen + && memcmp(ce->name, (*p_previous_dir)->name, namelen) == 0) { + /* + * When our caller is sequentially iterating thru the index, + * items in the same directory will be sequential, and therefore + * refer to the same dir_entry. + */ + dir = *p_previous_dir; + } else { + if (!use_precomputed_dir_hash) + hash = memihash(ce->name, namelen); + dir = find_dir_entry__hash(istate, ce->name, namelen, hash); + } + if (!dir) { /* not found, create it and add to hash table */ FLEX_ALLOC_MEM(dir, name, ce->name, namelen); @@ -81,15 +93,20 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, hashmap_add(&istate->dir_hash, dir); /* recursively add missing parent directories */ - dir->parent = hash_dir_entry(istate, ce, namelen); + dir->parent = hash_dir_entry(istate, ce, namelen, NULL); } + + if (p_previous_dir) + *p_previous_dir = dir; + return dir; } -static void add_dir_entry(struct index_state *istate, struct cache_entry *ce) +static void add_dir_entry(struct index_state *istate, struct cache_entry *ce, + struct dir_entry **p_previous_dir) { /* Add reference to the directory entry (and parents if 0). */ - struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce)); + struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce), p_previous_dir); while (dir && !(dir->nr++)) dir = dir->parent; } @@ -100,7 +117,7 @@ static void remove_dir_entry(struct index_state *istate, struct cache_entry *ce) * Release reference to the directory entry. If 0, remove and continue * with parent directory. */ - struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce)); + struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce), NULL); while (dir && !(--dir->nr)) { struct dir_entry *parent = dir->parent; hashmap_remove(&istate->dir_hash, dir, NULL); @@ -109,7 +126,8 @@ static void remove_dir_entry(struct index_state *istate, struct cache_entry *ce) } } -static void hash_index_entry(struct index_state *istate, struct cache_entry *ce) +static void hash_index_entry(struct index_state *istate, struct cache_entry *ce, + struct dir_entry **p_previous_dir) { unsigned int h; @@ -126,7 +144,7 @@ static void hash_index_entry(struct index_state *istate, struct cache_entry *ce) hashmap_add(&istate->name_hash, ce); if (ignore_case) - add_dir_entry(istate, ce); + add_dir_entry(istate, ce, p_previous_dir); } static int cache_entry_cmp(const struct cache_entry *ce1, @@ -142,6 +160,7 @@ static int cache_entry_cmp(const struct cache_entry *ce1, static void lazy_init_name_hash(struct index_state *istate) { + struct dir_entry *previous_dir = NULL; int nr; if (istate->name_hash_initialized) @@ -151,14 +170,14 @@ static void lazy_init_name_hash(struct index_state *istate) hashmap_init(&istate->dir_hash, (hashmap_cmp_fn) dir_entry_cmp, istate->cache_nr); for (nr = 0; nr < istate->cache_nr; nr++) - hash_index_entry(istate, istate->cache[nr]); + hash_index_entry(istate, istate->cache[nr], &previous_dir); istate->name_hash_initialized = 1; } void add_name_hash(struct index_state *istate, struct cache_entry *ce) { if (istate->name_hash_initialized) - hash_index_entry(istate, ce); + hash_index_entry(istate, ce, NULL); } void remove_name_hash(struct index_state *istate, struct cache_entry *ce)