From eb591e42fd48dda59c309beba6991e95d552034f Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 1 May 2020 15:30:18 +0000 Subject: [PATCH 1/7] bloom: fix whitespace around tab length Fix alignment issues that were likely introduced due to an editor using tab lengths of 4 instead of 8. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- bloom.c | 16 ++++++++-------- bloom.h | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/bloom.c b/bloom.c index dd9bab9bbd..2e3e0f5037 100644 --- a/bloom.c +++ b/bloom.c @@ -29,8 +29,8 @@ static inline unsigned char get_bitmask(uint32_t pos) } static int load_bloom_filter_from_graph(struct commit_graph *g, - struct bloom_filter *filter, - struct commit *c) + struct bloom_filter *filter, + struct commit *c) { uint32_t lex_pos, start_index, end_index; @@ -123,9 +123,9 @@ uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len) } void fill_bloom_key(const char *data, - size_t len, - struct bloom_key *key, - const struct bloom_filter_settings *settings) + size_t len, + struct bloom_key *key, + const struct bloom_filter_settings *settings) { int i; const uint32_t seed0 = 0x293ae76f; @@ -139,8 +139,8 @@ void fill_bloom_key(const char *data, } void add_key_to_filter(const struct bloom_key *key, - struct bloom_filter *filter, - const struct bloom_filter_settings *settings) + struct bloom_filter *filter, + const struct bloom_filter_settings *settings) { int i; uint64_t mod = filter->len * BITS_PER_WORD; @@ -160,7 +160,7 @@ void init_bloom_filters(void) struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c, - int compute_if_not_present) + int compute_if_not_present) { struct bloom_filter *filter; struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; diff --git a/bloom.h b/bloom.h index b935186425..a51e371529 100644 --- a/bloom.h +++ b/bloom.h @@ -74,8 +74,8 @@ void fill_bloom_key(const char *data, const struct bloom_filter_settings *settings); void add_key_to_filter(const struct bloom_key *key, - struct bloom_filter *filter, - const struct bloom_filter_settings *settings); + struct bloom_filter *filter, + const struct bloom_filter_settings *settings); void init_bloom_filters(void); From 54c337be9c8ad47b49709a43b4c63f407ff168e6 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 1 May 2020 15:30:19 +0000 Subject: [PATCH 2/7] test-bloom: fix usage typo Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- t/helper/test-bloom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c index 77eb27adac..00c1d44a56 100644 --- a/t/helper/test-bloom.c +++ b/t/helper/test-bloom.c @@ -44,7 +44,7 @@ static void get_bloom_filter_for_commit(const struct object_id *commit_oid) } static const char *bloom_usage = "\n" -" test-tool bloom get_murmer3 \n" +" test-tool bloom get_murmur3 \n" " test-tool bloom generate_filter [...]\n" " test-tool get_filter_for_commit \n"; From 891c17c95497712126700cdf5cd887bdb0ff3d55 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 11 May 2020 11:56:10 +0000 Subject: [PATCH 3/7] bloom: parse commit before computing filters When computing changed-path Bloom filters for a commit, we need to know if the commit has a parent or not. If the commit is not parsed, then its parent pointer will be NULL. As far as I can tell, the only opportunity to reach this code without parsing the commit is inside "test-tool bloom get_filter_for_commit" but it is best to be safe. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- bloom.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bloom.c b/bloom.c index 2e3e0f5037..9679278271 100644 --- a/bloom.c +++ b/bloom.c @@ -193,6 +193,9 @@ struct bloom_filter *get_bloom_filter(struct repository *r, diffopt.max_changes = max_changes; diff_setup_done(&diffopt); + /* ensure commit is parsed so we have parent information */ + repo_parse_commit(r, c); + if (c->parents) diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt); else From 88093289cdcfe99c5a3d7ef6f36ee45aa3018824 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 11 May 2020 11:56:11 +0000 Subject: [PATCH 4/7] Documentation: changed-path Bloom filters use byte words In Documentation/technical/commit-graph-format.txt, the definition of the BIDX chunk specifies the length is a number of 8-byte words. During development we discovered that using 8-byte words in the Murmur3 hash algorithm causes issues with big-endian versus little- endian machines. Thus, the hash algorithm was adapted to work on a byte-by-byte basis. However, this caused a change in the definition of a "word" in bloom.h. Now, a "word" is a single byte, which allows filters to be as small as two bytes. These length-two filters are demonstrated in t0095-bloom.sh, and a larger filter of length 25 is demonstrated as well. The original point of using 8-byte words was for alignment reasons. It also presented opportunities for extremely sparse Bloom filters when there were a small number of changes at a commit, creating a very low false-positive rate. However, modifying the format at this point is unlikely to be a valuable exercise. Also, this use of single-byte granularity does present opportunities to save space. It is unclear if 8-byte alignment of the filters would present any meaningful performance benefits. Modify the format document to reflect reality. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/technical/commit-graph-format.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/technical/commit-graph-format.txt b/Documentation/technical/commit-graph-format.txt index de56f9f1ef..1beef17182 100644 --- a/Documentation/technical/commit-graph-format.txt +++ b/Documentation/technical/commit-graph-format.txt @@ -97,10 +97,10 @@ CHUNK DATA: bit on. The other bits correspond to the position of the last parent. Bloom Filter Index (ID: {'B', 'I', 'D', 'X'}) (N * 4 bytes) [Optional] - * The ith entry, BIDX[i], stores the number of 8-byte word blocks in all - Bloom filters from commit 0 to commit i (inclusive) in lexicographic - order. The Bloom filter for the i-th commit spans from BIDX[i-1] to - BIDX[i] (plus header length), where BIDX[-1] is 0. + * The ith entry, BIDX[i], stores the number of bytes in all Bloom filters + from commit 0 to commit i (inclusive) in lexicographic order. The Bloom + filter for the i-th commit spans from BIDX[i-1] to BIDX[i] (plus header + length), where BIDX[-1] is 0. * The BIDX chunk is ignored if the BDAT chunk is not present. Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional] From 65c1a28bb60d1c17a672306c651bb402378f81b0 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 11 May 2020 11:56:12 +0000 Subject: [PATCH 5/7] bloom: de-duplicate directory entries When computing a changed-path Bloom filter, we need to take the files that changed from the diff computation and extract the parent directories. That way, a directory pathspec such as "Documentation" could match commits that change "Documentation/git.txt". However, the current code does a poor job of this process. The paths are added to a hashmap, but we do not check if an entry already exists with that path. This can create many duplicate entries and cause the filter to have a much larger length than it should. This means that the filter is more sparse than intended, which helps the false positive rate, but wastes a lot of space. Properly use hashmap_get() before hashmap_add(). Also be sure to include a comparison function so these can be matched correctly. This has an effect on a test in t0095-bloom.sh. This makes sense, there are ten changes inside "smallDir" so the total number of paths in the filter should be 11. This would result in 11 * 10 bits required, and with 8 bits per byte, this results in 14 bytes. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- bloom.c | 35 ++++++++++++++++++++++++++--------- t/t0095-bloom.sh | 4 ++-- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/bloom.c b/bloom.c index 9679278271..196cda0a1b 100644 --- a/bloom.c +++ b/bloom.c @@ -158,6 +158,19 @@ void init_bloom_filters(void) init_bloom_filter_slab(&bloom_filters); } +static int pathmap_cmp(const void *hashmap_cmp_fn_data, + const struct hashmap_entry *eptr, + const struct hashmap_entry *entry_or_key, + const void *keydata) +{ + const struct pathmap_hash_entry *e1, *e2; + + e1 = container_of(eptr, const struct pathmap_hash_entry, entry); + e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry); + + return strcmp(e1->path, e2->path); +} + struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c, int compute_if_not_present) @@ -206,25 +219,29 @@ struct bloom_filter *get_bloom_filter(struct repository *r, struct hashmap pathmap; struct pathmap_hash_entry *e; struct hashmap_iter iter; - hashmap_init(&pathmap, NULL, NULL, 0); + hashmap_init(&pathmap, pathmap_cmp, NULL, 0); for (i = 0; i < diff_queued_diff.nr; i++) { const char *path = diff_queued_diff.queue[i]->two->path; /* - * Add each leading directory of the changed file, i.e. for - * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so - * the Bloom filter could be used to speed up commands like - * 'git log dir/subdir', too. - * - * Note that directories are added without the trailing '/'. - */ + * Add each leading directory of the changed file, i.e. for + * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so + * the Bloom filter could be used to speed up commands like + * 'git log dir/subdir', too. + * + * Note that directories are added without the trailing '/'. + */ do { char *last_slash = strrchr(path, '/'); FLEX_ALLOC_STR(e, path, path); hashmap_entry_init(&e->entry, strhash(path)); - hashmap_add(&pathmap, &e->entry); + + if (!hashmap_get(&pathmap, &e->entry, NULL)) + hashmap_add(&pathmap, &e->entry); + else + free(e); if (!last_slash) last_slash = (char*)path; diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh index 8f9eef116d..6defeb544f 100755 --- a/t/t0095-bloom.sh +++ b/t/t0095-bloom.sh @@ -89,8 +89,8 @@ test_expect_success 'get bloom filter for commit with 10 changes' ' git add smallDir && git commit -m "commit with 10 changes" && cat >expect <<-\EOF && - Filter_Length:25 - Filter_Data:82|a0|65|47|0c|92|90|c0|a1|40|02|a0|e2|40|e0|04|0a|9a|66|cf|80|19|85|42|23| + Filter_Length:14 + Filter_Data:02|b3|c4|a0|34|e7|fe|eb|cb|47|fe|a0|e8|72| EOF test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual && test_cmp expect actual From 2f6775f00c4b39b905e84c46b95e5a04017ec9ba Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 11 May 2020 11:56:13 +0000 Subject: [PATCH 6/7] bloom: use num_changes not nr for limit detection As diff_tree_oid() computes a diff, it will terminate early if the total number of changed paths is strictly larger than max_changes. This includes the directories that changed, not just the file paths. However, only the file paths are reflected in the resulting diff queue's "nr" value. Use the "num_changes" from diffopt to check if the diff terminated early. This is incredibly important, as it can result in incorrect filters! For example, the first commit in the Linux kernel repo reports only 471 changes, but since these are nested inside several directories they expand to 513 "real" changes, and in fact the total list of changes is not reported. Thus, the computed filter for this commit is incorrect. Demonstrate the subtle difference by using one fewer file change in the 'get bloom filter for commit with 513 changes' test. Before, this edited 513 files inside "bigDir" which hit this inequality. However, dropping the file count by one demonstrates how the previous inequality was incorrect but the new one is correct. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- bloom.c | 2 +- t/t0095-bloom.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bloom.c b/bloom.c index 196cda0a1b..e2ede44126 100644 --- a/bloom.c +++ b/bloom.c @@ -215,7 +215,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r, diff_tree_oid(NULL, &c->object.oid, "", &diffopt); diffcore_std(&diffopt); - if (diff_queued_diff.nr <= max_changes) { + if (diffopt.num_changes <= max_changes) { struct hashmap pathmap; struct pathmap_hash_entry *e; struct hashmap_iter iter; diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh index 6defeb544f..48a9062559 100755 --- a/t/t0095-bloom.sh +++ b/t/t0095-bloom.sh @@ -100,7 +100,7 @@ test_expect_success EXPENSIVE 'get bloom filter for commit with 513 changes' ' rm actual && rm expect && mkdir bigDir && - for i in $(test_seq 0 512) + for i in $(test_seq 0 511) do echo $i >bigDir/$i done && From b928e488bd40a30797bb7b87733864b39dd32c36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= Date: Mon, 11 May 2020 11:56:14 +0000 Subject: [PATCH 7/7] completion: offer '--(no-)patch' among 'git log' options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: SZEDER Gábor Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- contrib/completion/git-completion.bash | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/completion/git-completion.bash b/contrib/completion/git-completion.bash index c21786f2fd..ec6ff1d5fb 100644 --- a/contrib/completion/git-completion.bash +++ b/contrib/completion/git-completion.bash @@ -1860,6 +1860,7 @@ _git_log () $merge $__git_diff_common_options --pickaxe-all --pickaxe-regex + --patch --no-patch " return ;;