From 3c94bd8dfb43d1e6f38184e34f4d5a0046dd00d7 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 11 Jan 2024 11:06:39 +0100 Subject: [PATCH 1/7] reftable/stack: refactor stack reloading to have common exit path The `reftable_stack_reload_maybe_reuse()` function is responsible for reloading the reftable list from disk. The function is quite hard to follow though because it has a bunch of different exit paths, many of which have to free the same set of resources. Refactor the function to have a common exit path. While at it, touch up the style of this function a bit to match our usual coding style better. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/stack.c | 86 +++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/reftable/stack.c b/reftable/stack.c index 16bab82063..bf869a6772 100644 --- a/reftable/stack.c +++ b/reftable/stack.c @@ -304,69 +304,67 @@ static int tv_cmp(struct timeval *a, struct timeval *b) static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st, int reuse_open) { - struct timeval deadline = { 0 }; - int err = gettimeofday(&deadline, NULL); + char **names = NULL, **names_after = NULL; + struct timeval deadline; int64_t delay = 0; - int tries = 0; + int tries = 0, err; + + err = gettimeofday(&deadline, NULL); if (err < 0) - return err; - + goto out; deadline.tv_sec += 3; - while (1) { - char **names = NULL; - char **names_after = NULL; - struct timeval now = { 0 }; - int err = gettimeofday(&now, NULL); - int err2 = 0; - if (err < 0) { - return err; - } - /* Only look at deadlines after the first few times. This - simplifies debugging in GDB */ + while (1) { + struct timeval now; + + err = gettimeofday(&now, NULL); + if (err < 0) + goto out; + + /* + * Only look at deadlines after the first few times. This + * simplifies debugging in GDB. + */ tries++; - if (tries > 3 && tv_cmp(&now, &deadline) >= 0) { - break; - } + if (tries > 3 && tv_cmp(&now, &deadline) >= 0) + goto out; err = read_lines(st->list_file, &names); - if (err < 0) { - free_names(names); - return err; - } + if (err < 0) + goto out; + err = reftable_stack_reload_once(st, names, reuse_open); - if (err == 0) { - free_names(names); + if (!err) break; - } - if (err != REFTABLE_NOT_EXIST_ERROR) { - free_names(names); - return err; - } - - /* err == REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent - writer. Check if there was one by checking if the name list - changed. - */ - err2 = read_lines(st->list_file, &names_after); - if (err2 < 0) { - free_names(names); - return err2; - } + if (err != REFTABLE_NOT_EXIST_ERROR) + goto out; + /* + * REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent + * writer. Check if there was one by checking if the name list + * changed. + */ + err = read_lines(st->list_file, &names_after); + if (err < 0) + goto out; if (names_equal(names_after, names)) { - free_names(names); - free_names(names_after); - return err; + err = REFTABLE_NOT_EXIST_ERROR; + goto out; } + free_names(names); + names = NULL; free_names(names_after); + names_after = NULL; delay = delay + (delay * rand()) / RAND_MAX + 1; sleep_millisec(delay); } - return 0; +out: + free_names(names); + free_names(names_after); + return err; } /* -1 = error From c5b5d5fbbc43364a3d3c0aedf9e984a0ffe04537 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 11 Jan 2024 11:06:43 +0100 Subject: [PATCH 2/7] reftable/stack: refactor reloading to use file descriptor We're about to introduce a stat(3P)-based caching mechanism to reload the list of stacks only when it has changed. In order to avoid race conditions this requires us to have a file descriptor available that we can use to call fstat(3P) on. Prepare for this by converting the code to use `fd_read_lines()` so that we have the file descriptor readily available. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/stack.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/reftable/stack.c b/reftable/stack.c index bf869a6772..b1ee247601 100644 --- a/reftable/stack.c +++ b/reftable/stack.c @@ -308,6 +308,7 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st, struct timeval deadline; int64_t delay = 0; int tries = 0, err; + int fd = -1; err = gettimeofday(&deadline, NULL); if (err < 0) @@ -329,9 +330,19 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st, if (tries > 3 && tv_cmp(&now, &deadline) >= 0) goto out; - err = read_lines(st->list_file, &names); - if (err < 0) - goto out; + fd = open(st->list_file, O_RDONLY); + if (fd < 0) { + if (errno != ENOENT) { + err = REFTABLE_IO_ERROR; + goto out; + } + + names = reftable_calloc(sizeof(char *)); + } else { + err = fd_read_lines(fd, &names); + if (err < 0) + goto out; + } err = reftable_stack_reload_once(st, names, reuse_open); if (!err) @@ -356,12 +367,16 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st, names = NULL; free_names(names_after); names_after = NULL; + close(fd); + fd = -1; delay = delay + (delay * rand()) / RAND_MAX + 1; sleep_millisec(delay); } out: + if (fd >= 0) + close(fd); free_names(names); free_names(names_after); return err; From 6fdfaf15a0c60572ac58c979a46e34634051e12f Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 11 Jan 2024 11:06:48 +0100 Subject: [PATCH 3/7] reftable/stack: use stat info to avoid re-reading stack list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Whenever we call into the refs interfaces we potentially have to reload refs in case they have been concurrently modified, either in-process or externally. While this happens somewhat automatically for loose refs because we simply try to re-read the files, the "packed" backend will reload its snapshot of the packed-refs file in case its stat info has changed since last reading it. In the reftable backend we have a similar mechanism that is provided by `reftable_stack_reload()`. This function will read the list of stacks from "tables.list" and, if they have changed from the currently stored list, reload the stacks. This is heavily inefficient though, as we have to check whether the stack is up-to-date on basically every read and thus keep on re-reading the file all the time even if it didn't change at all. We can do better and use the same stat(3P)-based mechanism that the "packed" backend uses. Instead of reading the file, we will only open the file descriptor, fstat(3P) it, and then compare the info against the cached value from the last time we have updated the stack. This should always work alright because "tables.list" is updated atomically via a rename, so even if the ctime or mtime wasn't granular enough to identify a change, at least the inode number or file size should have changed. This change significantly speeds up operations where many refs are read, like when using git-update-ref(1). The following benchmark creates N refs in an otherwise-empty repository via `git update-ref --stdin`: Benchmark 1: update-ref: create many refs (refcount = 1, revision = HEAD~) Time (mean ± σ): 5.1 ms ± 0.2 ms [User: 2.4 ms, System: 2.6 ms] Range (min … max): 4.8 ms … 7.2 ms 109 runs Benchmark 2: update-ref: create many refs (refcount = 100, revision = HEAD~) Time (mean ± σ): 19.1 ms ± 0.9 ms [User: 8.9 ms, System: 9.9 ms] Range (min … max): 18.4 ms … 26.7 ms 72 runs Benchmark 3: update-ref: create many refs (refcount = 10000, revision = HEAD~) Time (mean ± σ): 1.336 s ± 0.018 s [User: 0.590 s, System: 0.724 s] Range (min … max): 1.314 s … 1.373 s 10 runs Benchmark 4: update-ref: create many refs (refcount = 1, revision = HEAD) Time (mean ± σ): 5.1 ms ± 0.2 ms [User: 2.4 ms, System: 2.6 ms] Range (min … max): 4.8 ms … 7.2 ms 109 runs Benchmark 5: update-ref: create many refs (refcount = 100, revision = HEAD) Time (mean ± σ): 14.8 ms ± 0.2 ms [User: 7.1 ms, System: 7.5 ms] Range (min … max): 14.2 ms … 15.2 ms 82 runs Benchmark 6: update-ref: create many refs (refcount = 10000, revision = HEAD) Time (mean ± σ): 927.6 ms ± 5.3 ms [User: 437.8 ms, System: 489.5 ms] Range (min … max): 919.4 ms … 936.4 ms 10 runs Summary update-ref: create many refs (refcount = 1, revision = HEAD) ran 1.00 ± 0.07 times faster than update-ref: create many refs (refcount = 1, revision = HEAD~) 2.89 ± 0.14 times faster than update-ref: create many refs (refcount = 100, revision = HEAD) 3.74 ± 0.25 times faster than update-ref: create many refs (refcount = 100, revision = HEAD~) 181.26 ± 8.30 times faster than update-ref: create many refs (refcount = 10000, revision = HEAD) 261.01 ± 12.35 times faster than update-ref: create many refs (refcount = 10000, revision = HEAD~) Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/stack.c | 12 +++++++++++- reftable/stack.h | 1 + reftable/system.h | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/reftable/stack.c b/reftable/stack.c index b1ee247601..c28d82299d 100644 --- a/reftable/stack.c +++ b/reftable/stack.c @@ -175,6 +175,7 @@ void reftable_stack_destroy(struct reftable_stack *st) st->readers_len = 0; FREE_AND_NULL(st->readers); } + stat_validity_clear(&st->list_validity); FREE_AND_NULL(st->list_file); FREE_AND_NULL(st->reftable_dir); reftable_free(st); @@ -374,7 +375,11 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st, sleep_millisec(delay); } + stat_validity_update(&st->list_validity, fd); + out: + if (err) + stat_validity_clear(&st->list_validity); if (fd >= 0) close(fd); free_names(names); @@ -388,8 +393,13 @@ out: static int stack_uptodate(struct reftable_stack *st) { char **names = NULL; - int err = read_lines(st->list_file, &names); + int err; int i = 0; + + if (stat_validity_check(&st->list_validity, st->list_file)) + return 0; + + err = read_lines(st->list_file, &names); if (err < 0) return err; diff --git a/reftable/stack.h b/reftable/stack.h index f57005846e..3f80cc598a 100644 --- a/reftable/stack.h +++ b/reftable/stack.h @@ -14,6 +14,7 @@ https://developers.google.com/open-source/licenses/bsd #include "reftable-stack.h" struct reftable_stack { + struct stat_validity list_validity; char *list_file; char *reftable_dir; int disable_auto_compact; diff --git a/reftable/system.h b/reftable/system.h index 6b74a81514..2cc7adf271 100644 --- a/reftable/system.h +++ b/reftable/system.h @@ -12,6 +12,7 @@ https://developers.google.com/open-source/licenses/bsd /* This header glues the reftable library to the rest of Git */ #include "git-compat-util.h" +#include "statinfo.h" #include "strbuf.h" #include "hash-ll.h" /* hash ID, sizes.*/ #include "dir.h" /* remove_dir_recursively, for tests.*/ From 85e72be15d9eed0fc9373a8f4f8bc0fcc75041b3 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 11 Jan 2024 11:06:52 +0100 Subject: [PATCH 4/7] reftable/blocksource: refactor code to match our coding style Refactor `reftable_block_source_from_file()` to match our coding style better. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/blocksource.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/reftable/blocksource.c b/reftable/blocksource.c index a1ea304429..1e2fb5e9fd 100644 --- a/reftable/blocksource.c +++ b/reftable/blocksource.c @@ -125,24 +125,23 @@ static struct reftable_block_source_vtable file_vtable = { int reftable_block_source_from_file(struct reftable_block_source *bs, const char *name) { - struct stat st = { 0 }; - int err = 0; - int fd = open(name, O_RDONLY); - struct file_block_source *p = NULL; + struct file_block_source *p; + struct stat st; + int fd; + + fd = open(name, O_RDONLY); if (fd < 0) { - if (errno == ENOENT) { + if (errno == ENOENT) return REFTABLE_NOT_EXIST_ERROR; - } return -1; } - err = fstat(fd, &st); - if (err < 0) { + if (fstat(fd, &st) < 0) { close(fd); return REFTABLE_IO_ERROR; } - p = reftable_calloc(sizeof(struct file_block_source)); + p = reftable_calloc(sizeof(*p)); p->size = st.st_size; p->fd = fd; From 718a93ecc06ed59dda4e6a5d91b1c2169275694f Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 11 Jan 2024 11:06:56 +0100 Subject: [PATCH 5/7] reftable/blocksource: use mmap to read tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The blocksource interface provides an interface to read blocks from a reftable table. This interface is implemented using read(3P) calls on the underlying file descriptor. While this works alright, this pattern is very inefficient when repeatedly querying the reftable stack for one or more refs. This inefficiency can mostly be attributed to the fact that we often need to re-read the same blocks over and over again, and every single time we need to call read(3P) again. A natural fit in this context is to use mmap(3P) instead of read(3P), which has a bunch of benefits: - We do not need to come up with a caching strategy for some of the blocks as this will be handled by the kernel already. - We can avoid the overhead of having to call into the read(3P) syscall repeatedly. - We do not need to allocate returned blocks repeatedly, but can instead hand out pointers into the mmapped region directly. Using mmap comes with a significant drawback on Windows though, because mmapped files cannot be deleted and neither is it possible to rename files onto an mmapped file. But for one, the reftable library gracefully handles the case where auto-compaction cannot delete a still-open stack already and ignores any such errors. Also, `reftable_stack_clean()` will prune stale tables which are not referenced by "tables.list" anymore so that those files can eventually be pruned. And second, we never rewrite already-written stacks, so it does not matter that we cannot rename a file over an mmaped file, either. Another unfortunate property of mmap is that it is not supported by all systems. But given that the size of reftables should typically be rather limited (megabytes at most in the vast majority of repositories), we can use the fallback implementation provided by `git_mmap()` which reads the whole file into memory instead. This is the same strategy that the "packed" backend uses. While this change doesn't significantly improve performance in the case where we're seeking through stacks once (like e.g. git-for-each-ref(1) would). But it does speed up usecases where there is lots of random access to refs, e.g. when writing. The following benchmark demonstrates these savings with git-update-ref(1) creating N refs in an otherwise empty repository: Benchmark 1: update-ref: create many refs (refcount = 1, revision = HEAD~) Time (mean ± σ): 5.1 ms ± 0.2 ms [User: 2.5 ms, System: 2.5 ms] Range (min … max): 4.8 ms … 7.1 ms 111 runs Benchmark 2: update-ref: create many refs (refcount = 100, revision = HEAD~) Time (mean ± σ): 14.8 ms ± 0.5 ms [User: 7.1 ms, System: 7.5 ms] Range (min … max): 14.1 ms … 18.7 ms 84 runs Benchmark 3: update-ref: create many refs (refcount = 10000, revision = HEAD~) Time (mean ± σ): 926.4 ms ± 5.6 ms [User: 448.5 ms, System: 477.7 ms] Range (min … max): 920.0 ms … 936.1 ms 10 runs Benchmark 4: update-ref: create many refs (refcount = 1, revision = HEAD) Time (mean ± σ): 5.0 ms ± 0.2 ms [User: 2.4 ms, System: 2.5 ms] Range (min … max): 4.7 ms … 5.4 ms 111 runs Benchmark 5: update-ref: create many refs (refcount = 100, revision = HEAD) Time (mean ± σ): 10.5 ms ± 0.2 ms [User: 5.0 ms, System: 5.3 ms] Range (min … max): 10.0 ms … 10.9 ms 93 runs Benchmark 6: update-ref: create many refs (refcount = 10000, revision = HEAD) Time (mean ± σ): 529.6 ms ± 9.1 ms [User: 268.0 ms, System: 261.4 ms] Range (min … max): 522.4 ms … 547.1 ms 10 runs Summary update-ref: create many refs (refcount = 1, revision = HEAD) ran 1.01 ± 0.06 times faster than update-ref: create many refs (refcount = 1, revision = HEAD~) 2.08 ± 0.07 times faster than update-ref: create many refs (refcount = 100, revision = HEAD) 2.95 ± 0.14 times faster than update-ref: create many refs (refcount = 100, revision = HEAD~) 105.33 ± 3.76 times faster than update-ref: create many refs (refcount = 10000, revision = HEAD) 184.24 ± 5.89 times faster than update-ref: create many refs (refcount = 10000, revision = HEAD~) Theoretically, we could also replicate the strategy of the "packed" backend where small tables are read into memory instead of using mmap. Benchmarks did not confirm that this has a performance benefit though. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/blocksource.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/reftable/blocksource.c b/reftable/blocksource.c index 1e2fb5e9fd..8c41e3c70f 100644 --- a/reftable/blocksource.c +++ b/reftable/blocksource.c @@ -76,8 +76,8 @@ struct reftable_block_source malloc_block_source(void) } struct file_block_source { - int fd; uint64_t size; + unsigned char *data; }; static uint64_t file_size(void *b) @@ -87,19 +87,12 @@ static uint64_t file_size(void *b) static void file_return_block(void *b, struct reftable_block *dest) { - if (dest->len) - memset(dest->data, 0xff, dest->len); - reftable_free(dest->data); } -static void file_close(void *b) +static void file_close(void *v) { - int fd = ((struct file_block_source *)b)->fd; - if (fd > 0) { - close(fd); - ((struct file_block_source *)b)->fd = 0; - } - + struct file_block_source *b = v; + munmap(b->data, b->size); reftable_free(b); } @@ -108,9 +101,7 @@ static int file_read_block(void *v, struct reftable_block *dest, uint64_t off, { struct file_block_source *b = v; assert(off + size <= b->size); - dest->data = reftable_malloc(size); - if (pread_in_full(b->fd, dest->data, size, off) != size) - return -1; + dest->data = b->data + off; dest->len = size; return size; } @@ -143,7 +134,8 @@ int reftable_block_source_from_file(struct reftable_block_source *bs, p = reftable_calloc(sizeof(*p)); p->size = st.st_size; - p->fd = fd; + p->data = xmmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + close(fd); assert(!bs->ops); bs->ops = &file_vtable; From 456333eb4d00a2bc8a71134bedfe5d3bc13a1932 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 18 Jan 2024 14:41:51 +0100 Subject: [PATCH 6/7] reftable/stack: unconditionally reload stack after commit After we have committed an addition to the reftable stack we call `reftable_stack_reload()` to reload the stack and thus reflect the changes that were just added. This function will only conditionally reload the stack in case `stack_uptodate()` tells us that the stack needs reloading. This check is wasteful though because we already know that the stack needs reloading. Call `reftable_stack_reload_maybe_reuse()` instead, which will unconditionally reload the stack. This is merely a conceptual fix, the code in question was not found to cause any problems in practice. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reftable/stack.c b/reftable/stack.c index c28d82299d..705cfb6caa 100644 --- a/reftable/stack.c +++ b/reftable/stack.c @@ -585,7 +585,7 @@ int reftable_addition_commit(struct reftable_addition *add) add->new_tables = NULL; add->new_tables_len = 0; - err = reftable_stack_reload(add->stack); + err = reftable_stack_reload_maybe_reuse(add->stack, 1); if (err) goto done; From 4f36b8597c8f1c20e465ade4159896fb592e34c0 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 18 Jan 2024 14:41:56 +0100 Subject: [PATCH 7/7] reftable/stack: fix race in up-to-date check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In 6fdfaf15a0 (reftable/stack: use stat info to avoid re-reading stack list, 2024-01-11) we have introduced a new mechanism to avoid re-reading the table list in case stat(3P) figures out that the stack didn't change since the last time we read it. While this change significantly improved performance when writing many refs, it can unfortunately lead to false negatives in very specific scenarios. Given two processes A and B, there is a feasible sequence of events that cause us to accidentally treat the table list as up-to-date even though it changed: 1. A reads the reftable stack and caches its stat info. 2. B updates the stack, appending a new table to "tables.list". This will both use a new inode and result in a different file size, thus invalidating A's cache in theory. 3. B decides to auto-compact the stack and merges two tables. The file size now matches what A has cached again. Furthermore, the filesystem may decide to recycle the inode number of the file we have replaced in (2) because it is not in use anymore. 4. A reloads the reftable stack. Neither the inode number nor the file size changed. If the timestamps did not change either then we think the cached copy of our stack is up-to-date. In fact, the commit introduced three related issues: - Non-POSIX compliant systems may not report proper `st_dev` and `st_ino` values in stat(3P), which made us rely solely on the file's potentially coarse-grained mtime and ctime. - `stat_validity_check()` and friends may end up not comparing `st_dev` and `st_ino` depending on the "core.checkstat" config, again reducing the signal to the mtime and ctime. - `st_ino` can be recycled, rendering the check moot even on POSIX-compliant systems. Given that POSIX defines that "The st_ino and st_dev fields taken together uniquely identify the file within the system", these issues led to the most important signal to establish file identity to be ignored or become useless in some cases. Refactor the code to stop using `stat_validity_check()`. Instead, we manually stat(3P) the file descriptors to make relevant information available. On Windows and MSYS2 the result will have both `st_dev` and `st_ino` set to 0, which allows us to address the first issue by not using the stat-based cache in that case. It also allows us to make sure that we always compare `st_dev` and `st_ino`, addressing the second issue. The third issue of inode recycling can be addressed by keeping the file descriptor of "files.list" open during the lifetime of the reftable stack. As the file will still exist on disk even though it has been unlinked it is impossible for its inode to be recycled as long as the file descriptor is still open. This should address the race in a POSIX-compliant way. The only real downside is that this mechanism cannot be used on non-POSIX-compliant systems like Windows. But we at least have the second-level caching mechanism in place that compares contents of "files.list" with the currently loaded list of tables. This new mechanism performs roughly the same as the previous one that relied on `stat_validity_check()`: Benchmark 1: update-ref: create many refs (HEAD~) Time (mean ± σ): 4.754 s ± 0.026 s [User: 2.204 s, System: 2.549 s] Range (min … max): 4.694 s … 4.802 s 20 runs Benchmark 2: update-ref: create many refs (HEAD) Time (mean ± σ): 4.721 s ± 0.020 s [User: 2.194 s, System: 2.527 s] Range (min … max): 4.691 s … 4.753 s 20 runs Summary update-ref: create many refs (HEAD~) ran 1.01 ± 0.01 times faster than update-ref: create many refs (HEAD) Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/stack.c | 99 +++++++++++++++++++++++++++++++++++++++++++---- reftable/stack.h | 4 +- reftable/system.h | 1 - 3 files changed, 95 insertions(+), 9 deletions(-) diff --git a/reftable/stack.c b/reftable/stack.c index 705cfb6caa..77a387a86c 100644 --- a/reftable/stack.c +++ b/reftable/stack.c @@ -66,6 +66,7 @@ int reftable_new_stack(struct reftable_stack **dest, const char *dir, strbuf_addstr(&list_file_name, "/tables.list"); p->list_file = strbuf_detach(&list_file_name, NULL); + p->list_fd = -1; p->reftable_dir = xstrdup(dir); p->config = config; @@ -175,7 +176,12 @@ void reftable_stack_destroy(struct reftable_stack *st) st->readers_len = 0; FREE_AND_NULL(st->readers); } - stat_validity_clear(&st->list_validity); + + if (st->list_fd >= 0) { + close(st->list_fd); + st->list_fd = -1; + } + FREE_AND_NULL(st->list_file); FREE_AND_NULL(st->reftable_dir); reftable_free(st); @@ -375,11 +381,59 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st, sleep_millisec(delay); } - stat_validity_update(&st->list_validity, fd); - out: - if (err) - stat_validity_clear(&st->list_validity); + /* + * Invalidate the stat cache. It is sufficient to only close the file + * descriptor and keep the cached stat info because we never use the + * latter when the former is negative. + */ + if (st->list_fd >= 0) { + close(st->list_fd); + st->list_fd = -1; + } + + /* + * Cache stat information in case it provides a useful signal to us. + * According to POSIX, "The st_ino and st_dev fields taken together + * uniquely identify the file within the system." That being said, + * Windows is not POSIX compliant and we do not have these fields + * available. So the information we have there is insufficient to + * determine whether two file descriptors point to the same file. + * + * While we could fall back to using other signals like the file's + * mtime, those are not sufficient to avoid races. We thus refrain from + * using the stat cache on such systems and fall back to the secondary + * caching mechanism, which is to check whether contents of the file + * have changed. + * + * On other systems which are POSIX compliant we must keep the file + * descriptor open. This is to avoid a race condition where two + * processes access the reftable stack at the same point in time: + * + * 1. A reads the reftable stack and caches its stat info. + * + * 2. B updates the stack, appending a new table to "tables.list". + * This will both use a new inode and result in a different file + * size, thus invalidating A's cache in theory. + * + * 3. B decides to auto-compact the stack and merges two tables. The + * file size now matches what A has cached again. Furthermore, the + * filesystem may decide to recycle the inode number of the file + * we have replaced in (2) because it is not in use anymore. + * + * 4. A reloads the reftable stack. Neither the inode number nor the + * file size changed. If the timestamps did not change either then + * we think the cached copy of our stack is up-to-date. + * + * By keeping the file descriptor open the inode number cannot be + * recycled, mitigating the race. + */ + if (!err && fd >= 0 && !fstat(fd, &st->list_st) && + st->list_st.st_dev && st->list_st.st_ino) { + st->list_fd = fd; + fd = -1; + } + if (fd >= 0) close(fd); free_names(names); @@ -396,8 +450,39 @@ static int stack_uptodate(struct reftable_stack *st) int err; int i = 0; - if (stat_validity_check(&st->list_validity, st->list_file)) - return 0; + /* + * When we have cached stat information available then we use it to + * verify whether the file has been rewritten. + * + * Note that we explicitly do not want to use `stat_validity_check()` + * and friends here because they may end up not comparing the `st_dev` + * and `st_ino` fields. These functions thus cannot guarantee that we + * indeed still have the same file. + */ + if (st->list_fd >= 0) { + struct stat list_st; + + if (stat(st->list_file, &list_st) < 0) { + /* + * It's fine for "tables.list" to not exist. In that + * case, we have to refresh when the loaded stack has + * any readers. + */ + if (errno == ENOENT) + return !!st->readers_len; + return REFTABLE_IO_ERROR; + } + + /* + * When "tables.list" refers to the same file we can assume + * that it didn't change. This is because we always use + * rename(3P) to update the file and never write to it + * directly. + */ + if (st->list_st.st_dev == list_st.st_dev && + st->list_st.st_ino == list_st.st_ino) + return 0; + } err = read_lines(st->list_file, &names); if (err < 0) diff --git a/reftable/stack.h b/reftable/stack.h index 3f80cc598a..c1e3efa899 100644 --- a/reftable/stack.h +++ b/reftable/stack.h @@ -14,8 +14,10 @@ https://developers.google.com/open-source/licenses/bsd #include "reftable-stack.h" struct reftable_stack { - struct stat_validity list_validity; + struct stat list_st; char *list_file; + int list_fd; + char *reftable_dir; int disable_auto_compact; diff --git a/reftable/system.h b/reftable/system.h index 2cc7adf271..6b74a81514 100644 --- a/reftable/system.h +++ b/reftable/system.h @@ -12,7 +12,6 @@ https://developers.google.com/open-source/licenses/bsd /* This header glues the reftable library to the rest of Git */ #include "git-compat-util.h" -#include "statinfo.h" #include "strbuf.h" #include "hash-ll.h" /* hash ID, sizes.*/ #include "dir.h" /* remove_dir_recursively, for tests.*/