From 3c94bd8dfb43d1e6f38184e34f4d5a0046dd00d7 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 11 Jan 2024 11:06:39 +0100
Subject: [PATCH 1/7] reftable/stack: refactor stack reloading to have common
 exit path

The `reftable_stack_reload_maybe_reuse()` function is responsible for
reloading the reftable list from disk. The function is quite hard to
follow though because it has a bunch of different exit paths, many of
which have to free the same set of resources.

Refactor the function to have a common exit path. While at it, touch up
the style of this function a bit to match our usual coding style better.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c | 86 +++++++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 44 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index 16bab82063..bf869a6772 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -304,69 +304,67 @@ static int tv_cmp(struct timeval *a, struct timeval *b)
 static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
 					     int reuse_open)
 {
-	struct timeval deadline = { 0 };
-	int err = gettimeofday(&deadline, NULL);
+	char **names = NULL, **names_after = NULL;
+	struct timeval deadline;
 	int64_t delay = 0;
-	int tries = 0;
+	int tries = 0, err;
+
+	err = gettimeofday(&deadline, NULL);
 	if (err < 0)
-		return err;
-
+		goto out;
 	deadline.tv_sec += 3;
-	while (1) {
-		char **names = NULL;
-		char **names_after = NULL;
-		struct timeval now = { 0 };
-		int err = gettimeofday(&now, NULL);
-		int err2 = 0;
-		if (err < 0) {
-			return err;
-		}
 
-		/* Only look at deadlines after the first few times. This
-		   simplifies debugging in GDB */
+	while (1) {
+		struct timeval now;
+
+		err = gettimeofday(&now, NULL);
+		if (err < 0)
+			goto out;
+
+		/*
+		 * Only look at deadlines after the first few times. This
+		 * simplifies debugging in GDB.
+		 */
 		tries++;
-		if (tries > 3 && tv_cmp(&now, &deadline) >= 0) {
-			break;
-		}
+		if (tries > 3 && tv_cmp(&now, &deadline) >= 0)
+			goto out;
 
 		err = read_lines(st->list_file, &names);
-		if (err < 0) {
-			free_names(names);
-			return err;
-		}
+		if (err < 0)
+			goto out;
+
 		err = reftable_stack_reload_once(st, names, reuse_open);
-		if (err == 0) {
-			free_names(names);
+		if (!err)
 			break;
-		}
-		if (err != REFTABLE_NOT_EXIST_ERROR) {
-			free_names(names);
-			return err;
-		}
-
-		/* err == REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent
-		   writer. Check if there was one by checking if the name list
-		   changed.
-		*/
-		err2 = read_lines(st->list_file, &names_after);
-		if (err2 < 0) {
-			free_names(names);
-			return err2;
-		}
+		if (err != REFTABLE_NOT_EXIST_ERROR)
+			goto out;
 
+		/*
+		 * REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent
+		 * writer. Check if there was one by checking if the name list
+		 * changed.
+		 */
+		err = read_lines(st->list_file, &names_after);
+		if (err < 0)
+			goto out;
 		if (names_equal(names_after, names)) {
-			free_names(names);
-			free_names(names_after);
-			return err;
+			err = REFTABLE_NOT_EXIST_ERROR;
+			goto out;
 		}
+
 		free_names(names);
+		names = NULL;
 		free_names(names_after);
+		names_after = NULL;
 
 		delay = delay + (delay * rand()) / RAND_MAX + 1;
 		sleep_millisec(delay);
 	}
 
-	return 0;
+out:
+	free_names(names);
+	free_names(names_after);
+	return err;
 }
 
 /* -1 = error

From c5b5d5fbbc43364a3d3c0aedf9e984a0ffe04537 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 11 Jan 2024 11:06:43 +0100
Subject: [PATCH 2/7] reftable/stack: refactor reloading to use file descriptor

We're about to introduce a stat(3P)-based caching mechanism to reload
the list of stacks only when it has changed. In order to avoid race
conditions this requires us to have a file descriptor available that we
can use to call fstat(3P) on.

Prepare for this by converting the code to use `fd_read_lines()` so that
we have the file descriptor readily available.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index bf869a6772..b1ee247601 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -308,6 +308,7 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
 	struct timeval deadline;
 	int64_t delay = 0;
 	int tries = 0, err;
+	int fd = -1;
 
 	err = gettimeofday(&deadline, NULL);
 	if (err < 0)
@@ -329,9 +330,19 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
 		if (tries > 3 && tv_cmp(&now, &deadline) >= 0)
 			goto out;
 
-		err = read_lines(st->list_file, &names);
-		if (err < 0)
-			goto out;
+		fd = open(st->list_file, O_RDONLY);
+		if (fd < 0) {
+			if (errno != ENOENT) {
+				err = REFTABLE_IO_ERROR;
+				goto out;
+			}
+
+			names = reftable_calloc(sizeof(char *));
+		} else {
+			err = fd_read_lines(fd, &names);
+			if (err < 0)
+				goto out;
+		}
 
 		err = reftable_stack_reload_once(st, names, reuse_open);
 		if (!err)
@@ -356,12 +367,16 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
 		names = NULL;
 		free_names(names_after);
 		names_after = NULL;
+		close(fd);
+		fd = -1;
 
 		delay = delay + (delay * rand()) / RAND_MAX + 1;
 		sleep_millisec(delay);
 	}
 
 out:
+	if (fd >= 0)
+		close(fd);
 	free_names(names);
 	free_names(names_after);
 	return err;

From 6fdfaf15a0c60572ac58c979a46e34634051e12f Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 11 Jan 2024 11:06:48 +0100
Subject: [PATCH 3/7] reftable/stack: use stat info to avoid re-reading stack
 list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Whenever we call into the refs interfaces we potentially have to reload
refs in case they have been concurrently modified, either in-process or
externally. While this happens somewhat automatically for loose refs
because we simply try to re-read the files, the "packed" backend will
reload its snapshot of the packed-refs file in case its stat info has
changed since last reading it.

In the reftable backend we have a similar mechanism that is provided by
`reftable_stack_reload()`. This function will read the list of stacks
from "tables.list" and, if they have changed from the currently stored
list, reload the stacks. This is heavily inefficient though, as we have
to check whether the stack is up-to-date on basically every read and
thus keep on re-reading the file all the time even if it didn't change
at all.

We can do better and use the same stat(3P)-based mechanism that the
"packed" backend uses. Instead of reading the file, we will only open
the file descriptor, fstat(3P) it, and then compare the info against the
cached value from the last time we have updated the stack. This should
always work alright because "tables.list" is updated atomically via a
rename, so even if the ctime or mtime wasn't granular enough to identify
a change, at least the inode number or file size should have changed.

This change significantly speeds up operations where many refs are read,
like when using git-update-ref(1). The following benchmark creates N
refs in an otherwise-empty repository via `git update-ref --stdin`:

  Benchmark 1: update-ref: create many refs (refcount = 1, revision = HEAD~)
    Time (mean ± σ):       5.1 ms ±   0.2 ms    [User: 2.4 ms, System: 2.6 ms]
    Range (min … max):     4.8 ms …   7.2 ms    109 runs

  Benchmark 2: update-ref: create many refs (refcount = 100, revision = HEAD~)
    Time (mean ± σ):      19.1 ms ±   0.9 ms    [User: 8.9 ms, System: 9.9 ms]
    Range (min … max):    18.4 ms …  26.7 ms    72 runs

  Benchmark 3: update-ref: create many refs (refcount = 10000, revision = HEAD~)
    Time (mean ± σ):      1.336 s ±  0.018 s    [User: 0.590 s, System: 0.724 s]
    Range (min … max):    1.314 s …  1.373 s    10 runs

  Benchmark 4: update-ref: create many refs (refcount = 1, revision = HEAD)
    Time (mean ± σ):       5.1 ms ±   0.2 ms    [User: 2.4 ms, System: 2.6 ms]
    Range (min … max):     4.8 ms …   7.2 ms    109 runs

  Benchmark 5: update-ref: create many refs (refcount = 100, revision = HEAD)
    Time (mean ± σ):      14.8 ms ±   0.2 ms    [User: 7.1 ms, System: 7.5 ms]
    Range (min … max):    14.2 ms …  15.2 ms    82 runs

  Benchmark 6: update-ref: create many refs (refcount = 10000, revision = HEAD)
    Time (mean ± σ):     927.6 ms ±   5.3 ms    [User: 437.8 ms, System: 489.5 ms]
    Range (min … max):   919.4 ms … 936.4 ms    10 runs

  Summary
    update-ref: create many refs (refcount = 1, revision = HEAD) ran
      1.00 ± 0.07 times faster than update-ref: create many refs (refcount = 1, revision = HEAD~)
      2.89 ± 0.14 times faster than update-ref: create many refs (refcount = 100, revision = HEAD)
      3.74 ± 0.25 times faster than update-ref: create many refs (refcount = 100, revision = HEAD~)
    181.26 ± 8.30 times faster than update-ref: create many refs (refcount = 10000, revision = HEAD)
    261.01 ± 12.35 times faster than update-ref: create many refs (refcount = 10000, revision = HEAD~)

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c  | 12 +++++++++++-
 reftable/stack.h  |  1 +
 reftable/system.h |  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index b1ee247601..c28d82299d 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -175,6 +175,7 @@ void reftable_stack_destroy(struct reftable_stack *st)
 		st->readers_len = 0;
 		FREE_AND_NULL(st->readers);
 	}
+	stat_validity_clear(&st->list_validity);
 	FREE_AND_NULL(st->list_file);
 	FREE_AND_NULL(st->reftable_dir);
 	reftable_free(st);
@@ -374,7 +375,11 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
 		sleep_millisec(delay);
 	}
 
+	stat_validity_update(&st->list_validity, fd);
+
 out:
+	if (err)
+		stat_validity_clear(&st->list_validity);
 	if (fd >= 0)
 		close(fd);
 	free_names(names);
@@ -388,8 +393,13 @@ out:
 static int stack_uptodate(struct reftable_stack *st)
 {
 	char **names = NULL;
-	int err = read_lines(st->list_file, &names);
+	int err;
 	int i = 0;
+
+	if (stat_validity_check(&st->list_validity, st->list_file))
+		return 0;
+
+	err = read_lines(st->list_file, &names);
 	if (err < 0)
 		return err;
 
diff --git a/reftable/stack.h b/reftable/stack.h
index f57005846e..3f80cc598a 100644
--- a/reftable/stack.h
+++ b/reftable/stack.h
@@ -14,6 +14,7 @@ https://developers.google.com/open-source/licenses/bsd
 #include "reftable-stack.h"
 
 struct reftable_stack {
+	struct stat_validity list_validity;
 	char *list_file;
 	char *reftable_dir;
 	int disable_auto_compact;
diff --git a/reftable/system.h b/reftable/system.h
index 6b74a81514..2cc7adf271 100644
--- a/reftable/system.h
+++ b/reftable/system.h
@@ -12,6 +12,7 @@ https://developers.google.com/open-source/licenses/bsd
 /* This header glues the reftable library to the rest of Git */
 
 #include "git-compat-util.h"
+#include "statinfo.h"
 #include "strbuf.h"
 #include "hash-ll.h" /* hash ID, sizes.*/
 #include "dir.h" /* remove_dir_recursively, for tests.*/

From 85e72be15d9eed0fc9373a8f4f8bc0fcc75041b3 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 11 Jan 2024 11:06:52 +0100
Subject: [PATCH 4/7] reftable/blocksource: refactor code to match our coding
 style

Refactor `reftable_block_source_from_file()` to match our coding style
better.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/blocksource.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/reftable/blocksource.c b/reftable/blocksource.c
index a1ea304429..1e2fb5e9fd 100644
--- a/reftable/blocksource.c
+++ b/reftable/blocksource.c
@@ -125,24 +125,23 @@ static struct reftable_block_source_vtable file_vtable = {
 int reftable_block_source_from_file(struct reftable_block_source *bs,
 				    const char *name)
 {
-	struct stat st = { 0 };
-	int err = 0;
-	int fd = open(name, O_RDONLY);
-	struct file_block_source *p = NULL;
+	struct file_block_source *p;
+	struct stat st;
+	int fd;
+
+	fd = open(name, O_RDONLY);
 	if (fd < 0) {
-		if (errno == ENOENT) {
+		if (errno == ENOENT)
 			return REFTABLE_NOT_EXIST_ERROR;
-		}
 		return -1;
 	}
 
-	err = fstat(fd, &st);
-	if (err < 0) {
+	if (fstat(fd, &st) < 0) {
 		close(fd);
 		return REFTABLE_IO_ERROR;
 	}
 
-	p = reftable_calloc(sizeof(struct file_block_source));
+	p = reftable_calloc(sizeof(*p));
 	p->size = st.st_size;
 	p->fd = fd;
 

From 718a93ecc06ed59dda4e6a5d91b1c2169275694f Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 11 Jan 2024 11:06:56 +0100
Subject: [PATCH 5/7] reftable/blocksource: use mmap to read tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The blocksource interface provides an interface to read blocks from a
reftable table. This interface is implemented using read(3P) calls on
the underlying file descriptor. While this works alright, this pattern
is very inefficient when repeatedly querying the reftable stack for one
or more refs. This inefficiency can mostly be attributed to the fact
that we often need to re-read the same blocks over and over again, and
every single time we need to call read(3P) again.

A natural fit in this context is to use mmap(3P) instead of read(3P),
which has a bunch of benefits:

  - We do not need to come up with a caching strategy for some of the
    blocks as this will be handled by the kernel already.

  - We can avoid the overhead of having to call into the read(3P)
    syscall repeatedly.

  - We do not need to allocate returned blocks repeatedly, but can
    instead hand out pointers into the mmapped region directly.

Using mmap comes with a significant drawback on Windows though, because
mmapped files cannot be deleted and neither is it possible to rename
files onto an mmapped file. But for one, the reftable library gracefully
handles the case where auto-compaction cannot delete a still-open stack
already and ignores any such errors. Also, `reftable_stack_clean()` will
prune stale tables which are not referenced by "tables.list" anymore so
that those files can eventually be pruned. And second, we never rewrite
already-written stacks, so it does not matter that we cannot rename a
file over an mmaped file, either.

Another unfortunate property of mmap is that it is not supported by all
systems. But given that the size of reftables should typically be rather
limited (megabytes at most in the vast majority of repositories), we can
use the fallback implementation provided by `git_mmap()` which reads the
whole file into memory instead. This is the same strategy that the
"packed" backend uses.

While this change doesn't significantly improve performance in the case
where we're seeking through stacks once (like e.g. git-for-each-ref(1)
would). But it does speed up usecases where there is lots of random
access to refs, e.g. when writing. The following benchmark demonstrates
these savings with git-update-ref(1) creating N refs in an otherwise
empty repository:

  Benchmark 1: update-ref: create many refs (refcount = 1, revision = HEAD~)
    Time (mean ± σ):       5.1 ms ±   0.2 ms    [User: 2.5 ms, System: 2.5 ms]
    Range (min … max):     4.8 ms …   7.1 ms    111 runs

  Benchmark 2: update-ref: create many refs (refcount = 100, revision = HEAD~)
    Time (mean ± σ):      14.8 ms ±   0.5 ms    [User: 7.1 ms, System: 7.5 ms]
    Range (min … max):    14.1 ms …  18.7 ms    84 runs

  Benchmark 3: update-ref: create many refs (refcount = 10000, revision = HEAD~)
    Time (mean ± σ):     926.4 ms ±   5.6 ms    [User: 448.5 ms, System: 477.7 ms]
    Range (min … max):   920.0 ms … 936.1 ms    10 runs

  Benchmark 4: update-ref: create many refs (refcount = 1, revision = HEAD)
    Time (mean ± σ):       5.0 ms ±   0.2 ms    [User: 2.4 ms, System: 2.5 ms]
    Range (min … max):     4.7 ms …   5.4 ms    111 runs

  Benchmark 5: update-ref: create many refs (refcount = 100, revision = HEAD)
    Time (mean ± σ):      10.5 ms ±   0.2 ms    [User: 5.0 ms, System: 5.3 ms]
    Range (min … max):    10.0 ms …  10.9 ms    93 runs

  Benchmark 6: update-ref: create many refs (refcount = 10000, revision = HEAD)
    Time (mean ± σ):     529.6 ms ±   9.1 ms    [User: 268.0 ms, System: 261.4 ms]
    Range (min … max):   522.4 ms … 547.1 ms    10 runs

  Summary
    update-ref: create many refs (refcount = 1, revision = HEAD) ran
      1.01 ± 0.06 times faster than update-ref: create many refs (refcount = 1, revision = HEAD~)
      2.08 ± 0.07 times faster than update-ref: create many refs (refcount = 100, revision = HEAD)
      2.95 ± 0.14 times faster than update-ref: create many refs (refcount = 100, revision = HEAD~)
    105.33 ± 3.76 times faster than update-ref: create many refs (refcount = 10000, revision = HEAD)
    184.24 ± 5.89 times faster than update-ref: create many refs (refcount = 10000, revision = HEAD~)

Theoretically, we could also replicate the strategy of the "packed"
backend where small tables are read into memory instead of using mmap.
Benchmarks did not confirm that this has a performance benefit though.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/blocksource.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/reftable/blocksource.c b/reftable/blocksource.c
index 1e2fb5e9fd..8c41e3c70f 100644
--- a/reftable/blocksource.c
+++ b/reftable/blocksource.c
@@ -76,8 +76,8 @@ struct reftable_block_source malloc_block_source(void)
 }
 
 struct file_block_source {
-	int fd;
 	uint64_t size;
+	unsigned char *data;
 };
 
 static uint64_t file_size(void *b)
@@ -87,19 +87,12 @@ static uint64_t file_size(void *b)
 
 static void file_return_block(void *b, struct reftable_block *dest)
 {
-	if (dest->len)
-		memset(dest->data, 0xff, dest->len);
-	reftable_free(dest->data);
 }
 
-static void file_close(void *b)
+static void file_close(void *v)
 {
-	int fd = ((struct file_block_source *)b)->fd;
-	if (fd > 0) {
-		close(fd);
-		((struct file_block_source *)b)->fd = 0;
-	}
-
+	struct file_block_source *b = v;
+	munmap(b->data, b->size);
 	reftable_free(b);
 }
 
@@ -108,9 +101,7 @@ static int file_read_block(void *v, struct reftable_block *dest, uint64_t off,
 {
 	struct file_block_source *b = v;
 	assert(off + size <= b->size);
-	dest->data = reftable_malloc(size);
-	if (pread_in_full(b->fd, dest->data, size, off) != size)
-		return -1;
+	dest->data = b->data + off;
 	dest->len = size;
 	return size;
 }
@@ -143,7 +134,8 @@ int reftable_block_source_from_file(struct reftable_block_source *bs,
 
 	p = reftable_calloc(sizeof(*p));
 	p->size = st.st_size;
-	p->fd = fd;
+	p->data = xmmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	close(fd);
 
 	assert(!bs->ops);
 	bs->ops = &file_vtable;

From 456333eb4d00a2bc8a71134bedfe5d3bc13a1932 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 18 Jan 2024 14:41:51 +0100
Subject: [PATCH 6/7] reftable/stack: unconditionally reload stack after commit

After we have committed an addition to the reftable stack we call
`reftable_stack_reload()` to reload the stack and thus reflect the
changes that were just added. This function will only conditionally
reload the stack in case `stack_uptodate()` tells us that the stack
needs reloading. This check is wasteful though because we already know
that the stack needs reloading.

Call `reftable_stack_reload_maybe_reuse()` instead, which will
unconditionally reload the stack. This is merely a conceptual fix, the
code in question was not found to cause any problems in practice.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index c28d82299d..705cfb6caa 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -585,7 +585,7 @@ int reftable_addition_commit(struct reftable_addition *add)
 	add->new_tables = NULL;
 	add->new_tables_len = 0;
 
-	err = reftable_stack_reload(add->stack);
+	err = reftable_stack_reload_maybe_reuse(add->stack, 1);
 	if (err)
 		goto done;
 

From 4f36b8597c8f1c20e465ade4159896fb592e34c0 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 18 Jan 2024 14:41:56 +0100
Subject: [PATCH 7/7] reftable/stack: fix race in up-to-date check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In 6fdfaf15a0 (reftable/stack: use stat info to avoid re-reading stack
list, 2024-01-11) we have introduced a new mechanism to avoid re-reading
the table list in case stat(3P) figures out that the stack didn't change
since the last time we read it.

While this change significantly improved performance when writing many
refs, it can unfortunately lead to false negatives in very specific
scenarios. Given two processes A and B, there is a feasible sequence of
events that cause us to accidentally treat the table list as up-to-date
even though it changed:

  1. A reads the reftable stack and caches its stat info.

  2. B updates the stack, appending a new table to "tables.list". This
     will both use a new inode and result in a different file size, thus
     invalidating A's cache in theory.

  3. B decides to auto-compact the stack and merges two tables. The file
     size now matches what A has cached again. Furthermore, the
     filesystem may decide to recycle the inode number of the file we
     have replaced in (2) because it is not in use anymore.

  4. A reloads the reftable stack. Neither the inode number nor the
     file size changed. If the timestamps did not change either then we
     think the cached copy of our stack is up-to-date.

In fact, the commit introduced three related issues:

  - Non-POSIX compliant systems may not report proper `st_dev` and
    `st_ino` values in stat(3P), which made us rely solely on the
    file's potentially coarse-grained mtime and ctime.

  - `stat_validity_check()` and friends may end up not comparing
    `st_dev` and `st_ino` depending on the "core.checkstat" config,
    again reducing the signal to the mtime and ctime.

  - `st_ino` can be recycled, rendering the check moot even on
    POSIX-compliant systems.

Given that POSIX defines that "The st_ino and st_dev fields taken
together uniquely identify the file within the system", these issues led
to the most important signal to establish file identity to be ignored or
become useless in some cases.

Refactor the code to stop using `stat_validity_check()`. Instead, we
manually stat(3P) the file descriptors to make relevant information
available. On Windows and MSYS2 the result will have both `st_dev` and
`st_ino` set to 0, which allows us to address the first issue by not
using the stat-based cache in that case. It also allows us to make sure
that we always compare `st_dev` and `st_ino`, addressing the second
issue.

The third issue of inode recycling can be addressed by keeping the file
descriptor of "files.list" open during the lifetime of the reftable
stack. As the file will still exist on disk even though it has been
unlinked it is impossible for its inode to be recycled as long as the
file descriptor is still open.

This should address the race in a POSIX-compliant way. The only real
downside is that this mechanism cannot be used on non-POSIX-compliant
systems like Windows. But we at least have the second-level caching
mechanism in place that compares contents of "files.list" with the
currently loaded list of tables.

This new mechanism performs roughly the same as the previous one that
relied on `stat_validity_check()`:

  Benchmark 1: update-ref: create many refs (HEAD~)
    Time (mean ± σ):      4.754 s ±  0.026 s    [User: 2.204 s, System: 2.549 s]
    Range (min … max):    4.694 s …  4.802 s    20 runs

  Benchmark 2: update-ref: create many refs (HEAD)
    Time (mean ± σ):      4.721 s ±  0.020 s    [User: 2.194 s, System: 2.527 s]
    Range (min … max):    4.691 s …  4.753 s    20 runs

  Summary
    update-ref: create many refs (HEAD~) ran
      1.01 ± 0.01 times faster than update-ref: create many refs (HEAD)

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c  | 99 +++++++++++++++++++++++++++++++++++++++++++----
 reftable/stack.h  |  4 +-
 reftable/system.h |  1 -
 3 files changed, 95 insertions(+), 9 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index 705cfb6caa..77a387a86c 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -66,6 +66,7 @@ int reftable_new_stack(struct reftable_stack **dest, const char *dir,
 	strbuf_addstr(&list_file_name, "/tables.list");
 
 	p->list_file = strbuf_detach(&list_file_name, NULL);
+	p->list_fd = -1;
 	p->reftable_dir = xstrdup(dir);
 	p->config = config;
 
@@ -175,7 +176,12 @@ void reftable_stack_destroy(struct reftable_stack *st)
 		st->readers_len = 0;
 		FREE_AND_NULL(st->readers);
 	}
-	stat_validity_clear(&st->list_validity);
+
+	if (st->list_fd >= 0) {
+		close(st->list_fd);
+		st->list_fd = -1;
+	}
+
 	FREE_AND_NULL(st->list_file);
 	FREE_AND_NULL(st->reftable_dir);
 	reftable_free(st);
@@ -375,11 +381,59 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
 		sleep_millisec(delay);
 	}
 
-	stat_validity_update(&st->list_validity, fd);
-
 out:
-	if (err)
-		stat_validity_clear(&st->list_validity);
+	/*
+	 * Invalidate the stat cache. It is sufficient to only close the file
+	 * descriptor and keep the cached stat info because we never use the
+	 * latter when the former is negative.
+	 */
+	if (st->list_fd >= 0) {
+		close(st->list_fd);
+		st->list_fd = -1;
+	}
+
+	/*
+	 * Cache stat information in case it provides a useful signal to us.
+	 * According to POSIX, "The st_ino and st_dev fields taken together
+	 * uniquely identify the file within the system." That being said,
+	 * Windows is not POSIX compliant and we do not have these fields
+	 * available. So the information we have there is insufficient to
+	 * determine whether two file descriptors point to the same file.
+	 *
+	 * While we could fall back to using other signals like the file's
+	 * mtime, those are not sufficient to avoid races. We thus refrain from
+	 * using the stat cache on such systems and fall back to the secondary
+	 * caching mechanism, which is to check whether contents of the file
+	 * have changed.
+	 *
+	 * On other systems which are POSIX compliant we must keep the file
+	 * descriptor open. This is to avoid a race condition where two
+	 * processes access the reftable stack at the same point in time:
+	 *
+	 *   1. A reads the reftable stack and caches its stat info.
+	 *
+	 *   2. B updates the stack, appending a new table to "tables.list".
+	 *      This will both use a new inode and result in a different file
+	 *      size, thus invalidating A's cache in theory.
+	 *
+	 *   3. B decides to auto-compact the stack and merges two tables. The
+	 *      file size now matches what A has cached again. Furthermore, the
+	 *      filesystem may decide to recycle the inode number of the file
+	 *      we have replaced in (2) because it is not in use anymore.
+	 *
+	 *   4. A reloads the reftable stack. Neither the inode number nor the
+	 *      file size changed. If the timestamps did not change either then
+	 *      we think the cached copy of our stack is up-to-date.
+	 *
+	 * By keeping the file descriptor open the inode number cannot be
+	 * recycled, mitigating the race.
+	 */
+	if (!err && fd >= 0 && !fstat(fd, &st->list_st) &&
+	    st->list_st.st_dev && st->list_st.st_ino) {
+		st->list_fd = fd;
+		fd = -1;
+	}
+
 	if (fd >= 0)
 		close(fd);
 	free_names(names);
@@ -396,8 +450,39 @@ static int stack_uptodate(struct reftable_stack *st)
 	int err;
 	int i = 0;
 
-	if (stat_validity_check(&st->list_validity, st->list_file))
-		return 0;
+	/*
+	 * When we have cached stat information available then we use it to
+	 * verify whether the file has been rewritten.
+	 *
+	 * Note that we explicitly do not want to use `stat_validity_check()`
+	 * and friends here because they may end up not comparing the `st_dev`
+	 * and `st_ino` fields. These functions thus cannot guarantee that we
+	 * indeed still have the same file.
+	 */
+	if (st->list_fd >= 0) {
+		struct stat list_st;
+
+		if (stat(st->list_file, &list_st) < 0) {
+			/*
+			 * It's fine for "tables.list" to not exist. In that
+			 * case, we have to refresh when the loaded stack has
+			 * any readers.
+			 */
+			if (errno == ENOENT)
+				return !!st->readers_len;
+			return REFTABLE_IO_ERROR;
+		}
+
+		/*
+		 * When "tables.list" refers to the same file we can assume
+		 * that it didn't change. This is because we always use
+		 * rename(3P) to update the file and never write to it
+		 * directly.
+		 */
+		if (st->list_st.st_dev == list_st.st_dev &&
+		    st->list_st.st_ino == list_st.st_ino)
+			return 0;
+	}
 
 	err = read_lines(st->list_file, &names);
 	if (err < 0)
diff --git a/reftable/stack.h b/reftable/stack.h
index 3f80cc598a..c1e3efa899 100644
--- a/reftable/stack.h
+++ b/reftable/stack.h
@@ -14,8 +14,10 @@ https://developers.google.com/open-source/licenses/bsd
 #include "reftable-stack.h"
 
 struct reftable_stack {
-	struct stat_validity list_validity;
+	struct stat list_st;
 	char *list_file;
+	int list_fd;
+
 	char *reftable_dir;
 	int disable_auto_compact;
 
diff --git a/reftable/system.h b/reftable/system.h
index 2cc7adf271..6b74a81514 100644
--- a/reftable/system.h
+++ b/reftable/system.h
@@ -12,7 +12,6 @@ https://developers.google.com/open-source/licenses/bsd
 /* This header glues the reftable library to the rest of Git */
 
 #include "git-compat-util.h"
-#include "statinfo.h"
 #include "strbuf.h"
 #include "hash-ll.h" /* hash ID, sizes.*/
 #include "dir.h" /* remove_dir_recursively, for tests.*/