From 7ba3a48c3d0835386599ff844b23c817e2f605d3 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 2 Jan 2026 18:52:15 +0000 Subject: [PATCH 01/11] ivec: introduce the C side of ivec Trying to use Rust's Vec in C, or git's ALLOC_GROW() macros (via wrapper functions) in Rust is painful because: * C doesn't define its own vector type, and even though Rust does have Vec its painful to use on the C side (more on that below). However its still not viable to use Rust's Vec type because Git needs to be able to compile without Rust. So ivec was created expressley to be interoperable between C and Rust without needing Rust. * C doing vector things the Rust way would require wrapper functions, and Rust doing vector things the C way would require wrapper functions, so ivec was created to ensure a consistent contract between the 2 languages for how to manipulate a vector. * Currently, Rust defines its own 'Vec' type that is generic, but its memory allocator and struct layout weren't designed for interoperability with C (or any language for that matter), meaning that the C side cannot push to or expand a 'Vec' without defining wrapper functions in Rust that C can call. Without special care, the two languages might use different allocators (malloc/free on the C side, and possibly something else in Rust), which would make it difficult for a function in one language to free elements allocated by a call from a function in the other language. * Similarly, git defines ALLOC_GROW() and related macros in git-compat-util.h. While we could add functions allowing Rust to invoke something similar to those macros, passing three variables (pointer, length, allocated_size) instead of a single variable (vector) across the language boundary requires more cognitive overhead for readers to keep track of and makes it easier to make mistakes. Further, for low-level components that we want to eventually convert to pure Rust, such triplets would feel very out of place. To address these issue, introduce a new type, ivec -- short for interoperable vector. (We refer to it as 'ivec' generally, though on the Rust side the struct is called IVec to match Rust style.) This new type is specifically designed for FFI purposes, so that both languages handle the vector in the same way, though it could be used on either side independently. This type is designed such that it can easily be replaced by a Rust 'Vec' once interoperability is no longer a concern. One particular item to note is that Git's macros to handle vec operations infer the amount that a vec needs to grow from the size of a pointer, but that makes it somewhat specific to the macros used in C. To avoid defining every ivec function as a macro I opted to also include an element_size field that allows concrete functions like push() to know how much to grow the memory. This element_size also helps in verifying that the ivec is correct when passing from C to Rust. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- Makefile | 1 + compat/ivec.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++ compat/ivec.h | 52 +++++++++++++++++++++++ meson.build | 1 + 4 files changed, 167 insertions(+) create mode 100644 compat/ivec.c create mode 100644 compat/ivec.h diff --git a/Makefile b/Makefile index 89d8d73ec0..f923b307d6 100644 --- a/Makefile +++ b/Makefile @@ -1107,6 +1107,7 @@ LIB_OBJS += commit-reach.o LIB_OBJS += commit.o LIB_OBJS += common-exit.o LIB_OBJS += common-init.o +LIB_OBJS += compat/ivec.o LIB_OBJS += compat/nonblock.o LIB_OBJS += compat/obstack.o LIB_OBJS += compat/open.o diff --git a/compat/ivec.c b/compat/ivec.c new file mode 100644 index 0000000000..0a777e78dc --- /dev/null +++ b/compat/ivec.c @@ -0,0 +1,113 @@ +#include "ivec.h" + +struct IVec_c_void { + void *ptr; + size_t length; + size_t capacity; + size_t element_size; +}; + +static void _set_capacity(void *self_, size_t new_capacity) +{ + struct IVec_c_void *self = self_; + + if (new_capacity == self->capacity) { + return; + } + if (new_capacity == 0) { + free(self->ptr); + self->ptr = NULL; + } else { + self->ptr = realloc(self->ptr, new_capacity * self->element_size); + } + self->capacity = new_capacity; +} + + +void ivec_init(void *self_, size_t element_size) +{ + struct IVec_c_void *self = self_; + + self->ptr = NULL; + self->length = 0; + self->capacity = 0; + self->element_size = element_size; +} + +void ivec_zero(void *self_, size_t capacity) +{ + struct IVec_c_void *self = self_; + + self->ptr = calloc(capacity, self->element_size); + self->length = capacity; + self->capacity = capacity; + // DO NOT MODIFY element_size!!! +} + +void ivec_reserve_exact(void *self_, size_t additional) +{ + struct IVec_c_void *self = self_; + + _set_capacity(self, self->capacity + additional); +} + +void ivec_reserve(void *self_, size_t additional) +{ + struct IVec_c_void *self = self_; + + size_t growby = 128; + if (self->capacity > growby) + growby = self->capacity; + if (additional > growby) + growby = additional; + + _set_capacity(self, self->capacity + growby); +} + +void ivec_shrink_to_fit(void *self_) +{ + struct IVec_c_void *self = self_; + + _set_capacity(self, self->length); +} + +void ivec_push(void *self_, const void *value) +{ + struct IVec_c_void *self = self_; + void *dst = NULL; + + if (self->length == self->capacity) + ivec_reserve(self, 1); + + dst = (uint8_t*)self->ptr + self->length * self->element_size; + memcpy(dst, value, self->element_size); + self->length++; +} + +void ivec_free(void *self_) +{ + struct IVec_c_void *self = self_; + + free(self->ptr); + self->ptr = NULL; + self->length = 0; + self->capacity = 0; + // DO NOT MODIFY element_size!!! +} + +void ivec_move(void *src_, void *dst_) +{ + struct IVec_c_void *src = src_; + struct IVec_c_void *dst = dst_; + + ivec_free(dst); + dst->ptr = src->ptr; + dst->length = src->length; + dst->capacity = src->capacity; + // DO NOT MODIFY element_size!!! + + src->ptr = NULL; + src->length = 0; + src->capacity = 0; + // DO NOT MODIFY element_size!!! +} diff --git a/compat/ivec.h b/compat/ivec.h new file mode 100644 index 0000000000..654a05c506 --- /dev/null +++ b/compat/ivec.h @@ -0,0 +1,52 @@ +#ifndef IVEC_H +#define IVEC_H + +#include + +#define IVEC_INIT(variable) ivec_init(&(variable), sizeof(*(variable).ptr)) + +#ifndef CBINDGEN +#define DEFINE_IVEC_TYPE(type, suffix) \ +struct IVec_##suffix { \ + type* ptr; \ + size_t length; \ + size_t capacity; \ + size_t element_size; \ +} + +DEFINE_IVEC_TYPE(bool, bool); + +DEFINE_IVEC_TYPE(uint8_t, u8); +DEFINE_IVEC_TYPE(uint16_t, u16); +DEFINE_IVEC_TYPE(uint32_t, u32); +DEFINE_IVEC_TYPE(uint64_t, u64); + +DEFINE_IVEC_TYPE(int8_t, i8); +DEFINE_IVEC_TYPE(int16_t, i16); +DEFINE_IVEC_TYPE(int32_t, i32); +DEFINE_IVEC_TYPE(int64_t, i64); + +DEFINE_IVEC_TYPE(float, f32); +DEFINE_IVEC_TYPE(double, f64); + +DEFINE_IVEC_TYPE(size_t, usize); +DEFINE_IVEC_TYPE(ssize_t, isize); +#endif + +void ivec_init(void *self_, size_t element_size); + +void ivec_zero(void *self_, size_t capacity); + +void ivec_reserve_exact(void *self_, size_t additional); + +void ivec_reserve(void *self_, size_t additional); + +void ivec_shrink_to_fit(void *self_); + +void ivec_push(void *self_, const void *value); + +void ivec_free(void *self_); + +void ivec_move(void *src, void *dst); + +#endif /* IVEC_H */ diff --git a/meson.build b/meson.build index dd52efd1c8..42ac0c8c42 100644 --- a/meson.build +++ b/meson.build @@ -302,6 +302,7 @@ libgit_sources = [ 'commit.c', 'common-exit.c', 'common-init.c', + 'compat/ivec.c', 'compat/nonblock.c', 'compat/obstack.c', 'compat/open.c', From bcf8fd63453d9a988e14248b23260a8cf4a03f63 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 2 Jan 2026 18:52:16 +0000 Subject: [PATCH 02/11] xdiff: make classic diff explicit by creating xdl_do_classic_diff() Later patches will prepare xdl_cleanup_records() to be moved into xdiffi.c since only the classic diff uses that function. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xdiffi.c | 43 +++++++++++++++++++++++++++---------------- xdiff/xdiffi.h | 1 + 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c index 4376f943db..e3196c7245 100644 --- a/xdiff/xdiffi.c +++ b/xdiff/xdiffi.c @@ -311,26 +311,13 @@ int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1, } -int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, - xdfenv_t *xe) { +int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags) +{ long ndiags; long *kvd, *kvdf, *kvdb; xdalgoenv_t xenv; int res; - if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) - return -1; - - if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF) { - res = xdl_do_patience_diff(xpp, xe); - goto out; - } - - if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF) { - res = xdl_do_histogram_diff(xpp, xe); - goto out; - } - /* * Allocate and setup K vectors to be used by the differential * algorithm. @@ -355,9 +342,33 @@ int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xenv.heur_min = XDL_HEUR_MIN_COST; res = xdl_recs_cmp(&xe->xdf1, 0, xe->xdf1.nreff, &xe->xdf2, 0, xe->xdf2.nreff, - kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0, + kvdf, kvdb, (flags & XDF_NEED_MINIMAL) != 0, &xenv); + xdl_free(kvd); + + return res; +} + + +int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, + xdfenv_t *xe) { + int res; + + if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) + return -1; + + if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF) { + res = xdl_do_patience_diff(xpp, xe); + goto out; + } + + if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF) { + res = xdl_do_histogram_diff(xpp, xe); + goto out; + } + + res = xdl_do_classic_diff(xe, xpp->flags); out: if (res < 0) xdl_free_env(xe); diff --git a/xdiff/xdiffi.h b/xdiff/xdiffi.h index 49e52c67f9..8bf4c20373 100644 --- a/xdiff/xdiffi.h +++ b/xdiff/xdiffi.h @@ -42,6 +42,7 @@ typedef struct s_xdchange { int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1, xdfile_t *xdf2, long off2, long lim2, long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv); +int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags); int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdfenv_t *xe); int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, long flags); From db8a50ca6b92fd8222b90d6873b9f7e3766f224a Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 2 Jan 2026 18:52:17 +0000 Subject: [PATCH 03/11] xdiff: don't waste time guessing the number of lines All lines must be read anyway, so classify them after they're read in. Also move the memset() into xdl_init_classifier(). Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xprepare.c | 54 ++++++++++++++++++++---------------------------- xdiff/xutils.c | 20 ------------------ xdiff/xutils.h | 1 - 3 files changed, 22 insertions(+), 53 deletions(-) diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 34c82e4f8e..96a32cc5e9 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -26,8 +26,6 @@ #define XDL_KPDIS_RUN 4 #define XDL_MAX_EQLIMIT 1024 #define XDL_SIMSCAN_WINDOW 100 -#define XDL_GUESS_NLINES1 256 -#define XDL_GUESS_NLINES2 20 #define DISCARD 0 #define KEEP 1 @@ -55,6 +53,8 @@ typedef struct s_xdlclassifier { static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) { + memset(cf, 0, sizeof(xdlclassifier_t)); + cf->flags = flags; cf->hbits = xdl_hashbits((unsigned int) size); @@ -134,12 +134,12 @@ static void xdl_free_ctx(xdfile_t *xdf) } -static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp, - xdlclassifier_t *cf, xdfile_t *xdf) { +static int xdl_prepare_ctx(mmfile_t *mf, xdfile_t *xdf, uint64_t flags) { long bsize; uint64_t hav; uint8_t const *blk, *cur, *top, *prev; xrecord_t *crec; + long narec = 8; xdf->reference_index = NULL; xdf->changed = NULL; @@ -152,23 +152,21 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_ if ((cur = blk = xdl_mmfile_first(mf, &bsize))) { for (top = blk + bsize; cur < top; ) { prev = cur; - hav = xdl_hash_record(&cur, top, xpp->flags); + hav = xdl_hash_record(&cur, top, flags); if (XDL_ALLOC_GROW(xdf->recs, (long)xdf->nrec + 1, narec)) goto abort; crec = &xdf->recs[xdf->nrec++]; crec->ptr = prev; crec->size = cur - prev; crec->line_hash = hav; - if (xdl_classify_record(pass, cf, crec) < 0) - goto abort; } } if (!XDL_CALLOC_ARRAY(xdf->changed, xdf->nrec + 2)) goto abort; - if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) && - (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) { + if ((XDF_DIFF_ALG(flags) != XDF_PATIENCE_DIFF) && + (XDF_DIFF_ALG(flags) != XDF_HISTOGRAM_DIFF)) { if (!XDL_ALLOC_ARRAY(xdf->reference_index, xdf->nrec + 1)) goto abort; } @@ -381,39 +379,31 @@ static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2 int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdfenv_t *xe) { - long enl1, enl2, sample; xdlclassifier_t cf; - memset(&cf, 0, sizeof(cf)); + if (xdl_prepare_ctx(mf1, &xe->xdf1, xpp->flags) < 0) { - /* - * For histogram diff, we can afford a smaller sample size and - * thus a poorer estimate of the number of lines, as the hash - * table (rhash) won't be filled up/grown. The number of lines - * (nrecs) will be updated correctly anyway by - * xdl_prepare_ctx(). - */ - sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF - ? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1); - - enl1 = xdl_guess_lines(mf1, sample) + 1; - enl2 = xdl_guess_lines(mf2, sample) + 1; - - if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0) - return -1; - - if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) { - - xdl_free_classifier(&cf); return -1; } - if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) { + if (xdl_prepare_ctx(mf2, &xe->xdf2, xpp->flags) < 0) { xdl_free_ctx(&xe->xdf1); - xdl_free_classifier(&cf); return -1; } + if (xdl_init_classifier(&cf, xe->xdf1.nrec + xe->xdf2.nrec + 1, xpp->flags) < 0) + return -1; + + for (size_t i = 0; i < xe->xdf1.nrec; i++) { + xrecord_t *rec = &xe->xdf1.recs[i]; + xdl_classify_record(1, &cf, rec); + } + + for (size_t i = 0; i < xe->xdf2.nrec; i++) { + xrecord_t *rec = &xe->xdf2.recs[i]; + xdl_classify_record(2, &cf, rec); + } + if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) && (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) && xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) { diff --git a/xdiff/xutils.c b/xdiff/xutils.c index 77ee1ad9c8..b3d51197c1 100644 --- a/xdiff/xutils.c +++ b/xdiff/xutils.c @@ -118,26 +118,6 @@ void *xdl_cha_alloc(chastore_t *cha) { return data; } -long xdl_guess_lines(mmfile_t *mf, long sample) { - long nl = 0, size, tsize = 0; - char const *data, *cur, *top; - - if ((cur = data = xdl_mmfile_first(mf, &size))) { - for (top = data + size; nl < sample && cur < top; ) { - nl++; - if (!(cur = memchr(cur, '\n', top - cur))) - cur = top; - else - cur++; - } - tsize += (long) (cur - data); - } - - if (nl && tsize) - nl = xdl_mmfile_size(mf) / (tsize / nl); - - return nl + 1; -} int xdl_blankline(const char *line, long size, long flags) { diff --git a/xdiff/xutils.h b/xdiff/xutils.h index 615b4a9d35..d800840dd0 100644 --- a/xdiff/xutils.h +++ b/xdiff/xutils.h @@ -31,7 +31,6 @@ int xdl_emit_diffrec(char const *rec, long size, char const *pre, long psize, int xdl_cha_init(chastore_t *cha, long isize, long icount); void xdl_cha_free(chastore_t *cha); void *xdl_cha_alloc(chastore_t *cha); -long xdl_guess_lines(mmfile_t *mf, long sample); int xdl_blankline(const char *line, long size, long flags); int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags); uint64_t xdl_hash_record_verbatim(uint8_t const **data, uint8_t const *top); From 319871c177c7273710ff32c66d9ed10a2ee5d2c8 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 2 Jan 2026 18:52:18 +0000 Subject: [PATCH 04/11] xdiff: let patience and histogram benefit from xdl_trim_ends() The patience diff is set up the exact same way as histogram, see xdl_do_historgram_diff() in xhistogram.c. xdl_optimize_ctxs() is redundant now, delete it. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xpatience.c | 4 +++- xdiff/xprepare.c | 14 ++------------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/xdiff/xpatience.c b/xdiff/xpatience.c index 9580d18032..2bce07cf48 100644 --- a/xdiff/xpatience.c +++ b/xdiff/xpatience.c @@ -373,5 +373,7 @@ static int patience_diff(xpparam_t const *xpp, xdfenv_t *env, int xdl_do_patience_diff(xpparam_t const *xpp, xdfenv_t *env) { - return patience_diff(xpp, env, 1, (int)env->xdf1.nrec, 1, (int)env->xdf2.nrec); + return patience_diff(xpp, env, + env->xdf1.dstart + 1, env->xdf1.dend - env->xdf1.dstart + 1, + env->xdf2.dstart + 1, env->xdf2.dend - env->xdf2.dstart + 1); } diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 96a32cc5e9..0d7d9f6146 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -366,17 +366,6 @@ static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) { } -static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) { - - if (xdl_trim_ends(xdf1, xdf2) < 0 || - xdl_cleanup_records(cf, xdf1, xdf2) < 0) { - - return -1; - } - - return 0; -} - int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdfenv_t *xe) { xdlclassifier_t cf; @@ -404,9 +393,10 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdl_classify_record(2, &cf, rec); } + xdl_trim_ends(&xe->xdf1, &xe->xdf2); if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) && (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) && - xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) { + xdl_cleanup_records(&cf, &xe->xdf1, &xe->xdf2) < 0) { xdl_free_ctx(&xe->xdf2); xdl_free_ctx(&xe->xdf1); From fde7f6ebb113b9a27307d2e51dd20f8e78c81ea4 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 2 Jan 2026 18:52:19 +0000 Subject: [PATCH 05/11] xdiff: use xdfenv_t in xdl_trim_ends() and xdl_cleanup_records() View with --color-words. Prepare these functions to use the fields: delta_start, delta_end. A future patch will add these fields to xdfenv_t. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xprepare.c | 60 ++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 0d7d9f6146..0acb3437d4 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -261,7 +261,7 @@ static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) { * matches on the other file. Also, lines that have multiple matches * might be potentially discarded if they appear in a run of discardable. */ -static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) { +static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { long i, nm, mlim; xrecord_t *recs; xdlclass_t *rcrec; @@ -273,11 +273,11 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd * Create temporary arrays that will help us decide if * changed[i] should remain false, or become true. */ - if (!XDL_CALLOC_ARRAY(action1, xdf1->nrec + 1)) { + if (!XDL_CALLOC_ARRAY(action1, xe->xdf1.nrec + 1)) { ret = -1; goto cleanup; } - if (!XDL_CALLOC_ARRAY(action2, xdf2->nrec + 1)) { + if (!XDL_CALLOC_ARRAY(action2, xe->xdf2.nrec + 1)) { ret = -1; goto cleanup; } @@ -285,17 +285,17 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd /* * Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE. */ - if ((mlim = xdl_bogosqrt((long)xdf1->nrec)) > XDL_MAX_EQLIMIT) + if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; - for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) { + for (i = xe->xdf1.dstart, recs = &xe->xdf1.recs[xe->xdf1.dstart]; i <= xe->xdf1.dend; i++, recs++) { rcrec = cf->rcrecs[recs->minimal_perfect_hash]; nm = rcrec ? rcrec->len2 : 0; action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; } - if ((mlim = xdl_bogosqrt((long)xdf2->nrec)) > XDL_MAX_EQLIMIT) + if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; - for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) { + for (i = xe->xdf2.dstart, recs = &xe->xdf2.recs[xe->xdf2.dstart]; i <= xe->xdf2.dend; i++, recs++) { rcrec = cf->rcrecs[recs->minimal_perfect_hash]; nm = rcrec ? rcrec->len1 : 0; action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; @@ -305,27 +305,27 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd * Use temporary arrays to decide if changed[i] should remain * false, or become true. */ - xdf1->nreff = 0; - for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; - i <= xdf1->dend; i++, recs++) { + xe->xdf1.nreff = 0; + for (i = xe->xdf1.dstart, recs = &xe->xdf1.recs[xe->xdf1.dstart]; + i <= xe->xdf1.dend; i++, recs++) { if (action1[i] == KEEP || - (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xdf1->dstart, xdf1->dend))) { - xdf1->reference_index[xdf1->nreff++] = i; + (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->xdf1.dstart, xe->xdf1.dend))) { + xe->xdf1.reference_index[xe->xdf1.nreff++] = i; /* changed[i] remains false, i.e. keep */ } else - xdf1->changed[i] = true; + xe->xdf1.changed[i] = true; /* i.e. discard */ } - xdf2->nreff = 0; - for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; - i <= xdf2->dend; i++, recs++) { + xe->xdf2.nreff = 0; + for (i = xe->xdf2.dstart, recs = &xe->xdf2.recs[xe->xdf2.dstart]; + i <= xe->xdf2.dend; i++, recs++) { if (action2[i] == KEEP || - (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xdf2->dstart, xdf2->dend))) { - xdf2->reference_index[xdf2->nreff++] = i; + (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->xdf2.dstart, xe->xdf2.dend))) { + xe->xdf2.reference_index[xe->xdf2.nreff++] = i; /* changed[i] remains false, i.e. keep */ } else - xdf2->changed[i] = true; + xe->xdf2.changed[i] = true; /* i.e. discard */ } @@ -340,27 +340,27 @@ cleanup: /* * Early trim initial and terminal matching records. */ -static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) { +static int xdl_trim_ends(xdfenv_t *xe) { long i, lim; xrecord_t *recs1, *recs2; - recs1 = xdf1->recs; - recs2 = xdf2->recs; - for (i = 0, lim = (long)XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim; + recs1 = xe->xdf1.recs; + recs2 = xe->xdf2.recs; + for (i = 0, lim = (long)XDL_MIN(xe->xdf1.nrec, xe->xdf2.nrec); i < lim; i++, recs1++, recs2++) if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash) break; - xdf1->dstart = xdf2->dstart = i; + xe->xdf1.dstart = xe->xdf2.dstart = i; - recs1 = xdf1->recs + xdf1->nrec - 1; - recs2 = xdf2->recs + xdf2->nrec - 1; + recs1 = xe->xdf1.recs + xe->xdf1.nrec - 1; + recs2 = xe->xdf2.recs + xe->xdf2.nrec - 1; for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--) if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash) break; - xdf1->dend = (long)xdf1->nrec - i - 1; - xdf2->dend = (long)xdf2->nrec - i - 1; + xe->xdf1.dend = (long)xe->xdf1.nrec - i - 1; + xe->xdf2.dend = (long)xe->xdf2.nrec - i - 1; return 0; } @@ -393,10 +393,10 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdl_classify_record(2, &cf, rec); } - xdl_trim_ends(&xe->xdf1, &xe->xdf2); + xdl_trim_ends(xe); if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) && (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) && - xdl_cleanup_records(&cf, &xe->xdf1, &xe->xdf2) < 0) { + xdl_cleanup_records(&cf, xe) < 0) { xdl_free_ctx(&xe->xdf2); xdl_free_ctx(&xe->xdf1); From 0e67fd8b4b702408afae82fcc70dbb70a4452db7 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 2 Jan 2026 18:52:20 +0000 Subject: [PATCH 06/11] xdiff: cleanup xdl_trim_ends() This patch is best viewed with a before and after of the whole function. Rather than using 2 pointers and walking them. Use direct indexing with local variables of what is being compared to make it easier to follow along. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xprepare.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 0acb3437d4..06b6a6f804 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -340,29 +340,29 @@ cleanup: /* * Early trim initial and terminal matching records. */ -static int xdl_trim_ends(xdfenv_t *xe) { - long i, lim; - xrecord_t *recs1, *recs2; +static void xdl_trim_ends(xdfenv_t *xe) +{ + size_t lim = XDL_MIN(xe->xdf1.nrec, xe->xdf2.nrec); - recs1 = xe->xdf1.recs; - recs2 = xe->xdf2.recs; - for (i = 0, lim = (long)XDL_MIN(xe->xdf1.nrec, xe->xdf2.nrec); i < lim; - i++, recs1++, recs2++) - if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash) + for (size_t i = 0; i < lim; i++) { + size_t mph1 = xe->xdf1.recs[i].minimal_perfect_hash; + size_t mph2 = xe->xdf2.recs[i].minimal_perfect_hash; + if (mph1 != mph2) { + xe->xdf1.dstart = xe->xdf2.dstart = (ssize_t)i; + lim -= i; break; + } + } - xe->xdf1.dstart = xe->xdf2.dstart = i; - - recs1 = xe->xdf1.recs + xe->xdf1.nrec - 1; - recs2 = xe->xdf2.recs + xe->xdf2.nrec - 1; - for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--) - if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash) + for (size_t i = 0; i < lim; i++) { + size_t mph1 = xe->xdf1.recs[xe->xdf1.nrec - 1 - i].minimal_perfect_hash; + size_t mph2 = xe->xdf2.recs[xe->xdf2.nrec - 1 - i].minimal_perfect_hash; + if (mph1 != mph2) { + xe->xdf1.dend = xe->xdf1.nrec - 1 - i; + xe->xdf2.dend = xe->xdf2.nrec - 1 - i; break; - - xe->xdf1.dend = (long)xe->xdf1.nrec - i - 1; - xe->xdf2.dend = (long)xe->xdf2.nrec - i - 1; - - return 0; + } + } } From d71dac1e231765c687aa8c93c707c1f50aff2091 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 2 Jan 2026 18:52:21 +0000 Subject: [PATCH 07/11] xdiff: replace xdfile_t.dstart with xdfenv_t.delta_start Placing delta_start in xdfenv_t instead of xdfile_t provides a more appropriate context since this variable only makes sense with a pair of files. View with --color-words. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xhistogram.c | 4 ++-- xdiff/xpatience.c | 4 ++-- xdiff/xprepare.c | 17 +++++++++-------- xdiff/xtypes.h | 3 ++- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/xdiff/xhistogram.c b/xdiff/xhistogram.c index 5ae1282c27..eb6a52d9ba 100644 --- a/xdiff/xhistogram.c +++ b/xdiff/xhistogram.c @@ -365,6 +365,6 @@ out: int xdl_do_histogram_diff(xpparam_t const *xpp, xdfenv_t *env) { return histogram_diff(xpp, env, - env->xdf1.dstart + 1, env->xdf1.dend - env->xdf1.dstart + 1, - env->xdf2.dstart + 1, env->xdf2.dend - env->xdf2.dstart + 1); + env->delta_start + 1, env->xdf1.dend - env->delta_start + 1, + env->delta_start + 1, env->xdf2.dend - env->delta_start + 1); } diff --git a/xdiff/xpatience.c b/xdiff/xpatience.c index 2bce07cf48..bd0ffbb417 100644 --- a/xdiff/xpatience.c +++ b/xdiff/xpatience.c @@ -374,6 +374,6 @@ static int patience_diff(xpparam_t const *xpp, xdfenv_t *env, int xdl_do_patience_diff(xpparam_t const *xpp, xdfenv_t *env) { return patience_diff(xpp, env, - env->xdf1.dstart + 1, env->xdf1.dend - env->xdf1.dstart + 1, - env->xdf2.dstart + 1, env->xdf2.dend - env->xdf2.dstart + 1); + env->delta_start + 1, env->xdf1.dend - env->delta_start + 1, + env->delta_start + 1, env->xdf2.dend - env->delta_start + 1); } diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 06b6a6f804..e88468e74c 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -173,7 +173,6 @@ static int xdl_prepare_ctx(mmfile_t *mf, xdfile_t *xdf, uint64_t flags) { xdf->changed += 1; xdf->nreff = 0; - xdf->dstart = 0; xdf->dend = xdf->nrec - 1; return 0; @@ -287,7 +286,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { */ if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; - for (i = xe->xdf1.dstart, recs = &xe->xdf1.recs[xe->xdf1.dstart]; i <= xe->xdf1.dend; i++, recs++) { + for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= xe->xdf1.dend; i++, recs++) { rcrec = cf->rcrecs[recs->minimal_perfect_hash]; nm = rcrec ? rcrec->len2 : 0; action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; @@ -295,7 +294,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; - for (i = xe->xdf2.dstart, recs = &xe->xdf2.recs[xe->xdf2.dstart]; i <= xe->xdf2.dend; i++, recs++) { + for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= xe->xdf2.dend; i++, recs++) { rcrec = cf->rcrecs[recs->minimal_perfect_hash]; nm = rcrec ? rcrec->len1 : 0; action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; @@ -306,10 +305,10 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { * false, or become true. */ xe->xdf1.nreff = 0; - for (i = xe->xdf1.dstart, recs = &xe->xdf1.recs[xe->xdf1.dstart]; + for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= xe->xdf1.dend; i++, recs++) { if (action1[i] == KEEP || - (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->xdf1.dstart, xe->xdf1.dend))) { + (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->delta_start, xe->xdf1.dend))) { xe->xdf1.reference_index[xe->xdf1.nreff++] = i; /* changed[i] remains false, i.e. keep */ } else @@ -318,10 +317,10 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { } xe->xdf2.nreff = 0; - for (i = xe->xdf2.dstart, recs = &xe->xdf2.recs[xe->xdf2.dstart]; + for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= xe->xdf2.dend; i++, recs++) { if (action2[i] == KEEP || - (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->xdf2.dstart, xe->xdf2.dend))) { + (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->delta_start, xe->xdf2.dend))) { xe->xdf2.reference_index[xe->xdf2.nreff++] = i; /* changed[i] remains false, i.e. keep */ } else @@ -348,7 +347,7 @@ static void xdl_trim_ends(xdfenv_t *xe) size_t mph1 = xe->xdf1.recs[i].minimal_perfect_hash; size_t mph2 = xe->xdf2.recs[i].minimal_perfect_hash; if (mph1 != mph2) { - xe->xdf1.dstart = xe->xdf2.dstart = (ssize_t)i; + xe->delta_start = (ssize_t)i; lim -= i; break; } @@ -370,6 +369,8 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdfenv_t *xe) { xdlclassifier_t cf; + xe->delta_start = 0; + if (xdl_prepare_ctx(mf1, &xe->xdf1, xpp->flags) < 0) { return -1; diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h index 979586f20a..bda1f85eb0 100644 --- a/xdiff/xtypes.h +++ b/xdiff/xtypes.h @@ -48,7 +48,7 @@ typedef struct s_xrecord { typedef struct s_xdfile { xrecord_t *recs; size_t nrec; - ptrdiff_t dstart, dend; + ptrdiff_t dend; bool *changed; size_t *reference_index; size_t nreff; @@ -56,6 +56,7 @@ typedef struct s_xdfile { typedef struct s_xdfenv { xdfile_t xdf1, xdf2; + size_t delta_start; } xdfenv_t; From fee078ad4ced742f436a86dfcf8d10820fbe7b2e Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 2 Jan 2026 18:52:22 +0000 Subject: [PATCH 08/11] xdiff: replace xdfile_t.dend with xdfenv_t.delta_end View with --color-words. Same argument as delta_start. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xhistogram.c | 7 +++++-- xdiff/xpatience.c | 7 +++++-- xdiff/xprepare.c | 19 ++++++++++--------- xdiff/xtypes.h | 3 +-- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/xdiff/xhistogram.c b/xdiff/xhistogram.c index eb6a52d9ba..b4d6f88748 100644 --- a/xdiff/xhistogram.c +++ b/xdiff/xhistogram.c @@ -364,7 +364,10 @@ out: int xdl_do_histogram_diff(xpparam_t const *xpp, xdfenv_t *env) { + ptrdiff_t dend1 = env->xdf1.nrec - 1 - env->delta_end; + ptrdiff_t dend2 = env->xdf2.nrec - 1 - env->delta_end; + return histogram_diff(xpp, env, - env->delta_start + 1, env->xdf1.dend - env->delta_start + 1, - env->delta_start + 1, env->xdf2.dend - env->delta_start + 1); + env->delta_start + 1, dend1 - env->delta_start + 1, + env->delta_start + 1, dend2 - env->delta_start + 1); } diff --git a/xdiff/xpatience.c b/xdiff/xpatience.c index bd0ffbb417..5b8bb34d2b 100644 --- a/xdiff/xpatience.c +++ b/xdiff/xpatience.c @@ -373,7 +373,10 @@ static int patience_diff(xpparam_t const *xpp, xdfenv_t *env, int xdl_do_patience_diff(xpparam_t const *xpp, xdfenv_t *env) { + ptrdiff_t dend1 = env->xdf1.nrec - 1 - env->delta_end; + ptrdiff_t dend2 = env->xdf2.nrec - 1 - env->delta_end; + return patience_diff(xpp, env, - env->delta_start + 1, env->xdf1.dend - env->delta_start + 1, - env->delta_start + 1, env->xdf2.dend - env->delta_start + 1); + env->delta_start + 1, dend1 - env->delta_start + 1, + env->delta_start + 1, dend2 - env->delta_start + 1); } diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index e88468e74c..d3cdb6ac02 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -173,7 +173,6 @@ static int xdl_prepare_ctx(mmfile_t *mf, xdfile_t *xdf, uint64_t flags) { xdf->changed += 1; xdf->nreff = 0; - xdf->dend = xdf->nrec - 1; return 0; @@ -267,6 +266,8 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { uint8_t *action1 = NULL, *action2 = NULL; bool need_min = !!(cf->flags & XDF_NEED_MINIMAL); int ret = 0; + ptrdiff_t dend1 = xe->xdf1.nrec - 1 - xe->delta_end; + ptrdiff_t dend2 = xe->xdf2.nrec - 1 - xe->delta_end; /* * Create temporary arrays that will help us decide if @@ -286,7 +287,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { */ if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; - for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= xe->xdf1.dend; i++, recs++) { + for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= dend1; i++, recs++) { rcrec = cf->rcrecs[recs->minimal_perfect_hash]; nm = rcrec ? rcrec->len2 : 0; action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; @@ -294,7 +295,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; - for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= xe->xdf2.dend; i++, recs++) { + for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= dend2; i++, recs++) { rcrec = cf->rcrecs[recs->minimal_perfect_hash]; nm = rcrec ? rcrec->len1 : 0; action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; @@ -306,9 +307,9 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { */ xe->xdf1.nreff = 0; for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; - i <= xe->xdf1.dend; i++, recs++) { + i <= dend1; i++, recs++) { if (action1[i] == KEEP || - (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->delta_start, xe->xdf1.dend))) { + (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->delta_start, dend1))) { xe->xdf1.reference_index[xe->xdf1.nreff++] = i; /* changed[i] remains false, i.e. keep */ } else @@ -318,9 +319,9 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { xe->xdf2.nreff = 0; for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; - i <= xe->xdf2.dend; i++, recs++) { + i <= dend2; i++, recs++) { if (action2[i] == KEEP || - (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->delta_start, xe->xdf2.dend))) { + (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->delta_start, dend2))) { xe->xdf2.reference_index[xe->xdf2.nreff++] = i; /* changed[i] remains false, i.e. keep */ } else @@ -357,8 +358,7 @@ static void xdl_trim_ends(xdfenv_t *xe) size_t mph1 = xe->xdf1.recs[xe->xdf1.nrec - 1 - i].minimal_perfect_hash; size_t mph2 = xe->xdf2.recs[xe->xdf2.nrec - 1 - i].minimal_perfect_hash; if (mph1 != mph2) { - xe->xdf1.dend = xe->xdf1.nrec - 1 - i; - xe->xdf2.dend = xe->xdf2.nrec - 1 - i; + xe->delta_end = i; break; } } @@ -370,6 +370,7 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdlclassifier_t cf; xe->delta_start = 0; + xe->delta_end = 0; if (xdl_prepare_ctx(mf1, &xe->xdf1, xpp->flags) < 0) { diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h index bda1f85eb0..a939396064 100644 --- a/xdiff/xtypes.h +++ b/xdiff/xtypes.h @@ -48,7 +48,6 @@ typedef struct s_xrecord { typedef struct s_xdfile { xrecord_t *recs; size_t nrec; - ptrdiff_t dend; bool *changed; size_t *reference_index; size_t nreff; @@ -56,7 +55,7 @@ typedef struct s_xdfile { typedef struct s_xdfenv { xdfile_t xdf1, xdf2; - size_t delta_start; + size_t delta_start, delta_end; } xdfenv_t; From efd314b8e8526fec7df9e5f8a7a2ebd4d6c423d2 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 2 Jan 2026 18:52:23 +0000 Subject: [PATCH 09/11] xdiff: remove dependence on xdlclassifier from xdl_cleanup_records() Disentangle xdl_cleanup_records() from the classifier so that it can be moved from xprepare.c into xdiffi.c. The classic diff is the only algorithm that needs to count the number of times each line occurs in each file. Make xdl_cleanup_records() count the number of lines instead of the classifier so it won't slow down patience or histogram. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xprepare.c | 52 +++++++++++++++++++++++++++++++++--------------- xdiff/xtypes.h | 1 + 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index d3cdb6ac02..b53a3b80c4 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -21,6 +21,7 @@ */ #include "xinclude.h" +#include "compat/ivec.h" #define XDL_KPDIS_RUN 4 @@ -35,7 +36,6 @@ typedef struct s_xdlclass { struct s_xdlclass *next; xrecord_t rec; long idx; - long len1, len2; } xdlclass_t; typedef struct s_xdlclassifier { @@ -92,7 +92,7 @@ static void xdl_free_classifier(xdlclassifier_t *cf) { } -static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t *rec) { +static int xdl_classify_record(xdlclassifier_t *cf, xrecord_t *rec) { size_t hi; xdlclass_t *rcrec; @@ -113,13 +113,10 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t return -1; cf->rcrecs[rcrec->idx] = rcrec; rcrec->rec = *rec; - rcrec->len1 = rcrec->len2 = 0; rcrec->next = cf->rchash[hi]; cf->rchash[hi] = rcrec; } - (pass == 1) ? rcrec->len1++ : rcrec->len2++; - rec->minimal_perfect_hash = (size_t)rcrec->idx; return 0; @@ -253,22 +250,44 @@ static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) { return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1); } +struct xoccurrence +{ + size_t file1, file2; +}; + + +DEFINE_IVEC_TYPE(struct xoccurrence, xoccurrence); + /* * Try to reduce the problem complexity, discard records that have no * matches on the other file. Also, lines that have multiple matches * might be potentially discarded if they appear in a run of discardable. */ -static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { - long i, nm, mlim; +static int xdl_cleanup_records(xdfenv_t *xe, uint64_t flags) { + long i; + size_t nm, mlim; xrecord_t *recs; - xdlclass_t *rcrec; uint8_t *action1 = NULL, *action2 = NULL; - bool need_min = !!(cf->flags & XDF_NEED_MINIMAL); + struct IVec_xoccurrence occ; + bool need_min = !!(flags & XDF_NEED_MINIMAL); int ret = 0; ptrdiff_t dend1 = xe->xdf1.nrec - 1 - xe->delta_end; ptrdiff_t dend2 = xe->xdf2.nrec - 1 - xe->delta_end; + IVEC_INIT(occ); + ivec_zero(&occ, xe->mph_size); + + for (size_t j = 0; j < xe->xdf1.nrec; j++) { + size_t mph1 = xe->xdf1.recs[j].minimal_perfect_hash; + occ.ptr[mph1].file1 += 1; + } + + for (size_t j = 0; j < xe->xdf2.nrec; j++) { + size_t mph2 = xe->xdf2.recs[j].minimal_perfect_hash; + occ.ptr[mph2].file2 += 1; + } + /* * Create temporary arrays that will help us decide if * changed[i] should remain false, or become true. @@ -288,16 +307,14 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= dend1; i++, recs++) { - rcrec = cf->rcrecs[recs->minimal_perfect_hash]; - nm = rcrec ? rcrec->len2 : 0; + nm = occ.ptr[recs->minimal_perfect_hash].file2; action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; } if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= dend2; i++, recs++) { - rcrec = cf->rcrecs[recs->minimal_perfect_hash]; - nm = rcrec ? rcrec->len1 : 0; + nm = occ.ptr[recs->minimal_perfect_hash].file1; action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; } @@ -332,6 +349,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) { cleanup: xdl_free(action1); xdl_free(action2); + ivec_free(&occ); return ret; } @@ -387,18 +405,20 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, for (size_t i = 0; i < xe->xdf1.nrec; i++) { xrecord_t *rec = &xe->xdf1.recs[i]; - xdl_classify_record(1, &cf, rec); + xdl_classify_record(&cf, rec); } for (size_t i = 0; i < xe->xdf2.nrec; i++) { xrecord_t *rec = &xe->xdf2.recs[i]; - xdl_classify_record(2, &cf, rec); + xdl_classify_record(&cf, rec); } + xe->mph_size = cf.count; + xdl_trim_ends(xe); if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) && (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) && - xdl_cleanup_records(&cf, xe) < 0) { + xdl_cleanup_records(xe, xpp->flags) < 0) { xdl_free_ctx(&xe->xdf2); xdl_free_ctx(&xe->xdf1); diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h index a939396064..2528bd37e8 100644 --- a/xdiff/xtypes.h +++ b/xdiff/xtypes.h @@ -56,6 +56,7 @@ typedef struct s_xdfile { typedef struct s_xdfenv { xdfile_t xdf1, xdf2; size_t delta_start, delta_end; + size_t mph_size; } xdfenv_t; From 083774849baa226e5377709261860ccc9a1a9c52 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 2 Jan 2026 18:52:24 +0000 Subject: [PATCH 10/11] xdiff: move xdl_cleanup_records() from xprepare.c to xdiffi.c Only the classic diff uses xdl_cleanup_records(). Move it, xdl_clean_mmatch(), and the macros to xdiffi.c and call xdl_cleanup_records() inside of xdl_do_classic_diff(). This better organizes the code related to the classic diff. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xdiffi.c | 180 ++++++++++++++++++++++++++++++++++++++++++++ xdiff/xprepare.c | 191 +---------------------------------------------- 2 files changed, 181 insertions(+), 190 deletions(-) diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c index e3196c7245..0f1fd7cf80 100644 --- a/xdiff/xdiffi.c +++ b/xdiff/xdiffi.c @@ -21,6 +21,7 @@ */ #include "xinclude.h" +#include "compat/ivec.h" static size_t get_hash(xdfile_t *xdf, long index) { @@ -33,6 +34,14 @@ static size_t get_hash(xdfile_t *xdf, long index) #define XDL_SNAKE_CNT 20 #define XDL_K_HEUR 4 +#define XDL_KPDIS_RUN 4 +#define XDL_MAX_EQLIMIT 1024 +#define XDL_SIMSCAN_WINDOW 100 + +#define DISCARD 0 +#define KEEP 1 +#define INVESTIGATE 2 + typedef struct s_xdpsplit { long i1, i2; int min_lo, min_hi; @@ -311,6 +320,175 @@ int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1, } +static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) { + long r, rdis0, rpdis0, rdis1, rpdis1; + + /* + * Limits the window that is examined during the similar-lines + * scan. The loops below stops when action[i - r] == KEEP + * (line that has no match), but there are corner cases where + * the loop proceed all the way to the extremities by causing + * huge performance penalties in case of big files. + */ + if (i - s > XDL_SIMSCAN_WINDOW) + s = i - XDL_SIMSCAN_WINDOW; + if (e - i > XDL_SIMSCAN_WINDOW) + e = i + XDL_SIMSCAN_WINDOW; + + /* + * Scans the lines before 'i' to find a run of lines that either + * have no match (action[j] == DISCARD) or have multiple matches + * (action[j] == INVESTIGATE). Note that we always call this + * function with action[i] == INVESTIGATE, so the current line + * (i) is already a multimatch line. + */ + for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) { + if (action[i - r] == DISCARD) + rdis0++; + else if (action[i - r] == INVESTIGATE) + rpdis0++; + else if (action[i - r] == KEEP) + break; + else + BUG("Illegal value for action[i - r]"); + } + /* + * If the run before the line 'i' found only multimatch lines, + * we return false and hence we don't make the current line (i) + * discarded. We want to discard multimatch lines only when + * they appear in the middle of runs with nomatch lines + * (action[j] == DISCARD). + */ + if (rdis0 == 0) + return 0; + for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) { + if (action[i + r] == DISCARD) + rdis1++; + else if (action[i + r] == INVESTIGATE) + rpdis1++; + else if (action[i + r] == KEEP) + break; + else + BUG("Illegal value for action[i + r]"); + } + /* + * If the run after the line 'i' found only multimatch lines, + * we return false and hence we don't make the current line (i) + * discarded. + */ + if (rdis1 == 0) + return false; + rdis1 += rdis0; + rpdis1 += rpdis0; + + return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1); +} + +struct xoccurrence +{ + size_t file1, file2; +}; + + +DEFINE_IVEC_TYPE(struct xoccurrence, xoccurrence); + + +/* + * Try to reduce the problem complexity, discard records that have no + * matches on the other file. Also, lines that have multiple matches + * might be potentially discarded if they appear in a run of discardable. + */ +static int xdl_cleanup_records(xdfenv_t *xe, uint64_t flags) { + long i; + size_t nm, mlim; + xrecord_t *recs; + uint8_t *action1 = NULL, *action2 = NULL; + struct IVec_xoccurrence occ; + bool need_min = !!(flags & XDF_NEED_MINIMAL); + int ret = 0; + ptrdiff_t dend1 = xe->xdf1.nrec - 1 - xe->delta_end; + ptrdiff_t dend2 = xe->xdf2.nrec - 1 - xe->delta_end; + + IVEC_INIT(occ); + ivec_zero(&occ, xe->mph_size); + + for (size_t j = 0; j < xe->xdf1.nrec; j++) { + size_t mph1 = xe->xdf1.recs[j].minimal_perfect_hash; + occ.ptr[mph1].file1 += 1; + } + + for (size_t j = 0; j < xe->xdf2.nrec; j++) { + size_t mph2 = xe->xdf2.recs[j].minimal_perfect_hash; + occ.ptr[mph2].file2 += 1; + } + + /* + * Create temporary arrays that will help us decide if + * changed[i] should remain false, or become true. + */ + if (!XDL_CALLOC_ARRAY(action1, xe->xdf1.nrec + 1)) { + ret = -1; + goto cleanup; + } + if (!XDL_CALLOC_ARRAY(action2, xe->xdf2.nrec + 1)) { + ret = -1; + goto cleanup; + } + + /* + * Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE. + */ + if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT) + mlim = XDL_MAX_EQLIMIT; + for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= dend1; i++, recs++) { + nm = occ.ptr[recs->minimal_perfect_hash].file2; + action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; + } + + if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT) + mlim = XDL_MAX_EQLIMIT; + for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= dend2; i++, recs++) { + nm = occ.ptr[recs->minimal_perfect_hash].file1; + action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; + } + + /* + * Use temporary arrays to decide if changed[i] should remain + * false, or become true. + */ + xe->xdf1.nreff = 0; + for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; + i <= dend1; i++, recs++) { + if (action1[i] == KEEP || + (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->delta_start, dend1))) { + xe->xdf1.reference_index[xe->xdf1.nreff++] = i; + /* changed[i] remains false, i.e. keep */ + } else + xe->xdf1.changed[i] = true; + /* i.e. discard */ + } + + xe->xdf2.nreff = 0; + for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; + i <= dend2; i++, recs++) { + if (action2[i] == KEEP || + (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->delta_start, dend2))) { + xe->xdf2.reference_index[xe->xdf2.nreff++] = i; + /* changed[i] remains false, i.e. keep */ + } else + xe->xdf2.changed[i] = true; + /* i.e. discard */ + } + +cleanup: + xdl_free(action1); + xdl_free(action2); + ivec_free(&occ); + + return ret; +} + + int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags) { long ndiags; @@ -318,6 +496,8 @@ int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags) xdalgoenv_t xenv; int res; + xdl_cleanup_records(xe, flags); + /* * Allocate and setup K vectors to be used by the differential * algorithm. diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index b53a3b80c4..3f555e29f4 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -24,14 +24,6 @@ #include "compat/ivec.h" -#define XDL_KPDIS_RUN 4 -#define XDL_MAX_EQLIMIT 1024 -#define XDL_SIMSCAN_WINDOW 100 - -#define DISCARD 0 -#define KEEP 1 -#define INVESTIGATE 2 - typedef struct s_xdlclass { struct s_xdlclass *next; xrecord_t rec; @@ -50,8 +42,6 @@ typedef struct s_xdlclassifier { } xdlclassifier_t; - - static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) { memset(cf, 0, sizeof(xdlclassifier_t)); @@ -186,175 +176,6 @@ void xdl_free_env(xdfenv_t *xe) { } -static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) { - long r, rdis0, rpdis0, rdis1, rpdis1; - - /* - * Limits the window that is examined during the similar-lines - * scan. The loops below stops when action[i - r] == KEEP - * (line that has no match), but there are corner cases where - * the loop proceed all the way to the extremities by causing - * huge performance penalties in case of big files. - */ - if (i - s > XDL_SIMSCAN_WINDOW) - s = i - XDL_SIMSCAN_WINDOW; - if (e - i > XDL_SIMSCAN_WINDOW) - e = i + XDL_SIMSCAN_WINDOW; - - /* - * Scans the lines before 'i' to find a run of lines that either - * have no match (action[j] == DISCARD) or have multiple matches - * (action[j] == INVESTIGATE). Note that we always call this - * function with action[i] == INVESTIGATE, so the current line - * (i) is already a multimatch line. - */ - for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) { - if (action[i - r] == DISCARD) - rdis0++; - else if (action[i - r] == INVESTIGATE) - rpdis0++; - else if (action[i - r] == KEEP) - break; - else - BUG("Illegal value for action[i - r]"); - } - /* - * If the run before the line 'i' found only multimatch lines, - * we return false and hence we don't make the current line (i) - * discarded. We want to discard multimatch lines only when - * they appear in the middle of runs with nomatch lines - * (action[j] == DISCARD). - */ - if (rdis0 == 0) - return 0; - for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) { - if (action[i + r] == DISCARD) - rdis1++; - else if (action[i + r] == INVESTIGATE) - rpdis1++; - else if (action[i + r] == KEEP) - break; - else - BUG("Illegal value for action[i + r]"); - } - /* - * If the run after the line 'i' found only multimatch lines, - * we return false and hence we don't make the current line (i) - * discarded. - */ - if (rdis1 == 0) - return false; - rdis1 += rdis0; - rpdis1 += rpdis0; - - return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1); -} - -struct xoccurrence -{ - size_t file1, file2; -}; - - -DEFINE_IVEC_TYPE(struct xoccurrence, xoccurrence); - - -/* - * Try to reduce the problem complexity, discard records that have no - * matches on the other file. Also, lines that have multiple matches - * might be potentially discarded if they appear in a run of discardable. - */ -static int xdl_cleanup_records(xdfenv_t *xe, uint64_t flags) { - long i; - size_t nm, mlim; - xrecord_t *recs; - uint8_t *action1 = NULL, *action2 = NULL; - struct IVec_xoccurrence occ; - bool need_min = !!(flags & XDF_NEED_MINIMAL); - int ret = 0; - ptrdiff_t dend1 = xe->xdf1.nrec - 1 - xe->delta_end; - ptrdiff_t dend2 = xe->xdf2.nrec - 1 - xe->delta_end; - - IVEC_INIT(occ); - ivec_zero(&occ, xe->mph_size); - - for (size_t j = 0; j < xe->xdf1.nrec; j++) { - size_t mph1 = xe->xdf1.recs[j].minimal_perfect_hash; - occ.ptr[mph1].file1 += 1; - } - - for (size_t j = 0; j < xe->xdf2.nrec; j++) { - size_t mph2 = xe->xdf2.recs[j].minimal_perfect_hash; - occ.ptr[mph2].file2 += 1; - } - - /* - * Create temporary arrays that will help us decide if - * changed[i] should remain false, or become true. - */ - if (!XDL_CALLOC_ARRAY(action1, xe->xdf1.nrec + 1)) { - ret = -1; - goto cleanup; - } - if (!XDL_CALLOC_ARRAY(action2, xe->xdf2.nrec + 1)) { - ret = -1; - goto cleanup; - } - - /* - * Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE. - */ - if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT) - mlim = XDL_MAX_EQLIMIT; - for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= dend1; i++, recs++) { - nm = occ.ptr[recs->minimal_perfect_hash].file2; - action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; - } - - if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT) - mlim = XDL_MAX_EQLIMIT; - for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= dend2; i++, recs++) { - nm = occ.ptr[recs->minimal_perfect_hash].file1; - action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; - } - - /* - * Use temporary arrays to decide if changed[i] should remain - * false, or become true. - */ - xe->xdf1.nreff = 0; - for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; - i <= dend1; i++, recs++) { - if (action1[i] == KEEP || - (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->delta_start, dend1))) { - xe->xdf1.reference_index[xe->xdf1.nreff++] = i; - /* changed[i] remains false, i.e. keep */ - } else - xe->xdf1.changed[i] = true; - /* i.e. discard */ - } - - xe->xdf2.nreff = 0; - for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; - i <= dend2; i++, recs++) { - if (action2[i] == KEEP || - (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->delta_start, dend2))) { - xe->xdf2.reference_index[xe->xdf2.nreff++] = i; - /* changed[i] remains false, i.e. keep */ - } else - xe->xdf2.changed[i] = true; - /* i.e. discard */ - } - -cleanup: - xdl_free(action1); - xdl_free(action2); - ivec_free(&occ); - - return ret; -} - - /* * Early trim initial and terminal matching records. */ @@ -414,19 +235,9 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, } xe->mph_size = cf.count; + xdl_free_classifier(&cf); xdl_trim_ends(xe); - if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) && - (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) && - xdl_cleanup_records(xe, xpp->flags) < 0) { - - xdl_free_ctx(&xe->xdf2); - xdl_free_ctx(&xe->xdf1); - xdl_free_classifier(&cf); - return -1; - } - - xdl_free_classifier(&cf); return 0; } From 8bbe574720009fadd2a930020b7d2b9d0674416b Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sun, 4 Jan 2026 14:33:02 +0900 Subject: [PATCH 11/11] SQUASH??? cocci --- compat/ivec.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/compat/ivec.c b/compat/ivec.c index 0a777e78dc..6d47a9ddd5 100644 --- a/compat/ivec.c +++ b/compat/ivec.c @@ -15,8 +15,7 @@ static void _set_capacity(void *self_, size_t new_capacity) return; } if (new_capacity == 0) { - free(self->ptr); - self->ptr = NULL; + FREE_AND_NULL(self->ptr); } else { self->ptr = realloc(self->ptr, new_capacity * self->element_size); } @@ -88,8 +87,7 @@ void ivec_free(void *self_) { struct IVec_c_void *self = self_; - free(self->ptr); - self->ptr = NULL; + FREE_AND_NULL(self->ptr); self->length = 0; self->capacity = 0; // DO NOT MODIFY element_size!!!