From db8a50ca6b92fd8222b90d6873b9f7e3766f224a Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 2 Jan 2026 18:52:17 +0000 Subject: [PATCH] xdiff: don't waste time guessing the number of lines All lines must be read anyway, so classify them after they're read in. Also move the memset() into xdl_init_classifier(). Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xprepare.c | 54 ++++++++++++++++++++---------------------------- xdiff/xutils.c | 20 ------------------ xdiff/xutils.h | 1 - 3 files changed, 22 insertions(+), 53 deletions(-) diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 34c82e4f8e..96a32cc5e9 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -26,8 +26,6 @@ #define XDL_KPDIS_RUN 4 #define XDL_MAX_EQLIMIT 1024 #define XDL_SIMSCAN_WINDOW 100 -#define XDL_GUESS_NLINES1 256 -#define XDL_GUESS_NLINES2 20 #define DISCARD 0 #define KEEP 1 @@ -55,6 +53,8 @@ typedef struct s_xdlclassifier { static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) { + memset(cf, 0, sizeof(xdlclassifier_t)); + cf->flags = flags; cf->hbits = xdl_hashbits((unsigned int) size); @@ -134,12 +134,12 @@ static void xdl_free_ctx(xdfile_t *xdf) } -static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp, - xdlclassifier_t *cf, xdfile_t *xdf) { +static int xdl_prepare_ctx(mmfile_t *mf, xdfile_t *xdf, uint64_t flags) { long bsize; uint64_t hav; uint8_t const *blk, *cur, *top, *prev; xrecord_t *crec; + long narec = 8; xdf->reference_index = NULL; xdf->changed = NULL; @@ -152,23 +152,21 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_ if ((cur = blk = xdl_mmfile_first(mf, &bsize))) { for (top = blk + bsize; cur < top; ) { prev = cur; - hav = xdl_hash_record(&cur, top, xpp->flags); + hav = xdl_hash_record(&cur, top, flags); if (XDL_ALLOC_GROW(xdf->recs, (long)xdf->nrec + 1, narec)) goto abort; crec = &xdf->recs[xdf->nrec++]; crec->ptr = prev; crec->size = cur - prev; crec->line_hash = hav; - if (xdl_classify_record(pass, cf, crec) < 0) - goto abort; } } if (!XDL_CALLOC_ARRAY(xdf->changed, xdf->nrec + 2)) goto abort; - if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) && - (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) { + if ((XDF_DIFF_ALG(flags) != XDF_PATIENCE_DIFF) && + (XDF_DIFF_ALG(flags) != XDF_HISTOGRAM_DIFF)) { if (!XDL_ALLOC_ARRAY(xdf->reference_index, xdf->nrec + 1)) goto abort; } @@ -381,39 +379,31 @@ static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2 int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdfenv_t *xe) { - long enl1, enl2, sample; xdlclassifier_t cf; - memset(&cf, 0, sizeof(cf)); + if (xdl_prepare_ctx(mf1, &xe->xdf1, xpp->flags) < 0) { - /* - * For histogram diff, we can afford a smaller sample size and - * thus a poorer estimate of the number of lines, as the hash - * table (rhash) won't be filled up/grown. The number of lines - * (nrecs) will be updated correctly anyway by - * xdl_prepare_ctx(). - */ - sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF - ? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1); - - enl1 = xdl_guess_lines(mf1, sample) + 1; - enl2 = xdl_guess_lines(mf2, sample) + 1; - - if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0) - return -1; - - if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) { - - xdl_free_classifier(&cf); return -1; } - if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) { + if (xdl_prepare_ctx(mf2, &xe->xdf2, xpp->flags) < 0) { xdl_free_ctx(&xe->xdf1); - xdl_free_classifier(&cf); return -1; } + if (xdl_init_classifier(&cf, xe->xdf1.nrec + xe->xdf2.nrec + 1, xpp->flags) < 0) + return -1; + + for (size_t i = 0; i < xe->xdf1.nrec; i++) { + xrecord_t *rec = &xe->xdf1.recs[i]; + xdl_classify_record(1, &cf, rec); + } + + for (size_t i = 0; i < xe->xdf2.nrec; i++) { + xrecord_t *rec = &xe->xdf2.recs[i]; + xdl_classify_record(2, &cf, rec); + } + if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) && (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) && xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) { diff --git a/xdiff/xutils.c b/xdiff/xutils.c index 77ee1ad9c8..b3d51197c1 100644 --- a/xdiff/xutils.c +++ b/xdiff/xutils.c @@ -118,26 +118,6 @@ void *xdl_cha_alloc(chastore_t *cha) { return data; } -long xdl_guess_lines(mmfile_t *mf, long sample) { - long nl = 0, size, tsize = 0; - char const *data, *cur, *top; - - if ((cur = data = xdl_mmfile_first(mf, &size))) { - for (top = data + size; nl < sample && cur < top; ) { - nl++; - if (!(cur = memchr(cur, '\n', top - cur))) - cur = top; - else - cur++; - } - tsize += (long) (cur - data); - } - - if (nl && tsize) - nl = xdl_mmfile_size(mf) / (tsize / nl); - - return nl + 1; -} int xdl_blankline(const char *line, long size, long flags) { diff --git a/xdiff/xutils.h b/xdiff/xutils.h index 615b4a9d35..d800840dd0 100644 --- a/xdiff/xutils.h +++ b/xdiff/xutils.h @@ -31,7 +31,6 @@ int xdl_emit_diffrec(char const *rec, long size, char const *pre, long psize, int xdl_cha_init(chastore_t *cha, long isize, long icount); void xdl_cha_free(chastore_t *cha); void *xdl_cha_alloc(chastore_t *cha); -long xdl_guess_lines(mmfile_t *mf, long sample); int xdl_blankline(const char *line, long size, long flags); int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags); uint64_t xdl_hash_record_verbatim(uint8_t const **data, uint8_t const *top);