Merge branch 'en/xdiff-cleanup-3' into seen

Preparation of xdiff/ codebase to work with Rust

Comments?

* en/xdiff-cleanup-3:
  SQUASH??? cocci
  xdiff: move xdl_cleanup_records() from xprepare.c to xdiffi.c
  xdiff: remove dependence on xdlclassifier from xdl_cleanup_records()
  xdiff: replace xdfile_t.dend with xdfenv_t.delta_end
  xdiff: replace xdfile_t.dstart with xdfenv_t.delta_start
  xdiff: cleanup xdl_trim_ends()
  xdiff: use xdfenv_t in xdl_trim_ends() and xdl_cleanup_records()
  xdiff: let patience and histogram benefit from xdl_trim_ends()
  xdiff: don't waste time guessing the number of lines
  xdiff: make classic diff explicit by creating xdl_do_classic_diff()
  ivec: introduce the C side of ivec
This commit is contained in:
Junio C Hamano 2026-01-09 06:55:52 -08:00
commit 6b24017454
12 changed files with 430 additions and 272 deletions

View File

@ -1116,6 +1116,7 @@ LIB_OBJS += commit-reach.o
LIB_OBJS += commit.o
LIB_OBJS += common-exit.o
LIB_OBJS += common-init.o
LIB_OBJS += compat/ivec.o
LIB_OBJS += compat/nonblock.o
LIB_OBJS += compat/obstack.o
LIB_OBJS += compat/open.o

111
compat/ivec.c Normal file
View File

@ -0,0 +1,111 @@
#include "ivec.h"
struct IVec_c_void {
void *ptr;
size_t length;
size_t capacity;
size_t element_size;
};
static void _set_capacity(void *self_, size_t new_capacity)
{
struct IVec_c_void *self = self_;
if (new_capacity == self->capacity) {
return;
}
if (new_capacity == 0) {
FREE_AND_NULL(self->ptr);
} else {
self->ptr = realloc(self->ptr, new_capacity * self->element_size);
}
self->capacity = new_capacity;
}
void ivec_init(void *self_, size_t element_size)
{
struct IVec_c_void *self = self_;
self->ptr = NULL;
self->length = 0;
self->capacity = 0;
self->element_size = element_size;
}
void ivec_zero(void *self_, size_t capacity)
{
struct IVec_c_void *self = self_;
self->ptr = calloc(capacity, self->element_size);
self->length = capacity;
self->capacity = capacity;
// DO NOT MODIFY element_size!!!
}
void ivec_reserve_exact(void *self_, size_t additional)
{
struct IVec_c_void *self = self_;
_set_capacity(self, self->capacity + additional);
}
void ivec_reserve(void *self_, size_t additional)
{
struct IVec_c_void *self = self_;
size_t growby = 128;
if (self->capacity > growby)
growby = self->capacity;
if (additional > growby)
growby = additional;
_set_capacity(self, self->capacity + growby);
}
void ivec_shrink_to_fit(void *self_)
{
struct IVec_c_void *self = self_;
_set_capacity(self, self->length);
}
void ivec_push(void *self_, const void *value)
{
struct IVec_c_void *self = self_;
void *dst = NULL;
if (self->length == self->capacity)
ivec_reserve(self, 1);
dst = (uint8_t*)self->ptr + self->length * self->element_size;
memcpy(dst, value, self->element_size);
self->length++;
}
void ivec_free(void *self_)
{
struct IVec_c_void *self = self_;
FREE_AND_NULL(self->ptr);
self->length = 0;
self->capacity = 0;
// DO NOT MODIFY element_size!!!
}
void ivec_move(void *src_, void *dst_)
{
struct IVec_c_void *src = src_;
struct IVec_c_void *dst = dst_;
ivec_free(dst);
dst->ptr = src->ptr;
dst->length = src->length;
dst->capacity = src->capacity;
// DO NOT MODIFY element_size!!!
src->ptr = NULL;
src->length = 0;
src->capacity = 0;
// DO NOT MODIFY element_size!!!
}

52
compat/ivec.h Normal file
View File

@ -0,0 +1,52 @@
#ifndef IVEC_H
#define IVEC_H
#include <git-compat-util.h>
#define IVEC_INIT(variable) ivec_init(&(variable), sizeof(*(variable).ptr))
#ifndef CBINDGEN
#define DEFINE_IVEC_TYPE(type, suffix) \
struct IVec_##suffix { \
type* ptr; \
size_t length; \
size_t capacity; \
size_t element_size; \
}
DEFINE_IVEC_TYPE(bool, bool);
DEFINE_IVEC_TYPE(uint8_t, u8);
DEFINE_IVEC_TYPE(uint16_t, u16);
DEFINE_IVEC_TYPE(uint32_t, u32);
DEFINE_IVEC_TYPE(uint64_t, u64);
DEFINE_IVEC_TYPE(int8_t, i8);
DEFINE_IVEC_TYPE(int16_t, i16);
DEFINE_IVEC_TYPE(int32_t, i32);
DEFINE_IVEC_TYPE(int64_t, i64);
DEFINE_IVEC_TYPE(float, f32);
DEFINE_IVEC_TYPE(double, f64);
DEFINE_IVEC_TYPE(size_t, usize);
DEFINE_IVEC_TYPE(ssize_t, isize);
#endif
void ivec_init(void *self_, size_t element_size);
void ivec_zero(void *self_, size_t capacity);
void ivec_reserve_exact(void *self_, size_t additional);
void ivec_reserve(void *self_, size_t additional);
void ivec_shrink_to_fit(void *self_);
void ivec_push(void *self_, const void *value);
void ivec_free(void *self_);
void ivec_move(void *src, void *dst);
#endif /* IVEC_H */

View File

@ -302,6 +302,7 @@ libgit_sources = [
'commit.c',
'common-exit.c',
'common-init.c',
'compat/ivec.c',
'compat/nonblock.c',
'compat/obstack.c',
'compat/open.c',

View File

@ -21,6 +21,7 @@
*/
#include "xinclude.h"
#include "compat/ivec.h"
static size_t get_hash(xdfile_t *xdf, long index)
{
@ -33,6 +34,14 @@ static size_t get_hash(xdfile_t *xdf, long index)
#define XDL_SNAKE_CNT 20
#define XDL_K_HEUR 4
#define XDL_KPDIS_RUN 4
#define XDL_MAX_EQLIMIT 1024
#define XDL_SIMSCAN_WINDOW 100
#define DISCARD 0
#define KEEP 1
#define INVESTIGATE 2
typedef struct s_xdpsplit {
long i1, i2;
int min_lo, min_hi;
@ -311,25 +320,183 @@ int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1,
}
int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
xdfenv_t *xe) {
static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
long r, rdis0, rpdis0, rdis1, rpdis1;
/*
* Limits the window that is examined during the similar-lines
* scan. The loops below stops when action[i - r] == KEEP
* (line that has no match), but there are corner cases where
* the loop proceed all the way to the extremities by causing
* huge performance penalties in case of big files.
*/
if (i - s > XDL_SIMSCAN_WINDOW)
s = i - XDL_SIMSCAN_WINDOW;
if (e - i > XDL_SIMSCAN_WINDOW)
e = i + XDL_SIMSCAN_WINDOW;
/*
* Scans the lines before 'i' to find a run of lines that either
* have no match (action[j] == DISCARD) or have multiple matches
* (action[j] == INVESTIGATE). Note that we always call this
* function with action[i] == INVESTIGATE, so the current line
* (i) is already a multimatch line.
*/
for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
if (action[i - r] == DISCARD)
rdis0++;
else if (action[i - r] == INVESTIGATE)
rpdis0++;
else if (action[i - r] == KEEP)
break;
else
BUG("Illegal value for action[i - r]");
}
/*
* If the run before the line 'i' found only multimatch lines,
* we return false and hence we don't make the current line (i)
* discarded. We want to discard multimatch lines only when
* they appear in the middle of runs with nomatch lines
* (action[j] == DISCARD).
*/
if (rdis0 == 0)
return 0;
for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
if (action[i + r] == DISCARD)
rdis1++;
else if (action[i + r] == INVESTIGATE)
rpdis1++;
else if (action[i + r] == KEEP)
break;
else
BUG("Illegal value for action[i + r]");
}
/*
* If the run after the line 'i' found only multimatch lines,
* we return false and hence we don't make the current line (i)
* discarded.
*/
if (rdis1 == 0)
return false;
rdis1 += rdis0;
rpdis1 += rpdis0;
return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
}
struct xoccurrence
{
size_t file1, file2;
};
DEFINE_IVEC_TYPE(struct xoccurrence, xoccurrence);
/*
* Try to reduce the problem complexity, discard records that have no
* matches on the other file. Also, lines that have multiple matches
* might be potentially discarded if they appear in a run of discardable.
*/
static int xdl_cleanup_records(xdfenv_t *xe, uint64_t flags) {
long i;
size_t nm, mlim;
xrecord_t *recs;
uint8_t *action1 = NULL, *action2 = NULL;
struct IVec_xoccurrence occ;
bool need_min = !!(flags & XDF_NEED_MINIMAL);
int ret = 0;
ptrdiff_t dend1 = xe->xdf1.nrec - 1 - xe->delta_end;
ptrdiff_t dend2 = xe->xdf2.nrec - 1 - xe->delta_end;
IVEC_INIT(occ);
ivec_zero(&occ, xe->mph_size);
for (size_t j = 0; j < xe->xdf1.nrec; j++) {
size_t mph1 = xe->xdf1.recs[j].minimal_perfect_hash;
occ.ptr[mph1].file1 += 1;
}
for (size_t j = 0; j < xe->xdf2.nrec; j++) {
size_t mph2 = xe->xdf2.recs[j].minimal_perfect_hash;
occ.ptr[mph2].file2 += 1;
}
/*
* Create temporary arrays that will help us decide if
* changed[i] should remain false, or become true.
*/
if (!XDL_CALLOC_ARRAY(action1, xe->xdf1.nrec + 1)) {
ret = -1;
goto cleanup;
}
if (!XDL_CALLOC_ARRAY(action2, xe->xdf2.nrec + 1)) {
ret = -1;
goto cleanup;
}
/*
* Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
*/
if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT)
mlim = XDL_MAX_EQLIMIT;
for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= dend1; i++, recs++) {
nm = occ.ptr[recs->minimal_perfect_hash].file2;
action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
}
if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT)
mlim = XDL_MAX_EQLIMIT;
for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= dend2; i++, recs++) {
nm = occ.ptr[recs->minimal_perfect_hash].file1;
action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
}
/*
* Use temporary arrays to decide if changed[i] should remain
* false, or become true.
*/
xe->xdf1.nreff = 0;
for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start];
i <= dend1; i++, recs++) {
if (action1[i] == KEEP ||
(action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->delta_start, dend1))) {
xe->xdf1.reference_index[xe->xdf1.nreff++] = i;
/* changed[i] remains false, i.e. keep */
} else
xe->xdf1.changed[i] = true;
/* i.e. discard */
}
xe->xdf2.nreff = 0;
for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start];
i <= dend2; i++, recs++) {
if (action2[i] == KEEP ||
(action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->delta_start, dend2))) {
xe->xdf2.reference_index[xe->xdf2.nreff++] = i;
/* changed[i] remains false, i.e. keep */
} else
xe->xdf2.changed[i] = true;
/* i.e. discard */
}
cleanup:
xdl_free(action1);
xdl_free(action2);
ivec_free(&occ);
return ret;
}
int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags)
{
long ndiags;
long *kvd, *kvdf, *kvdb;
xdalgoenv_t xenv;
int res;
if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0)
return -1;
if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF) {
res = xdl_do_patience_diff(xpp, xe);
goto out;
}
if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF) {
res = xdl_do_histogram_diff(xpp, xe);
goto out;
}
xdl_cleanup_records(xe, flags);
/*
* Allocate and setup K vectors to be used by the differential
@ -355,9 +522,33 @@ int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
xenv.heur_min = XDL_HEUR_MIN_COST;
res = xdl_recs_cmp(&xe->xdf1, 0, xe->xdf1.nreff, &xe->xdf2, 0, xe->xdf2.nreff,
kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0,
kvdf, kvdb, (flags & XDF_NEED_MINIMAL) != 0,
&xenv);
xdl_free(kvd);
return res;
}
int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
xdfenv_t *xe) {
int res;
if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0)
return -1;
if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF) {
res = xdl_do_patience_diff(xpp, xe);
goto out;
}
if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF) {
res = xdl_do_histogram_diff(xpp, xe);
goto out;
}
res = xdl_do_classic_diff(xe, xpp->flags);
out:
if (res < 0)
xdl_free_env(xe);

View File

@ -42,6 +42,7 @@ typedef struct s_xdchange {
int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1,
xdfile_t *xdf2, long off2, long lim2,
long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv);
int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags);
int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
xdfenv_t *xe);
int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, long flags);

View File

@ -364,7 +364,10 @@ out:
int xdl_do_histogram_diff(xpparam_t const *xpp, xdfenv_t *env)
{
ptrdiff_t dend1 = env->xdf1.nrec - 1 - env->delta_end;
ptrdiff_t dend2 = env->xdf2.nrec - 1 - env->delta_end;
return histogram_diff(xpp, env,
env->xdf1.dstart + 1, env->xdf1.dend - env->xdf1.dstart + 1,
env->xdf2.dstart + 1, env->xdf2.dend - env->xdf2.dstart + 1);
env->delta_start + 1, dend1 - env->delta_start + 1,
env->delta_start + 1, dend2 - env->delta_start + 1);
}

View File

@ -373,5 +373,10 @@ static int patience_diff(xpparam_t const *xpp, xdfenv_t *env,
int xdl_do_patience_diff(xpparam_t const *xpp, xdfenv_t *env)
{
return patience_diff(xpp, env, 1, (int)env->xdf1.nrec, 1, (int)env->xdf2.nrec);
ptrdiff_t dend1 = env->xdf1.nrec - 1 - env->delta_end;
ptrdiff_t dend2 = env->xdf2.nrec - 1 - env->delta_end;
return patience_diff(xpp, env,
env->delta_start + 1, dend1 - env->delta_start + 1,
env->delta_start + 1, dend2 - env->delta_start + 1);
}

View File

@ -21,23 +21,13 @@
*/
#include "xinclude.h"
#include "compat/ivec.h"
#define XDL_KPDIS_RUN 4
#define XDL_MAX_EQLIMIT 1024
#define XDL_SIMSCAN_WINDOW 100
#define XDL_GUESS_NLINES1 256
#define XDL_GUESS_NLINES2 20
#define DISCARD 0
#define KEEP 1
#define INVESTIGATE 2
typedef struct s_xdlclass {
struct s_xdlclass *next;
xrecord_t rec;
long idx;
long len1, len2;
} xdlclass_t;
typedef struct s_xdlclassifier {
@ -52,9 +42,9 @@ typedef struct s_xdlclassifier {
} xdlclassifier_t;
static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
memset(cf, 0, sizeof(xdlclassifier_t));
cf->flags = flags;
cf->hbits = xdl_hashbits((unsigned int) size);
@ -92,7 +82,7 @@ static void xdl_free_classifier(xdlclassifier_t *cf) {
}
static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t *rec) {
static int xdl_classify_record(xdlclassifier_t *cf, xrecord_t *rec) {
size_t hi;
xdlclass_t *rcrec;
@ -113,13 +103,10 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t
return -1;
cf->rcrecs[rcrec->idx] = rcrec;
rcrec->rec = *rec;
rcrec->len1 = rcrec->len2 = 0;
rcrec->next = cf->rchash[hi];
cf->rchash[hi] = rcrec;
}
(pass == 1) ? rcrec->len1++ : rcrec->len2++;
rec->minimal_perfect_hash = (size_t)rcrec->idx;
return 0;
@ -134,12 +121,12 @@ static void xdl_free_ctx(xdfile_t *xdf)
}
static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
xdlclassifier_t *cf, xdfile_t *xdf) {
static int xdl_prepare_ctx(mmfile_t *mf, xdfile_t *xdf, uint64_t flags) {
long bsize;
uint64_t hav;
uint8_t const *blk, *cur, *top, *prev;
xrecord_t *crec;
long narec = 8;
xdf->reference_index = NULL;
xdf->changed = NULL;
@ -152,31 +139,27 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_
if ((cur = blk = xdl_mmfile_first(mf, &bsize))) {
for (top = blk + bsize; cur < top; ) {
prev = cur;
hav = xdl_hash_record(&cur, top, xpp->flags);
hav = xdl_hash_record(&cur, top, flags);
if (XDL_ALLOC_GROW(xdf->recs, (long)xdf->nrec + 1, narec))
goto abort;
crec = &xdf->recs[xdf->nrec++];
crec->ptr = prev;
crec->size = cur - prev;
crec->line_hash = hav;
if (xdl_classify_record(pass, cf, crec) < 0)
goto abort;
}
}
if (!XDL_CALLOC_ARRAY(xdf->changed, xdf->nrec + 2))
goto abort;
if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
(XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) {
if ((XDF_DIFF_ALG(flags) != XDF_PATIENCE_DIFF) &&
(XDF_DIFF_ALG(flags) != XDF_HISTOGRAM_DIFF)) {
if (!XDL_ALLOC_ARRAY(xdf->reference_index, xdf->nrec + 1))
goto abort;
}
xdf->changed += 1;
xdf->nreff = 0;
xdf->dstart = 0;
xdf->dend = xdf->nrec - 1;
return 0;
@ -193,238 +176,68 @@ void xdl_free_env(xdfenv_t *xe) {
}
static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
long r, rdis0, rpdis0, rdis1, rpdis1;
/*
* Limits the window that is examined during the similar-lines
* scan. The loops below stops when action[i - r] == KEEP
* (line that has no match), but there are corner cases where
* the loop proceed all the way to the extremities by causing
* huge performance penalties in case of big files.
*/
if (i - s > XDL_SIMSCAN_WINDOW)
s = i - XDL_SIMSCAN_WINDOW;
if (e - i > XDL_SIMSCAN_WINDOW)
e = i + XDL_SIMSCAN_WINDOW;
/*
* Scans the lines before 'i' to find a run of lines that either
* have no match (action[j] == DISCARD) or have multiple matches
* (action[j] == INVESTIGATE). Note that we always call this
* function with action[i] == INVESTIGATE, so the current line
* (i) is already a multimatch line.
*/
for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
if (action[i - r] == DISCARD)
rdis0++;
else if (action[i - r] == INVESTIGATE)
rpdis0++;
else if (action[i - r] == KEEP)
break;
else
BUG("Illegal value for action[i - r]");
}
/*
* If the run before the line 'i' found only multimatch lines,
* we return false and hence we don't make the current line (i)
* discarded. We want to discard multimatch lines only when
* they appear in the middle of runs with nomatch lines
* (action[j] == DISCARD).
*/
if (rdis0 == 0)
return 0;
for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
if (action[i + r] == DISCARD)
rdis1++;
else if (action[i + r] == INVESTIGATE)
rpdis1++;
else if (action[i + r] == KEEP)
break;
else
BUG("Illegal value for action[i + r]");
}
/*
* If the run after the line 'i' found only multimatch lines,
* we return false and hence we don't make the current line (i)
* discarded.
*/
if (rdis1 == 0)
return false;
rdis1 += rdis0;
rpdis1 += rpdis0;
return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
}
/*
* Try to reduce the problem complexity, discard records that have no
* matches on the other file. Also, lines that have multiple matches
* might be potentially discarded if they appear in a run of discardable.
*/
static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
long i, nm, mlim;
xrecord_t *recs;
xdlclass_t *rcrec;
uint8_t *action1 = NULL, *action2 = NULL;
bool need_min = !!(cf->flags & XDF_NEED_MINIMAL);
int ret = 0;
/*
* Create temporary arrays that will help us decide if
* changed[i] should remain false, or become true.
*/
if (!XDL_CALLOC_ARRAY(action1, xdf1->nrec + 1)) {
ret = -1;
goto cleanup;
}
if (!XDL_CALLOC_ARRAY(action2, xdf2->nrec + 1)) {
ret = -1;
goto cleanup;
}
/*
* Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
*/
if ((mlim = xdl_bogosqrt((long)xdf1->nrec)) > XDL_MAX_EQLIMIT)
mlim = XDL_MAX_EQLIMIT;
for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
rcrec = cf->rcrecs[recs->minimal_perfect_hash];
nm = rcrec ? rcrec->len2 : 0;
action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
}
if ((mlim = xdl_bogosqrt((long)xdf2->nrec)) > XDL_MAX_EQLIMIT)
mlim = XDL_MAX_EQLIMIT;
for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
rcrec = cf->rcrecs[recs->minimal_perfect_hash];
nm = rcrec ? rcrec->len1 : 0;
action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
}
/*
* Use temporary arrays to decide if changed[i] should remain
* false, or become true.
*/
xdf1->nreff = 0;
for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
i <= xdf1->dend; i++, recs++) {
if (action1[i] == KEEP ||
(action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xdf1->dstart, xdf1->dend))) {
xdf1->reference_index[xdf1->nreff++] = i;
/* changed[i] remains false, i.e. keep */
} else
xdf1->changed[i] = true;
/* i.e. discard */
}
xdf2->nreff = 0;
for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart];
i <= xdf2->dend; i++, recs++) {
if (action2[i] == KEEP ||
(action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xdf2->dstart, xdf2->dend))) {
xdf2->reference_index[xdf2->nreff++] = i;
/* changed[i] remains false, i.e. keep */
} else
xdf2->changed[i] = true;
/* i.e. discard */
}
cleanup:
xdl_free(action1);
xdl_free(action2);
return ret;
}
/*
* Early trim initial and terminal matching records.
*/
static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) {
long i, lim;
xrecord_t *recs1, *recs2;
static void xdl_trim_ends(xdfenv_t *xe)
{
size_t lim = XDL_MIN(xe->xdf1.nrec, xe->xdf2.nrec);
recs1 = xdf1->recs;
recs2 = xdf2->recs;
for (i = 0, lim = (long)XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim;
i++, recs1++, recs2++)
if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash)
for (size_t i = 0; i < lim; i++) {
size_t mph1 = xe->xdf1.recs[i].minimal_perfect_hash;
size_t mph2 = xe->xdf2.recs[i].minimal_perfect_hash;
if (mph1 != mph2) {
xe->delta_start = (ssize_t)i;
lim -= i;
break;
xdf1->dstart = xdf2->dstart = i;
recs1 = xdf1->recs + xdf1->nrec - 1;
recs2 = xdf2->recs + xdf2->nrec - 1;
for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--)
if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash)
break;
xdf1->dend = (long)xdf1->nrec - i - 1;
xdf2->dend = (long)xdf2->nrec - i - 1;
return 0;
}
static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
if (xdl_trim_ends(xdf1, xdf2) < 0 ||
xdl_cleanup_records(cf, xdf1, xdf2) < 0) {
return -1;
}
}
return 0;
for (size_t i = 0; i < lim; i++) {
size_t mph1 = xe->xdf1.recs[xe->xdf1.nrec - 1 - i].minimal_perfect_hash;
size_t mph2 = xe->xdf2.recs[xe->xdf2.nrec - 1 - i].minimal_perfect_hash;
if (mph1 != mph2) {
xe->delta_end = i;
break;
}
}
}
int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
xdfenv_t *xe) {
long enl1, enl2, sample;
xdlclassifier_t cf;
memset(&cf, 0, sizeof(cf));
xe->delta_start = 0;
xe->delta_end = 0;
/*
* For histogram diff, we can afford a smaller sample size and
* thus a poorer estimate of the number of lines, as the hash
* table (rhash) won't be filled up/grown. The number of lines
* (nrecs) will be updated correctly anyway by
* xdl_prepare_ctx().
*/
sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF
? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1);
if (xdl_prepare_ctx(mf1, &xe->xdf1, xpp->flags) < 0) {
enl1 = xdl_guess_lines(mf1, sample) + 1;
enl2 = xdl_guess_lines(mf2, sample) + 1;
if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0)
return -1;
if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
xdl_free_classifier(&cf);
return -1;
}
if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {
if (xdl_prepare_ctx(mf2, &xe->xdf2, xpp->flags) < 0) {
xdl_free_ctx(&xe->xdf1);
xdl_free_classifier(&cf);
return -1;
}
if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
(XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) {
xdl_free_ctx(&xe->xdf2);
xdl_free_ctx(&xe->xdf1);
xdl_free_classifier(&cf);
if (xdl_init_classifier(&cf, xe->xdf1.nrec + xe->xdf2.nrec + 1, xpp->flags) < 0)
return -1;
for (size_t i = 0; i < xe->xdf1.nrec; i++) {
xrecord_t *rec = &xe->xdf1.recs[i];
xdl_classify_record(&cf, rec);
}
for (size_t i = 0; i < xe->xdf2.nrec; i++) {
xrecord_t *rec = &xe->xdf2.recs[i];
xdl_classify_record(&cf, rec);
}
xe->mph_size = cf.count;
xdl_free_classifier(&cf);
xdl_trim_ends(xe);
return 0;
}

View File

@ -48,7 +48,6 @@ typedef struct s_xrecord {
typedef struct s_xdfile {
xrecord_t *recs;
size_t nrec;
ptrdiff_t dstart, dend;
bool *changed;
size_t *reference_index;
size_t nreff;
@ -56,6 +55,8 @@ typedef struct s_xdfile {
typedef struct s_xdfenv {
xdfile_t xdf1, xdf2;
size_t delta_start, delta_end;
size_t mph_size;
} xdfenv_t;

View File

@ -118,26 +118,6 @@ void *xdl_cha_alloc(chastore_t *cha) {
return data;
}
long xdl_guess_lines(mmfile_t *mf, long sample) {
long nl = 0, size, tsize = 0;
char const *data, *cur, *top;
if ((cur = data = xdl_mmfile_first(mf, &size))) {
for (top = data + size; nl < sample && cur < top; ) {
nl++;
if (!(cur = memchr(cur, '\n', top - cur)))
cur = top;
else
cur++;
}
tsize += (long) (cur - data);
}
if (nl && tsize)
nl = xdl_mmfile_size(mf) / (tsize / nl);
return nl + 1;
}
int xdl_blankline(const char *line, long size, long flags)
{

View File

@ -31,7 +31,6 @@ int xdl_emit_diffrec(char const *rec, long size, char const *pre, long psize,
int xdl_cha_init(chastore_t *cha, long isize, long icount);
void xdl_cha_free(chastore_t *cha);
void *xdl_cha_alloc(chastore_t *cha);
long xdl_guess_lines(mmfile_t *mf, long sample);
int xdl_blankline(const char *line, long size, long flags);
int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags);
uint64_t xdl_hash_record_verbatim(uint8_t const **data, uint8_t const *top);