mirror of
https://github.com/git/git.git
synced 2026-01-11 13:23:12 +09:00
Merge branch 'en/xdiff-cleanup-3' into seen
Preparation of xdiff/ codebase to work with Rust Comments? * en/xdiff-cleanup-3: SQUASH??? cocci xdiff: move xdl_cleanup_records() from xprepare.c to xdiffi.c xdiff: remove dependence on xdlclassifier from xdl_cleanup_records() xdiff: replace xdfile_t.dend with xdfenv_t.delta_end xdiff: replace xdfile_t.dstart with xdfenv_t.delta_start xdiff: cleanup xdl_trim_ends() xdiff: use xdfenv_t in xdl_trim_ends() and xdl_cleanup_records() xdiff: let patience and histogram benefit from xdl_trim_ends() xdiff: don't waste time guessing the number of lines xdiff: make classic diff explicit by creating xdl_do_classic_diff() ivec: introduce the C side of ivec
This commit is contained in:
commit
6b24017454
1
Makefile
1
Makefile
@ -1116,6 +1116,7 @@ LIB_OBJS += commit-reach.o
|
||||
LIB_OBJS += commit.o
|
||||
LIB_OBJS += common-exit.o
|
||||
LIB_OBJS += common-init.o
|
||||
LIB_OBJS += compat/ivec.o
|
||||
LIB_OBJS += compat/nonblock.o
|
||||
LIB_OBJS += compat/obstack.o
|
||||
LIB_OBJS += compat/open.o
|
||||
|
||||
111
compat/ivec.c
Normal file
111
compat/ivec.c
Normal file
@ -0,0 +1,111 @@
|
||||
#include "ivec.h"
|
||||
|
||||
struct IVec_c_void {
|
||||
void *ptr;
|
||||
size_t length;
|
||||
size_t capacity;
|
||||
size_t element_size;
|
||||
};
|
||||
|
||||
static void _set_capacity(void *self_, size_t new_capacity)
|
||||
{
|
||||
struct IVec_c_void *self = self_;
|
||||
|
||||
if (new_capacity == self->capacity) {
|
||||
return;
|
||||
}
|
||||
if (new_capacity == 0) {
|
||||
FREE_AND_NULL(self->ptr);
|
||||
} else {
|
||||
self->ptr = realloc(self->ptr, new_capacity * self->element_size);
|
||||
}
|
||||
self->capacity = new_capacity;
|
||||
}
|
||||
|
||||
|
||||
void ivec_init(void *self_, size_t element_size)
|
||||
{
|
||||
struct IVec_c_void *self = self_;
|
||||
|
||||
self->ptr = NULL;
|
||||
self->length = 0;
|
||||
self->capacity = 0;
|
||||
self->element_size = element_size;
|
||||
}
|
||||
|
||||
void ivec_zero(void *self_, size_t capacity)
|
||||
{
|
||||
struct IVec_c_void *self = self_;
|
||||
|
||||
self->ptr = calloc(capacity, self->element_size);
|
||||
self->length = capacity;
|
||||
self->capacity = capacity;
|
||||
// DO NOT MODIFY element_size!!!
|
||||
}
|
||||
|
||||
void ivec_reserve_exact(void *self_, size_t additional)
|
||||
{
|
||||
struct IVec_c_void *self = self_;
|
||||
|
||||
_set_capacity(self, self->capacity + additional);
|
||||
}
|
||||
|
||||
void ivec_reserve(void *self_, size_t additional)
|
||||
{
|
||||
struct IVec_c_void *self = self_;
|
||||
|
||||
size_t growby = 128;
|
||||
if (self->capacity > growby)
|
||||
growby = self->capacity;
|
||||
if (additional > growby)
|
||||
growby = additional;
|
||||
|
||||
_set_capacity(self, self->capacity + growby);
|
||||
}
|
||||
|
||||
void ivec_shrink_to_fit(void *self_)
|
||||
{
|
||||
struct IVec_c_void *self = self_;
|
||||
|
||||
_set_capacity(self, self->length);
|
||||
}
|
||||
|
||||
void ivec_push(void *self_, const void *value)
|
||||
{
|
||||
struct IVec_c_void *self = self_;
|
||||
void *dst = NULL;
|
||||
|
||||
if (self->length == self->capacity)
|
||||
ivec_reserve(self, 1);
|
||||
|
||||
dst = (uint8_t*)self->ptr + self->length * self->element_size;
|
||||
memcpy(dst, value, self->element_size);
|
||||
self->length++;
|
||||
}
|
||||
|
||||
void ivec_free(void *self_)
|
||||
{
|
||||
struct IVec_c_void *self = self_;
|
||||
|
||||
FREE_AND_NULL(self->ptr);
|
||||
self->length = 0;
|
||||
self->capacity = 0;
|
||||
// DO NOT MODIFY element_size!!!
|
||||
}
|
||||
|
||||
void ivec_move(void *src_, void *dst_)
|
||||
{
|
||||
struct IVec_c_void *src = src_;
|
||||
struct IVec_c_void *dst = dst_;
|
||||
|
||||
ivec_free(dst);
|
||||
dst->ptr = src->ptr;
|
||||
dst->length = src->length;
|
||||
dst->capacity = src->capacity;
|
||||
// DO NOT MODIFY element_size!!!
|
||||
|
||||
src->ptr = NULL;
|
||||
src->length = 0;
|
||||
src->capacity = 0;
|
||||
// DO NOT MODIFY element_size!!!
|
||||
}
|
||||
52
compat/ivec.h
Normal file
52
compat/ivec.h
Normal file
@ -0,0 +1,52 @@
|
||||
#ifndef IVEC_H
|
||||
#define IVEC_H
|
||||
|
||||
#include <git-compat-util.h>
|
||||
|
||||
#define IVEC_INIT(variable) ivec_init(&(variable), sizeof(*(variable).ptr))
|
||||
|
||||
#ifndef CBINDGEN
|
||||
#define DEFINE_IVEC_TYPE(type, suffix) \
|
||||
struct IVec_##suffix { \
|
||||
type* ptr; \
|
||||
size_t length; \
|
||||
size_t capacity; \
|
||||
size_t element_size; \
|
||||
}
|
||||
|
||||
DEFINE_IVEC_TYPE(bool, bool);
|
||||
|
||||
DEFINE_IVEC_TYPE(uint8_t, u8);
|
||||
DEFINE_IVEC_TYPE(uint16_t, u16);
|
||||
DEFINE_IVEC_TYPE(uint32_t, u32);
|
||||
DEFINE_IVEC_TYPE(uint64_t, u64);
|
||||
|
||||
DEFINE_IVEC_TYPE(int8_t, i8);
|
||||
DEFINE_IVEC_TYPE(int16_t, i16);
|
||||
DEFINE_IVEC_TYPE(int32_t, i32);
|
||||
DEFINE_IVEC_TYPE(int64_t, i64);
|
||||
|
||||
DEFINE_IVEC_TYPE(float, f32);
|
||||
DEFINE_IVEC_TYPE(double, f64);
|
||||
|
||||
DEFINE_IVEC_TYPE(size_t, usize);
|
||||
DEFINE_IVEC_TYPE(ssize_t, isize);
|
||||
#endif
|
||||
|
||||
void ivec_init(void *self_, size_t element_size);
|
||||
|
||||
void ivec_zero(void *self_, size_t capacity);
|
||||
|
||||
void ivec_reserve_exact(void *self_, size_t additional);
|
||||
|
||||
void ivec_reserve(void *self_, size_t additional);
|
||||
|
||||
void ivec_shrink_to_fit(void *self_);
|
||||
|
||||
void ivec_push(void *self_, const void *value);
|
||||
|
||||
void ivec_free(void *self_);
|
||||
|
||||
void ivec_move(void *src, void *dst);
|
||||
|
||||
#endif /* IVEC_H */
|
||||
@ -302,6 +302,7 @@ libgit_sources = [
|
||||
'commit.c',
|
||||
'common-exit.c',
|
||||
'common-init.c',
|
||||
'compat/ivec.c',
|
||||
'compat/nonblock.c',
|
||||
'compat/obstack.c',
|
||||
'compat/open.c',
|
||||
|
||||
221
xdiff/xdiffi.c
221
xdiff/xdiffi.c
@ -21,6 +21,7 @@
|
||||
*/
|
||||
|
||||
#include "xinclude.h"
|
||||
#include "compat/ivec.h"
|
||||
|
||||
static size_t get_hash(xdfile_t *xdf, long index)
|
||||
{
|
||||
@ -33,6 +34,14 @@ static size_t get_hash(xdfile_t *xdf, long index)
|
||||
#define XDL_SNAKE_CNT 20
|
||||
#define XDL_K_HEUR 4
|
||||
|
||||
#define XDL_KPDIS_RUN 4
|
||||
#define XDL_MAX_EQLIMIT 1024
|
||||
#define XDL_SIMSCAN_WINDOW 100
|
||||
|
||||
#define DISCARD 0
|
||||
#define KEEP 1
|
||||
#define INVESTIGATE 2
|
||||
|
||||
typedef struct s_xdpsplit {
|
||||
long i1, i2;
|
||||
int min_lo, min_hi;
|
||||
@ -311,25 +320,183 @@ int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1,
|
||||
}
|
||||
|
||||
|
||||
int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
|
||||
xdfenv_t *xe) {
|
||||
static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
|
||||
long r, rdis0, rpdis0, rdis1, rpdis1;
|
||||
|
||||
/*
|
||||
* Limits the window that is examined during the similar-lines
|
||||
* scan. The loops below stops when action[i - r] == KEEP
|
||||
* (line that has no match), but there are corner cases where
|
||||
* the loop proceed all the way to the extremities by causing
|
||||
* huge performance penalties in case of big files.
|
||||
*/
|
||||
if (i - s > XDL_SIMSCAN_WINDOW)
|
||||
s = i - XDL_SIMSCAN_WINDOW;
|
||||
if (e - i > XDL_SIMSCAN_WINDOW)
|
||||
e = i + XDL_SIMSCAN_WINDOW;
|
||||
|
||||
/*
|
||||
* Scans the lines before 'i' to find a run of lines that either
|
||||
* have no match (action[j] == DISCARD) or have multiple matches
|
||||
* (action[j] == INVESTIGATE). Note that we always call this
|
||||
* function with action[i] == INVESTIGATE, so the current line
|
||||
* (i) is already a multimatch line.
|
||||
*/
|
||||
for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
|
||||
if (action[i - r] == DISCARD)
|
||||
rdis0++;
|
||||
else if (action[i - r] == INVESTIGATE)
|
||||
rpdis0++;
|
||||
else if (action[i - r] == KEEP)
|
||||
break;
|
||||
else
|
||||
BUG("Illegal value for action[i - r]");
|
||||
}
|
||||
/*
|
||||
* If the run before the line 'i' found only multimatch lines,
|
||||
* we return false and hence we don't make the current line (i)
|
||||
* discarded. We want to discard multimatch lines only when
|
||||
* they appear in the middle of runs with nomatch lines
|
||||
* (action[j] == DISCARD).
|
||||
*/
|
||||
if (rdis0 == 0)
|
||||
return 0;
|
||||
for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
|
||||
if (action[i + r] == DISCARD)
|
||||
rdis1++;
|
||||
else if (action[i + r] == INVESTIGATE)
|
||||
rpdis1++;
|
||||
else if (action[i + r] == KEEP)
|
||||
break;
|
||||
else
|
||||
BUG("Illegal value for action[i + r]");
|
||||
}
|
||||
/*
|
||||
* If the run after the line 'i' found only multimatch lines,
|
||||
* we return false and hence we don't make the current line (i)
|
||||
* discarded.
|
||||
*/
|
||||
if (rdis1 == 0)
|
||||
return false;
|
||||
rdis1 += rdis0;
|
||||
rpdis1 += rpdis0;
|
||||
|
||||
return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
|
||||
}
|
||||
|
||||
struct xoccurrence
|
||||
{
|
||||
size_t file1, file2;
|
||||
};
|
||||
|
||||
|
||||
DEFINE_IVEC_TYPE(struct xoccurrence, xoccurrence);
|
||||
|
||||
|
||||
/*
|
||||
* Try to reduce the problem complexity, discard records that have no
|
||||
* matches on the other file. Also, lines that have multiple matches
|
||||
* might be potentially discarded if they appear in a run of discardable.
|
||||
*/
|
||||
static int xdl_cleanup_records(xdfenv_t *xe, uint64_t flags) {
|
||||
long i;
|
||||
size_t nm, mlim;
|
||||
xrecord_t *recs;
|
||||
uint8_t *action1 = NULL, *action2 = NULL;
|
||||
struct IVec_xoccurrence occ;
|
||||
bool need_min = !!(flags & XDF_NEED_MINIMAL);
|
||||
int ret = 0;
|
||||
ptrdiff_t dend1 = xe->xdf1.nrec - 1 - xe->delta_end;
|
||||
ptrdiff_t dend2 = xe->xdf2.nrec - 1 - xe->delta_end;
|
||||
|
||||
IVEC_INIT(occ);
|
||||
ivec_zero(&occ, xe->mph_size);
|
||||
|
||||
for (size_t j = 0; j < xe->xdf1.nrec; j++) {
|
||||
size_t mph1 = xe->xdf1.recs[j].minimal_perfect_hash;
|
||||
occ.ptr[mph1].file1 += 1;
|
||||
}
|
||||
|
||||
for (size_t j = 0; j < xe->xdf2.nrec; j++) {
|
||||
size_t mph2 = xe->xdf2.recs[j].minimal_perfect_hash;
|
||||
occ.ptr[mph2].file2 += 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create temporary arrays that will help us decide if
|
||||
* changed[i] should remain false, or become true.
|
||||
*/
|
||||
if (!XDL_CALLOC_ARRAY(action1, xe->xdf1.nrec + 1)) {
|
||||
ret = -1;
|
||||
goto cleanup;
|
||||
}
|
||||
if (!XDL_CALLOC_ARRAY(action2, xe->xdf2.nrec + 1)) {
|
||||
ret = -1;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
|
||||
*/
|
||||
if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT)
|
||||
mlim = XDL_MAX_EQLIMIT;
|
||||
for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= dend1; i++, recs++) {
|
||||
nm = occ.ptr[recs->minimal_perfect_hash].file2;
|
||||
action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
|
||||
}
|
||||
|
||||
if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT)
|
||||
mlim = XDL_MAX_EQLIMIT;
|
||||
for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= dend2; i++, recs++) {
|
||||
nm = occ.ptr[recs->minimal_perfect_hash].file1;
|
||||
action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use temporary arrays to decide if changed[i] should remain
|
||||
* false, or become true.
|
||||
*/
|
||||
xe->xdf1.nreff = 0;
|
||||
for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start];
|
||||
i <= dend1; i++, recs++) {
|
||||
if (action1[i] == KEEP ||
|
||||
(action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->delta_start, dend1))) {
|
||||
xe->xdf1.reference_index[xe->xdf1.nreff++] = i;
|
||||
/* changed[i] remains false, i.e. keep */
|
||||
} else
|
||||
xe->xdf1.changed[i] = true;
|
||||
/* i.e. discard */
|
||||
}
|
||||
|
||||
xe->xdf2.nreff = 0;
|
||||
for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start];
|
||||
i <= dend2; i++, recs++) {
|
||||
if (action2[i] == KEEP ||
|
||||
(action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->delta_start, dend2))) {
|
||||
xe->xdf2.reference_index[xe->xdf2.nreff++] = i;
|
||||
/* changed[i] remains false, i.e. keep */
|
||||
} else
|
||||
xe->xdf2.changed[i] = true;
|
||||
/* i.e. discard */
|
||||
}
|
||||
|
||||
cleanup:
|
||||
xdl_free(action1);
|
||||
xdl_free(action2);
|
||||
ivec_free(&occ);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags)
|
||||
{
|
||||
long ndiags;
|
||||
long *kvd, *kvdf, *kvdb;
|
||||
xdalgoenv_t xenv;
|
||||
int res;
|
||||
|
||||
if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0)
|
||||
return -1;
|
||||
|
||||
if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF) {
|
||||
res = xdl_do_patience_diff(xpp, xe);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF) {
|
||||
res = xdl_do_histogram_diff(xpp, xe);
|
||||
goto out;
|
||||
}
|
||||
xdl_cleanup_records(xe, flags);
|
||||
|
||||
/*
|
||||
* Allocate and setup K vectors to be used by the differential
|
||||
@ -355,9 +522,33 @@ int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
|
||||
xenv.heur_min = XDL_HEUR_MIN_COST;
|
||||
|
||||
res = xdl_recs_cmp(&xe->xdf1, 0, xe->xdf1.nreff, &xe->xdf2, 0, xe->xdf2.nreff,
|
||||
kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0,
|
||||
kvdf, kvdb, (flags & XDF_NEED_MINIMAL) != 0,
|
||||
&xenv);
|
||||
|
||||
xdl_free(kvd);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
|
||||
xdfenv_t *xe) {
|
||||
int res;
|
||||
|
||||
if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0)
|
||||
return -1;
|
||||
|
||||
if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF) {
|
||||
res = xdl_do_patience_diff(xpp, xe);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF) {
|
||||
res = xdl_do_histogram_diff(xpp, xe);
|
||||
goto out;
|
||||
}
|
||||
|
||||
res = xdl_do_classic_diff(xe, xpp->flags);
|
||||
out:
|
||||
if (res < 0)
|
||||
xdl_free_env(xe);
|
||||
|
||||
@ -42,6 +42,7 @@ typedef struct s_xdchange {
|
||||
int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1,
|
||||
xdfile_t *xdf2, long off2, long lim2,
|
||||
long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv);
|
||||
int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags);
|
||||
int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
|
||||
xdfenv_t *xe);
|
||||
int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, long flags);
|
||||
|
||||
@ -364,7 +364,10 @@ out:
|
||||
|
||||
int xdl_do_histogram_diff(xpparam_t const *xpp, xdfenv_t *env)
|
||||
{
|
||||
ptrdiff_t dend1 = env->xdf1.nrec - 1 - env->delta_end;
|
||||
ptrdiff_t dend2 = env->xdf2.nrec - 1 - env->delta_end;
|
||||
|
||||
return histogram_diff(xpp, env,
|
||||
env->xdf1.dstart + 1, env->xdf1.dend - env->xdf1.dstart + 1,
|
||||
env->xdf2.dstart + 1, env->xdf2.dend - env->xdf2.dstart + 1);
|
||||
env->delta_start + 1, dend1 - env->delta_start + 1,
|
||||
env->delta_start + 1, dend2 - env->delta_start + 1);
|
||||
}
|
||||
|
||||
@ -373,5 +373,10 @@ static int patience_diff(xpparam_t const *xpp, xdfenv_t *env,
|
||||
|
||||
int xdl_do_patience_diff(xpparam_t const *xpp, xdfenv_t *env)
|
||||
{
|
||||
return patience_diff(xpp, env, 1, (int)env->xdf1.nrec, 1, (int)env->xdf2.nrec);
|
||||
ptrdiff_t dend1 = env->xdf1.nrec - 1 - env->delta_end;
|
||||
ptrdiff_t dend2 = env->xdf2.nrec - 1 - env->delta_end;
|
||||
|
||||
return patience_diff(xpp, env,
|
||||
env->delta_start + 1, dend1 - env->delta_start + 1,
|
||||
env->delta_start + 1, dend2 - env->delta_start + 1);
|
||||
}
|
||||
|
||||
277
xdiff/xprepare.c
277
xdiff/xprepare.c
@ -21,23 +21,13 @@
|
||||
*/
|
||||
|
||||
#include "xinclude.h"
|
||||
#include "compat/ivec.h"
|
||||
|
||||
|
||||
#define XDL_KPDIS_RUN 4
|
||||
#define XDL_MAX_EQLIMIT 1024
|
||||
#define XDL_SIMSCAN_WINDOW 100
|
||||
#define XDL_GUESS_NLINES1 256
|
||||
#define XDL_GUESS_NLINES2 20
|
||||
|
||||
#define DISCARD 0
|
||||
#define KEEP 1
|
||||
#define INVESTIGATE 2
|
||||
|
||||
typedef struct s_xdlclass {
|
||||
struct s_xdlclass *next;
|
||||
xrecord_t rec;
|
||||
long idx;
|
||||
long len1, len2;
|
||||
} xdlclass_t;
|
||||
|
||||
typedef struct s_xdlclassifier {
|
||||
@ -52,9 +42,9 @@ typedef struct s_xdlclassifier {
|
||||
} xdlclassifier_t;
|
||||
|
||||
|
||||
|
||||
|
||||
static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
|
||||
memset(cf, 0, sizeof(xdlclassifier_t));
|
||||
|
||||
cf->flags = flags;
|
||||
|
||||
cf->hbits = xdl_hashbits((unsigned int) size);
|
||||
@ -92,7 +82,7 @@ static void xdl_free_classifier(xdlclassifier_t *cf) {
|
||||
}
|
||||
|
||||
|
||||
static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t *rec) {
|
||||
static int xdl_classify_record(xdlclassifier_t *cf, xrecord_t *rec) {
|
||||
size_t hi;
|
||||
xdlclass_t *rcrec;
|
||||
|
||||
@ -113,13 +103,10 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t
|
||||
return -1;
|
||||
cf->rcrecs[rcrec->idx] = rcrec;
|
||||
rcrec->rec = *rec;
|
||||
rcrec->len1 = rcrec->len2 = 0;
|
||||
rcrec->next = cf->rchash[hi];
|
||||
cf->rchash[hi] = rcrec;
|
||||
}
|
||||
|
||||
(pass == 1) ? rcrec->len1++ : rcrec->len2++;
|
||||
|
||||
rec->minimal_perfect_hash = (size_t)rcrec->idx;
|
||||
|
||||
return 0;
|
||||
@ -134,12 +121,12 @@ static void xdl_free_ctx(xdfile_t *xdf)
|
||||
}
|
||||
|
||||
|
||||
static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
|
||||
xdlclassifier_t *cf, xdfile_t *xdf) {
|
||||
static int xdl_prepare_ctx(mmfile_t *mf, xdfile_t *xdf, uint64_t flags) {
|
||||
long bsize;
|
||||
uint64_t hav;
|
||||
uint8_t const *blk, *cur, *top, *prev;
|
||||
xrecord_t *crec;
|
||||
long narec = 8;
|
||||
|
||||
xdf->reference_index = NULL;
|
||||
xdf->changed = NULL;
|
||||
@ -152,31 +139,27 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_
|
||||
if ((cur = blk = xdl_mmfile_first(mf, &bsize))) {
|
||||
for (top = blk + bsize; cur < top; ) {
|
||||
prev = cur;
|
||||
hav = xdl_hash_record(&cur, top, xpp->flags);
|
||||
hav = xdl_hash_record(&cur, top, flags);
|
||||
if (XDL_ALLOC_GROW(xdf->recs, (long)xdf->nrec + 1, narec))
|
||||
goto abort;
|
||||
crec = &xdf->recs[xdf->nrec++];
|
||||
crec->ptr = prev;
|
||||
crec->size = cur - prev;
|
||||
crec->line_hash = hav;
|
||||
if (xdl_classify_record(pass, cf, crec) < 0)
|
||||
goto abort;
|
||||
}
|
||||
}
|
||||
|
||||
if (!XDL_CALLOC_ARRAY(xdf->changed, xdf->nrec + 2))
|
||||
goto abort;
|
||||
|
||||
if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
|
||||
(XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) {
|
||||
if ((XDF_DIFF_ALG(flags) != XDF_PATIENCE_DIFF) &&
|
||||
(XDF_DIFF_ALG(flags) != XDF_HISTOGRAM_DIFF)) {
|
||||
if (!XDL_ALLOC_ARRAY(xdf->reference_index, xdf->nrec + 1))
|
||||
goto abort;
|
||||
}
|
||||
|
||||
xdf->changed += 1;
|
||||
xdf->nreff = 0;
|
||||
xdf->dstart = 0;
|
||||
xdf->dend = xdf->nrec - 1;
|
||||
|
||||
return 0;
|
||||
|
||||
@ -193,238 +176,68 @@ void xdl_free_env(xdfenv_t *xe) {
|
||||
}
|
||||
|
||||
|
||||
static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
|
||||
long r, rdis0, rpdis0, rdis1, rpdis1;
|
||||
|
||||
/*
|
||||
* Limits the window that is examined during the similar-lines
|
||||
* scan. The loops below stops when action[i - r] == KEEP
|
||||
* (line that has no match), but there are corner cases where
|
||||
* the loop proceed all the way to the extremities by causing
|
||||
* huge performance penalties in case of big files.
|
||||
*/
|
||||
if (i - s > XDL_SIMSCAN_WINDOW)
|
||||
s = i - XDL_SIMSCAN_WINDOW;
|
||||
if (e - i > XDL_SIMSCAN_WINDOW)
|
||||
e = i + XDL_SIMSCAN_WINDOW;
|
||||
|
||||
/*
|
||||
* Scans the lines before 'i' to find a run of lines that either
|
||||
* have no match (action[j] == DISCARD) or have multiple matches
|
||||
* (action[j] == INVESTIGATE). Note that we always call this
|
||||
* function with action[i] == INVESTIGATE, so the current line
|
||||
* (i) is already a multimatch line.
|
||||
*/
|
||||
for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
|
||||
if (action[i - r] == DISCARD)
|
||||
rdis0++;
|
||||
else if (action[i - r] == INVESTIGATE)
|
||||
rpdis0++;
|
||||
else if (action[i - r] == KEEP)
|
||||
break;
|
||||
else
|
||||
BUG("Illegal value for action[i - r]");
|
||||
}
|
||||
/*
|
||||
* If the run before the line 'i' found only multimatch lines,
|
||||
* we return false and hence we don't make the current line (i)
|
||||
* discarded. We want to discard multimatch lines only when
|
||||
* they appear in the middle of runs with nomatch lines
|
||||
* (action[j] == DISCARD).
|
||||
*/
|
||||
if (rdis0 == 0)
|
||||
return 0;
|
||||
for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
|
||||
if (action[i + r] == DISCARD)
|
||||
rdis1++;
|
||||
else if (action[i + r] == INVESTIGATE)
|
||||
rpdis1++;
|
||||
else if (action[i + r] == KEEP)
|
||||
break;
|
||||
else
|
||||
BUG("Illegal value for action[i + r]");
|
||||
}
|
||||
/*
|
||||
* If the run after the line 'i' found only multimatch lines,
|
||||
* we return false and hence we don't make the current line (i)
|
||||
* discarded.
|
||||
*/
|
||||
if (rdis1 == 0)
|
||||
return false;
|
||||
rdis1 += rdis0;
|
||||
rpdis1 += rpdis0;
|
||||
|
||||
return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Try to reduce the problem complexity, discard records that have no
|
||||
* matches on the other file. Also, lines that have multiple matches
|
||||
* might be potentially discarded if they appear in a run of discardable.
|
||||
*/
|
||||
static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
|
||||
long i, nm, mlim;
|
||||
xrecord_t *recs;
|
||||
xdlclass_t *rcrec;
|
||||
uint8_t *action1 = NULL, *action2 = NULL;
|
||||
bool need_min = !!(cf->flags & XDF_NEED_MINIMAL);
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* Create temporary arrays that will help us decide if
|
||||
* changed[i] should remain false, or become true.
|
||||
*/
|
||||
if (!XDL_CALLOC_ARRAY(action1, xdf1->nrec + 1)) {
|
||||
ret = -1;
|
||||
goto cleanup;
|
||||
}
|
||||
if (!XDL_CALLOC_ARRAY(action2, xdf2->nrec + 1)) {
|
||||
ret = -1;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
|
||||
*/
|
||||
if ((mlim = xdl_bogosqrt((long)xdf1->nrec)) > XDL_MAX_EQLIMIT)
|
||||
mlim = XDL_MAX_EQLIMIT;
|
||||
for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
|
||||
rcrec = cf->rcrecs[recs->minimal_perfect_hash];
|
||||
nm = rcrec ? rcrec->len2 : 0;
|
||||
action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
|
||||
}
|
||||
|
||||
if ((mlim = xdl_bogosqrt((long)xdf2->nrec)) > XDL_MAX_EQLIMIT)
|
||||
mlim = XDL_MAX_EQLIMIT;
|
||||
for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
|
||||
rcrec = cf->rcrecs[recs->minimal_perfect_hash];
|
||||
nm = rcrec ? rcrec->len1 : 0;
|
||||
action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use temporary arrays to decide if changed[i] should remain
|
||||
* false, or become true.
|
||||
*/
|
||||
xdf1->nreff = 0;
|
||||
for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
|
||||
i <= xdf1->dend; i++, recs++) {
|
||||
if (action1[i] == KEEP ||
|
||||
(action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xdf1->dstart, xdf1->dend))) {
|
||||
xdf1->reference_index[xdf1->nreff++] = i;
|
||||
/* changed[i] remains false, i.e. keep */
|
||||
} else
|
||||
xdf1->changed[i] = true;
|
||||
/* i.e. discard */
|
||||
}
|
||||
|
||||
xdf2->nreff = 0;
|
||||
for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart];
|
||||
i <= xdf2->dend; i++, recs++) {
|
||||
if (action2[i] == KEEP ||
|
||||
(action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xdf2->dstart, xdf2->dend))) {
|
||||
xdf2->reference_index[xdf2->nreff++] = i;
|
||||
/* changed[i] remains false, i.e. keep */
|
||||
} else
|
||||
xdf2->changed[i] = true;
|
||||
/* i.e. discard */
|
||||
}
|
||||
|
||||
cleanup:
|
||||
xdl_free(action1);
|
||||
xdl_free(action2);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Early trim initial and terminal matching records.
|
||||
*/
|
||||
static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) {
|
||||
long i, lim;
|
||||
xrecord_t *recs1, *recs2;
|
||||
static void xdl_trim_ends(xdfenv_t *xe)
|
||||
{
|
||||
size_t lim = XDL_MIN(xe->xdf1.nrec, xe->xdf2.nrec);
|
||||
|
||||
recs1 = xdf1->recs;
|
||||
recs2 = xdf2->recs;
|
||||
for (i = 0, lim = (long)XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim;
|
||||
i++, recs1++, recs2++)
|
||||
if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash)
|
||||
for (size_t i = 0; i < lim; i++) {
|
||||
size_t mph1 = xe->xdf1.recs[i].minimal_perfect_hash;
|
||||
size_t mph2 = xe->xdf2.recs[i].minimal_perfect_hash;
|
||||
if (mph1 != mph2) {
|
||||
xe->delta_start = (ssize_t)i;
|
||||
lim -= i;
|
||||
break;
|
||||
|
||||
xdf1->dstart = xdf2->dstart = i;
|
||||
|
||||
recs1 = xdf1->recs + xdf1->nrec - 1;
|
||||
recs2 = xdf2->recs + xdf2->nrec - 1;
|
||||
for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--)
|
||||
if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash)
|
||||
break;
|
||||
|
||||
xdf1->dend = (long)xdf1->nrec - i - 1;
|
||||
xdf2->dend = (long)xdf2->nrec - i - 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
|
||||
|
||||
if (xdl_trim_ends(xdf1, xdf2) < 0 ||
|
||||
xdl_cleanup_records(cf, xdf1, xdf2) < 0) {
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
for (size_t i = 0; i < lim; i++) {
|
||||
size_t mph1 = xe->xdf1.recs[xe->xdf1.nrec - 1 - i].minimal_perfect_hash;
|
||||
size_t mph2 = xe->xdf2.recs[xe->xdf2.nrec - 1 - i].minimal_perfect_hash;
|
||||
if (mph1 != mph2) {
|
||||
xe->delta_end = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
|
||||
xdfenv_t *xe) {
|
||||
long enl1, enl2, sample;
|
||||
xdlclassifier_t cf;
|
||||
|
||||
memset(&cf, 0, sizeof(cf));
|
||||
xe->delta_start = 0;
|
||||
xe->delta_end = 0;
|
||||
|
||||
/*
|
||||
* For histogram diff, we can afford a smaller sample size and
|
||||
* thus a poorer estimate of the number of lines, as the hash
|
||||
* table (rhash) won't be filled up/grown. The number of lines
|
||||
* (nrecs) will be updated correctly anyway by
|
||||
* xdl_prepare_ctx().
|
||||
*/
|
||||
sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF
|
||||
? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1);
|
||||
if (xdl_prepare_ctx(mf1, &xe->xdf1, xpp->flags) < 0) {
|
||||
|
||||
enl1 = xdl_guess_lines(mf1, sample) + 1;
|
||||
enl2 = xdl_guess_lines(mf2, sample) + 1;
|
||||
|
||||
if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0)
|
||||
return -1;
|
||||
|
||||
if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
|
||||
|
||||
xdl_free_classifier(&cf);
|
||||
return -1;
|
||||
}
|
||||
if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {
|
||||
if (xdl_prepare_ctx(mf2, &xe->xdf2, xpp->flags) < 0) {
|
||||
|
||||
xdl_free_ctx(&xe->xdf1);
|
||||
xdl_free_classifier(&cf);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
|
||||
(XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
|
||||
xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) {
|
||||
|
||||
xdl_free_ctx(&xe->xdf2);
|
||||
xdl_free_ctx(&xe->xdf1);
|
||||
xdl_free_classifier(&cf);
|
||||
if (xdl_init_classifier(&cf, xe->xdf1.nrec + xe->xdf2.nrec + 1, xpp->flags) < 0)
|
||||
return -1;
|
||||
|
||||
for (size_t i = 0; i < xe->xdf1.nrec; i++) {
|
||||
xrecord_t *rec = &xe->xdf1.recs[i];
|
||||
xdl_classify_record(&cf, rec);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < xe->xdf2.nrec; i++) {
|
||||
xrecord_t *rec = &xe->xdf2.recs[i];
|
||||
xdl_classify_record(&cf, rec);
|
||||
}
|
||||
|
||||
xe->mph_size = cf.count;
|
||||
xdl_free_classifier(&cf);
|
||||
|
||||
xdl_trim_ends(xe);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -48,7 +48,6 @@ typedef struct s_xrecord {
|
||||
typedef struct s_xdfile {
|
||||
xrecord_t *recs;
|
||||
size_t nrec;
|
||||
ptrdiff_t dstart, dend;
|
||||
bool *changed;
|
||||
size_t *reference_index;
|
||||
size_t nreff;
|
||||
@ -56,6 +55,8 @@ typedef struct s_xdfile {
|
||||
|
||||
typedef struct s_xdfenv {
|
||||
xdfile_t xdf1, xdf2;
|
||||
size_t delta_start, delta_end;
|
||||
size_t mph_size;
|
||||
} xdfenv_t;
|
||||
|
||||
|
||||
|
||||
@ -118,26 +118,6 @@ void *xdl_cha_alloc(chastore_t *cha) {
|
||||
return data;
|
||||
}
|
||||
|
||||
long xdl_guess_lines(mmfile_t *mf, long sample) {
|
||||
long nl = 0, size, tsize = 0;
|
||||
char const *data, *cur, *top;
|
||||
|
||||
if ((cur = data = xdl_mmfile_first(mf, &size))) {
|
||||
for (top = data + size; nl < sample && cur < top; ) {
|
||||
nl++;
|
||||
if (!(cur = memchr(cur, '\n', top - cur)))
|
||||
cur = top;
|
||||
else
|
||||
cur++;
|
||||
}
|
||||
tsize += (long) (cur - data);
|
||||
}
|
||||
|
||||
if (nl && tsize)
|
||||
nl = xdl_mmfile_size(mf) / (tsize / nl);
|
||||
|
||||
return nl + 1;
|
||||
}
|
||||
|
||||
int xdl_blankline(const char *line, long size, long flags)
|
||||
{
|
||||
|
||||
@ -31,7 +31,6 @@ int xdl_emit_diffrec(char const *rec, long size, char const *pre, long psize,
|
||||
int xdl_cha_init(chastore_t *cha, long isize, long icount);
|
||||
void xdl_cha_free(chastore_t *cha);
|
||||
void *xdl_cha_alloc(chastore_t *cha);
|
||||
long xdl_guess_lines(mmfile_t *mf, long sample);
|
||||
int xdl_blankline(const char *line, long size, long flags);
|
||||
int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags);
|
||||
uint64_t xdl_hash_record_verbatim(uint8_t const **data, uint8_t const *top);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user