git/sha1-array.h
Jeff King 600bee4e70 oid_array: use size_t for count and allocation
The oid_array object uses an "int" to store the number of items and the
allocated size. It's rather unlikely for somebody to have more than 2^31
objects in a repository (the sha1's alone would be 40GB!), but if they
do, we'd overflow our alloc variable.

You can reproduce this case with something like:

  git init repo
  cd repo

  # make a pack with 2^24 objects
  perl -e '
    my $nr = 2**24;

    for (my $i = 0; $i < $nr; $i++) {
	    print "blob\n";
	    print "data 4\n";
	    print pack("N", $i);
    }
  ' | git fast-import

  # now make 256 copies of it; most of these objects will be duplicates,
  # but oid_array doesn't de-dup until all values are read and it can
  # sort the result.
  cd .git/objects/pack/
  pack=$(echo *.pack)
  idx=$(echo *.idx)
  for i in $(seq 0 255); do
    # no need to waste disk space
    ln "$pack" "pack-extra-$i.pack"
    ln "$idx" "pack-extra-$i.idx"
  done

  # and now force an oid_array to store all of it
  git cat-file --batch-all-objects --batch-check

which results in:

  fatal: size_t overflow: 32 * 18446744071562067968

So the good news is that st_mult() sees the problem (the large number is
because our int wraps negative, and then that gets cast to a size_t),
doing the job it was meant to: bailing in crazy situations rather than
causing an undersized buffer.

But we should avoid hitting this case at all, and instead limit
ourselves based on what malloc() is willing to give us. We can easily do
that by switching to size_t.

The cat-file process above made it to ~120GB virtual set size before the
integer overflow (our internal hash storage is 32-bytes now in
preparation for sha256, so we'd expect ~128GB total needed, plus
potentially more to copy from one realloc'd block to another)). After
this patch (and about 130GB of RAM+swap), it does eventually read in the
whole set. No test for obvious reasons.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-03-30 10:59:08 -07:00

110 lines
3.6 KiB
C

#ifndef SHA1_ARRAY_H
#define SHA1_ARRAY_H
/**
* The API provides storage and manipulation of sets of object identifiers.
* The emphasis is on storage and processing efficiency, making them suitable
* for large lists. Note that the ordering of items is not preserved over some
* operations.
*
* Examples
* --------
* -----------------------------------------
* int print_callback(const struct object_id *oid,
* void *data)
* {
* printf("%s\n", oid_to_hex(oid));
* return 0; // always continue
* }
*
* void some_func(void)
* {
* struct sha1_array hashes = OID_ARRAY_INIT;
* struct object_id oid;
*
* // Read objects into our set
* while (read_object_from_stdin(oid.hash))
* oid_array_append(&hashes, &oid);
*
* // Check if some objects are in our set
* while (read_object_from_stdin(oid.hash)) {
* if (oid_array_lookup(&hashes, &oid) >= 0)
* printf("it's in there!\n");
*
* // Print the unique set of objects. We could also have
* // avoided adding duplicate objects in the first place,
* // but we would end up re-sorting the array repeatedly.
* // Instead, this will sort once and then skip duplicates
* // in linear time.
*
* oid_array_for_each_unique(&hashes, print_callback, NULL);
* }
*/
/**
* A single array of object IDs. This should be initialized by assignment from
* `OID_ARRAY_INIT`. The `oid` member contains the actual data. The `nr` member
* contains the number of items in the set. The `alloc` and `sorted` members
* are used internally, and should not be needed by API callers.
*/
struct oid_array {
struct object_id *oid;
size_t nr;
size_t alloc;
int sorted;
};
#define OID_ARRAY_INIT { NULL, 0, 0, 0 }
/**
* Add an item to the set. The object ID will be placed at the end of the array
* (but note that some operations below may lose this ordering).
*/
void oid_array_append(struct oid_array *array, const struct object_id *oid);
/**
* Perform a binary search of the array for a specific object ID. If found,
* returns the offset (in number of elements) of the object ID. If not found,
* returns a negative integer. If the array is not sorted, this function has
* the side effect of sorting it.
*/
int oid_array_lookup(struct oid_array *array, const struct object_id *oid);
/**
* Free all memory associated with the array and return it to the initial,
* empty state.
*/
void oid_array_clear(struct oid_array *array);
typedef int (*for_each_oid_fn)(const struct object_id *oid,
void *data);
/**
* Iterate over each element of the list, executing the callback function for
* each one. Does not sort the list, so any custom hash order is retained.
* If the callback returns a non-zero value, the iteration ends immediately
* and the callback's return is propagated; otherwise, 0 is returned.
*/
int oid_array_for_each(struct oid_array *array,
for_each_oid_fn fn,
void *data);
/**
* Iterate over each unique element of the list in sorted order, but otherwise
* behave like `oid_array_for_each`. If the array is not sorted, this function
* has the side effect of sorting it.
*/
int oid_array_for_each_unique(struct oid_array *array,
for_each_oid_fn fn,
void *data);
/**
* Apply the callback function `want` to each entry in the array, retaining
* only the entries for which the function returns true. Preserve the order
* of the entries that are retained.
*/
void oid_array_filter(struct oid_array *array,
for_each_oid_fn want,
void *cbdata);
#endif /* SHA1_ARRAY_H */