From b103881d4f4b157d86813ba5f91acd7ed6c888d0 Mon Sep 17 00:00:00 2001 From: Phillip Wood Date: Thu, 22 May 2025 16:55:20 +0100 Subject: [PATCH 1/4] midx repack: avoid integer overflow on 32 bit systems On a 32 bit system "git multi-pack-index --repack --batch-size=120M" failed with fatal: size_t overflow: 6038786 * 1289 The calculation to estimated size of the objects in the pack referenced by the multi-pack-index uses st_mult() to multiply the pack size by the number of referenced objects before dividing by the total number of objects in the pack. As size_t is 32 bits on 32 bit systems this calculation easily overflows. Fix this by using 64bit arithmetic instead. Also fix a potential overflow when caluculating the total size of the objects referenced by the multipack index with a batch size larger than SIZE_MAX / 2. In that case total_size += estimated_size can overflow as both total_size and estimated_size can be greater that SIZE_MAX / 2. This is addressed by using saturating arithmetic for the addition. Although estimated_size is of type uint64_t by the time we reach this sum it is bounded by the batch size which is of type size_t and so casting estimated_size to size_t does not truncate the value. Signed-off-by: Phillip Wood Signed-off-by: Junio C Hamano --- git-compat-util.h | 16 ++++++++++++++++ midx-write.c | 12 ++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/git-compat-util.h b/git-compat-util.h index 36b9577c8d..4678e21c4c 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -668,6 +668,22 @@ static inline int cast_size_t_to_int(size_t a) return (int)a; } +static inline uint64_t u64_mult(uint64_t a, uint64_t b) +{ + if (unsigned_mult_overflows(a, b)) + die("uint64_t overflow: %"PRIuMAX" * %"PRIuMAX, + (uintmax_t)a, (uintmax_t)b); + return a * b; +} + +static inline uint64_t u64_add(uint64_t a, uint64_t b) +{ + if (unsigned_add_overflows(a, b)) + die("uint64_t overflow: %"PRIuMAX" + %"PRIuMAX, + (uintmax_t)a, (uintmax_t)b); + return a + b; +} + /* * Limit size of IO chunks, because huge chunks only cause pain. OS X * 64-bit is buggy, returning EINVAL if len >= INT_MAX; and even in diff --git a/midx-write.c b/midx-write.c index dd3b3070e5..105014a279 100644 --- a/midx-write.c +++ b/midx-write.c @@ -1699,19 +1699,23 @@ static void fill_included_packs_batch(struct repository *r, for (i = 0; total_size < batch_size && i < m->num_packs; i++) { int pack_int_id = pack_info[i].pack_int_id; struct packed_git *p = m->packs[pack_int_id]; - size_t expected_size; + uint64_t expected_size; if (!want_included_pack(r, m, pack_kept_objects, pack_int_id)) continue; - expected_size = st_mult(p->pack_size, - pack_info[i].referenced_objects); + expected_size = uint64_mult(p->pack_size, + pack_info[i].referenced_objects); expected_size /= p->num_objects; if (expected_size >= batch_size) continue; - total_size += expected_size; + if (unsigned_add_overflows(total_size, (size_t)expected_size)) + total_size = SIZE_MAX; + else + total_size += expected_size; + include_pack[pack_int_id] = 1; } From f874c0ed90c63276e0ebc445ad6fee5dcbfacb86 Mon Sep 17 00:00:00 2001 From: Phillip Wood Date: Thu, 22 May 2025 16:55:21 +0100 Subject: [PATCH 2/4] midx repack: avoid potential integer overflow on 64 bit systems On a 64 bit system the calculation p->pack_size * pack_info[i].referenced_objects could overflow. If a pack file contains 2^28 objects with an average compressed size of 1KB then the pack size will be 2^38B. If all of the objects are referenced by the multi-pack index the sum above will overflow. Avoid this by using shifted integer arithmetic and changing the order of the calculation so that the pack size is divided by the total number of objects in the pack before multiplying by the number of objects referenced by the multi-pack index. Using a shift of 14 bits should give reasonable accuracy while avoiding overflow for pack sizes less that 1PB. Signed-off-by: Phillip Wood Signed-off-by: Junio C Hamano --- midx-write.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/midx-write.c b/midx-write.c index 105014a279..8121e96f4f 100644 --- a/midx-write.c +++ b/midx-write.c @@ -1704,9 +1704,15 @@ static void fill_included_packs_batch(struct repository *r, if (!want_included_pack(r, m, pack_kept_objects, pack_int_id)) continue; - expected_size = uint64_mult(p->pack_size, - pack_info[i].referenced_objects); + /* + * Use shifted integer arithmetic to calculate the + * expected pack size to ~4 significant digits without + * overflow for packsizes less that 1PB. + */ + expected_size = (uint64_t)pack_info[i].referenced_objects << 14; expected_size /= p->num_objects; + expected_size = u64_mult(expected_size, p->pack_size); + expected_size = u64_add(expected_size, 1u << 13) >> 14; if (expected_size >= batch_size) continue; From 3aa98a61da6e1403081b4dfaa0c644614d228bac Mon Sep 17 00:00:00 2001 From: Phillip Wood Date: Thu, 22 May 2025 16:55:22 +0100 Subject: [PATCH 3/4] midx: avoid negative array index nth_midxed_pack_int_id() returns the index of the pack file in the multi pack index's list of packfiles that the specified object. The index is returned as a uint32_t. Storing this in an int will make the index negative if the most significant bit is set. Fix this by using uint32_t as the rest of the code does. This is unlikely to be a practical problem as it requires the multipack index to reference 2^31 packfiles. Signed-off-by: Phillip Wood Signed-off-by: Junio C Hamano --- midx-write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/midx-write.c b/midx-write.c index 8121e96f4f..ba4a94950a 100644 --- a/midx-write.c +++ b/midx-write.c @@ -1566,7 +1566,7 @@ int expire_midx_packs(struct repository *r, const char *object_dir, unsigned fla _("Counting referenced objects"), m->num_objects); for (i = 0; i < m->num_objects; i++) { - int pack_int_id = nth_midxed_pack_int_id(m, i); + uint32_t pack_int_id = nth_midxed_pack_int_id(m, i); count[pack_int_id]++; display_progress(progress, i + 1); } @@ -1697,7 +1697,7 @@ static void fill_included_packs_batch(struct repository *r, total_size = 0; for (i = 0; total_size < batch_size && i < m->num_packs; i++) { - int pack_int_id = pack_info[i].pack_int_id; + uint32_t pack_int_id = pack_info[i].pack_int_id; struct packed_git *p = m->packs[pack_int_id]; uint64_t expected_size; From 70b128c57635183761df8a52a56f43dc30468ded Mon Sep 17 00:00:00 2001 From: Phillip Wood Date: Thu, 22 May 2025 16:55:23 +0100 Subject: [PATCH 4/4] midx docs: clarify tie breaking Clarify what happens when an object exists in more than one pack, but not in the preferred pack. "git multi-pack-index repack" relies on ties for objects that are not in the preferred pack being resolved in favor of the newest pack that contains a copy of the object. If ties were resolved in favor of the oldest pack as the current documentation suggests the multi-pack index would not reference any of the objects in the pack created by "git multi-pack-index repack". Helped-by: Taylor Blau Signed-off-by: Phillip Wood Signed-off-by: Junio C Hamano --- Documentation/git-multi-pack-index.adoc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Documentation/git-multi-pack-index.adoc b/Documentation/git-multi-pack-index.adoc index 631d5c7d15..b6cd0d7f85 100644 --- a/Documentation/git-multi-pack-index.adoc +++ b/Documentation/git-multi-pack-index.adoc @@ -38,10 +38,13 @@ write:: + -- --preferred-pack=:: - Optionally specify the tie-breaking pack used when - multiple packs contain the same object. `` must - contain at least one object. If not given, ties are - broken in favor of the pack with the lowest mtime. + When specified, break ties in favor of this pack when + there are additional copies of its objects in other + packs. Ties for objects not found in the preferred + pack are always resolved in favor of the copy in the + pack with the highest mtime. If unspecified, the pack + with the lowest mtime is used by default. The + preferred pack must have at least one object. --[no-]bitmap:: Control whether or not a multi-pack bitmap is written.