From dc660106ea8511e6adc44d2b70e9a4ae8b18090e Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 27 Jan 2026 15:56:44 +0200 Subject: [PATCH 01/11] index-pack, unpack-objects: use size_t for object size When unpacking objects from a packfile, the object size is decoded from a variable-length encoding. On platforms where unsigned long is 32-bit (such as Windows, even in 64-bit builds), the shift operation overflows when decoding sizes larger than 4GB. The result is a truncated size value, causing the unpacked object to be corrupted or rejected. Fix this by changing the size variable to size_t, which is 64-bit on 64-bit platforms, and ensuring the shift arithmetic occurs in 64-bit space. This was originally authored by LordKiRon , who preferred not to reveal their real name and therefore agreed that I take over authorship. Signed-off-by: Johannes Schindelin --- builtin/index-pack.c | 9 +++++---- builtin/unpack-objects.c | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index ca7784dc2c4969..cc660582e97a4d 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -37,7 +37,7 @@ static const char index_pack_usage[] = struct object_entry { struct pack_idx_entry idx; - unsigned long size; + size_t size; unsigned char hdr_size; signed char type; signed char real_type; @@ -469,7 +469,7 @@ static int is_delta_type(enum object_type type) return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA); } -static void *unpack_entry_data(off_t offset, unsigned long size, +static void *unpack_entry_data(off_t offset, size_t size, enum object_type type, struct object_id *oid) { static char fixed_buf[8192]; @@ -524,7 +524,8 @@ static void *unpack_raw_entry(struct object_entry *obj, struct object_id *oid) { unsigned char *p; - unsigned long size, c; + size_t size; + unsigned long c; off_t base_offset; unsigned shift; void *data; @@ -542,7 +543,7 @@ static void *unpack_raw_entry(struct object_entry *obj, p = fill(1); c = *p; use(1); - size += (c & 0x7f) << shift; + size += ((size_t)c & 0x7f) << shift; shift += 7; } obj->size = size; diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index e01cf6e360f6d1..59a36c2481a457 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -533,7 +533,8 @@ static void unpack_one(unsigned nr) { unsigned shift; unsigned char *pack; - unsigned long size, c; + size_t size; + unsigned long c; enum object_type type; obj_list[nr].offset = consumed_bytes; @@ -548,7 +549,7 @@ static void unpack_one(unsigned nr) pack = fill(1); c = *pack; use(1); - size += (c & 0x7f) << shift; + size += ((size_t)c & 0x7f) << shift; shift += 7; } From 92f4327b1fe09126dd6421b071a9071ce5530371 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 29 Jan 2026 14:07:48 +0100 Subject: [PATCH 02/11] git-zlib: handle data streams larger than 4GB On Windows, zlib's `uLong` type is 32-bit even on 64-bit systems. When processing data streams larger than 4GB, the `total_in` and `total_out` fields in zlib's `z_stream` structure wrap around, which caused the sanity checks in `zlib_post_call()` to trigger `BUG()` assertions. The git_zstream wrapper now tracks its own 64-bit totals rather than copying them from zlib. The sanity checks compare only the low bits, using `maximum_unsigned_value_of_type(uLong)` to mask appropriately for the platform's `uLong` size. This is based on work by LordKiRon in git-for-windows#6076. Signed-off-by: Johannes Schindelin --- git-zlib.c | 25 +++++++++++++++++-------- git-zlib.h | 4 ++-- object-file.c | 2 +- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/git-zlib.c b/git-zlib.c index df9604910e3fdf..b91cb323aee916 100644 --- a/git-zlib.c +++ b/git-zlib.c @@ -30,6 +30,9 @@ static const char *zerr_to_string(int status) */ /* #define ZLIB_BUF_MAX ((uInt)-1) */ #define ZLIB_BUF_MAX ((uInt) 1024 * 1024 * 1024) /* 1GB */ + +/* uLong is 32-bit on Windows, even on 64-bit systems */ +#define ULONG_MAX_VALUE maximum_unsigned_value_of_type(uLong) static inline uInt zlib_buf_cap(unsigned long len) { return (ZLIB_BUF_MAX < len) ? ZLIB_BUF_MAX : len; @@ -39,31 +42,37 @@ static void zlib_pre_call(git_zstream *s) { s->z.next_in = s->next_in; s->z.next_out = s->next_out; - s->z.total_in = s->total_in; - s->z.total_out = s->total_out; + s->z.total_in = (uLong)(s->total_in & ULONG_MAX_VALUE); + s->z.total_out = (uLong)(s->total_out & ULONG_MAX_VALUE); s->z.avail_in = zlib_buf_cap(s->avail_in); s->z.avail_out = zlib_buf_cap(s->avail_out); } static void zlib_post_call(git_zstream *s, int status) { - unsigned long bytes_consumed; - unsigned long bytes_produced; + size_t bytes_consumed; + size_t bytes_produced; bytes_consumed = s->z.next_in - s->next_in; bytes_produced = s->z.next_out - s->next_out; - if (s->z.total_out != s->total_out + bytes_produced) + /* + * zlib's total_out/total_in are uLong which may wrap for >4GB. + * We track our own totals and verify only the low bits match. + */ + if ((s->z.total_out & ULONG_MAX_VALUE) != + ((s->total_out + bytes_produced) & ULONG_MAX_VALUE)) BUG("total_out mismatch"); /* * zlib does not update total_in when it returns Z_NEED_DICT, * causing a mismatch here. Skip the sanity check in that case. */ if (status != Z_NEED_DICT && - s->z.total_in != s->total_in + bytes_consumed) + (s->z.total_in & ULONG_MAX_VALUE) != + ((s->total_in + bytes_consumed) & ULONG_MAX_VALUE)) BUG("total_in mismatch"); - s->total_out = s->z.total_out; - s->total_in = s->z.total_in; + s->total_out += bytes_produced; + s->total_in += bytes_consumed; /* zlib-ng marks `next_in` as `const`, so we have to cast it away. */ s->next_in = (unsigned char *) s->z.next_in; s->next_out = s->z.next_out; diff --git a/git-zlib.h b/git-zlib.h index 0e66fefa8c9f05..44380e8ad38305 100644 --- a/git-zlib.h +++ b/git-zlib.h @@ -7,8 +7,8 @@ typedef struct git_zstream { struct z_stream_s z; unsigned long avail_in; unsigned long avail_out; - unsigned long total_in; - unsigned long total_out; + size_t total_in; + size_t total_out; unsigned char *next_in; unsigned char *next_out; } git_zstream; diff --git a/object-file.c b/object-file.c index 2acc9522df2daa..086b2b65ffe65e 100644 --- a/object-file.c +++ b/object-file.c @@ -1118,7 +1118,7 @@ int odb_source_loose_write_stream(struct odb_source *source, } while (ret == Z_OK || ret == Z_BUF_ERROR); if (stream.total_in != len + hdrlen) - die(_("write stream object %ld != %"PRIuMAX), stream.total_in, + die(_("write stream object %"PRIuMAX" != %"PRIuMAX), (uintmax_t)stream.total_in, (uintmax_t)len + hdrlen); /* From 3a539061c5f62c65d46bd0eb774bb1b1239463ff Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Wed, 28 Jan 2026 01:01:23 +0200 Subject: [PATCH 03/11] odb, packfile: use size_t for streaming object sizes The odb_read_stream structure uses unsigned long for the size field, which is 32-bit on Windows even in 64-bit builds. When streaming objects larger than 4GB, the size would be truncated to zero or an incorrect value, resulting in empty files being written to disk. Change the size field in odb_read_stream to size_t and introduce unpack_object_header_sz() to return sizes via size_t pointer. Since object_info.sizep remains unsigned long for API compatibility, use temporary variables where the types differ, with comments noting the truncation limitation for code paths that still use unsigned long. This was originally authored by LordKiRon , who preferred not to reveal their real name and therefore agreed that I take over authorship. Signed-off-by: Johannes Schindelin --- builtin/pack-objects.c | 23 ++++++++++++++++------- object-file.c | 10 +++++++++- odb/streaming.c | 13 ++++++++++++- odb/streaming.h | 2 +- oss-fuzz/fuzz-pack-headers.c | 2 +- pack-bitmap.c | 2 +- pack-check.c | 6 ++++-- packfile.c | 24 +++++++++++++++--------- packfile.h | 4 ++-- 9 files changed, 61 insertions(+), 25 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index dd2480a73d2edf..aa4b1cb9b8a6c1 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -629,14 +629,21 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, struct packed_git *p = IN_PACK(entry); struct pack_window *w_curs = NULL; uint32_t pos; - off_t offset; + off_t offset, cur; enum object_type type = oe_type(entry); + enum object_type in_pack_type; off_t datalen; unsigned char header[MAX_PACK_OBJECT_HEADER], dheader[MAX_PACK_OBJECT_HEADER]; unsigned hdrlen; const unsigned hashsz = the_hash_algo->rawsz; - unsigned long entry_size = SIZE(entry); + size_t entry_size; + + cur = entry->in_pack_offset; + in_pack_type = unpack_object_header(p, &w_curs, &cur, &entry_size); + if (in_pack_type < 0) + die(_("write_reuse_object: unable to parse object header of %s"), + oid_to_hex(&entry->idx.oid)); if (DELTA(entry)) type = (allow_ofs_delta && DELTA(entry)->idx.offset) ? @@ -1087,7 +1094,7 @@ static void write_reused_pack_one(struct packed_git *reuse_packfile, { off_t offset, next, cur; enum object_type type; - unsigned long size; + size_t size; offset = pack_pos_to_offset(reuse_packfile, pos); next = pack_pos_to_offset(reuse_packfile, pos + 1); @@ -2243,7 +2250,7 @@ static void check_object(struct object_entry *entry, uint32_t object_index) off_t ofs; unsigned char *buf, c; enum object_type type; - unsigned long in_pack_size; + size_t in_pack_size; buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail); @@ -2734,16 +2741,18 @@ unsigned long oe_get_size_slow(struct packing_data *pack, struct pack_window *w_curs; unsigned char *buf; enum object_type type; - unsigned long used, avail, size; + unsigned long used, avail; + size_t size; if (e->type_ != OBJ_OFS_DELTA && e->type_ != OBJ_REF_DELTA) { + unsigned long sz; packing_data_lock(&to_pack); if (odb_read_object_info(the_repository->objects, - &e->idx.oid, &size) < 0) + &e->idx.oid, &sz) < 0) die(_("unable to get size of %s"), oid_to_hex(&e->idx.oid)); packing_data_unlock(&to_pack); - return size; + return sz; } p = oe_in_pack(pack, e); diff --git a/object-file.c b/object-file.c index 086b2b65ffe65e..0be2981c7a1f43 100644 --- a/object-file.c +++ b/object-file.c @@ -2326,6 +2326,7 @@ int odb_source_loose_read_object_stream(struct odb_read_stream **out, struct object_info oi = OBJECT_INFO_INIT; struct odb_loose_read_stream *st; unsigned long mapsize; + unsigned long size_ul; void *mapped; mapped = odb_source_loose_map_object(source, oid, &mapsize); @@ -2349,11 +2350,18 @@ int odb_source_loose_read_object_stream(struct odb_read_stream **out, goto error; } - oi.sizep = &st->base.size; + /* + * object_info.sizep is unsigned long* (32-bit on Windows), but + * st->base.size is size_t (64-bit). Use temporary variable. + * Note: loose objects >4GB would still truncate here, but such + * large loose objects are uncommon (they'd normally be packed). + */ + oi.sizep = &size_ul; oi.typep = &st->base.type; if (parse_loose_header(st->hdr, &oi) < 0 || st->base.type < 0) goto error; + st->base.size = size_ul; st->mapped = mapped; st->mapsize = mapsize; diff --git a/odb/streaming.c b/odb/streaming.c index 5927a12954ba59..af2adf5ce786d6 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -157,15 +157,26 @@ static int open_istream_incore(struct odb_read_stream **out, .base.read = read_istream_incore, }; struct odb_incore_read_stream *st; + unsigned long size_ul; int ret; oi.typep = &stream.base.type; - oi.sizep = &stream.base.size; + /* + * object_info.sizep is unsigned long* (32-bit on Windows), but + * stream.base.size is size_t (64-bit). We use a temporary variable + * because the types are incompatible. Note: this path still truncates + * for >4GB objects, but large objects should use pack streaming + * (packfile_store_read_object_stream) which handles size_t properly. + * This incore fallback is only used for small objects or when pack + * streaming is unavailable. + */ + oi.sizep = &size_ul; oi.contentp = (void **)&stream.buf; ret = odb_read_object_info_extended(odb, oid, &oi, OBJECT_INFO_DIE_IF_CORRUPT); if (ret) return ret; + stream.base.size = size_ul; CALLOC_ARRAY(st, 1); *st = stream; diff --git a/odb/streaming.h b/odb/streaming.h index c7861f7e13c606..517e2ea2d3f5c3 100644 --- a/odb/streaming.h +++ b/odb/streaming.h @@ -21,7 +21,7 @@ struct odb_read_stream { odb_read_stream_close_fn close; odb_read_stream_read_fn read; enum object_type type; - unsigned long size; /* inflated size of full object */ + size_t size; /* inflated size of full object */ }; /* diff --git a/oss-fuzz/fuzz-pack-headers.c b/oss-fuzz/fuzz-pack-headers.c index 150c0f5fa2d7ec..ef61ab577c5098 100644 --- a/oss-fuzz/fuzz-pack-headers.c +++ b/oss-fuzz/fuzz-pack-headers.c @@ -6,7 +6,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size); int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { enum object_type type; - unsigned long len; + size_t len; unpack_object_header_buffer((const unsigned char *)data, (unsigned long)size, &type, &len); diff --git a/pack-bitmap.c b/pack-bitmap.c index f6ec18d83afe21..f9af8a96bdf4ee 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -2270,7 +2270,7 @@ static int try_partial_reuse(struct bitmap_index *bitmap_git, { off_t delta_obj_offset; enum object_type type; - unsigned long size; + size_t size; if (pack_pos >= pack->p->num_objects) return -1; /* not actually in the pack */ diff --git a/pack-check.c b/pack-check.c index 79992bb509f473..2792f34d2595bf 100644 --- a/pack-check.c +++ b/pack-check.c @@ -110,7 +110,7 @@ static int verify_packfile(struct repository *r, void *data; struct object_id oid; enum object_type type; - unsigned long size; + size_t size; off_t curpos; int data_valid; @@ -143,7 +143,9 @@ static int verify_packfile(struct repository *r, data = NULL; data_valid = 0; } else { - data = unpack_entry(r, p, entries[i].offset, &type, &size); + unsigned long sz; + data = unpack_entry(r, p, entries[i].offset, &type, &sz); + size = sz; data_valid = 1; } diff --git a/packfile.c b/packfile.c index b012d648adaf2e..fdae91dd110682 100644 --- a/packfile.c +++ b/packfile.c @@ -1133,7 +1133,7 @@ int packfile_store_count_objects(struct packfile_store *store, } unsigned long unpack_object_header_buffer(const unsigned char *buf, - unsigned long len, enum object_type *type, unsigned long *sizep) + unsigned long len, enum object_type *type, size_t *sizep) { unsigned shift; size_t size, c; @@ -1144,7 +1144,11 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, size = c & 15; shift = 4; while (c & 0x80) { - if (len <= used || (bitsizeof(long) - 7) < shift) { + /* + * Each continuation byte adds 7 bits. Ensure shift won't + * overflow size_t (use size_t not long for 64-bit on Windows). + */ + if (len <= used || (bitsizeof(size_t) - 7) < shift) { error("bad object header"); size = used = 0; break; @@ -1153,7 +1157,7 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, size = st_add(size, st_left_shift(c & 0x7f, shift)); shift += 7; } - *sizep = cast_size_t_to_ulong(size); + *sizep = size; return used; } @@ -1215,7 +1219,7 @@ unsigned long get_size_from_delta(struct packed_git *p, int unpack_object_header(struct packed_git *p, struct pack_window **w_curs, off_t *curpos, - unsigned long *sizep) + size_t *sizep) { unsigned char *base; unsigned long left; @@ -1367,7 +1371,7 @@ static enum object_type packed_to_object_type(struct repository *r, while (type == OBJ_OFS_DELTA || type == OBJ_REF_DELTA) { off_t base_offset; - unsigned long size; + size_t size; /* Push the object we're going to leave behind */ if (poi_stack_nr >= poi_stack_alloc && poi_stack == small_poi_stack) { poi_stack_alloc = alloc_nr(poi_stack_nr); @@ -1586,7 +1590,7 @@ static int packed_object_info_with_index_pos(struct packed_git *p, off_t obj_off uint32_t *maybe_index_pos, struct object_info *oi) { struct pack_window *w_curs = NULL; - unsigned long size; + size_t size; off_t curpos = obj_offset; enum object_type type = OBJ_NONE; uint32_t pack_pos; @@ -1778,7 +1782,7 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset, struct pack_window *w_curs = NULL; off_t curpos = obj_offset; void *data = NULL; - unsigned long size; + size_t size; enum object_type type; struct unpack_entry_stack_ent small_delta_stack[UNPACK_ENTRY_STACK_PREALLOC]; struct unpack_entry_stack_ent *delta_stack = small_delta_stack; @@ -1943,8 +1947,10 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset, (uintmax_t)curpos, p->pack_name); data = NULL; } else { + unsigned long sz; data = patch_delta(base, base_size, delta_data, - delta_size, &size); + delta_size, &sz); + size = sz; /* * We could not apply the delta; warn the user, but @@ -2929,7 +2935,7 @@ int packfile_read_object_stream(struct odb_read_stream **out, struct odb_packed_read_stream *stream; struct pack_window *window = NULL; enum object_type in_pack_type; - unsigned long size; + size_t size; in_pack_type = unpack_object_header(pack, &window, &offset, &size); unuse_pack(&window); diff --git a/packfile.h b/packfile.h index 9b647da7dda7c1..49d6bdecf6ea18 100644 --- a/packfile.h +++ b/packfile.h @@ -456,9 +456,9 @@ off_t find_pack_entry_one(const struct object_id *oid, struct packed_git *); int is_pack_valid(struct packed_git *); void *unpack_entry(struct repository *r, struct packed_git *, off_t, enum object_type *, unsigned long *); -unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep); +unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, size_t *sizep); unsigned long get_size_from_delta(struct packed_git *, struct pack_window **, off_t); -int unpack_object_header(struct packed_git *, struct pack_window **, off_t *, unsigned long *); +int unpack_object_header(struct packed_git *, struct pack_window **, off_t *, size_t *); off_t get_delta_base(struct packed_git *p, struct pack_window **w_curs, off_t *curpos, enum object_type type, off_t delta_obj_offset); From 3274cba862ae42a6813710410274a692ec0f5d29 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 29 Jan 2026 10:18:01 +0200 Subject: [PATCH 04/11] delta, packfile: use size_t for delta header sizes The delta header decoding functions return unsigned long, which truncates on Windows for objects larger than 4GB. Introduce size_t variants get_delta_hdr_size_sz() and get_size_from_delta_sz() that preserve the full 64-bit size, and use them in packed_object_info() where the size is needed for streaming decisions. This was originally authored by LordKiRon , who preferred not to reveal their real name and therefore agreed that I take over authorship. Signed-off-by: Johannes Schindelin --- delta.h | 14 ++++++++++++-- packfile.c | 33 ++++++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/delta.h b/delta.h index 8a56ec07992c75..fad68cfc45f6f4 100644 --- a/delta.h +++ b/delta.h @@ -86,8 +86,11 @@ void *patch_delta(const void *src_buf, unsigned long src_size, * This must be called twice on the delta data buffer, first to get the * expected source buffer size, and again to get the target buffer size. */ -static inline unsigned long get_delta_hdr_size(const unsigned char **datap, - const unsigned char *top) +/* + * Size_t variant that doesn't truncate - use for >4GB objects on Windows. + */ +static inline size_t get_delta_hdr_size_sz(const unsigned char **datap, + const unsigned char *top) { const unsigned char *data = *datap; size_t cmd, size = 0; @@ -98,6 +101,13 @@ static inline unsigned long get_delta_hdr_size(const unsigned char **datap, i += 7; } while (cmd & 0x80 && data < top); *datap = data; + return size; +} + +static inline unsigned long get_delta_hdr_size(const unsigned char **datap, + const unsigned char *top) +{ + size_t size = get_delta_hdr_size_sz(datap, top); return cast_size_t_to_ulong(size); } diff --git a/packfile.c b/packfile.c index fdae91dd110682..4208f53046b630 100644 --- a/packfile.c +++ b/packfile.c @@ -1161,9 +1161,12 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, return used; } -unsigned long get_size_from_delta(struct packed_git *p, - struct pack_window **w_curs, - off_t curpos) +/* + * Size_t variant for >4GB delta results on Windows. + */ +static size_t get_size_from_delta_sz(struct packed_git *p, + struct pack_window **w_curs, + off_t curpos) { const unsigned char *data; unsigned char delta_head[20], *in; @@ -1210,10 +1213,18 @@ unsigned long get_size_from_delta(struct packed_git *p, data = delta_head; /* ignore base size */ - get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); + get_delta_hdr_size_sz(&data, delta_head+sizeof(delta_head)); /* Read the result size */ - return get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); + return get_delta_hdr_size_sz(&data, delta_head+sizeof(delta_head)); +} + +unsigned long get_size_from_delta(struct packed_git *p, + struct pack_window **w_curs, + off_t curpos) +{ + size_t size = get_size_from_delta_sz(p, w_curs, curpos); + return cast_size_t_to_ulong(size); } int unpack_object_header(struct packed_git *p, @@ -1618,14 +1629,18 @@ static int packed_object_info_with_index_pos(struct packed_git *p, off_t obj_off ret = -1; goto out; } - *oi->sizep = get_size_from_delta(p, &w_curs, tmp_pos); - if (*oi->sizep == 0) { + /* + * Use size_t variant to avoid die() on >4GB deltas. + * oi->sizep is unsigned long, so truncation may occur, + * but streaming code uses its own size_t tracking. + */ + size = get_size_from_delta_sz(p, &w_curs, tmp_pos); + if (size == 0) { ret = -1; goto out; } - } else { - *oi->sizep = size; } + *oi->sizep = (unsigned long)size; } if (oi->disk_sizep || (oi->mtimep && p->is_cruft)) { From afa74a3a2b9caf9989055a9311309f590729d6c1 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 6 Jul 2023 13:37:46 +0200 Subject: [PATCH 05/11] test-tool: add a helper to synthesize large packfiles To test Git's behavior with very large pack files, we need a way to generate such files quickly. A naive approach using only readily-available Git commands would take over 10 hours for a 4GB pack file, which is prohibitive. Side-stepping Git's machinery and actual zlib compression by writing uncompressed content with the appropriate zlib header makes things much faster. The fastest method using this approach generates many small, unreachable blob objects and takes about 1.5 minutes for 4GB. However, this cannot be used because we need to test git clone, which requires a reachable commit history. Generating many reachable commits with small, uncompressed blobs takes about 4 minutes for 4GB. But this approach 1) does not reproduce the issues we want to fix (which require individual objects larger than 4GB) and 2) is comparatively slow because of the many SHA-1 calculations. The approach taken here generates a single large blob (filled with NUL bytes), along with the trees and commits needed to make it reachable. This takes about 2.5 minutes for 4.5GB, which is the fastest option that produces a valid, clonable repository with an object large enough to trigger the bugs we want to test. Signed-off-by: Johannes Schindelin --- Makefile | 1 + compat/zlib-compat.h | 2 + t/helper/meson.build | 1 + t/helper/test-synthesize.c | 250 +++++++++++++++++++++++++++++++++++++ t/helper/test-tool.c | 1 + t/helper/test-tool.h | 1 + 6 files changed, 256 insertions(+) create mode 100644 t/helper/test-synthesize.c diff --git a/Makefile b/Makefile index cedc234173e377..85405cb5b8d148 100644 --- a/Makefile +++ b/Makefile @@ -872,6 +872,7 @@ TEST_BUILTINS_OBJS += test-submodule-config.o TEST_BUILTINS_OBJS += test-submodule-nested-repo-config.o TEST_BUILTINS_OBJS += test-submodule.o TEST_BUILTINS_OBJS += test-subprocess.o +TEST_BUILTINS_OBJS += test-synthesize.o TEST_BUILTINS_OBJS += test-trace2.o TEST_BUILTINS_OBJS += test-truncate.o TEST_BUILTINS_OBJS += test-userdiff.o diff --git a/compat/zlib-compat.h b/compat/zlib-compat.h index ac0827662298af..5078c5ef6ce0e8 100644 --- a/compat/zlib-compat.h +++ b/compat/zlib-compat.h @@ -7,6 +7,8 @@ # define z_stream_s zng_stream_s # define gz_header_s zng_gz_header_s +# define adler32(adler, buf, len) zng_adler32(adler, buf, len) + # define crc32(crc, buf, len) zng_crc32(crc, buf, len) # define inflate(strm, bits) zng_inflate(strm, bits) diff --git a/t/helper/meson.build b/t/helper/meson.build index 675e64c0101b61..3235f10ab8aae1 100644 --- a/t/helper/meson.build +++ b/t/helper/meson.build @@ -69,6 +69,7 @@ test_tool_sources = [ 'test-submodule-nested-repo-config.c', 'test-submodule.c', 'test-subprocess.c', + 'test-synthesize.c', 'test-tool.c', 'test-trace2.c', 'test-truncate.c', diff --git a/t/helper/test-synthesize.c b/t/helper/test-synthesize.c new file mode 100644 index 00000000000000..3ce7078078efff --- /dev/null +++ b/t/helper/test-synthesize.c @@ -0,0 +1,250 @@ +#define USE_THE_REPOSITORY_VARIABLE + +#include "test-tool.h" +#include "git-compat-util.h" +#include "git-zlib.h" +#include "hash.h" +#include "hex.h" +#include "object-file.h" +#include "object.h" +#include "pack.h" +#include "parse-options.h" +#include "parse.h" +#include "repository.h" +#include "setup.h" +#include "strbuf.h" +#include "write-or-die.h" + +#define BLOCK_SIZE 0xffff +static const unsigned char zeros[BLOCK_SIZE]; + +/* + * Write data as an uncompressed zlib stream. + * For data larger than 64KB, writes multiple uncompressed blocks. + * If data is NULL, writes zeros. + * Updates the pack checksum context. + */ +static void write_uncompressed_zlib(FILE *f, struct git_hash_ctx *pack_ctx, + const void *data, size_t len, + const struct git_hash_algo *algo) +{ + unsigned char zlib_header[2] = { 0x78, 0x01 }; /* CMF, FLG */ + unsigned char block_header[5]; + const unsigned char *p = data; + size_t remaining = len; + uint32_t adler = 1L; /* adler32 initial value */ + unsigned char adler_buf[4]; + + /* Write zlib header */ + fwrite_or_die(f, zlib_header, sizeof(zlib_header)); + algo->update_fn(pack_ctx, zlib_header, 2); + + /* Write uncompressed blocks (max 64KB each) */ + do { + size_t block_len = remaining > BLOCK_SIZE ? BLOCK_SIZE : remaining; + int is_final = (block_len == remaining); + const unsigned char *block_data = data ? p : zeros; + + block_header[0] = is_final ? 0x01 : 0x00; + block_header[1] = block_len & 0xff; + block_header[2] = (block_len >> 8) & 0xff; + block_header[3] = block_header[1] ^ 0xff; + block_header[4] = block_header[2] ^ 0xff; + + fwrite_or_die(f, block_header, sizeof(block_header)); + algo->update_fn(pack_ctx, block_header, 5); + + if (block_len) { + fwrite_or_die(f, block_data, block_len); + algo->update_fn(pack_ctx, block_data, block_len); + adler = adler32(adler, block_data, block_len); + } + + if (data) + p += block_len; + remaining -= block_len; + } while (remaining > 0); + + /* Write adler32 checksum */ + put_be32(adler_buf, adler); + fwrite_or_die(f, adler_buf, sizeof(adler_buf)); + algo->update_fn(pack_ctx, adler_buf, 4); +} + +/* + * Write an uncompressed object to the pack file. + * If `data == NULL`, it is treated like a buffer to NUL bytes. + * Updates the pack checksum context. + */ +static void write_pack_object(FILE *f, struct git_hash_ctx *pack_ctx, + enum object_type type, + const void *data, size_t len, + struct object_id *oid, + const struct git_hash_algo *algo) +{ + unsigned char pack_header[MAX_PACK_OBJECT_HEADER]; + char object_header[32]; + int pack_header_len, object_header_len; + struct git_hash_ctx ctx; + + /* Write pack object header */ + pack_header_len = encode_in_pack_object_header(pack_header, + sizeof(pack_header), + type, len); + fwrite_or_die(f, pack_header, pack_header_len); + algo->update_fn(pack_ctx, pack_header, pack_header_len); + + /* Write the data as uncompressed zlib */ + write_uncompressed_zlib(f, pack_ctx, data, len, algo); + + algo->init_fn(&ctx); + object_header_len = format_object_header(object_header, + sizeof(object_header), + type, len); + algo->update_fn(&ctx, object_header, object_header_len); + if (data) + algo->update_fn(&ctx, data, len); + else { + for (size_t i = len / BLOCK_SIZE; i; i--) + algo->update_fn(&ctx, zeros, BLOCK_SIZE); + algo->update_fn(&ctx, zeros, len % BLOCK_SIZE); + } + algo->final_oid_fn(oid, &ctx); +} + +/* + * Generate a pack file with a single large (>4GB) reachable object. + * + * Creates: + * 1. A large blob (all NUL bytes) + * 2. A tree containing that blob as "file" + * 3. A commit using that tree + * 4. The empty tree + * 5. A child commit using the empty tree + * + * This is useful for testing that Git can handle objects larger than 4GB. + */ +static int generate_pack_with_large_object(const char *path, size_t blob_size, + const struct git_hash_algo *algo) +{ + FILE *f = xfopen(path, "wb"); + struct git_hash_ctx pack_ctx; + unsigned char pack_hash[GIT_MAX_RAWSZ]; + struct object_id blob_oid, tree_oid, commit_oid, empty_tree_oid, final_commit_oid; + struct strbuf buf = STRBUF_INIT; + const uint32_t object_count = 5; + struct pack_header pack_header = { + .hdr_signature = htonl(PACK_SIGNATURE), + .hdr_version = htonl(PACK_VERSION), + .hdr_entries = htonl(object_count), + }; + + algo->init_fn(&pack_ctx); + + /* Write pack header */ + fwrite_or_die(f, &pack_header, sizeof(pack_header)); + algo->update_fn(&pack_ctx, &pack_header, sizeof(pack_header)); + + /* 1. Write the large blob */ + write_pack_object(f, &pack_ctx, OBJ_BLOB, NULL, blob_size, &blob_oid, algo); + + /* 2. Write tree containing the blob as "file" */ + strbuf_addf(&buf, "100644 file%c", '\0'); + strbuf_add(&buf, blob_oid.hash, algo->rawsz); + write_pack_object(f, &pack_ctx, OBJ_TREE, buf.buf, buf.len, &tree_oid, algo); + + /* 3. Write commit using that tree */ + strbuf_reset(&buf); + strbuf_addf(&buf, + "tree %s\n" + "author A U Thor 1234567890 +0000\n" + "committer C O Mitter 1234567890 +0000\n" + "\n" + "Large blob commit\n", + oid_to_hex(&tree_oid)); + write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, &commit_oid, algo); + + /* 4. Write the empty tree */ + write_pack_object(f, &pack_ctx, OBJ_TREE, "", 0, &empty_tree_oid, algo); + + /* 5. Write final commit using empty tree, with previous commit as parent */ + strbuf_reset(&buf); + strbuf_addf(&buf, + "tree %s\n" + "parent %s\n" + "author A U Thor 1234567890 +0000\n" + "committer C O Mitter 1234567890 +0000\n" + "\n" + "Empty tree commit\n", + oid_to_hex(&empty_tree_oid), + oid_to_hex(&commit_oid)); + write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, &final_commit_oid, algo); + + /* Write pack trailer (checksum) */ + algo->final_fn(pack_hash, &pack_ctx); + fwrite_or_die(f, pack_hash, algo->rawsz); + if (fclose(f)) + die_errno(_("could not close '%s'"), path); + + strbuf_release(&buf); + + /* Print the final commit OID so caller can set up refs */ + printf("%s\n", oid_to_hex(&final_commit_oid)); + + return 0; +} + +static int cmd__synthesize__pack(int argc, const char **argv, + const char *prefix UNUSED, + struct repository *repo) +{ + int non_git; + int reachable_large = 0; + const struct git_hash_algo *algo; + size_t blob_size; + uintmax_t blob_size_u; + const char *path; + const char * const usage[] = { + "test-tool synthesize pack " + "--reachable-large ", + NULL + }; + struct option options[] = { + OPT_BOOL(0, "reachable-large", &reachable_large, + N_("write a pack with a single reachable large blob")), + OPT_END() + }; + + setup_git_directory_gently(&non_git); + repo = the_repository; + algo = repo->hash_algo; + + argc = parse_options(argc, argv, NULL, options, usage, + PARSE_OPT_KEEP_ARGV0); + if (argc != 3 || !reachable_large) + usage_with_options(usage, options); + + if (!git_parse_unsigned(argv[1], &blob_size_u, + maximum_unsigned_value_of_type(size_t))) + die(_("'%s' is not a valid blob size"), argv[1]); + blob_size = blob_size_u; + path = argv[2]; + + return !!generate_pack_with_large_object(path, blob_size, algo); +} + +int cmd__synthesize(int argc, const char **argv) +{ + const char *prefix = NULL; + char const * const synthesize_usage[] = { + "test-tool synthesize pack ", + NULL, + }; + parse_opt_subcommand_fn *fn = NULL; + struct option options[] = { + OPT_SUBCOMMAND("pack", &fn, cmd__synthesize__pack), + OPT_END() + }; + argc = parse_options(argc, argv, prefix, options, synthesize_usage, 0); + return !!fn(argc, argv, prefix, NULL); +} diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index a7abc618b3887e..b71a22b43bbc9e 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -82,6 +82,7 @@ static struct test_cmd cmds[] = { { "submodule-config", cmd__submodule_config }, { "submodule-nested-repo-config", cmd__submodule_nested_repo_config }, { "subprocess", cmd__subprocess }, + { "synthesize", cmd__synthesize }, { "trace2", cmd__trace2 }, { "truncate", cmd__truncate }, { "userdiff", cmd__userdiff }, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index 7f150fa1eb9ad2..f2885b33d58aa8 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -75,6 +75,7 @@ int cmd__submodule(int argc, const char **argv); int cmd__submodule_config(int argc, const char **argv); int cmd__submodule_nested_repo_config(int argc, const char **argv); int cmd__subprocess(int argc, const char **argv); +int cmd__synthesize(int argc, const char **argv); int cmd__trace2(int argc, const char **argv); int cmd__truncate(int argc, const char **argv); int cmd__userdiff(int argc, const char **argv); From a3019888d8465e0f77926a91a20db170fef6989d Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 29 Jan 2026 14:07:14 +0100 Subject: [PATCH 06/11] t5608: add regression test for >4GB object clone The shift overflow bug in index-pack and unpack-objects caused incorrect object size calculation when the encoded size required more than 32 bits of shift. This would result in corrupted or failed unpacking of objects larger than 4GB. Add a test that creates a pack file containing a 4GB+ blob using the new 'test-tool synthesize pack --reachable-large' command, then clones the repository to verify the fix works correctly. Signed-off-by: Johannes Schindelin --- t/t5608-clone-2gb.sh | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/t/t5608-clone-2gb.sh b/t/t5608-clone-2gb.sh index 87a8cd9f98381a..af93302ddec1cf 100755 --- a/t/t5608-clone-2gb.sh +++ b/t/t5608-clone-2gb.sh @@ -49,4 +49,41 @@ test_expect_success 'clone - with worktree, file:// protocol' ' ' +test_expect_success SIZE_T_IS_64BIT 'set up repo with >4GB object' ' + large_blob_size=$((4*1024*1024*1024+1)) && + git init --bare 4gb-repo && + head_oid=$(test-tool synthesize pack \ + --reachable-large "$large_blob_size" \ + 4gb-repo/objects/pack/test.pack) && + git -C 4gb-repo index-pack objects/pack/test.pack && + git -C 4gb-repo update-ref refs/heads/main $head_oid && + git -C 4gb-repo symbolic-ref HEAD refs/heads/main +' + +test_expect_success SIZE_T_IS_64BIT 'clone >4GB object via unpack-objects' ' + # The synthesized pack has five objects, so a large unpack limit keeps + # fetch-pack on the unpack-objects path. + git -c fetch.unpackLimit=100 clone --bare \ + "file://$(pwd)/4gb-repo" 4gb-clone-unpack && + + # Verify the large blob survived the clone by comparing its OID + # between source and clone. We cannot use "cat-file -s" because + # object_info.sizep is still unsigned long, which truncates >4GB + # sizes on Windows. OID equality proves content integrity since + # the clone already verified checksums via index-pack/unpack-objects. + source_blob=$(git -C 4gb-repo rev-parse main^:file) && + clone_blob=$(git -C 4gb-clone-unpack rev-parse main^:file) && + test "$source_blob" = "$clone_blob" +' + +test_expect_success SIZE_T_IS_64BIT 'clone with >4GB object via index-pack' ' + # Force fetch-pack to hand the pack to index-pack instead. + git -c fetch.unpackLimit=1 clone --bare \ + "file://$(pwd)/4gb-repo" 4gb-clone-index && + + source_blob=$(git -C 4gb-repo rev-parse main^:file) && + clone_blob=$(git -C 4gb-clone-index rev-parse main^:file) && + test "$source_blob" = "$clone_blob" +' + test_done From 859e93e7a9f1d5ba965dc2b9891b5885a6c167ef Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sun, 3 May 2026 09:56:06 +0200 Subject: [PATCH 07/11] test-tool synthesize: use the unsafe hash for speed Jeff King pointed out on the mailing list [1] that t5608's new >4GB test cases dominate the entire test suite runtime: 160 seconds on his laptop when the rest of the suite finishes in under 90 seconds, and 305-850 seconds across CI jobs. The bottleneck is that the synthesize helper hashes roughly 8 GB of data through SHA-1 (4 GB for the pack checksum plus 4 GB for the blob OID) for a 4 GB+1 blob. Since the helper generates known test data, collision detection is unnecessary. Switch from repo->hash_algo to unsafe_hash_algo(), which uses hardware-accelerated SHA-1 (via OpenSSL or Apple CommonCrypto) when available. Benchmarks on an x86_64 machine generating a 4 GB+1 pack (2 runs each, interleaved): SHA-1 backend Run 1 Run 2 SHA1DC (safe) 75s 80s OpenSSL (unsafe) 21s 19s The effect scales linearly. At 64 MB with 10 randomized interleaved runs, the OpenSSL unsafe backend shows a 5.4x improvement (median 0.202s vs 1.088s) with tight variance (stdev 0.028s vs 0.095s). The speedup is only realized when the build has a fast unsafe backend compiled in. The CI's linux-TEST-vars job already sets OPENSSL_SHA1_UNSAFE=YesPlease; macOS benefits from Apple CommonCrypto when configured. On builds without a separate unsafe backend (such as the default Windows builds), unsafe_hash_algo() returns the regular collision-detecting implementation and the change is a no-op. [1] https://lore.kernel.org/git/20260501063805.GA2038915@coredump.intra.peff.net/ Assisted-by: Claude Opus 4.6 Signed-off-by: Johannes Schindelin --- t/helper/test-synthesize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/helper/test-synthesize.c b/t/helper/test-synthesize.c index 3ce7078078efff..e2faaad7b436df 100644 --- a/t/helper/test-synthesize.c +++ b/t/helper/test-synthesize.c @@ -217,7 +217,7 @@ static int cmd__synthesize__pack(int argc, const char **argv, setup_git_directory_gently(&non_git); repo = the_repository; - algo = repo->hash_algo; + algo = unsafe_hash_algo(repo->hash_algo); argc = parse_options(argc, argv, NULL, options, usage, PARSE_OPT_KEEP_ARGV0); From 29b9a74e915e6200ac2b4d98e446c1e73964cbd2 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sun, 3 May 2026 16:23:05 +0200 Subject: [PATCH 08/11] test-tool synthesize: precompute pack for 4 GiB + 1 The synthesize helper hashes roughly 8 GiB of data through SHA-1 to produce a 4 GiB + 1 pack (4 GiB for the pack checksum, 4 GiB for the blob OID). Since the blob content is all NUL bytes, every byte in the resulting pack file is deterministic for a given blob size and hash algorithm. Add a fast path that writes the pack from precomputed constants: a 25-byte prefix (pack header, object header, zlib header, first block header), the zero-filled bulk with periodic 5-byte deflate block headers, and a 513-byte suffix (tree, two commits, empty tree, pack SHA-1 checksum). This eliminates all SHA-1 and adler32 computation, making the helper purely I/O-bound. The precomputed constants are stored in a struct fast_pack array keyed by hash algorithm format_id, so that adding SHA-256 support later requires only adding another array entry with its suffix. The constants were generated by running the generic path and extracting the non-zero bytes from the resulting pack file. Benchmarks generating a 4 GiB + 1 pack (3 runs each, SHA1DC on x86_64): generic path: 88s / 81s / 140s fast path: 14s / 13s / 15s On CI, where t5608 currently takes 200-850 seconds depending on the job, the fast path cuts the pack-generation phase from minutes to seconds, leaving only the clone operations themselves. Assisted-by: Claude Opus 4.6 Signed-off-by: Johannes Schindelin --- t/helper/test-synthesize.c | 202 ++++++++++++++++++++++++++++++++++++- 1 file changed, 201 insertions(+), 1 deletion(-) diff --git a/t/helper/test-synthesize.c b/t/helper/test-synthesize.c index e2faaad7b436df..83c40ee02abf05 100644 --- a/t/helper/test-synthesize.c +++ b/t/helper/test-synthesize.c @@ -112,6 +112,201 @@ static void write_pack_object(FILE *f, struct git_hash_ctx *pack_ctx, algo->final_oid_fn(oid, &ctx); } +/* + * Fast path: precomputed pack data for a 4 GiB + 1 all-NUL blob. + * + * The generated pack is almost entirely zeros with a small constant + * prefix, periodic deflate block headers, and a constant suffix + * containing the tree, two commits, and the pack checksum. Because + * every byte is deterministic for a given blob size and hash algorithm, + * we can write the pack without computing any hashes at all, reducing + * runtime from minutes of hash computation to seconds of pure I/O. + * + * The blob is stored as an uncompressed deflate stream: a two-byte + * zlib header, then 65538 blocks of up to 0xffff bytes each, followed + * by an adler32 checksum. The pack header and deflate framing are + * shared across hash algorithms; only the suffix (which contains OIDs + * and the pack checksum) differs. + * + * Constants were generated by running the generic path and extracting + * the non-zero bytes from the resulting pack file. + */ + +#define FAST_PACK_4G1_BLOB_SIZE ((size_t)4 * 1024 * 1024 * 1024 + 1) +#define FAST_PACK_4G1_N_FULL_BLOCKS 65537 + +/* + * Per-hash-algorithm constants for the fast path. The prefix and + * deflate block structure are identical across algorithms; only the + * suffix (tree, commits, pack checksum) and the commit OID differ. + */ +struct fast_pack { + uint32_t format_id; + const unsigned char *suffix; + size_t suffix_len; + const char *commit_oid; +}; + +/* Pack header + pack object header + zlib header + first block header */ +static const unsigned char fast_pack_prefix[] = { + /* PACK header: signature, version 2, 5 objects */ + 0x50, 0x41, 0x43, 0x4b, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x05, + /* pack object header: blob, size = 4294967297 */ + 0xb1, 0x80, 0x80, 0x80, 0x80, 0x01, + /* zlib header: CMF=0x78, FLG=0x01 */ + 0x78, 0x01, + /* first non-final block header: BFINAL=0, LEN=0xffff, NLEN=0x0000 */ + 0x00, 0xff, 0xff, 0x00, 0x00 +}; + +/* Every non-final deflate block header is identical */ +static const unsigned char fast_pack_block_header[] = { + 0x00, 0xff, 0xff, 0x00, 0x00 +}; + +/* Final block (2 data bytes) + adler32 of 4294967297 NUL bytes */ +static const unsigned char fast_pack_final_block[] = { + /* BFINAL=1, LEN=2, NLEN=0xfffd */ + 0x01, 0x02, 0x00, 0xfd, 0xff, + /* 2 NUL data bytes */ + 0x00, 0x00, + /* adler32 */ + 0x00, 0xe2, 0x00, 0x01 +}; + +/* + * SHA-1 suffix: tree, commit, empty tree, final commit, pack checksum. + */ +static const unsigned char fast_pack_sha1_suffix[] = { + 0xa0, 0x02, 0x78, 0x01, 0x01, 0x20, 0x00, 0xdf, + 0xff, 0x31, 0x30, 0x30, 0x36, 0x34, 0x34, 0x20, + 0x66, 0x69, 0x6c, 0x65, 0x00, 0x3e, 0xb7, 0xfe, + 0xb1, 0x41, 0x3c, 0x75, 0x7f, 0x0d, 0x81, 0x81, + 0xde, 0xb2, 0x8d, 0x1d, 0xab, 0x03, 0xd6, 0x48, + 0x46, 0xb4, 0xb4, 0x0c, 0x60, 0x95, 0x0b, 0x78, + 0x01, 0x01, 0xb5, 0x00, 0x4a, 0xff, 0x74, 0x72, + 0x65, 0x65, 0x20, 0x63, 0x36, 0x38, 0x33, 0x66, + 0x63, 0x63, 0x37, 0x64, 0x31, 0x64, 0x38, 0x33, + 0x65, 0x66, 0x32, 0x66, 0x65, 0x31, 0x61, 0x66, + 0x35, 0x35, 0x32, 0x31, 0x35, 0x64, 0x30, 0x31, + 0x36, 0x38, 0x64, 0x62, 0x35, 0x32, 0x61, 0x33, + 0x61, 0x33, 0x62, 0x0a, 0x61, 0x75, 0x74, 0x68, + 0x6f, 0x72, 0x20, 0x41, 0x20, 0x55, 0x20, 0x54, + 0x68, 0x6f, 0x72, 0x20, 0x3c, 0x61, 0x75, 0x74, + 0x68, 0x6f, 0x72, 0x40, 0x65, 0x78, 0x61, 0x6d, + 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x6f, 0x6d, 0x3e, + 0x20, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x30, 0x20, 0x2b, 0x30, 0x30, 0x30, + 0x30, 0x0a, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, + 0x74, 0x65, 0x72, 0x20, 0x43, 0x20, 0x4f, 0x20, + 0x4d, 0x69, 0x74, 0x74, 0x65, 0x72, 0x20, 0x3c, + 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x74, 0x65, + 0x72, 0x40, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, + 0x65, 0x2e, 0x63, 0x6f, 0x6d, 0x3e, 0x20, 0x31, + 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x30, 0x20, 0x2b, 0x30, 0x30, 0x30, 0x30, 0x0a, + 0x0a, 0x4c, 0x61, 0x72, 0x67, 0x65, 0x20, 0x62, + 0x6c, 0x6f, 0x62, 0x20, 0x63, 0x6f, 0x6d, 0x6d, + 0x69, 0x74, 0x0a, 0xc6, 0x55, 0x37, 0x6b, 0x20, + 0x78, 0x01, 0x01, 0x00, 0x00, 0xff, 0xff, 0x00, + 0x00, 0x00, 0x01, 0x95, 0x0e, 0x78, 0x01, 0x01, + 0xe5, 0x00, 0x1a, 0xff, 0x74, 0x72, 0x65, 0x65, + 0x20, 0x34, 0x62, 0x38, 0x32, 0x35, 0x64, 0x63, + 0x36, 0x34, 0x32, 0x63, 0x62, 0x36, 0x65, 0x62, + 0x39, 0x61, 0x30, 0x36, 0x30, 0x65, 0x35, 0x34, + 0x62, 0x66, 0x38, 0x64, 0x36, 0x39, 0x32, 0x38, + 0x38, 0x66, 0x62, 0x65, 0x65, 0x34, 0x39, 0x30, + 0x34, 0x0a, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, + 0x20, 0x63, 0x35, 0x62, 0x32, 0x31, 0x63, 0x36, + 0x31, 0x31, 0x61, 0x61, 0x35, 0x39, 0x34, 0x65, + 0x63, 0x39, 0x66, 0x64, 0x37, 0x65, 0x39, 0x32, + 0x63, 0x66, 0x39, 0x36, 0x34, 0x38, 0x39, 0x31, + 0x34, 0x63, 0x61, 0x34, 0x63, 0x32, 0x34, 0x31, + 0x32, 0x0a, 0x61, 0x75, 0x74, 0x68, 0x6f, 0x72, + 0x20, 0x41, 0x20, 0x55, 0x20, 0x54, 0x68, 0x6f, + 0x72, 0x20, 0x3c, 0x61, 0x75, 0x74, 0x68, 0x6f, + 0x72, 0x40, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, + 0x65, 0x2e, 0x63, 0x6f, 0x6d, 0x3e, 0x20, 0x31, + 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x30, 0x20, 0x2b, 0x30, 0x30, 0x30, 0x30, 0x0a, + 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x74, 0x65, + 0x72, 0x20, 0x43, 0x20, 0x4f, 0x20, 0x4d, 0x69, + 0x74, 0x74, 0x65, 0x72, 0x20, 0x3c, 0x63, 0x6f, + 0x6d, 0x6d, 0x69, 0x74, 0x74, 0x65, 0x72, 0x40, + 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, + 0x63, 0x6f, 0x6d, 0x3e, 0x20, 0x31, 0x32, 0x33, + 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x20, + 0x2b, 0x30, 0x30, 0x30, 0x30, 0x0a, 0x0a, 0x45, + 0x6d, 0x70, 0x74, 0x79, 0x20, 0x74, 0x72, 0x65, + 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, + 0x0a, 0xaa, 0xb8, 0x45, 0x01, 0x8e, 0xfc, 0xf0, + 0x2f, 0x9c, 0xc5, 0xcc, 0x4f, 0x6a, 0x1a, 0xc9, + 0x2b, 0x23, 0xa9, 0xff, 0x91, 0x06, 0xc2, 0x70, + 0xe3 +}; + +static const struct fast_pack fast_packs[] = { + { + .format_id = GIT_SHA1_FORMAT_ID, + .suffix = fast_pack_sha1_suffix, + .suffix_len = sizeof(fast_pack_sha1_suffix), + .commit_oid = "aac43daf40d0377af31aa9c798a4ae8a31b55c1d", + }, +}; + +/* + * Try the fast path for known blob sizes. Returns 1 if the pack was + * written from precomputed constants, 0 if the caller should fall + * through to the generic path. + */ +static int generate_fast_pack(const char *path, size_t blob_size, + const struct git_hash_algo *algo) +{ + const struct fast_pack *fp = NULL; + FILE *f; + size_t i; + + if (blob_size != FAST_PACK_4G1_BLOB_SIZE) + return 0; + + for (i = 0; i < ARRAY_SIZE(fast_packs); i++) { + if (fast_packs[i].format_id == algo->format_id) { + fp = &fast_packs[i]; + break; + } + } + if (!fp) + return 0; + + f = xfopen(path, "wb"); + + fwrite_or_die(f, fast_pack_prefix, sizeof(fast_pack_prefix)); + + /* First full block: 0xffff zero bytes (header already in prefix) */ + fwrite_or_die(f, zeros, BLOCK_SIZE); + + /* Remaining non-final full blocks */ + for (i = 1; i < FAST_PACK_4G1_N_FULL_BLOCKS; i++) { + fwrite_or_die(f, fast_pack_block_header, + sizeof(fast_pack_block_header)); + fwrite_or_die(f, zeros, BLOCK_SIZE); + } + + /* Final block (2 data bytes) + adler32 */ + fwrite_or_die(f, fast_pack_final_block, + sizeof(fast_pack_final_block)); + + /* Tree, commits, and pack checksum */ + fwrite_or_die(f, fp->suffix, fp->suffix_len); + + if (fclose(f)) + die_errno(_("could not close '%s'"), path); + + printf("%s\n", fp->commit_oid); + return 1; +} + /* * Generate a pack file with a single large (>4GB) reachable object. * @@ -127,7 +322,7 @@ static void write_pack_object(FILE *f, struct git_hash_ctx *pack_ctx, static int generate_pack_with_large_object(const char *path, size_t blob_size, const struct git_hash_algo *algo) { - FILE *f = xfopen(path, "wb"); + FILE *f; struct git_hash_ctx pack_ctx; unsigned char pack_hash[GIT_MAX_RAWSZ]; struct object_id blob_oid, tree_oid, commit_oid, empty_tree_oid, final_commit_oid; @@ -139,6 +334,11 @@ static int generate_pack_with_large_object(const char *path, size_t blob_size, .hdr_entries = htonl(object_count), }; + if (generate_fast_pack(path, blob_size, algo)) + return 0; + + f = xfopen(path, "wb"); + algo->init_fn(&pack_ctx); /* Write pack header */ From 8e6e7208040917a254379fd6c63d432f5e2f6f59 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sun, 3 May 2026 16:29:18 +0200 Subject: [PATCH 09/11] test-tool synthesize: add precomputed SHA-256 pack for 4 GiB + 1 Add a SHA-256 entry to the fast_packs[] table. The pack prefix and deflate block structure are identical to SHA-1 (the pack format does not encode the hash algorithm in its header). Only the suffix differs: SHA-256 OIDs are 32 bytes instead of 20, giving a 609-byte suffix compared to 513 for SHA-1, and a different pack checksum. The constants were generated by running the generic path inside a repository initialized with --object-format=sha256. Assisted-by: Claude Opus 4.6 Signed-off-by: Johannes Schindelin --- t/helper/test-synthesize.c | 91 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/t/helper/test-synthesize.c b/t/helper/test-synthesize.c index 83c40ee02abf05..1f28ecf0f2c9ee 100644 --- a/t/helper/test-synthesize.c +++ b/t/helper/test-synthesize.c @@ -246,6 +246,90 @@ static const unsigned char fast_pack_sha1_suffix[] = { 0xe3 }; +/* + * SHA-256 suffix: same structure, but with 32-byte OIDs and SHA-256 + * pack checksum (609 bytes vs 513 for SHA-1). + */ +static const unsigned char fast_pack_sha256_suffix[] = { + 0xac, 0x02, 0x78, 0x01, 0x01, 0x2c, 0x00, 0xd3, + 0xff, 0x31, 0x30, 0x30, 0x36, 0x34, 0x34, 0x20, + 0x66, 0x69, 0x6c, 0x65, 0x00, 0x42, 0x53, 0xc1, + 0x8a, 0x9f, 0x5e, 0xc3, 0xbb, 0x47, 0xb0, 0x83, + 0x8a, 0x19, 0xdb, 0x31, 0xbb, 0x7b, 0x0f, 0x3b, + 0x80, 0xa4, 0xbc, 0x2f, 0xaf, 0x72, 0x6b, 0xdb, + 0x62, 0xaa, 0xba, 0xdd, 0xde, 0x77, 0xc6, 0x13, + 0xeb, 0x9d, 0x0c, 0x78, 0x01, 0x01, 0xcd, 0x00, + 0x32, 0xff, 0x74, 0x72, 0x65, 0x65, 0x20, 0x62, + 0x36, 0x30, 0x39, 0x37, 0x37, 0x64, 0x37, 0x63, + 0x34, 0x63, 0x32, 0x64, 0x31, 0x65, 0x63, 0x63, + 0x33, 0x66, 0x62, 0x61, 0x31, 0x64, 0x39, 0x38, + 0x65, 0x65, 0x31, 0x32, 0x30, 0x61, 0x64, 0x63, + 0x32, 0x34, 0x38, 0x33, 0x34, 0x39, 0x35, 0x30, + 0x62, 0x65, 0x34, 0x31, 0x32, 0x64, 0x39, 0x34, + 0x63, 0x38, 0x30, 0x39, 0x34, 0x38, 0x30, 0x66, + 0x35, 0x38, 0x62, 0x61, 0x39, 0x64, 0x61, 0x0a, + 0x61, 0x75, 0x74, 0x68, 0x6f, 0x72, 0x20, 0x41, + 0x20, 0x55, 0x20, 0x54, 0x68, 0x6f, 0x72, 0x20, + 0x3c, 0x61, 0x75, 0x74, 0x68, 0x6f, 0x72, 0x40, + 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, + 0x63, 0x6f, 0x6d, 0x3e, 0x20, 0x31, 0x32, 0x33, + 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x20, + 0x2b, 0x30, 0x30, 0x30, 0x30, 0x0a, 0x63, 0x6f, + 0x6d, 0x6d, 0x69, 0x74, 0x74, 0x65, 0x72, 0x20, + 0x43, 0x20, 0x4f, 0x20, 0x4d, 0x69, 0x74, 0x74, + 0x65, 0x72, 0x20, 0x3c, 0x63, 0x6f, 0x6d, 0x6d, + 0x69, 0x74, 0x74, 0x65, 0x72, 0x40, 0x65, 0x78, + 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x6f, + 0x6d, 0x3e, 0x20, 0x31, 0x32, 0x33, 0x34, 0x35, + 0x36, 0x37, 0x38, 0x39, 0x30, 0x20, 0x2b, 0x30, + 0x30, 0x30, 0x30, 0x0a, 0x0a, 0x4c, 0x61, 0x72, + 0x67, 0x65, 0x20, 0x62, 0x6c, 0x6f, 0x62, 0x20, + 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x0a, 0xb7, + 0x80, 0x3d, 0xd7, 0x20, 0x78, 0x01, 0x01, 0x00, + 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x95, + 0x11, 0x78, 0x01, 0x01, 0x15, 0x01, 0xea, 0xfe, + 0x74, 0x72, 0x65, 0x65, 0x20, 0x36, 0x65, 0x66, + 0x31, 0x39, 0x62, 0x34, 0x31, 0x32, 0x32, 0x35, + 0x63, 0x35, 0x33, 0x36, 0x39, 0x66, 0x31, 0x63, + 0x31, 0x30, 0x34, 0x64, 0x34, 0x35, 0x64, 0x38, + 0x64, 0x38, 0x35, 0x65, 0x66, 0x61, 0x39, 0x62, + 0x30, 0x35, 0x37, 0x62, 0x35, 0x33, 0x62, 0x31, + 0x34, 0x62, 0x34, 0x62, 0x39, 0x62, 0x39, 0x33, + 0x39, 0x64, 0x64, 0x37, 0x34, 0x64, 0x65, 0x63, + 0x63, 0x35, 0x33, 0x32, 0x31, 0x0a, 0x70, 0x61, + 0x72, 0x65, 0x6e, 0x74, 0x20, 0x37, 0x35, 0x62, + 0x66, 0x30, 0x63, 0x34, 0x37, 0x61, 0x65, 0x34, + 0x62, 0x62, 0x33, 0x30, 0x38, 0x65, 0x37, 0x63, + 0x63, 0x32, 0x34, 0x38, 0x32, 0x65, 0x32, 0x32, + 0x65, 0x66, 0x61, 0x65, 0x33, 0x37, 0x38, 0x37, + 0x61, 0x39, 0x36, 0x38, 0x34, 0x38, 0x62, 0x64, + 0x31, 0x37, 0x34, 0x39, 0x35, 0x36, 0x37, 0x31, + 0x34, 0x37, 0x31, 0x35, 0x32, 0x34, 0x36, 0x64, + 0x64, 0x62, 0x64, 0x35, 0x34, 0x0a, 0x61, 0x75, + 0x74, 0x68, 0x6f, 0x72, 0x20, 0x41, 0x20, 0x55, + 0x20, 0x54, 0x68, 0x6f, 0x72, 0x20, 0x3c, 0x61, + 0x75, 0x74, 0x68, 0x6f, 0x72, 0x40, 0x65, 0x78, + 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x6f, + 0x6d, 0x3e, 0x20, 0x31, 0x32, 0x33, 0x34, 0x35, + 0x36, 0x37, 0x38, 0x39, 0x30, 0x20, 0x2b, 0x30, + 0x30, 0x30, 0x30, 0x0a, 0x63, 0x6f, 0x6d, 0x6d, + 0x69, 0x74, 0x74, 0x65, 0x72, 0x20, 0x43, 0x20, + 0x4f, 0x20, 0x4d, 0x69, 0x74, 0x74, 0x65, 0x72, + 0x20, 0x3c, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, + 0x74, 0x65, 0x72, 0x40, 0x65, 0x78, 0x61, 0x6d, + 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x6f, 0x6d, 0x3e, + 0x20, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x30, 0x20, 0x2b, 0x30, 0x30, 0x30, + 0x30, 0x0a, 0x0a, 0x45, 0x6d, 0x70, 0x74, 0x79, + 0x20, 0x74, 0x72, 0x65, 0x65, 0x20, 0x63, 0x6f, + 0x6d, 0x6d, 0x69, 0x74, 0x0a, 0x6d, 0x6d, 0x51, + 0x9a, 0xc9, 0x11, 0x76, 0x61, 0xa3, 0x89, 0x49, + 0xb7, 0xa1, 0x58, 0xc6, 0x1d, 0x8c, 0x33, 0x75, + 0x8d, 0x7e, 0x4d, 0x8e, 0x58, 0x91, 0xf8, 0x5c, + 0x57, 0xd9, 0x89, 0x9e, 0xb8, 0xd2, 0x9a, 0xd8, + 0xc9 +}; + static const struct fast_pack fast_packs[] = { { .format_id = GIT_SHA1_FORMAT_ID, @@ -253,6 +337,13 @@ static const struct fast_pack fast_packs[] = { .suffix_len = sizeof(fast_pack_sha1_suffix), .commit_oid = "aac43daf40d0377af31aa9c798a4ae8a31b55c1d", }, + { + .format_id = GIT_SHA256_FORMAT_ID, + .suffix = fast_pack_sha256_suffix, + .suffix_len = sizeof(fast_pack_sha256_suffix), + .commit_oid = "63c46ca51267b1d45be69a044bb84b4bf0559f09" + "d727f861d2ae94ddebdddbc9", + }, }; /* From 5b44410b2f9b3ebf01d582445542b6aca9984c2e Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 4 May 2026 15:25:11 +0200 Subject: [PATCH 10/11] t5608: mark >4GB tests as EXPENSIVE Even with precomputed pack constants that reduced the helper's runtime from minutes to seconds, the >4GB clone tests still take 200-850 seconds across CI jobs. The bottleneck is no longer the pack generation but the clone operations themselves: transporting, unpacking, and indexing 4 GiB of data through unpack-objects and index-pack is inherently expensive. As Jeff King pointed out [1], t5608 alone takes 160 seconds on his laptop while the rest of the entire test suite finishes in under 90 seconds, and the test's disk footprint (4+ GiB source repo, then two clones) is problematic for developers who use RAM disks for their trash directories. Gate the >4GB tests on the EXPENSIVE prereq (which requires GIT_TEST_LONG to be set) in addition to SIZE_T_IS_64BIT, keeping them out of normal local test runs. [1] https://lore.kernel.org/git/20260501063805.GA2038915@coredump.intra.peff.net/ Assisted-by: Claude Opus 4.6 Signed-off-by: Johannes Schindelin --- t/t5608-clone-2gb.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/t/t5608-clone-2gb.sh b/t/t5608-clone-2gb.sh index af93302ddec1cf..4f8a95ddda411c 100755 --- a/t/t5608-clone-2gb.sh +++ b/t/t5608-clone-2gb.sh @@ -49,7 +49,7 @@ test_expect_success 'clone - with worktree, file:// protocol' ' ' -test_expect_success SIZE_T_IS_64BIT 'set up repo with >4GB object' ' +test_expect_success SIZE_T_IS_64BIT,EXPENSIVE 'set up repo with >4GB object' ' large_blob_size=$((4*1024*1024*1024+1)) && git init --bare 4gb-repo && head_oid=$(test-tool synthesize pack \ @@ -60,7 +60,7 @@ test_expect_success SIZE_T_IS_64BIT 'set up repo with >4GB object' ' git -C 4gb-repo symbolic-ref HEAD refs/heads/main ' -test_expect_success SIZE_T_IS_64BIT 'clone >4GB object via unpack-objects' ' +test_expect_success SIZE_T_IS_64BIT,EXPENSIVE 'clone >4GB object via unpack-objects' ' # The synthesized pack has five objects, so a large unpack limit keeps # fetch-pack on the unpack-objects path. git -c fetch.unpackLimit=100 clone --bare \ @@ -76,7 +76,7 @@ test_expect_success SIZE_T_IS_64BIT 'clone >4GB object via unpack-objects' ' test "$source_blob" = "$clone_blob" ' -test_expect_success SIZE_T_IS_64BIT 'clone with >4GB object via index-pack' ' +test_expect_success SIZE_T_IS_64BIT,EXPENSIVE 'clone with >4GB object via index-pack' ' # Force fetch-pack to hand the pack to index-pack instead. git -c fetch.unpackLimit=1 clone --bare \ "file://$(pwd)/4gb-repo" 4gb-clone-index && From 1eaaa7fad7a1432dd97ffdd7c45e8162f61bc302 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 4 May 2026 18:06:32 +0200 Subject: [PATCH 11/11] ci: run expensive tests on push builds to integration branches Derrick Stolee suggested [1] that expensive tests should be run at a regular cadence rather than on every PR iteration. Gate GIT_TEST_LONG on push builds to the integration branches (next, master, main, maint) so that the EXPENSIVE prereq is satisfied there but not during PR validation, where the extra minutes of wall-clock time do not justify themselves. [1] https://lore.kernel.org/git/e1e8837f-7374-4079-ba87-ab95dd156e33@gmail.com/ Helped-by: Derrick Stolee Assisted-by: Claude Opus 4.6 Signed-off-by: Johannes Schindelin --- ci/lib.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ci/lib.sh b/ci/lib.sh index 42a2b6a318b874..a671994bdf511f 100755 --- a/ci/lib.sh +++ b/ci/lib.sh @@ -314,6 +314,15 @@ export DEFAULT_TEST_TARGET=prove export GIT_TEST_CLONE_2GB=true export SKIP_DASHED_BUILT_INS=YesPlease +# Enable expensive tests on push builds to integration branches, but +# not on PR builds where the extra time is not justified for every +# iteration. +case "$GITHUB_EVENT_NAME,$CI_BRANCH" in +push,*next*|push,*master*|push,*main*|push,*maint*) + export GIT_TEST_LONG=YesPlease + ;; +esac + case "$distro" in ubuntu-*) # Python 2 is end of life, and Ubuntu 23.04 and newer don't actually