From d52a1bb4c5b7c2d0ca143ca3762675089b7216ad Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 3 Mar 2026 16:00:16 +0100 Subject: [PATCH 01/10] upload-pack: fix debug statement when flushing packfile data When git-upload-pack(1) writes packfile data to the client we have some logic in place that buffers some partial lines. When that buffer still contains data after git-pack-objects(1) has finished we flush the buffer so that all remaining bytes are sent out. Curiously, when we do so we also print the string "flushed." to stderr. This statement has been introduced in b1c71b7281 (upload-pack: avoid sending an incomplete pack upon failure, 2006-06-20), so quite a while ago. What's interesting though is that stderr is typically spliced through to the client-side, and consequently the client would see this message. Munging the way how we do the caching indeed confirms this: $ git clone file:///home/pks/Development/linux/ Cloning into bare repository 'linux.git'... remote: Enumerating objects: 12980346, done. remote: Counting objects: 100% (131820/131820), done. remote: Compressing objects: 100% (50290/50290), done. remote: Total 12980346 (delta 96319), reused 104500 (delta 81217), pack-reused 12848526 (from 1) Receiving objects: 100% (12980346/12980346), 3.23 GiB | 57.44 MiB/s, done. flushed. Resolving deltas: 100% (10676718/10676718), done. It's quite clear that this string shouldn't ever be visible to the client, so it rather feels like this is a left-over debug statement. The menitoned commit doesn't mention this line, either. Remove the debug output to prepare for a change in how we do the buffering in the next commit. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- upload-pack.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/upload-pack.c b/upload-pack.c index 2d2b70cbf2..c2643c0295 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -457,11 +457,9 @@ static void create_pack_file(struct upload_pack_data *pack_data, } /* flush the data */ - if (output_state->used > 0) { + if (output_state->used > 0) send_client_data(1, output_state->buffer, output_state->used, pack_data->use_sideband); - fprintf(stderr, "flushed.\n"); - } free(output_state); if (pack_data->use_sideband) packet_flush(1); From 2a00891ccc04142242a070ec2dc94ee75362f4c2 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 3 Mar 2026 16:00:17 +0100 Subject: [PATCH 02/10] upload-pack: adapt keepalives based on buffering The function `create_pack_file()` is responsible for sending the packfile data to the client of git-upload-pack(1). As generating the bytes may take significant computing resources we also have a mechanism in place that optionally sends keepalive pktlines in case we haven't sent out any data. The keepalive logic is purely based poll(3p): we pass a timeout to that syscall, and if the call times out we send out the keepalive pktline. While reasonable, this logic isn't entirely sufficient: even if the call to poll(3p) ends because we have received data on any of the file descriptors we may not necessarily send data to the client. The most important edge case here happens in `relay_pack_data()`. When we haven't seen the initial "PACK" signature from git-pack-objects(1) yet we buffer incoming data. So in the worst case, if each of the bytes of that signature arrive shortly before the configured keepalive timeout, then we may not send out any data for a time period that is (almost) four times as long as the configured timeout. This edge case is rather unlikely to matter in practice. But in a subsequent commit we're going to adapt our buffering mechanism to become more aggressive, which makes it more likely that we don't send any data for an extended amount of time. Adapt the logic so that instead of using a fixed timeout on every call to poll(3p), we instead figure out how much time has passed since the last-sent data. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- upload-pack.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/upload-pack.c b/upload-pack.c index c2643c0295..04521e57c9 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -29,6 +29,7 @@ #include "commit-graph.h" #include "commit-reach.h" #include "shallow.h" +#include "trace.h" #include "write-or-die.h" #include "json-writer.h" #include "strmap.h" @@ -218,7 +219,8 @@ struct output_state { }; static int relay_pack_data(int pack_objects_out, struct output_state *os, - int use_sideband, int write_packfile_line) + int use_sideband, int write_packfile_line, + bool *did_send_data) { /* * We keep the last byte to ourselves @@ -232,6 +234,8 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os, */ ssize_t readsz; + *did_send_data = false; + readsz = xread(pack_objects_out, os->buffer + os->used, sizeof(os->buffer) - os->used); if (readsz < 0) { @@ -247,6 +251,7 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os, if (os->packfile_uris_started) packet_delim(1); packet_write_fmt(1, "\1packfile\n"); + *did_send_data = true; } break; } @@ -259,6 +264,7 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os, } *p = '\0'; packet_write_fmt(1, "\1%s\n", os->buffer); + *did_send_data = true; os->used -= p - os->buffer + 1; memmove(os->buffer, p + 1, os->used); @@ -279,6 +285,7 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os, os->used = 0; } + *did_send_data = true; return readsz; } @@ -290,6 +297,7 @@ static void create_pack_file(struct upload_pack_data *pack_data, char progress[128]; char abort_msg[] = "aborting due to possible repository " "corruption on the remote side."; + uint64_t last_sent_ms = 0; ssize_t sz; int i; FILE *pipe_fd; @@ -365,10 +373,14 @@ static void create_pack_file(struct upload_pack_data *pack_data, */ while (1) { + uint64_t now_ms = getnanotime() / 1000000; struct pollfd pfd[2]; - int pe, pu, pollsize, polltimeout; + int pe, pu, pollsize, polltimeout_ms; int ret; + if (!last_sent_ms) + last_sent_ms = now_ms; + reset_timeout(pack_data->timeout); pollsize = 0; @@ -390,11 +402,21 @@ static void create_pack_file(struct upload_pack_data *pack_data, if (!pollsize) break; - polltimeout = pack_data->keepalive < 0 - ? -1 - : 1000 * pack_data->keepalive; + if (pack_data->keepalive < 0) { + polltimeout_ms = -1; + } else { + /* + * The polling timeout needs to be adjusted based on + * the time we have sent our last package. The longer + * it's been in the past, the shorter the timeout + * becomes until we eventually don't block at all. + */ + polltimeout_ms = 1000 * pack_data->keepalive - (now_ms - last_sent_ms); + if (polltimeout_ms < 0) + polltimeout_ms = 0; + } - ret = poll(pfd, pollsize, polltimeout); + ret = poll(pfd, pollsize, polltimeout_ms); if (ret < 0) { if (errno != EINTR) { @@ -403,16 +425,18 @@ static void create_pack_file(struct upload_pack_data *pack_data, } continue; } + if (0 <= pe && (pfd[pe].revents & (POLLIN|POLLHUP))) { /* Status ready; we ship that in the side-band * or dump to the standard error. */ sz = xread(pack_objects.err, progress, sizeof(progress)); - if (0 < sz) + if (0 < sz) { send_client_data(2, progress, sz, pack_data->use_sideband); - else if (sz == 0) { + last_sent_ms = now_ms; + } else if (sz == 0) { close(pack_objects.err); pack_objects.err = -1; } @@ -421,11 +445,14 @@ static void create_pack_file(struct upload_pack_data *pack_data, /* give priority to status messages */ continue; } + if (0 <= pu && (pfd[pu].revents & (POLLIN|POLLHUP))) { + bool did_send_data; int result = relay_pack_data(pack_objects.out, output_state, pack_data->use_sideband, - !!uri_protocols); + !!uri_protocols, + &did_send_data); if (result == 0) { close(pack_objects.out); @@ -433,6 +460,9 @@ static void create_pack_file(struct upload_pack_data *pack_data, } else if (result < 0) { goto fail; } + + if (did_send_data) + last_sent_ms = now_ms; } /* @@ -448,6 +478,7 @@ static void create_pack_file(struct upload_pack_data *pack_data, if (!ret && pack_data->use_sideband) { static const char buf[] = "0005\1"; write_or_die(1, buf, 5); + last_sent_ms = now_ms; } } From d0477dd05a2205b59f59ae0650633555cc795346 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 3 Mar 2026 16:00:18 +0100 Subject: [PATCH 03/10] upload-pack: reduce lock contention when writing packfile data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In our production systems we have recently observed write contention in git-upload-pack(1). The system in question was consistently streaming packfiles at a rate of dozens of gigabits per second, but curiously the system was neither bottlenecked on CPU, memory or IOPS. We eventually discovered that Git was spending 80% of its time in `pipe_write()`, out of which almost all of the time was spent in the `ep_poll_callback` function in the kernel. Quoting the reporter: This infrastructure is part of an event notification queue designed to allow for multiple producers to emit events, but that concurrency safety is guarded by 3 layers of locking. The layer we're hitting contention in uses a simple reader/writer lock mode (a.k.a. shared versus exclusive mode), where producers need shared-mode (read mode), and various other actions use exclusive (write) mode. The system in question generates workloads where we have hundreds of git-upload-pack(1) processes active at the same point in time. These processes end up contending around those locks, and the consequence is that the Git processes stall. Now git-upload-pack(1) already has the infrastructure in place to buffer some of the data it reads from git-pack-objects(1) before actually sending it out. We only use this infrastructure in very limited ways though, so we generally end up matching one read(3p) call with one write(3p) call. Even worse, when the sideband is enabled we end up matching one read with _two_ writes: one for the pkt-line length, and one for the packfile data. Extend our use of the buffering infrastructure so that we soak up bytes until the buffer is filled up at least 2/3rds of its capacity. The change is relatively simple to implement as we already know to flush the buffer in `create_pack_file()` after git-pack-objects(1) has finished. This significantly reduces the number of write(3p) syscalls we need to do. Before this change, cloning the Linux repository resulted in around 400,000 write(3p) syscalls. With the buffering in place we only do around 130,000 syscalls. Now we could of course go even further and make sure that we always fill up the whole buffer. But this might cause an increase in read(3p) syscalls, and some tests show that this only reduces the number of write(3p) syscalls from 130,000 to 100,000. So overall this doesn't seem worth it. Note that the issue could also be fixed by adapting the write buffer that we use in the downstream git-pack-objects(1) command, and such a change would have roughly the same result. But the command that generates the packfile data may not always be git-pack-objects(1) as it can be changed via "uploadpack.packObjectsHook", so such a fix would only help in _some_ cases. Regardless of that, we'll also adapt the write buffer size of git-pack-objects(1) in a subsequent commit. Helped-by: Matt Smiley Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- upload-pack.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/upload-pack.c b/upload-pack.c index 04521e57c9..1b1c81ea63 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -276,6 +276,13 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os, } } + /* + * Make sure that we buffer some data before sending it to the client. + * This significantly reduces the number of write(3p) syscalls. + */ + if (readsz && os->used < (sizeof(os->buffer) * 2 / 3)) + return readsz; + if (os->used > 1) { send_client_data(1, os->buffer, os->used - 1, use_sideband); os->buffer[0] = os->buffer[os->used - 1]; From a677fed600872422508f63e84a3e258cac360f91 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 3 Mar 2026 16:00:19 +0100 Subject: [PATCH 04/10] git-compat-util: introduce `cast_size_t_to_ssize_t()` Introduce a new helper function `cast_size_t_to_ssize_t()`. This function will be used in the next commit. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- git-compat-util.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/git-compat-util.h b/git-compat-util.h index bebcf9f698..c6af04cd7a 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -665,6 +665,14 @@ static inline int cast_size_t_to_int(size_t a) return (int)a; } +static inline ssize_t cast_size_t_to_ssize_t(size_t a) +{ + if (a > maximum_signed_value_of_type(ssize_t)) + die("number too large to represent as ssize_t on this platform: %"PRIuMAX, + (uintmax_t)a); + return (ssize_t)a; +} + static inline uint64_t u64_mult(uint64_t a, uint64_t b) { if (unsigned_mult_overflows(a, b)) From 50869da7e666747c97db8422b26c5bc968c70329 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 3 Mar 2026 16:00:20 +0100 Subject: [PATCH 05/10] compat/posix: introduce writev(3p) wrapper In a subsequent commit we're going to add the first caller to writev(3p). Introduce a compatibility wrapper for this syscall that we can use on systems that don't have this syscall. The syscall exists on modern Unixes like Linux and macOS, and seemingly even for NonStop according to [1]. It doesn't seem to exist on Windows though. [1]: http://nonstoptools.com/manuals/OSS-SystemCalls.pdf [2]: https://www.gnu.org/software/gnulib/manual/html_node/writev.html Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- Makefile | 4 ++++ compat/posix.h | 14 ++++++++++++++ compat/writev.c | 29 +++++++++++++++++++++++++++++ config.mak.uname | 2 ++ meson.build | 1 + 5 files changed, 50 insertions(+) create mode 100644 compat/writev.c diff --git a/Makefile b/Makefile index 8aa489f3b6..61c7dff942 100644 --- a/Makefile +++ b/Makefile @@ -2015,6 +2015,10 @@ ifdef NO_PREAD COMPAT_CFLAGS += -DNO_PREAD COMPAT_OBJS += compat/pread.o endif +ifdef NO_WRITEV + COMPAT_CFLAGS += -DNO_WRITEV + COMPAT_OBJS += compat/writev.o +endif ifdef NO_FAST_WORKING_DIRECTORY BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY endif diff --git a/compat/posix.h b/compat/posix.h index 245386fa4a..3c611d2736 100644 --- a/compat/posix.h +++ b/compat/posix.h @@ -137,6 +137,9 @@ #include #include #include +#ifndef NO_WRITEV +#include +#endif #include #ifndef NO_SYS_SELECT_H #include @@ -323,6 +326,17 @@ int git_lstat(const char *, struct stat *); ssize_t git_pread(int fd, void *buf, size_t count, off_t offset); #endif +#ifdef NO_WRITEV +#define writev git_writev +#define iovec git_iovec +struct git_iovec { + void *iov_base; + size_t iov_len; +}; + +ssize_t git_writev(int fd, const struct iovec *iov, int iovcnt); +#endif + #ifdef NO_SETENV #define setenv gitsetenv int gitsetenv(const char *, const char *, int); diff --git a/compat/writev.c b/compat/writev.c new file mode 100644 index 0000000000..b77e534d5d --- /dev/null +++ b/compat/writev.c @@ -0,0 +1,29 @@ +#include "../git-compat-util.h" +#include "../wrapper.h" + +ssize_t git_writev(int fd, const struct iovec *iov, int iovcnt) +{ + size_t total_written = 0; + + for (int i = 0; i < iovcnt; i++) { + const char *bytes = iov[i].iov_base; + size_t iovec_written = 0; + + while (iovec_written < iov[i].iov_len) { + ssize_t bytes_written = xwrite(fd, bytes + iovec_written, + iov[i].iov_len - iovec_written); + if (bytes_written < 0) { + if (total_written) + goto out; + return bytes_written; + } + if (!bytes_written) + goto out; + iovec_written += bytes_written; + total_written += bytes_written; + } + } + +out: + return cast_size_t_to_ssize_t(total_written); +} diff --git a/config.mak.uname b/config.mak.uname index 3c35ae33a3..8e66a1cc0a 100644 --- a/config.mak.uname +++ b/config.mak.uname @@ -457,6 +457,7 @@ ifeq ($(uname_S),Windows) SANE_TOOL_PATH ?= $(msvc_bin_dir_msys) HAVE_ALLOCA_H = YesPlease NO_PREAD = YesPlease + NO_WRITEV = YesPlease NEEDS_CRYPTO_WITH_SSL = YesPlease NO_LIBGEN_H = YesPlease NO_POLL = YesPlease @@ -672,6 +673,7 @@ ifeq ($(uname_S),MINGW) pathsep = ; HAVE_ALLOCA_H = YesPlease NO_PREAD = YesPlease + NO_WRITEV = YesPlease NEEDS_CRYPTO_WITH_SSL = YesPlease NO_LIBGEN_H = YesPlease NO_POLL = YesPlease diff --git a/meson.build b/meson.build index dd52efd1c8..f1cd89efc7 100644 --- a/meson.build +++ b/meson.build @@ -1405,6 +1405,7 @@ checkfuncs = { 'initgroups' : [], 'strtoumax' : ['strtoumax.c', 'strtoimax.c'], 'pread' : ['pread.c'], + 'writev' : ['writev.c'], } if host_machine.system() == 'windows' From 0b2112ccfc786b17aa8bda478cbbefdd9fe77141 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 3 Mar 2026 16:00:21 +0100 Subject: [PATCH 06/10] wrapper: introduce writev(3p) wrappers In the preceding commit we have added a compatibility wrapper for the writev(3p) syscall. Introduce some generic wrappers for this function that we nowadays take for granted in the Git codebase. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- wrapper.c | 41 +++++++++++++++++++++++++++++++++++++++++ wrapper.h | 9 +++++++++ write-or-die.c | 8 ++++++++ write-or-die.h | 1 + 4 files changed, 59 insertions(+) diff --git a/wrapper.c b/wrapper.c index b794fb20e7..fa40f399b3 100644 --- a/wrapper.c +++ b/wrapper.c @@ -323,6 +323,47 @@ ssize_t write_in_full(int fd, const void *buf, size_t count) return total; } +ssize_t writev_in_full(int fd, struct iovec *iov, int iovcnt) +{ + ssize_t total_written = 0; + + while (iovcnt) { + ssize_t bytes_written = writev(fd, iov, iovcnt); + if (bytes_written < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + return -1; + } + if (!bytes_written) { + errno = ENOSPC; + return -1; + } + + total_written += bytes_written; + + /* + * We first need to discard any iovec entities that have been + * fully written. + */ + while (iovcnt && (size_t)bytes_written >= iov->iov_len) { + bytes_written -= iov->iov_len; + iov++; + iovcnt--; + } + + /* + * Finally, we need to adjust the last iovec in case we have + * performed a partial write. + */ + if (iovcnt && bytes_written) { + iov->iov_base = (char *) iov->iov_base + bytes_written; + iov->iov_len -= bytes_written; + } + } + + return total_written; +} + ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset) { char *p = buf; diff --git a/wrapper.h b/wrapper.h index 15ac3bab6e..27519b32d1 100644 --- a/wrapper.h +++ b/wrapper.h @@ -47,6 +47,15 @@ ssize_t read_in_full(int fd, void *buf, size_t count); ssize_t write_in_full(int fd, const void *buf, size_t count); ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset); +/* + * Try to write all iovecs. Returns -1 in case an error occurred with a proper + * errno set, the number of bytes written otherwise. + * + * Note that the iovec will be modified as a result of this call to adjust for + * partial writes! + */ +ssize_t writev_in_full(int fd, struct iovec *iov, int iovcnt); + static inline ssize_t write_str_in_full(int fd, const char *str) { return write_in_full(fd, str, strlen(str)); diff --git a/write-or-die.c b/write-or-die.c index 01a9a51fa2..5f522fb728 100644 --- a/write-or-die.c +++ b/write-or-die.c @@ -96,6 +96,14 @@ void write_or_die(int fd, const void *buf, size_t count) } } +void writev_or_die(int fd, struct iovec *iov, int iovlen) +{ + if (writev_in_full(fd, iov, iovlen) < 0) { + check_pipe(errno); + die_errno("writev error"); + } +} + void fwrite_or_die(FILE *f, const void *buf, size_t count) { if (fwrite(buf, 1, count, f) != count) diff --git a/write-or-die.h b/write-or-die.h index 65a5c42a47..ae3d7d88b8 100644 --- a/write-or-die.h +++ b/write-or-die.h @@ -7,6 +7,7 @@ void fprintf_or_die(FILE *, const char *fmt, ...); void fwrite_or_die(FILE *f, const void *buf, size_t count); void fflush_or_die(FILE *f); void write_or_die(int fd, const void *buf, size_t count); +void writev_or_die(int fd, struct iovec *iov, int iovlen); /* * These values are used to help identify parts of a repository to fsync. From ba9ec7824892a1f9dfea42c2922ad318aeab96b3 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 3 Mar 2026 16:00:22 +0100 Subject: [PATCH 07/10] sideband: use writev(3p) to send pktlines Every pktline that we send out via `send_sideband()` currently requires two syscalls: one to write the pktline's length, and one to send its data. This typically isn't all that much of a problem, but under extreme load the syscalls may cause contention in the kernel. Refactor the code to instead use the newly introduced writev(3p) infra so that we can send out the data with a single syscall. This reduces the number of syscalls from around 133,000 calls to write(3p) to around 67,000 calls to writev(3p). Suggested-by: Jeff King Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- sideband.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sideband.c b/sideband.c index ea7c25211e..1ed6614eaf 100644 --- a/sideband.c +++ b/sideband.c @@ -264,6 +264,7 @@ void send_sideband(int fd, int band, const char *data, ssize_t sz, int packet_ma const char *p = data; while (sz) { + struct iovec iov[2]; unsigned n; char hdr[5]; @@ -273,12 +274,19 @@ void send_sideband(int fd, int band, const char *data, ssize_t sz, int packet_ma if (0 <= band) { xsnprintf(hdr, sizeof(hdr), "%04x", n + 5); hdr[4] = band; - write_or_die(fd, hdr, 5); + iov[0].iov_base = hdr; + iov[0].iov_len = 5; } else { xsnprintf(hdr, sizeof(hdr), "%04x", n + 4); - write_or_die(fd, hdr, 4); + iov[0].iov_base = hdr; + iov[0].iov_len = 4; } - write_or_die(fd, p, n); + + iov[1].iov_base = (void *) p; + iov[1].iov_len = n; + + writev_or_die(fd, iov, ARRAY_SIZE(iov)); + p += n; sz -= n; } From 0cd8f12b23cb947981341f5ff98cb2d716b0f85c Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 3 Mar 2026 16:00:23 +0100 Subject: [PATCH 08/10] csum-file: introduce `hashfd_ext()` Introduce a new `hashfd_ext()` function that takes an options structure. This function will replace `hashd_throughput()` in the next commit. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- csum-file.c | 22 +++++++++++++--------- csum-file.h | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/csum-file.c b/csum-file.c index 6e21e3cac8..a50416247e 100644 --- a/csum-file.c +++ b/csum-file.c @@ -161,17 +161,16 @@ struct hashfile *hashfd_check(const struct git_hash_algo *algop, return f; } -static struct hashfile *hashfd_internal(const struct git_hash_algo *algop, - int fd, const char *name, - struct progress *tp, - size_t buffer_len) +struct hashfile *hashfd_ext(const struct git_hash_algo *algop, + int fd, const char *name, + const struct hashfd_options *opts) { struct hashfile *f = xmalloc(sizeof(*f)); f->fd = fd; f->check_fd = -1; f->offset = 0; f->total = 0; - f->tp = tp; + f->tp = opts->progress; f->name = name; f->do_crc = 0; f->skip_hash = 0; @@ -179,8 +178,8 @@ static struct hashfile *hashfd_internal(const struct git_hash_algo *algop, f->algop = unsafe_hash_algo(algop); f->algop->init_fn(&f->ctx); - f->buffer_len = buffer_len; - f->buffer = xmalloc(buffer_len); + f->buffer_len = opts->buffer_len ? opts->buffer_len : 128 * 1024; + f->buffer = xmalloc(f->buffer_len); f->check_buffer = NULL; return f; @@ -194,7 +193,8 @@ struct hashfile *hashfd(const struct git_hash_algo *algop, * measure the rate of data passing through this hashfile, * use a larger buffer size to reduce fsync() calls. */ - return hashfd_internal(algop, fd, name, NULL, 128 * 1024); + struct hashfd_options opts = { 0 }; + return hashfd_ext(algop, fd, name, &opts); } struct hashfile *hashfd_throughput(const struct git_hash_algo *algop, @@ -206,7 +206,11 @@ struct hashfile *hashfd_throughput(const struct git_hash_algo *algop, * size so the progress indicators arrive at a more * frequent rate. */ - return hashfd_internal(algop, fd, name, tp, 8 * 1024); + struct hashfd_options opts = { + .progress = tp, + .buffer_len = 8 * 1024, + }; + return hashfd_ext(algop, fd, name, &opts); } void hashfile_checkpoint_init(struct hashfile *f, diff --git a/csum-file.h b/csum-file.h index 07ae11024a..a03b60120d 100644 --- a/csum-file.h +++ b/csum-file.h @@ -45,6 +45,20 @@ int hashfile_truncate(struct hashfile *, struct hashfile_checkpoint *); #define CSUM_FSYNC 2 #define CSUM_HASH_IN_STREAM 4 +struct hashfd_options { + /* + * Throughput progress that counts the number of bytes that have been + * hashed. + */ + struct progress *progress; + + /* The length of the buffer that shall be used read read data. */ + size_t buffer_len; +}; + +struct hashfile *hashfd_ext(const struct git_hash_algo *algop, + int fd, const char *name, + const struct hashfd_options *opts); struct hashfile *hashfd(const struct git_hash_algo *algop, int fd, const char *name); struct hashfile *hashfd_check(const struct git_hash_algo *algop, From b7b5cde0639d8e827ec1112602539e21a6b7bf39 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 3 Mar 2026 16:00:24 +0100 Subject: [PATCH 09/10] csum-file: drop `hashfd_throughput()` The `hashfd_throughput()` function is used by a single callsite in git-pack-objects(1). In contrast to `hashfd()`, this function uses a progress meter to measure throughput and a smaller buffer length so that the progress meter can provide more granular metrics. We're going to change that caller in the next commit to be a bit more specific to packing objects. As such, `hashfd_throughput()` will be a somewhat unfitting mechanism for any potential new callers. Drop the function and replace it with a call to `hashfd_ext()`. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 19 +++++++++++++++---- csum-file.c | 16 ---------------- csum-file.h | 2 -- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 5846b6a293..ba150a80ad 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -1331,11 +1331,22 @@ static void write_pack_file(void) unsigned char hash[GIT_MAX_RAWSZ]; char *pack_tmp_name = NULL; - if (pack_to_stdout) - f = hashfd_throughput(the_repository->hash_algo, 1, - "", progress_state); - else + if (pack_to_stdout) { + /* + * Since we are expecting to report progress of the + * write into this hashfile, use a smaller buffer + * size so the progress indicators arrive at a more + * frequent rate. + */ + struct hashfd_options opts = { + .progress = progress_state, + .buffer_len = 8 * 1024, + }; + f = hashfd_ext(the_repository->hash_algo, 1, + "", &opts); + } else { f = create_tmp_packfile(the_repository, &pack_tmp_name); + } offset = write_pack_header(f, nr_remaining); diff --git a/csum-file.c b/csum-file.c index a50416247e..5dfaca5543 100644 --- a/csum-file.c +++ b/csum-file.c @@ -197,22 +197,6 @@ struct hashfile *hashfd(const struct git_hash_algo *algop, return hashfd_ext(algop, fd, name, &opts); } -struct hashfile *hashfd_throughput(const struct git_hash_algo *algop, - int fd, const char *name, struct progress *tp) -{ - /* - * Since we are expecting to report progress of the - * write into this hashfile, use a smaller buffer - * size so the progress indicators arrive at a more - * frequent rate. - */ - struct hashfd_options opts = { - .progress = tp, - .buffer_len = 8 * 1024, - }; - return hashfd_ext(algop, fd, name, &opts); -} - void hashfile_checkpoint_init(struct hashfile *f, struct hashfile_checkpoint *checkpoint) { diff --git a/csum-file.h b/csum-file.h index a03b60120d..01472555c8 100644 --- a/csum-file.h +++ b/csum-file.h @@ -63,8 +63,6 @@ struct hashfile *hashfd(const struct git_hash_algo *algop, int fd, const char *name); struct hashfile *hashfd_check(const struct git_hash_algo *algop, const char *name); -struct hashfile *hashfd_throughput(const struct git_hash_algo *algop, - int fd, const char *name, struct progress *tp); /* * Free the hashfile without flushing its contents to disk. This only From b095803c90dac95a3d49af87dea509a38ffaf039 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 3 Mar 2026 16:00:25 +0100 Subject: [PATCH 10/10] builtin/pack-objects: reduce lock contention when writing packfile data When running `git pack-objects --stdout` we feed the data through `hashfd_ext()` with a progress meter and a smaller-than-usual buffer length of 8kB so that we can track throughput more granularly. But as packfiles tend to be on the larger side, this small buffer size may cause a ton of write(3p) syscalls. Originally, the buffer we used in `hashfd()` was 8kB for all use cases. This was changed though in 2ca245f8be (csum-file.h: increase hashfile buffer size, 2021-05-18) because we noticed that the number of writes can have an impact on performance. So the buffer size was increased to 128kB, which improved performance a bit for some use cases. But the commit didn't touch the buffer size for `hashd_throughput()`. The reasoning here was that callers expect the progress indicator to update frequently, and a larger buffer size would of course reduce the update frequency especially on slow networks. While that is of course true, there was (and still is, even though it's now a call to `hashfd_ext()`) only a single caller of this function in git-pack-objects(1). This command is responsible for writing packfiles, and those packfiles are often on the bigger side. So arguably: - The user won't care about increments of 8kB when packfiles tend to be megabytes or even gigabytes in size. - Reducing the number of syscalls would be even more valuable here than it would be for multi-pack indices, which was the benchmark done in the mentioned commit, as MIDXs are typically significantly smaller than packfiles. - Nowadays, many internet connections should be able to transfer data at a rate significantly higher than 8kB per second. Update the buffer to instead have a size of `LARGE_PACKET_DATA_MAX - 1`, which translates to ~64kB. This limit was chosen because `git pack-objects --stdout` is most often used when sending packfiles via git-upload-pack(1), where packfile data is chunked into pktlines when using the sideband. Furthermore, most internet connections should have a bandwidth signifcantly higher than 64kB/s, so we'd still be able to observe progress updates at a rate of at least once per second. This change significantly reduces the number of write(3p) syscalls from 355,000 to 44,000 when packing the Linux repository. While this results in a small performance improvement on an otherwise-unused system, this improvement is mostly negligible. More importantly though, it will reduce lock contention in the kernel on an extremely busy system where we have many processes writing data at once. Suggested-by: Jeff King Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index ba150a80ad..7301ed8c68 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -41,6 +41,7 @@ #include "promisor-remote.h" #include "pack-mtimes.h" #include "parse-options.h" +#include "pkt-line.h" #include "blob.h" #include "tree.h" #include "path-walk.h" @@ -1333,14 +1334,17 @@ static void write_pack_file(void) if (pack_to_stdout) { /* - * Since we are expecting to report progress of the - * write into this hashfile, use a smaller buffer - * size so the progress indicators arrive at a more - * frequent rate. + * This command is most often invoked via + * git-upload-pack(1), which will typically chunk data + * into pktlines. As such, we use the maximum data + * length of them as buffer length. + * + * Note that we need to subtract one though to + * accomodate for the sideband byte. */ struct hashfd_options opts = { .progress = progress_state, - .buffer_len = 8 * 1024, + .buffer_len = LARGE_PACKET_DATA_MAX - 1, }; f = hashfd_ext(the_repository->hash_algo, 1, "", &opts);