From ac61d3023ebddd6e401fc8f4080c165609c50c28 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 14:24:57 +0100 Subject: [PATCH 01/10] upload-pack: fix debug statement when flushing packfile data When git-upload-pack(1) writes packfile data to the client we have some logic in place that buffers some partial lines. When that buffer still contains data after git-pack-objects(1) has finished we flush the buffer so that all remaining bytes are sent out. Curiously, when we do so we also print the string "flushed." to stderr. This statement has been introduced in b1c71b7281 (upload-pack: avoid sending an incomplete pack upon failure, 2006-06-20), so quite a while ago. What's interesting though is that stderr is typically spliced through to the client-side, and consequently the client would see this message. Munging the way how we do the caching indeed confirms this: $ git clone file:///home/pks/Development/linux/ Cloning into bare repository 'linux.git'... remote: Enumerating objects: 12980346, done. remote: Counting objects: 100% (131820/131820), done. remote: Compressing objects: 100% (50290/50290), done. remote: Total 12980346 (delta 96319), reused 104500 (delta 81217), pack-reused 12848526 (from 1) Receiving objects: 100% (12980346/12980346), 3.23 GiB | 57.44 MiB/s, done. flushed. Resolving deltas: 100% (10676718/10676718), done. It's quite clear that this string shouldn't ever be visible to the client, so it rather feels like this is a left-over debug statement. The menitoned commit doesn't mention this line, either. Remove the debug output to prepare for a change in how we do the buffering in the next commit. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- upload-pack.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/upload-pack.c b/upload-pack.c index 2d2b70cbf2..c2643c0295 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -457,11 +457,9 @@ static void create_pack_file(struct upload_pack_data *pack_data, } /* flush the data */ - if (output_state->used > 0) { + if (output_state->used > 0) send_client_data(1, output_state->buffer, output_state->used, pack_data->use_sideband); - fprintf(stderr, "flushed.\n"); - } free(output_state); if (pack_data->use_sideband) packet_flush(1); From 515cedfe1de1b91d71f1b01f6878cdf930e76db8 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 14:24:58 +0100 Subject: [PATCH 02/10] upload-pack: adapt keepalives based on buffering The function `create_pack_file()` is responsible for sending the packfile data to the client of git-upload-pack(1). As generating the bytes may take significant computing resources we also have a mechanism in place that optionally sends keepalive pktlines in case we haven't sent out any data. The keepalive logic is purely based poll(3p): we pass a timeout to that syscall, and if the call times out we send out the keepalive pktline. While reasonable, this logic isn't entirely sufficient: even if the call to poll(3p) ends because we have received data on any of the file descriptors we may not necessarily send data to the client. The most important edge case here happens in `relay_pack_data()`. When we haven't seen the initial "PACK" signature from git-pack-objects(1) yet we buffer incoming data. So in the worst case, if each of the bytes of that signature arrive shortly before the configured keepalive timeout, then we may not send out any data for a time period that is (almost) four times as long as the configured timeout. This edge case is rather unlikely to matter in practice. But in a subsequent commit we're going to adapt our buffering mechanism to become more aggressive, which makes it more likely that we don't send any data for an extended amount of time. Adapt the logic so that instead of using a fixed timeout on every call to poll(3p), we instead figure out how much time has passed since the last-sent data. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- upload-pack.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/upload-pack.c b/upload-pack.c index c2643c0295..04521e57c9 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -29,6 +29,7 @@ #include "commit-graph.h" #include "commit-reach.h" #include "shallow.h" +#include "trace.h" #include "write-or-die.h" #include "json-writer.h" #include "strmap.h" @@ -218,7 +219,8 @@ struct output_state { }; static int relay_pack_data(int pack_objects_out, struct output_state *os, - int use_sideband, int write_packfile_line) + int use_sideband, int write_packfile_line, + bool *did_send_data) { /* * We keep the last byte to ourselves @@ -232,6 +234,8 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os, */ ssize_t readsz; + *did_send_data = false; + readsz = xread(pack_objects_out, os->buffer + os->used, sizeof(os->buffer) - os->used); if (readsz < 0) { @@ -247,6 +251,7 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os, if (os->packfile_uris_started) packet_delim(1); packet_write_fmt(1, "\1packfile\n"); + *did_send_data = true; } break; } @@ -259,6 +264,7 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os, } *p = '\0'; packet_write_fmt(1, "\1%s\n", os->buffer); + *did_send_data = true; os->used -= p - os->buffer + 1; memmove(os->buffer, p + 1, os->used); @@ -279,6 +285,7 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os, os->used = 0; } + *did_send_data = true; return readsz; } @@ -290,6 +297,7 @@ static void create_pack_file(struct upload_pack_data *pack_data, char progress[128]; char abort_msg[] = "aborting due to possible repository " "corruption on the remote side."; + uint64_t last_sent_ms = 0; ssize_t sz; int i; FILE *pipe_fd; @@ -365,10 +373,14 @@ static void create_pack_file(struct upload_pack_data *pack_data, */ while (1) { + uint64_t now_ms = getnanotime() / 1000000; struct pollfd pfd[2]; - int pe, pu, pollsize, polltimeout; + int pe, pu, pollsize, polltimeout_ms; int ret; + if (!last_sent_ms) + last_sent_ms = now_ms; + reset_timeout(pack_data->timeout); pollsize = 0; @@ -390,11 +402,21 @@ static void create_pack_file(struct upload_pack_data *pack_data, if (!pollsize) break; - polltimeout = pack_data->keepalive < 0 - ? -1 - : 1000 * pack_data->keepalive; + if (pack_data->keepalive < 0) { + polltimeout_ms = -1; + } else { + /* + * The polling timeout needs to be adjusted based on + * the time we have sent our last package. The longer + * it's been in the past, the shorter the timeout + * becomes until we eventually don't block at all. + */ + polltimeout_ms = 1000 * pack_data->keepalive - (now_ms - last_sent_ms); + if (polltimeout_ms < 0) + polltimeout_ms = 0; + } - ret = poll(pfd, pollsize, polltimeout); + ret = poll(pfd, pollsize, polltimeout_ms); if (ret < 0) { if (errno != EINTR) { @@ -403,16 +425,18 @@ static void create_pack_file(struct upload_pack_data *pack_data, } continue; } + if (0 <= pe && (pfd[pe].revents & (POLLIN|POLLHUP))) { /* Status ready; we ship that in the side-band * or dump to the standard error. */ sz = xread(pack_objects.err, progress, sizeof(progress)); - if (0 < sz) + if (0 < sz) { send_client_data(2, progress, sz, pack_data->use_sideband); - else if (sz == 0) { + last_sent_ms = now_ms; + } else if (sz == 0) { close(pack_objects.err); pack_objects.err = -1; } @@ -421,11 +445,14 @@ static void create_pack_file(struct upload_pack_data *pack_data, /* give priority to status messages */ continue; } + if (0 <= pu && (pfd[pu].revents & (POLLIN|POLLHUP))) { + bool did_send_data; int result = relay_pack_data(pack_objects.out, output_state, pack_data->use_sideband, - !!uri_protocols); + !!uri_protocols, + &did_send_data); if (result == 0) { close(pack_objects.out); @@ -433,6 +460,9 @@ static void create_pack_file(struct upload_pack_data *pack_data, } else if (result < 0) { goto fail; } + + if (did_send_data) + last_sent_ms = now_ms; } /* @@ -448,6 +478,7 @@ static void create_pack_file(struct upload_pack_data *pack_data, if (!ret && pack_data->use_sideband) { static const char buf[] = "0005\1"; write_or_die(1, buf, 5); + last_sent_ms = now_ms; } } From 3a6379aa77c76030aa4d6d88943cd263950db508 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 14:24:59 +0100 Subject: [PATCH 03/10] upload-pack: prefer flushing data over sending keepalive When using the sideband in git-upload-pack(1) we know to send out keepalive packets in case generating the pack takes too long. These keepalives take the form of a simple empty pktline. In the preceding commit we have adapted git-upload-pack(1) to buffer data more aggressively before sending it to the client. This creates an obvious optimization opportunity: when we hit the keepalive timeout while we still hold on to some buffered data, then it makes more sense to flush out the data instead of sending the empty keepalive packet. This is overall not going to be a significant win. Most keepalives will come before the pack data starts, and once pack-objects starts producing data, it tends to do so pretty consistently. And of course we can't send data before we see the PACK header, because the whole point is to buffer the early bit waiting for packfile URIs. But the optimization is easy enough to realize. Do so and flush out data instead of sending an empty pktline. While at it, drop the useless Suggested-by: Jeff King Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- upload-pack.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/upload-pack.c b/upload-pack.c index 04521e57c9..ef8f8189c1 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -466,18 +466,27 @@ static void create_pack_file(struct upload_pack_data *pack_data, } /* - * We hit the keepalive timeout without saying anything; send - * an empty message on the data sideband just to let the other - * side know we're still working on it, but don't have any data - * yet. + * We hit the keepalive timeout without saying anything. If we + * have pending data we flush it out to the caller now. + * Otherwise, we send an empty message on the data sideband + * just to let the other side know we're still working on it, + * but don't have any data yet. * * If we don't have a sideband channel, there's no room in the * protocol to say anything, so those clients are just out of * luck. */ if (!ret && pack_data->use_sideband) { - static const char buf[] = "0005\1"; - write_or_die(1, buf, 5); + if (output_state->packfile_started && output_state->used > 1) { + send_client_data(1, output_state->buffer, output_state->used - 1, + pack_data->use_sideband); + output_state->buffer[0] = output_state->buffer[output_state->used - 1]; + output_state->used = 1; + } else { + static const char buf[] = "0005\1"; + write_or_die(1, buf, 5); + } + last_sent_ms = now_ms; } } From fb1af225033a21e91e12860a85d53c28d2d4c62c Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 14:25:00 +0100 Subject: [PATCH 04/10] upload-pack: reduce lock contention when writing packfile data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In our production systems we have recently observed write contention in git-upload-pack(1). The system in question was consistently streaming packfiles at a rate of dozens of gigabits per second, but curiously the system was neither bottlenecked on CPU, memory or IOPS. We eventually discovered that Git was spending 80% of its time in `pipe_write()`, out of which almost all of the time was spent in the `ep_poll_callback` function in the kernel. Quoting the reporter: This infrastructure is part of an event notification queue designed to allow for multiple producers to emit events, but that concurrency safety is guarded by 3 layers of locking. The layer we're hitting contention in uses a simple reader/writer lock mode (a.k.a. shared versus exclusive mode), where producers need shared-mode (read mode), and various other actions use exclusive (write) mode. The system in question generates workloads where we have hundreds of git-upload-pack(1) processes active at the same point in time. These processes end up contending around those locks, and the consequence is that the Git processes stall. Now git-upload-pack(1) already has the infrastructure in place to buffer some of the data it reads from git-pack-objects(1) before actually sending it out. We only use this infrastructure in very limited ways though, so we generally end up matching one read(3p) call with one write(3p) call. Even worse, when the sideband is enabled we end up matching one read with _two_ writes: one for the pkt-line length, and one for the packfile data. Extend our use of the buffering infrastructure so that we soak up bytes until the buffer is filled up at least 2/3rds of its capacity. The change is relatively simple to implement as we already know to flush the buffer in `create_pack_file()` after git-pack-objects(1) has finished. This significantly reduces the number of write(3p) syscalls we need to do. Before this change, cloning the Linux repository resulted in around 400,000 write(3p) syscalls. With the buffering in place we only do around 130,000 syscalls. Now we could of course go even further and make sure that we always fill up the whole buffer. But this might cause an increase in read(3p) syscalls, and some tests show that this only reduces the number of write(3p) syscalls from 130,000 to 100,000. So overall this doesn't seem worth it. Note that the issue could also be fixed by adapting the write buffer that we use in the downstream git-pack-objects(1) command, and such a change would have roughly the same result. But the command that generates the packfile data may not always be git-pack-objects(1) as it can be changed via "uploadpack.packObjectsHook", so such a fix would only help in _some_ cases. Regardless of that, we'll also adapt the write buffer size of git-pack-objects(1) in a subsequent commit. Helped-by: Matt Smiley Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- upload-pack.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/upload-pack.c b/upload-pack.c index ef8f8189c1..92e1ff3ba2 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -276,6 +276,13 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os, } } + /* + * Make sure that we buffer some data before sending it to the client. + * This significantly reduces the number of write(3p) syscalls. + */ + if (readsz && os->used < (sizeof(os->buffer) * 2 / 3)) + return readsz; + if (os->used > 1) { send_client_data(1, os->buffer, os->used - 1, use_sideband); os->buffer[0] = os->buffer[os->used - 1]; From 104a6e9788373aebd5e3d42b8d42f557d6de91a3 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 14:25:01 +0100 Subject: [PATCH 05/10] compat/posix: introduce writev(3p) wrapper In a subsequent commit we're going to add the first caller to writev(3p). Introduce a compatibility wrapper for this syscall that we can use on systems that don't have this syscall. The syscall exists on modern Unixes like Linux and macOS, and seemingly even for NonStop according to [1]. It doesn't seem to exist on Windows though. [1]: http://nonstoptools.com/manuals/OSS-SystemCalls.pdf [2]: https://www.gnu.org/software/gnulib/manual/html_node/writev.html Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- Makefile | 4 ++++ compat/posix.h | 14 ++++++++++++++ compat/writev.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ config.mak.uname | 2 ++ meson.build | 1 + 5 files changed, 65 insertions(+) create mode 100644 compat/writev.c diff --git a/Makefile b/Makefile index 8aa489f3b6..61c7dff942 100644 --- a/Makefile +++ b/Makefile @@ -2015,6 +2015,10 @@ ifdef NO_PREAD COMPAT_CFLAGS += -DNO_PREAD COMPAT_OBJS += compat/pread.o endif +ifdef NO_WRITEV + COMPAT_CFLAGS += -DNO_WRITEV + COMPAT_OBJS += compat/writev.o +endif ifdef NO_FAST_WORKING_DIRECTORY BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY endif diff --git a/compat/posix.h b/compat/posix.h index 245386fa4a..3c611d2736 100644 --- a/compat/posix.h +++ b/compat/posix.h @@ -137,6 +137,9 @@ #include #include #include +#ifndef NO_WRITEV +#include +#endif #include #ifndef NO_SYS_SELECT_H #include @@ -323,6 +326,17 @@ int git_lstat(const char *, struct stat *); ssize_t git_pread(int fd, void *buf, size_t count, off_t offset); #endif +#ifdef NO_WRITEV +#define writev git_writev +#define iovec git_iovec +struct git_iovec { + void *iov_base; + size_t iov_len; +}; + +ssize_t git_writev(int fd, const struct iovec *iov, int iovcnt); +#endif + #ifdef NO_SETENV #define setenv gitsetenv int gitsetenv(const char *, const char *, int); diff --git a/compat/writev.c b/compat/writev.c new file mode 100644 index 0000000000..3a94870a2f --- /dev/null +++ b/compat/writev.c @@ -0,0 +1,44 @@ +#include "../git-compat-util.h" +#include "../wrapper.h" + +ssize_t git_writev(int fd, const struct iovec *iov, int iovcnt) +{ + size_t total_written = 0; + size_t sum = 0; + + /* + * According to writev(3p), the syscall shall error with EINVAL in case + * the sum of `iov_len` overflows `ssize_t`. + */ + for (int i = 0; i < iovcnt; i++) { + if (iov[i].iov_len > maximum_signed_value_of_type(ssize_t) || + iov[i].iov_len + sum > maximum_signed_value_of_type(ssize_t)) { + errno = EINVAL; + return -1; + } + + sum += iov[i].iov_len; + } + + for (int i = 0; i < iovcnt; i++) { + const char *bytes = iov[i].iov_base; + size_t iovec_written = 0; + + while (iovec_written < iov[i].iov_len) { + ssize_t bytes_written = xwrite(fd, bytes + iovec_written, + iov[i].iov_len - iovec_written); + if (bytes_written < 0) { + if (total_written) + goto out; + return bytes_written; + } + if (!bytes_written) + goto out; + iovec_written += bytes_written; + total_written += bytes_written; + } + } + +out: + return (ssize_t) total_written; +} diff --git a/config.mak.uname b/config.mak.uname index 3c35ae33a3..8e66a1cc0a 100644 --- a/config.mak.uname +++ b/config.mak.uname @@ -457,6 +457,7 @@ ifeq ($(uname_S),Windows) SANE_TOOL_PATH ?= $(msvc_bin_dir_msys) HAVE_ALLOCA_H = YesPlease NO_PREAD = YesPlease + NO_WRITEV = YesPlease NEEDS_CRYPTO_WITH_SSL = YesPlease NO_LIBGEN_H = YesPlease NO_POLL = YesPlease @@ -672,6 +673,7 @@ ifeq ($(uname_S),MINGW) pathsep = ; HAVE_ALLOCA_H = YesPlease NO_PREAD = YesPlease + NO_WRITEV = YesPlease NEEDS_CRYPTO_WITH_SSL = YesPlease NO_LIBGEN_H = YesPlease NO_POLL = YesPlease diff --git a/meson.build b/meson.build index dd52efd1c8..f1cd89efc7 100644 --- a/meson.build +++ b/meson.build @@ -1405,6 +1405,7 @@ checkfuncs = { 'initgroups' : [], 'strtoumax' : ['strtoumax.c', 'strtoimax.c'], 'pread' : ['pread.c'], + 'writev' : ['writev.c'], } if host_machine.system() == 'windows' From 6c8a62bc17cc4ae119ec6829225e897b873b59d6 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 14:25:02 +0100 Subject: [PATCH 06/10] wrapper: introduce writev(3p) wrappers In the preceding commit we have added a compatibility wrapper for the writev(3p) syscall. Introduce some generic wrappers for this function that we nowadays take for granted in the Git codebase. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- wrapper.c | 41 +++++++++++++++++++++++++++++++++++++++++ wrapper.h | 9 +++++++++ write-or-die.c | 8 ++++++++ write-or-die.h | 1 + 4 files changed, 59 insertions(+) diff --git a/wrapper.c b/wrapper.c index b794fb20e7..fa40f399b3 100644 --- a/wrapper.c +++ b/wrapper.c @@ -323,6 +323,47 @@ ssize_t write_in_full(int fd, const void *buf, size_t count) return total; } +ssize_t writev_in_full(int fd, struct iovec *iov, int iovcnt) +{ + ssize_t total_written = 0; + + while (iovcnt) { + ssize_t bytes_written = writev(fd, iov, iovcnt); + if (bytes_written < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + return -1; + } + if (!bytes_written) { + errno = ENOSPC; + return -1; + } + + total_written += bytes_written; + + /* + * We first need to discard any iovec entities that have been + * fully written. + */ + while (iovcnt && (size_t)bytes_written >= iov->iov_len) { + bytes_written -= iov->iov_len; + iov++; + iovcnt--; + } + + /* + * Finally, we need to adjust the last iovec in case we have + * performed a partial write. + */ + if (iovcnt && bytes_written) { + iov->iov_base = (char *) iov->iov_base + bytes_written; + iov->iov_len -= bytes_written; + } + } + + return total_written; +} + ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset) { char *p = buf; diff --git a/wrapper.h b/wrapper.h index 15ac3bab6e..27519b32d1 100644 --- a/wrapper.h +++ b/wrapper.h @@ -47,6 +47,15 @@ ssize_t read_in_full(int fd, void *buf, size_t count); ssize_t write_in_full(int fd, const void *buf, size_t count); ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset); +/* + * Try to write all iovecs. Returns -1 in case an error occurred with a proper + * errno set, the number of bytes written otherwise. + * + * Note that the iovec will be modified as a result of this call to adjust for + * partial writes! + */ +ssize_t writev_in_full(int fd, struct iovec *iov, int iovcnt); + static inline ssize_t write_str_in_full(int fd, const char *str) { return write_in_full(fd, str, strlen(str)); diff --git a/write-or-die.c b/write-or-die.c index 01a9a51fa2..5f522fb728 100644 --- a/write-or-die.c +++ b/write-or-die.c @@ -96,6 +96,14 @@ void write_or_die(int fd, const void *buf, size_t count) } } +void writev_or_die(int fd, struct iovec *iov, int iovlen) +{ + if (writev_in_full(fd, iov, iovlen) < 0) { + check_pipe(errno); + die_errno("writev error"); + } +} + void fwrite_or_die(FILE *f, const void *buf, size_t count) { if (fwrite(buf, 1, count, f) != count) diff --git a/write-or-die.h b/write-or-die.h index 65a5c42a47..ae3d7d88b8 100644 --- a/write-or-die.h +++ b/write-or-die.h @@ -7,6 +7,7 @@ void fprintf_or_die(FILE *, const char *fmt, ...); void fwrite_or_die(FILE *f, const void *buf, size_t count); void fflush_or_die(FILE *f); void write_or_die(int fd, const void *buf, size_t count); +void writev_or_die(int fd, struct iovec *iov, int iovlen); /* * These values are used to help identify parts of a repository to fsync. From 6b9c9a191232716f51dafbbdfba9d164ed9e1397 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 14:25:03 +0100 Subject: [PATCH 07/10] sideband: use writev(3p) to send pktlines Every pktline that we send out via `send_sideband()` currently requires two syscalls: one to write the pktline's length, and one to send its data. This typically isn't all that much of a problem, but under extreme load the syscalls may cause contention in the kernel. Refactor the code to instead use the newly introduced writev(3p) infra so that we can send out the data with a single syscall. This reduces the number of syscalls from around 133,000 calls to write(3p) to around 67,000 calls to writev(3p). Suggested-by: Jeff King Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- sideband.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sideband.c b/sideband.c index ea7c25211e..1ed6614eaf 100644 --- a/sideband.c +++ b/sideband.c @@ -264,6 +264,7 @@ void send_sideband(int fd, int band, const char *data, ssize_t sz, int packet_ma const char *p = data; while (sz) { + struct iovec iov[2]; unsigned n; char hdr[5]; @@ -273,12 +274,19 @@ void send_sideband(int fd, int band, const char *data, ssize_t sz, int packet_ma if (0 <= band) { xsnprintf(hdr, sizeof(hdr), "%04x", n + 5); hdr[4] = band; - write_or_die(fd, hdr, 5); + iov[0].iov_base = hdr; + iov[0].iov_len = 5; } else { xsnprintf(hdr, sizeof(hdr), "%04x", n + 4); - write_or_die(fd, hdr, 4); + iov[0].iov_base = hdr; + iov[0].iov_len = 4; } - write_or_die(fd, p, n); + + iov[1].iov_base = (void *) p; + iov[1].iov_len = n; + + writev_or_die(fd, iov, ARRAY_SIZE(iov)); + p += n; sz -= n; } From 25d41000f541dc6c5f3e14dc03e2e825f023f467 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 14:25:04 +0100 Subject: [PATCH 08/10] csum-file: introduce `hashfd_ext()` Introduce a new `hashfd_ext()` function that takes an options structure. This function will replace `hashd_throughput()` in the next commit. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- csum-file.c | 22 +++++++++++++--------- csum-file.h | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/csum-file.c b/csum-file.c index 6e21e3cac8..a50416247e 100644 --- a/csum-file.c +++ b/csum-file.c @@ -161,17 +161,16 @@ struct hashfile *hashfd_check(const struct git_hash_algo *algop, return f; } -static struct hashfile *hashfd_internal(const struct git_hash_algo *algop, - int fd, const char *name, - struct progress *tp, - size_t buffer_len) +struct hashfile *hashfd_ext(const struct git_hash_algo *algop, + int fd, const char *name, + const struct hashfd_options *opts) { struct hashfile *f = xmalloc(sizeof(*f)); f->fd = fd; f->check_fd = -1; f->offset = 0; f->total = 0; - f->tp = tp; + f->tp = opts->progress; f->name = name; f->do_crc = 0; f->skip_hash = 0; @@ -179,8 +178,8 @@ static struct hashfile *hashfd_internal(const struct git_hash_algo *algop, f->algop = unsafe_hash_algo(algop); f->algop->init_fn(&f->ctx); - f->buffer_len = buffer_len; - f->buffer = xmalloc(buffer_len); + f->buffer_len = opts->buffer_len ? opts->buffer_len : 128 * 1024; + f->buffer = xmalloc(f->buffer_len); f->check_buffer = NULL; return f; @@ -194,7 +193,8 @@ struct hashfile *hashfd(const struct git_hash_algo *algop, * measure the rate of data passing through this hashfile, * use a larger buffer size to reduce fsync() calls. */ - return hashfd_internal(algop, fd, name, NULL, 128 * 1024); + struct hashfd_options opts = { 0 }; + return hashfd_ext(algop, fd, name, &opts); } struct hashfile *hashfd_throughput(const struct git_hash_algo *algop, @@ -206,7 +206,11 @@ struct hashfile *hashfd_throughput(const struct git_hash_algo *algop, * size so the progress indicators arrive at a more * frequent rate. */ - return hashfd_internal(algop, fd, name, tp, 8 * 1024); + struct hashfd_options opts = { + .progress = tp, + .buffer_len = 8 * 1024, + }; + return hashfd_ext(algop, fd, name, &opts); } void hashfile_checkpoint_init(struct hashfile *f, diff --git a/csum-file.h b/csum-file.h index 07ae11024a..a03b60120d 100644 --- a/csum-file.h +++ b/csum-file.h @@ -45,6 +45,20 @@ int hashfile_truncate(struct hashfile *, struct hashfile_checkpoint *); #define CSUM_FSYNC 2 #define CSUM_HASH_IN_STREAM 4 +struct hashfd_options { + /* + * Throughput progress that counts the number of bytes that have been + * hashed. + */ + struct progress *progress; + + /* The length of the buffer that shall be used read read data. */ + size_t buffer_len; +}; + +struct hashfile *hashfd_ext(const struct git_hash_algo *algop, + int fd, const char *name, + const struct hashfd_options *opts); struct hashfile *hashfd(const struct git_hash_algo *algop, int fd, const char *name); struct hashfile *hashfd_check(const struct git_hash_algo *algop, From 1d58d66c0e8d815e1638e567f3f59424dab91728 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 14:25:05 +0100 Subject: [PATCH 09/10] csum-file: drop `hashfd_throughput()` The `hashfd_throughput()` function is used by a single callsite in git-pack-objects(1). In contrast to `hashfd()`, this function uses a progress meter to measure throughput and a smaller buffer length so that the progress meter can provide more granular metrics. We're going to change that caller in the next commit to be a bit more specific to packing objects. As such, `hashfd_throughput()` will be a somewhat unfitting mechanism for any potential new callers. Drop the function and replace it with a call to `hashfd_ext()`. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 19 +++++++++++++++---- csum-file.c | 16 ---------------- csum-file.h | 2 -- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 5846b6a293..ba150a80ad 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -1331,11 +1331,22 @@ static void write_pack_file(void) unsigned char hash[GIT_MAX_RAWSZ]; char *pack_tmp_name = NULL; - if (pack_to_stdout) - f = hashfd_throughput(the_repository->hash_algo, 1, - "", progress_state); - else + if (pack_to_stdout) { + /* + * Since we are expecting to report progress of the + * write into this hashfile, use a smaller buffer + * size so the progress indicators arrive at a more + * frequent rate. + */ + struct hashfd_options opts = { + .progress = progress_state, + .buffer_len = 8 * 1024, + }; + f = hashfd_ext(the_repository->hash_algo, 1, + "", &opts); + } else { f = create_tmp_packfile(the_repository, &pack_tmp_name); + } offset = write_pack_header(f, nr_remaining); diff --git a/csum-file.c b/csum-file.c index a50416247e..5dfaca5543 100644 --- a/csum-file.c +++ b/csum-file.c @@ -197,22 +197,6 @@ struct hashfile *hashfd(const struct git_hash_algo *algop, return hashfd_ext(algop, fd, name, &opts); } -struct hashfile *hashfd_throughput(const struct git_hash_algo *algop, - int fd, const char *name, struct progress *tp) -{ - /* - * Since we are expecting to report progress of the - * write into this hashfile, use a smaller buffer - * size so the progress indicators arrive at a more - * frequent rate. - */ - struct hashfd_options opts = { - .progress = tp, - .buffer_len = 8 * 1024, - }; - return hashfd_ext(algop, fd, name, &opts); -} - void hashfile_checkpoint_init(struct hashfile *f, struct hashfile_checkpoint *checkpoint) { diff --git a/csum-file.h b/csum-file.h index a03b60120d..01472555c8 100644 --- a/csum-file.h +++ b/csum-file.h @@ -63,8 +63,6 @@ struct hashfile *hashfd(const struct git_hash_algo *algop, int fd, const char *name); struct hashfile *hashfd_check(const struct git_hash_algo *algop, const char *name); -struct hashfile *hashfd_throughput(const struct git_hash_algo *algop, - int fd, const char *name, struct progress *tp); /* * Free the hashfile without flushing its contents to disk. This only From 6938f7b7f8cfe74a0e8af090717502068217e930 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 10 Mar 2026 14:25:06 +0100 Subject: [PATCH 10/10] builtin/pack-objects: reduce lock contention when writing packfile data When running `git pack-objects --stdout` we feed the data through `hashfd_ext()` with a progress meter and a smaller-than-usual buffer length of 8kB so that we can track throughput more granularly. But as packfiles tend to be on the larger side, this small buffer size may cause a ton of write(3p) syscalls. Originally, the buffer we used in `hashfd()` was 8kB for all use cases. This was changed though in 2ca245f8be (csum-file.h: increase hashfile buffer size, 2021-05-18) because we noticed that the number of writes can have an impact on performance. So the buffer size was increased to 128kB, which improved performance a bit for some use cases. But the commit didn't touch the buffer size for `hashd_throughput()`. The reasoning here was that callers expect the progress indicator to update frequently, and a larger buffer size would of course reduce the update frequency especially on slow networks. While that is of course true, there was (and still is, even though it's now a call to `hashfd_ext()`) only a single caller of this function in git-pack-objects(1). This command is responsible for writing packfiles, and those packfiles are often on the bigger side. So arguably: - The user won't care about increments of 8kB when packfiles tend to be megabytes or even gigabytes in size. - Reducing the number of syscalls would be even more valuable here than it would be for multi-pack indices, which was the benchmark done in the mentioned commit, as MIDXs are typically significantly smaller than packfiles. - Nowadays, many internet connections should be able to transfer data at a rate significantly higher than 8kB per second. Update the buffer to instead have a size of `LARGE_PACKET_DATA_MAX - 1`, which translates to ~64kB. This limit was chosen because `git pack-objects --stdout` is most often used when sending packfiles via git-upload-pack(1), where packfile data is chunked into pktlines when using the sideband. Furthermore, most internet connections should have a bandwidth signifcantly higher than 64kB/s, so we'd still be able to observe progress updates at a rate of at least once per second. This change significantly reduces the number of write(3p) syscalls from 355,000 to 44,000 when packing the Linux repository. While this results in a small performance improvement on an otherwise-unused system, this improvement is mostly negligible. More importantly though, it will reduce lock contention in the kernel on an extremely busy system where we have many processes writing data at once. Suggested-by: Jeff King Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index ba150a80ad..7301ed8c68 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -41,6 +41,7 @@ #include "promisor-remote.h" #include "pack-mtimes.h" #include "parse-options.h" +#include "pkt-line.h" #include "blob.h" #include "tree.h" #include "path-walk.h" @@ -1333,14 +1334,17 @@ static void write_pack_file(void) if (pack_to_stdout) { /* - * Since we are expecting to report progress of the - * write into this hashfile, use a smaller buffer - * size so the progress indicators arrive at a more - * frequent rate. + * This command is most often invoked via + * git-upload-pack(1), which will typically chunk data + * into pktlines. As such, we use the maximum data + * length of them as buffer length. + * + * Note that we need to subtract one though to + * accomodate for the sideband byte. */ struct hashfd_options opts = { .progress = progress_state, - .buffer_len = 8 * 1024, + .buffer_len = LARGE_PACKET_DATA_MAX - 1, }; f = hashfd_ext(the_repository->hash_algo, 1, "", &opts);