From ac61d3023ebddd6e401fc8f4080c165609c50c28 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Tue, 10 Mar 2026 14:24:57 +0100
Subject: [PATCH 01/10] upload-pack: fix debug statement when flushing packfile
 data

When git-upload-pack(1) writes packfile data to the client we have some
logic in place that buffers some partial lines. When that buffer still
contains data after git-pack-objects(1) has finished we flush the buffer
so that all remaining bytes are sent out.

Curiously, when we do so we also print the string "flushed." to stderr.
This statement has been introduced in b1c71b7281 (upload-pack: avoid
sending an incomplete pack upon failure, 2006-06-20), so quite a while
ago. What's interesting though is that stderr is typically spliced
through to the client-side, and consequently the client would see this
message. Munging the way how we do the caching indeed confirms this:

  $ git clone file:///home/pks/Development/linux/
  Cloning into bare repository 'linux.git'...
  remote: Enumerating objects: 12980346, done.
  remote: Counting objects: 100% (131820/131820), done.
  remote: Compressing objects: 100% (50290/50290), done.
  remote: Total 12980346 (delta 96319), reused 104500 (delta 81217), pack-reused 12848526 (from 1)
  Receiving objects: 100% (12980346/12980346), 3.23 GiB | 57.44 MiB/s, done.
  flushed.
  Resolving deltas: 100% (10676718/10676718), done.

It's quite clear that this string shouldn't ever be visible to the
client, so it rather feels like this is a left-over debug statement. The
menitoned commit doesn't mention this line, either.

Remove the debug output to prepare for a change in how we do the
buffering in the next commit.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 upload-pack.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/upload-pack.c b/upload-pack.c
index 2d2b70cbf2..c2643c0295 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -457,11 +457,9 @@ static void create_pack_file(struct upload_pack_data *pack_data,
 	}
 
 	/* flush the data */
-	if (output_state->used > 0) {
+	if (output_state->used > 0)
 		send_client_data(1, output_state->buffer, output_state->used,
 				 pack_data->use_sideband);
-		fprintf(stderr, "flushed.\n");
-	}
 	free(output_state);
 	if (pack_data->use_sideband)
 		packet_flush(1);

From 515cedfe1de1b91d71f1b01f6878cdf930e76db8 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Tue, 10 Mar 2026 14:24:58 +0100
Subject: [PATCH 02/10] upload-pack: adapt keepalives based on buffering

The function `create_pack_file()` is responsible for sending the
packfile data to the client of git-upload-pack(1). As generating the
bytes may take significant computing resources we also have a mechanism
in place that optionally sends keepalive pktlines in case we haven't
sent out any data.

The keepalive logic is purely based poll(3p): we pass a timeout to that
syscall, and if the call times out we send out the keepalive pktline.
While reasonable, this logic isn't entirely sufficient: even if the call
to poll(3p) ends because we have received data on any of the file
descriptors we may not necessarily send data to the client.

The most important edge case here happens in `relay_pack_data()`. When
we haven't seen the initial "PACK" signature from git-pack-objects(1)
yet we buffer incoming data. So in the worst case, if each of the bytes
of that signature arrive shortly before the configured keepalive
timeout, then we may not send out any data for a time period that is
(almost) four times as long as the configured timeout.

This edge case is rather unlikely to matter in practice. But in a
subsequent commit we're going to adapt our buffering mechanism to become
more aggressive, which makes it more likely that we don't send any data
for an extended amount of time.

Adapt the logic so that instead of using a fixed timeout on every call
to poll(3p), we instead figure out how much time has passed since the
last-sent data.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 upload-pack.c | 49 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/upload-pack.c b/upload-pack.c
index c2643c0295..04521e57c9 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -29,6 +29,7 @@
 #include "commit-graph.h"
 #include "commit-reach.h"
 #include "shallow.h"
+#include "trace.h"
 #include "write-or-die.h"
 #include "json-writer.h"
 #include "strmap.h"
@@ -218,7 +219,8 @@ struct output_state {
 };
 
 static int relay_pack_data(int pack_objects_out, struct output_state *os,
-			   int use_sideband, int write_packfile_line)
+			   int use_sideband, int write_packfile_line,
+			   bool *did_send_data)
 {
 	/*
 	 * We keep the last byte to ourselves
@@ -232,6 +234,8 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os,
 	 */
 	ssize_t readsz;
 
+	*did_send_data = false;
+
 	readsz = xread(pack_objects_out, os->buffer + os->used,
 		       sizeof(os->buffer) - os->used);
 	if (readsz < 0) {
@@ -247,6 +251,7 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os,
 				if (os->packfile_uris_started)
 					packet_delim(1);
 				packet_write_fmt(1, "\1packfile\n");
+				*did_send_data = true;
 			}
 			break;
 		}
@@ -259,6 +264,7 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os,
 			}
 			*p = '\0';
 			packet_write_fmt(1, "\1%s\n", os->buffer);
+			*did_send_data = true;
 
 			os->used -= p - os->buffer + 1;
 			memmove(os->buffer, p + 1, os->used);
@@ -279,6 +285,7 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os,
 		os->used = 0;
 	}
 
+	*did_send_data = true;
 	return readsz;
 }
 
@@ -290,6 +297,7 @@ static void create_pack_file(struct upload_pack_data *pack_data,
 	char progress[128];
 	char abort_msg[] = "aborting due to possible repository "
 		"corruption on the remote side.";
+	uint64_t last_sent_ms = 0;
 	ssize_t sz;
 	int i;
 	FILE *pipe_fd;
@@ -365,10 +373,14 @@ static void create_pack_file(struct upload_pack_data *pack_data,
 	 */
 
 	while (1) {
+		uint64_t now_ms = getnanotime() / 1000000;
 		struct pollfd pfd[2];
-		int pe, pu, pollsize, polltimeout;
+		int pe, pu, pollsize, polltimeout_ms;
 		int ret;
 
+		if (!last_sent_ms)
+			last_sent_ms = now_ms;
+
 		reset_timeout(pack_data->timeout);
 
 		pollsize = 0;
@@ -390,11 +402,21 @@ static void create_pack_file(struct upload_pack_data *pack_data,
 		if (!pollsize)
 			break;
 
-		polltimeout = pack_data->keepalive < 0
-			? -1
-			: 1000 * pack_data->keepalive;
+		if (pack_data->keepalive < 0) {
+			polltimeout_ms = -1;
+		} else {
+			/*
+			 * The polling timeout needs to be adjusted based on
+			 * the time we have sent our last package. The longer
+			 * it's been in the past, the shorter the timeout
+			 * becomes until we eventually don't block at all.
+			 */
+			polltimeout_ms = 1000 * pack_data->keepalive - (now_ms - last_sent_ms);
+			if (polltimeout_ms < 0)
+				polltimeout_ms = 0;
+		}
 
-		ret = poll(pfd, pollsize, polltimeout);
+		ret = poll(pfd, pollsize, polltimeout_ms);
 
 		if (ret < 0) {
 			if (errno != EINTR) {
@@ -403,16 +425,18 @@ static void create_pack_file(struct upload_pack_data *pack_data,
 			}
 			continue;
 		}
+
 		if (0 <= pe && (pfd[pe].revents & (POLLIN|POLLHUP))) {
 			/* Status ready; we ship that in the side-band
 			 * or dump to the standard error.
 			 */
 			sz = xread(pack_objects.err, progress,
 				  sizeof(progress));
-			if (0 < sz)
+			if (0 < sz) {
 				send_client_data(2, progress, sz,
 						 pack_data->use_sideband);
-			else if (sz == 0) {
+				last_sent_ms = now_ms;
+			} else if (sz == 0) {
 				close(pack_objects.err);
 				pack_objects.err = -1;
 			}
@@ -421,11 +445,14 @@ static void create_pack_file(struct upload_pack_data *pack_data,
 			/* give priority to status messages */
 			continue;
 		}
+
 		if (0 <= pu && (pfd[pu].revents & (POLLIN|POLLHUP))) {
+			bool did_send_data;
 			int result = relay_pack_data(pack_objects.out,
 						     output_state,
 						     pack_data->use_sideband,
-						     !!uri_protocols);
+						     !!uri_protocols,
+						     &did_send_data);
 
 			if (result == 0) {
 				close(pack_objects.out);
@@ -433,6 +460,9 @@ static void create_pack_file(struct upload_pack_data *pack_data,
 			} else if (result < 0) {
 				goto fail;
 			}
+
+			if (did_send_data)
+				last_sent_ms = now_ms;
 		}
 
 		/*
@@ -448,6 +478,7 @@ static void create_pack_file(struct upload_pack_data *pack_data,
 		if (!ret && pack_data->use_sideband) {
 			static const char buf[] = "0005\1";
 			write_or_die(1, buf, 5);
+			last_sent_ms = now_ms;
 		}
 	}
 

From 3a6379aa77c76030aa4d6d88943cd263950db508 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Tue, 10 Mar 2026 14:24:59 +0100
Subject: [PATCH 03/10] upload-pack: prefer flushing data over sending
 keepalive

When using the sideband in git-upload-pack(1) we know to send out
keepalive packets in case generating the pack takes too long. These
keepalives take the form of a simple empty pktline.

In the preceding commit we have adapted git-upload-pack(1) to buffer
data more aggressively before sending it to the client. This creates an
obvious optimization opportunity: when we hit the keepalive timeout
while we still hold on to some buffered data, then it makes more sense
to flush out the data instead of sending the empty keepalive packet.

This is overall not going to be a significant win. Most keepalives will
come before the pack data starts, and once pack-objects starts producing
data, it tends to do so pretty consistently. And of course we can't send
data before we see the PACK header, because the whole point is to buffer
the early bit waiting for packfile URIs. But the optimization is easy
enough to realize.

Do so and flush out data instead of sending an empty pktline. While at
it, drop the useless

Suggested-by: Jeff King <peff@peff.net>
Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 upload-pack.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/upload-pack.c b/upload-pack.c
index 04521e57c9..ef8f8189c1 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -466,18 +466,27 @@ static void create_pack_file(struct upload_pack_data *pack_data,
 		}
 
 		/*
-		 * We hit the keepalive timeout without saying anything; send
-		 * an empty message on the data sideband just to let the other
-		 * side know we're still working on it, but don't have any data
-		 * yet.
+		 * We hit the keepalive timeout without saying anything. If we
+		 * have pending data we flush it out to the caller now.
+		 * Otherwise, we send an empty message on the data sideband
+		 * just to let the other side know we're still working on it,
+		 * but don't have any data yet.
 		 *
 		 * If we don't have a sideband channel, there's no room in the
 		 * protocol to say anything, so those clients are just out of
 		 * luck.
 		 */
 		if (!ret && pack_data->use_sideband) {
-			static const char buf[] = "0005\1";
-			write_or_die(1, buf, 5);
+			if (output_state->packfile_started && output_state->used > 1) {
+				send_client_data(1, output_state->buffer, output_state->used - 1,
+						 pack_data->use_sideband);
+				output_state->buffer[0] = output_state->buffer[output_state->used - 1];
+				output_state->used = 1;
+			} else {
+				static const char buf[] = "0005\1";
+				write_or_die(1, buf, 5);
+			}
+
 			last_sent_ms = now_ms;
 		}
 	}

From fb1af225033a21e91e12860a85d53c28d2d4c62c Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Tue, 10 Mar 2026 14:25:00 +0100
Subject: [PATCH 04/10] upload-pack: reduce lock contention when writing
 packfile data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In our production systems we have recently observed write contention in
git-upload-pack(1). The system in question was consistently streaming
packfiles at a rate of dozens of gigabits per second, but curiously the
system was neither bottlenecked on CPU, memory or IOPS.

We eventually discovered that Git was spending 80% of its time in
`pipe_write()`, out of which almost all of the time was spent in the
`ep_poll_callback` function in the kernel. Quoting the reporter:

  This infrastructure is part of an event notification queue designed to
  allow for multiple producers to emit events, but that concurrency
  safety is guarded by 3 layers of locking. The layer we're hitting
  contention in uses a simple reader/writer lock mode (a.k.a. shared
  versus exclusive mode), where producers need shared-mode (read mode),
  and various other actions use exclusive (write) mode.

The system in question generates workloads where we have hundreds of
git-upload-pack(1) processes active at the same point in time. These
processes end up contending around those locks, and the consequence is
that the Git processes stall.

Now git-upload-pack(1) already has the infrastructure in place to buffer
some of the data it reads from git-pack-objects(1) before actually
sending it out. We only use this infrastructure in very limited ways
though, so we generally end up matching one read(3p) call with one
write(3p) call. Even worse, when the sideband is enabled we end up
matching one read with _two_ writes: one for the pkt-line length, and
one for the packfile data.

Extend our use of the buffering infrastructure so that we soak up bytes
until the buffer is filled up at least 2/3rds of its capacity. The
change is relatively simple to implement as we already know to flush the
buffer in `create_pack_file()` after git-pack-objects(1) has finished.

This significantly reduces the number of write(3p) syscalls we need to
do. Before this change, cloning the Linux repository resulted in around
400,000 write(3p) syscalls. With the buffering in place we only do
around 130,000 syscalls.

Now we could of course go even further and make sure that we always fill
up the whole buffer. But this might cause an increase in read(3p)
syscalls, and some tests show that this only reduces the number of
write(3p) syscalls from 130,000 to 100,000. So overall this doesn't seem
worth it.

Note that the issue could also be fixed by adapting the write buffer
that we use in the downstream git-pack-objects(1) command, and such a
change would have roughly the same result. But the command that
generates the packfile data may not always be git-pack-objects(1) as it
can be changed via "uploadpack.packObjectsHook", so such a fix would
only help in _some_ cases. Regardless of that, we'll also adapt the
write buffer size of git-pack-objects(1) in a subsequent commit.

Helped-by: Matt Smiley <msmiley@gitlab.com>
Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 upload-pack.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/upload-pack.c b/upload-pack.c
index ef8f8189c1..92e1ff3ba2 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -276,6 +276,13 @@ static int relay_pack_data(int pack_objects_out, struct output_state *os,
 		}
 	}
 
+	/*
+	 * Make sure that we buffer some data before sending it to the client.
+	 * This significantly reduces the number of write(3p) syscalls.
+	 */
+	if (readsz && os->used < (sizeof(os->buffer) * 2 / 3))
+		return readsz;
+
 	if (os->used > 1) {
 		send_client_data(1, os->buffer, os->used - 1, use_sideband);
 		os->buffer[0] = os->buffer[os->used - 1];

From 104a6e9788373aebd5e3d42b8d42f557d6de91a3 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Tue, 10 Mar 2026 14:25:01 +0100
Subject: [PATCH 05/10] compat/posix: introduce writev(3p) wrapper

In a subsequent commit we're going to add the first caller to
writev(3p). Introduce a compatibility wrapper for this syscall that we
can use on systems that don't have this syscall.

The syscall exists on modern Unixes like Linux and macOS, and seemingly
even for NonStop according to [1]. It doesn't seem to exist on Windows
though.

[1]: http://nonstoptools.com/manuals/OSS-SystemCalls.pdf
[2]: https://www.gnu.org/software/gnulib/manual/html_node/writev.html

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Makefile         |  4 ++++
 compat/posix.h   | 14 ++++++++++++++
 compat/writev.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 config.mak.uname |  2 ++
 meson.build      |  1 +
 5 files changed, 65 insertions(+)
 create mode 100644 compat/writev.c

diff --git a/Makefile b/Makefile
index 8aa489f3b6..61c7dff942 100644
--- a/Makefile
+++ b/Makefile
@@ -2015,6 +2015,10 @@ ifdef NO_PREAD
 	COMPAT_CFLAGS += -DNO_PREAD
 	COMPAT_OBJS += compat/pread.o
 endif
+ifdef NO_WRITEV
+	COMPAT_CFLAGS += -DNO_WRITEV
+	COMPAT_OBJS += compat/writev.o
+endif
 ifdef NO_FAST_WORKING_DIRECTORY
 	BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
 endif
diff --git a/compat/posix.h b/compat/posix.h
index 245386fa4a..3c611d2736 100644
--- a/compat/posix.h
+++ b/compat/posix.h
@@ -137,6 +137,9 @@
 #include <sys/socket.h>
 #include <sys/ioctl.h>
 #include <sys/statvfs.h>
+#ifndef NO_WRITEV
+#include <sys/uio.h>
+#endif
 #include <termios.h>
 #ifndef NO_SYS_SELECT_H
 #include <sys/select.h>
@@ -323,6 +326,17 @@ int git_lstat(const char *, struct stat *);
 ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
 #endif
 
+#ifdef NO_WRITEV
+#define writev git_writev
+#define iovec git_iovec
+struct git_iovec {
+	void *iov_base;
+	size_t iov_len;
+};
+
+ssize_t git_writev(int fd, const struct iovec *iov, int iovcnt);
+#endif
+
 #ifdef NO_SETENV
 #define setenv gitsetenv
 int gitsetenv(const char *, const char *, int);
diff --git a/compat/writev.c b/compat/writev.c
new file mode 100644
index 0000000000..3a94870a2f
--- /dev/null
+++ b/compat/writev.c
@@ -0,0 +1,44 @@
+#include "../git-compat-util.h"
+#include "../wrapper.h"
+
+ssize_t git_writev(int fd, const struct iovec *iov, int iovcnt)
+{
+	size_t total_written = 0;
+	size_t sum = 0;
+
+	/*
+	 * According to writev(3p), the syscall shall error with EINVAL in case
+	 * the sum of `iov_len` overflows `ssize_t`.
+	 */
+	 for (int i = 0; i < iovcnt; i++) {
+		if (iov[i].iov_len > maximum_signed_value_of_type(ssize_t) ||
+		    iov[i].iov_len + sum > maximum_signed_value_of_type(ssize_t)) {
+			errno = EINVAL;
+			return -1;
+		}
+
+		sum += iov[i].iov_len;
+	}
+
+	for (int i = 0; i < iovcnt; i++) {
+		const char *bytes = iov[i].iov_base;
+		size_t iovec_written = 0;
+
+		while (iovec_written < iov[i].iov_len) {
+			ssize_t bytes_written = xwrite(fd, bytes + iovec_written,
+						       iov[i].iov_len - iovec_written);
+			if (bytes_written < 0) {
+				if (total_written)
+					goto out;
+				return bytes_written;
+			}
+			if (!bytes_written)
+				goto out;
+			iovec_written += bytes_written;
+			total_written += bytes_written;
+		}
+	}
+
+out:
+	return (ssize_t) total_written;
+}
diff --git a/config.mak.uname b/config.mak.uname
index 3c35ae33a3..8e66a1cc0a 100644
--- a/config.mak.uname
+++ b/config.mak.uname
@@ -457,6 +457,7 @@ ifeq ($(uname_S),Windows)
 	SANE_TOOL_PATH ?= $(msvc_bin_dir_msys)
 	HAVE_ALLOCA_H = YesPlease
 	NO_PREAD = YesPlease
+	NO_WRITEV = YesPlease
 	NEEDS_CRYPTO_WITH_SSL = YesPlease
 	NO_LIBGEN_H = YesPlease
 	NO_POLL = YesPlease
@@ -672,6 +673,7 @@ ifeq ($(uname_S),MINGW)
 	pathsep = ;
 	HAVE_ALLOCA_H = YesPlease
 	NO_PREAD = YesPlease
+	NO_WRITEV = YesPlease
 	NEEDS_CRYPTO_WITH_SSL = YesPlease
 	NO_LIBGEN_H = YesPlease
 	NO_POLL = YesPlease
diff --git a/meson.build b/meson.build
index dd52efd1c8..f1cd89efc7 100644
--- a/meson.build
+++ b/meson.build
@@ -1405,6 +1405,7 @@ checkfuncs = {
   'initgroups' : [],
   'strtoumax' : ['strtoumax.c', 'strtoimax.c'],
   'pread' : ['pread.c'],
+  'writev' : ['writev.c'],
 }
 
 if host_machine.system() == 'windows'

From 6c8a62bc17cc4ae119ec6829225e897b873b59d6 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Tue, 10 Mar 2026 14:25:02 +0100
Subject: [PATCH 06/10] wrapper: introduce writev(3p) wrappers

In the preceding commit we have added a compatibility wrapper for the
writev(3p) syscall. Introduce some generic wrappers for this function
that we nowadays take for granted in the Git codebase.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 wrapper.c      | 41 +++++++++++++++++++++++++++++++++++++++++
 wrapper.h      |  9 +++++++++
 write-or-die.c |  8 ++++++++
 write-or-die.h |  1 +
 4 files changed, 59 insertions(+)

diff --git a/wrapper.c b/wrapper.c
index b794fb20e7..fa40f399b3 100644
--- a/wrapper.c
+++ b/wrapper.c
@@ -323,6 +323,47 @@ ssize_t write_in_full(int fd, const void *buf, size_t count)
 	return total;
 }
 
+ssize_t writev_in_full(int fd, struct iovec *iov, int iovcnt)
+{
+	ssize_t total_written = 0;
+
+	while (iovcnt) {
+		ssize_t bytes_written = writev(fd, iov, iovcnt);
+		if (bytes_written < 0) {
+			if (errno == EINTR || errno == EAGAIN)
+				continue;
+			return -1;
+		}
+		if (!bytes_written) {
+			errno = ENOSPC;
+			return -1;
+		}
+
+		total_written += bytes_written;
+
+		/*
+		 * We first need to discard any iovec entities that have been
+		 * fully written.
+		 */
+		while (iovcnt && (size_t)bytes_written >= iov->iov_len) {
+			bytes_written -= iov->iov_len;
+			iov++;
+			iovcnt--;
+		}
+
+		/*
+		 * Finally, we need to adjust the last iovec in case we have
+		 * performed a partial write.
+		 */
+		if (iovcnt && bytes_written) {
+			iov->iov_base = (char *) iov->iov_base + bytes_written;
+			iov->iov_len -= bytes_written;
+		}
+	}
+
+	return total_written;
+}
+
 ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset)
 {
 	char *p = buf;
diff --git a/wrapper.h b/wrapper.h
index 15ac3bab6e..27519b32d1 100644
--- a/wrapper.h
+++ b/wrapper.h
@@ -47,6 +47,15 @@ ssize_t read_in_full(int fd, void *buf, size_t count);
 ssize_t write_in_full(int fd, const void *buf, size_t count);
 ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset);
 
+/*
+ * Try to write all iovecs. Returns -1 in case an error occurred with a proper
+ * errno set, the number of bytes written otherwise.
+ *
+ * Note that the iovec will be modified as a result of this call to adjust for
+ * partial writes!
+ */
+ssize_t writev_in_full(int fd, struct iovec *iov, int iovcnt);
+
 static inline ssize_t write_str_in_full(int fd, const char *str)
 {
 	return write_in_full(fd, str, strlen(str));
diff --git a/write-or-die.c b/write-or-die.c
index 01a9a51fa2..5f522fb728 100644
--- a/write-or-die.c
+++ b/write-or-die.c
@@ -96,6 +96,14 @@ void write_or_die(int fd, const void *buf, size_t count)
 	}
 }
 
+void writev_or_die(int fd, struct iovec *iov, int iovlen)
+{
+	if (writev_in_full(fd, iov, iovlen) < 0) {
+		check_pipe(errno);
+		die_errno("writev error");
+	}
+}
+
 void fwrite_or_die(FILE *f, const void *buf, size_t count)
 {
 	if (fwrite(buf, 1, count, f) != count)
diff --git a/write-or-die.h b/write-or-die.h
index 65a5c42a47..ae3d7d88b8 100644
--- a/write-or-die.h
+++ b/write-or-die.h
@@ -7,6 +7,7 @@ void fprintf_or_die(FILE *, const char *fmt, ...);
 void fwrite_or_die(FILE *f, const void *buf, size_t count);
 void fflush_or_die(FILE *f);
 void write_or_die(int fd, const void *buf, size_t count);
+void writev_or_die(int fd, struct iovec *iov, int iovlen);
 
 /*
  * These values are used to help identify parts of a repository to fsync.

From 6b9c9a191232716f51dafbbdfba9d164ed9e1397 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Tue, 10 Mar 2026 14:25:03 +0100
Subject: [PATCH 07/10] sideband: use writev(3p) to send pktlines

Every pktline that we send out via `send_sideband()` currently requires
two syscalls: one to write the pktline's length, and one to send its
data. This typically isn't all that much of a problem, but under extreme
load the syscalls may cause contention in the kernel.

Refactor the code to instead use the newly introduced writev(3p) infra
so that we can send out the data with a single syscall. This reduces the
number of syscalls from around 133,000 calls to write(3p) to around
67,000 calls to writev(3p).

Suggested-by: Jeff King <peff@peff.net>
Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 sideband.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/sideband.c b/sideband.c
index ea7c25211e..1ed6614eaf 100644
--- a/sideband.c
+++ b/sideband.c
@@ -264,6 +264,7 @@ void send_sideband(int fd, int band, const char *data, ssize_t sz, int packet_ma
 	const char *p = data;
 
 	while (sz) {
+		struct iovec iov[2];
 		unsigned n;
 		char hdr[5];
 
@@ -273,12 +274,19 @@ void send_sideband(int fd, int band, const char *data, ssize_t sz, int packet_ma
 		if (0 <= band) {
 			xsnprintf(hdr, sizeof(hdr), "%04x", n + 5);
 			hdr[4] = band;
-			write_or_die(fd, hdr, 5);
+			iov[0].iov_base = hdr;
+			iov[0].iov_len = 5;
 		} else {
 			xsnprintf(hdr, sizeof(hdr), "%04x", n + 4);
-			write_or_die(fd, hdr, 4);
+			iov[0].iov_base = hdr;
+			iov[0].iov_len = 4;
 		}
-		write_or_die(fd, p, n);
+
+		iov[1].iov_base = (void *) p;
+		iov[1].iov_len = n;
+
+		writev_or_die(fd, iov, ARRAY_SIZE(iov));
+
 		p += n;
 		sz -= n;
 	}

From 25d41000f541dc6c5f3e14dc03e2e825f023f467 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Tue, 10 Mar 2026 14:25:04 +0100
Subject: [PATCH 08/10] csum-file: introduce `hashfd_ext()`

Introduce a new `hashfd_ext()` function that takes an options structure.
This function will replace `hashd_throughput()` in the next commit.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 csum-file.c | 22 +++++++++++++---------
 csum-file.h | 14 ++++++++++++++
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/csum-file.c b/csum-file.c
index 6e21e3cac8..a50416247e 100644
--- a/csum-file.c
+++ b/csum-file.c
@@ -161,17 +161,16 @@ struct hashfile *hashfd_check(const struct git_hash_algo *algop,
 	return f;
 }
 
-static struct hashfile *hashfd_internal(const struct git_hash_algo *algop,
-					int fd, const char *name,
-					struct progress *tp,
-					size_t buffer_len)
+struct hashfile *hashfd_ext(const struct git_hash_algo *algop,
+			    int fd, const char *name,
+			    const struct hashfd_options *opts)
 {
 	struct hashfile *f = xmalloc(sizeof(*f));
 	f->fd = fd;
 	f->check_fd = -1;
 	f->offset = 0;
 	f->total = 0;
-	f->tp = tp;
+	f->tp = opts->progress;
 	f->name = name;
 	f->do_crc = 0;
 	f->skip_hash = 0;
@@ -179,8 +178,8 @@ static struct hashfile *hashfd_internal(const struct git_hash_algo *algop,
 	f->algop = unsafe_hash_algo(algop);
 	f->algop->init_fn(&f->ctx);
 
-	f->buffer_len = buffer_len;
-	f->buffer = xmalloc(buffer_len);
+	f->buffer_len = opts->buffer_len ? opts->buffer_len : 128 * 1024;
+	f->buffer = xmalloc(f->buffer_len);
 	f->check_buffer = NULL;
 
 	return f;
@@ -194,7 +193,8 @@ struct hashfile *hashfd(const struct git_hash_algo *algop,
 	 * measure the rate of data passing through this hashfile,
 	 * use a larger buffer size to reduce fsync() calls.
 	 */
-	return hashfd_internal(algop, fd, name, NULL, 128 * 1024);
+	struct hashfd_options opts = { 0 };
+	return hashfd_ext(algop, fd, name, &opts);
 }
 
 struct hashfile *hashfd_throughput(const struct git_hash_algo *algop,
@@ -206,7 +206,11 @@ struct hashfile *hashfd_throughput(const struct git_hash_algo *algop,
 	 * size so the progress indicators arrive at a more
 	 * frequent rate.
 	 */
-	return hashfd_internal(algop, fd, name, tp, 8 * 1024);
+	struct hashfd_options opts = {
+		.progress = tp,
+		.buffer_len = 8 * 1024,
+	};
+	return hashfd_ext(algop, fd, name, &opts);
 }
 
 void hashfile_checkpoint_init(struct hashfile *f,
diff --git a/csum-file.h b/csum-file.h
index 07ae11024a..a03b60120d 100644
--- a/csum-file.h
+++ b/csum-file.h
@@ -45,6 +45,20 @@ int hashfile_truncate(struct hashfile *, struct hashfile_checkpoint *);
 #define CSUM_FSYNC		2
 #define CSUM_HASH_IN_STREAM	4
 
+struct hashfd_options {
+	/*
+	 * Throughput progress that counts the number of bytes that have been
+	 * hashed.
+	 */
+	struct progress *progress;
+
+	/* The length of the buffer that shall be used read read data. */
+	size_t buffer_len;
+};
+
+struct hashfile *hashfd_ext(const struct git_hash_algo *algop,
+			    int fd, const char *name,
+			    const struct hashfd_options *opts);
 struct hashfile *hashfd(const struct git_hash_algo *algop,
 			int fd, const char *name);
 struct hashfile *hashfd_check(const struct git_hash_algo *algop,

From 1d58d66c0e8d815e1638e567f3f59424dab91728 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Tue, 10 Mar 2026 14:25:05 +0100
Subject: [PATCH 09/10] csum-file: drop `hashfd_throughput()`

The `hashfd_throughput()` function is used by a single callsite in
git-pack-objects(1). In contrast to `hashfd()`, this function uses a
progress meter to measure throughput and a smaller buffer length so that
the progress meter can provide more granular metrics.

We're going to change that caller in the next commit to be a bit more
specific to packing objects. As such, `hashfd_throughput()` will be a
somewhat unfitting mechanism for any potential new callers.

Drop the function and replace it with a call to `hashfd_ext()`.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/pack-objects.c | 19 +++++++++++++++----
 csum-file.c            | 16 ----------------
 csum-file.h            |  2 --
 3 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5846b6a293..ba150a80ad 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1331,11 +1331,22 @@ static void write_pack_file(void)
 		unsigned char hash[GIT_MAX_RAWSZ];
 		char *pack_tmp_name = NULL;
 
-		if (pack_to_stdout)
-			f = hashfd_throughput(the_repository->hash_algo, 1,
-					      "<stdout>", progress_state);
-		else
+		if (pack_to_stdout) {
+			/*
+			 * Since we are expecting to report progress of the
+			 * write into this hashfile, use a smaller buffer
+			 * size so the progress indicators arrive at a more
+			 * frequent rate.
+			 */
+			struct hashfd_options opts = {
+				.progress = progress_state,
+				.buffer_len = 8 * 1024,
+			};
+			f = hashfd_ext(the_repository->hash_algo, 1,
+				       "<stdout>", &opts);
+		} else {
 			f = create_tmp_packfile(the_repository, &pack_tmp_name);
+		}
 
 		offset = write_pack_header(f, nr_remaining);
 
diff --git a/csum-file.c b/csum-file.c
index a50416247e..5dfaca5543 100644
--- a/csum-file.c
+++ b/csum-file.c
@@ -197,22 +197,6 @@ struct hashfile *hashfd(const struct git_hash_algo *algop,
 	return hashfd_ext(algop, fd, name, &opts);
 }
 
-struct hashfile *hashfd_throughput(const struct git_hash_algo *algop,
-				   int fd, const char *name, struct progress *tp)
-{
-	/*
-	 * Since we are expecting to report progress of the
-	 * write into this hashfile, use a smaller buffer
-	 * size so the progress indicators arrive at a more
-	 * frequent rate.
-	 */
-	struct hashfd_options opts = {
-		.progress = tp,
-		.buffer_len = 8 * 1024,
-	};
-	return hashfd_ext(algop, fd, name, &opts);
-}
-
 void hashfile_checkpoint_init(struct hashfile *f,
 			      struct hashfile_checkpoint *checkpoint)
 {
diff --git a/csum-file.h b/csum-file.h
index a03b60120d..01472555c8 100644
--- a/csum-file.h
+++ b/csum-file.h
@@ -63,8 +63,6 @@ struct hashfile *hashfd(const struct git_hash_algo *algop,
 			int fd, const char *name);
 struct hashfile *hashfd_check(const struct git_hash_algo *algop,
 			      const char *name);
-struct hashfile *hashfd_throughput(const struct git_hash_algo *algop,
-				   int fd, const char *name, struct progress *tp);
 
 /*
  * Free the hashfile without flushing its contents to disk. This only

From 6938f7b7f8cfe74a0e8af090717502068217e930 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Tue, 10 Mar 2026 14:25:06 +0100
Subject: [PATCH 10/10] builtin/pack-objects: reduce lock contention when
 writing packfile data

When running `git pack-objects --stdout` we feed the data through
`hashfd_ext()` with a progress meter and a smaller-than-usual buffer
length of 8kB so that we can track throughput more granularly. But as
packfiles tend to be on the larger side, this small buffer size may
cause a ton of write(3p) syscalls.

Originally, the buffer we used in `hashfd()` was 8kB for all use cases.
This was changed though in 2ca245f8be (csum-file.h: increase hashfile
buffer size, 2021-05-18) because we noticed that the number of writes
can have an impact on performance. So the buffer size was increased to
128kB, which improved performance a bit for some use cases.

But the commit didn't touch the buffer size for `hashd_throughput()`.
The reasoning here was that callers expect the progress indicator to
update frequently, and a larger buffer size would of course reduce the
update frequency especially on slow networks.

While that is of course true, there was (and still is, even though it's
now a call to `hashfd_ext()`) only a single caller of this function in
git-pack-objects(1). This command is responsible for writing packfiles,
and those packfiles are often on the bigger side. So arguably:

  - The user won't care about increments of 8kB when packfiles tend to
    be megabytes or even gigabytes in size.

  - Reducing the number of syscalls would be even more valuable here
    than it would be for multi-pack indices, which was the benchmark
    done in the mentioned commit, as MIDXs are typically significantly
    smaller than packfiles.

  - Nowadays, many internet connections should be able to transfer data
    at a rate significantly higher than 8kB per second.

Update the buffer to instead have a size of `LARGE_PACKET_DATA_MAX - 1`,
which translates to ~64kB. This limit was chosen because `git
pack-objects --stdout` is most often used when sending packfiles via
git-upload-pack(1), where packfile data is chunked into pktlines when
using the sideband. Furthermore, most internet connections should have a
bandwidth signifcantly higher than 64kB/s, so we'd still be able to
observe progress updates at a rate of at least once per second.

This change significantly reduces the number of write(3p) syscalls from
355,000 to 44,000 when packing the Linux repository. While this results
in a small performance improvement on an otherwise-unused system, this
improvement is mostly negligible. More importantly though, it will
reduce lock contention in the kernel on an extremely busy system where
we have many processes writing data at once.

Suggested-by: Jeff King <peff@peff.net>
Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/pack-objects.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ba150a80ad..7301ed8c68 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -41,6 +41,7 @@
 #include "promisor-remote.h"
 #include "pack-mtimes.h"
 #include "parse-options.h"
+#include "pkt-line.h"
 #include "blob.h"
 #include "tree.h"
 #include "path-walk.h"
@@ -1333,14 +1334,17 @@ static void write_pack_file(void)
 
 		if (pack_to_stdout) {
 			/*
-			 * Since we are expecting to report progress of the
-			 * write into this hashfile, use a smaller buffer
-			 * size so the progress indicators arrive at a more
-			 * frequent rate.
+			 * This command is most often invoked via
+			 * git-upload-pack(1), which will typically chunk data
+			 * into pktlines. As such, we use the maximum data
+			 * length of them as buffer length.
+			 *
+			 * Note that we need to subtract one though to
+			 * accomodate for the sideband byte.
 			 */
 			struct hashfd_options opts = {
 				.progress = progress_state,
-				.buffer_len = 8 * 1024,
+				.buffer_len = LARGE_PACKET_DATA_MAX - 1,
 			};
 			f = hashfd_ext(the_repository->hash_algo, 1,
 				       "<stdout>", &opts);