From 650134a47894244b3804b5d175439b362453fa4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Wed, 15 Jun 2022 18:58:38 +0200 Subject: [PATCH 1/6] archive: update format documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mention all formats in the --format section, use backtick quoting for literal values throughout, clarify the description of the configuration option. Helped-by: Junio C Hamano Signed-off-by: René Scharfe Signed-off-by: Junio C Hamano --- Documentation/git-archive.txt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/git-archive.txt b/Documentation/git-archive.txt index 56989a2f34..ff3f7b0344 100644 --- a/Documentation/git-archive.txt +++ b/Documentation/git-archive.txt @@ -34,10 +34,12 @@ OPTIONS ------- --format=:: - Format of the resulting archive: 'tar' or 'zip'. If this option + Format of the resulting archive. Possible values are `tar`, + `zip`, `tar.gz`, `tgz`, and any format defined using the + configuration option `tar..command`. If `--format` is not given, and the output file is specified, the format is - inferred from the filename if possible (e.g. writing to "foo.zip" - makes the output to be in the zip format). Otherwise the output + inferred from the filename if possible (e.g. writing to `foo.zip` + makes the output to be in the `zip` format). Otherwise the output format is `tar`. -l:: @@ -143,17 +145,15 @@ tar..command:: is executed using the shell with the generated tar file on its standard input, and should produce the final output on its standard output. Any compression-level options will be passed - to the command (e.g., "-9"). An output file with the same - extension as `` will be use this format if no other - format is given. + to the command (e.g., `-9`). + -The "tar.gz" and "tgz" formats are defined automatically and default to -`gzip -cn`. You may override them with custom commands. +The `tar.gz` and `tgz` formats are defined automatically and use the +command `gzip -cn` by default. tar..remote:: - If true, enable `` for use by remote clients via + If true, enable the format for use by remote clients via linkgit:git-upload-archive[1]. Defaults to false for - user-defined formats, but true for the "tar.gz" and "tgz" + user-defined formats, but true for the `tar.gz` and `tgz` formats. [[ATTRIBUTES]] From 96b9e5151bf0cad110d2f0bf16f413b9fc9f606c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Wed, 15 Jun 2022 18:59:57 +0200 Subject: [PATCH 2/6] archive: rename archiver data field to filter_command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The void pointer "data" in struct archiver is only used to store filter commands to pass tar archives to, like gzip. Rename it accordingly and also turn it into a char pointer to document the fact that it's a string reference. Signed-off-by: René Scharfe Signed-off-by: Junio C Hamano --- archive-tar.c | 10 +++++----- archive.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/archive-tar.c b/archive-tar.c index 042feb66d2..2717e34a1d 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -383,8 +383,8 @@ static int tar_filter_config(const char *var, const char *value, void *data) if (!strcmp(type, "command")) { if (!value) return config_error_nonbool(var); - free(ar->data); - ar->data = xstrdup(value); + free(ar->filter_command); + ar->filter_command = xstrdup(value); return 0; } if (!strcmp(type, "remote")) { @@ -432,10 +432,10 @@ static int write_tar_filter_archive(const struct archiver *ar, struct child_process filter = CHILD_PROCESS_INIT; int r; - if (!ar->data) + if (!ar->filter_command) BUG("tar-filter archiver called with no filter defined"); - strbuf_addstr(&cmd, ar->data); + strbuf_addstr(&cmd, ar->filter_command); if (args->compression_level >= 0) strbuf_addf(&cmd, " -%d", args->compression_level); @@ -478,7 +478,7 @@ void init_tar_archiver(void) git_config(git_tar_config, NULL); for (i = 0; i < nr_tar_filters; i++) { /* omit any filters that never had a command configured */ - if (tar_filters[i]->data) + if (tar_filters[i]->filter_command) register_archiver(tar_filters[i]); } } diff --git a/archive.h b/archive.h index 49fab71aaf..08bed3ed3a 100644 --- a/archive.h +++ b/archive.h @@ -43,7 +43,7 @@ struct archiver { const char *name; int (*write_archive)(const struct archiver *, struct archiver_args *); unsigned flags; - void *data; + char *filter_command; }; void register_archiver(struct archiver *); From dfce1186c6034d6f4ea283f5178fd25cbd8f4fc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Wed, 15 Jun 2022 19:01:14 +0200 Subject: [PATCH 3/6] archive-tar: factor out write_block() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All tar archive writes have the same size and are done to the same file descriptor. Move them to a common function, write_block(), to reduce code duplication and make it easy to change the destination. Original-patch-by: Rohit Ashiwal Signed-off-by: René Scharfe Signed-off-by: Junio C Hamano --- archive-tar.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/archive-tar.c b/archive-tar.c index 2717e34a1d..4e6a3deb80 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -38,11 +38,16 @@ static int write_tar_filter_archive(const struct archiver *ar, #define USTAR_MAX_MTIME 077777777777ULL #endif +static void write_block(const void *buf) +{ + write_or_die(1, buf, BLOCKSIZE); +} + /* writes out the whole block, but only if it is full */ static void write_if_needed(void) { if (offset == BLOCKSIZE) { - write_or_die(1, block, BLOCKSIZE); + write_block(block); offset = 0; } } @@ -66,7 +71,7 @@ static void do_write_blocked(const void *data, unsigned long size) write_if_needed(); } while (size >= BLOCKSIZE) { - write_or_die(1, buf, BLOCKSIZE); + write_block(buf); size -= BLOCKSIZE; buf += BLOCKSIZE; } @@ -101,10 +106,10 @@ static void write_trailer(void) { int tail = BLOCKSIZE - offset; memset(block + offset, 0, tail); - write_or_die(1, block, BLOCKSIZE); + write_block(block); if (tail < 2 * RECORDSIZE) { memset(block, 0, offset); - write_or_die(1, block, BLOCKSIZE); + write_block(block); } } From 76d7602631a9d0cb67cc1b848d580b862dc5de8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Wed, 15 Jun 2022 19:02:33 +0200 Subject: [PATCH 4/6] archive-tar: add internal gzip implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Git uses zlib for its own object store, but calls gzip when creating tgz archives. Add an option to perform the gzip compression for the latter using zlib, without depending on the external gzip binary. Plug it in by making write_block a function pointer and switching to a compressing variant if the filter command has the magic value "git archive gzip". Does that indirection slow down tar creation? Not really, at least not in this test: $ hyperfine -w3 -L rev HEAD,origin/main -p 'git checkout {rev} && make' \ './git -C ../linux archive --format=tar HEAD # {rev}' Benchmark #1: ./git -C ../linux archive --format=tar HEAD # HEAD Time (mean ± σ): 4.044 s ± 0.007 s [User: 3.901 s, System: 0.137 s] Range (min … max): 4.038 s … 4.059 s 10 runs Benchmark #2: ./git -C ../linux archive --format=tar HEAD # origin/main Time (mean ± σ): 4.047 s ± 0.009 s [User: 3.903 s, System: 0.138 s] Range (min … max): 4.038 s … 4.066 s 10 runs How does tgz creation perform? $ hyperfine -w3 -L command 'gzip -cn','git archive gzip' \ './git -c tar.tgz.command="{command}" -C ../linux archive --format=tgz HEAD' Benchmark #1: ./git -c tar.tgz.command="gzip -cn" -C ../linux archive --format=tgz HEAD Time (mean ± σ): 20.404 s ± 0.006 s [User: 23.943 s, System: 0.401 s] Range (min … max): 20.395 s … 20.414 s 10 runs Benchmark #2: ./git -c tar.tgz.command="git archive gzip" -C ../linux archive --format=tgz HEAD Time (mean ± σ): 23.807 s ± 0.023 s [User: 23.655 s, System: 0.145 s] Range (min … max): 23.782 s … 23.857 s 10 runs Summary './git -c tar.tgz.command="gzip -cn" -C ../linux archive --format=tgz HEAD' ran 1.17 ± 0.00 times faster than './git -c tar.tgz.command="git archive gzip" -C ../linux archive --format=tgz HEAD' So the internal implementation takes 17% longer on the Linux repo, but uses 2% less CPU time. That's because the external gzip can run in parallel on its own processor, while the internal one works sequentially and avoids the inter-process communication overhead. What are the benefits? Only an internal sequential implementation can offer this eco mode, and it allows avoiding the gzip(1) requirement. This implementation uses the helper functions from our zlib.c instead of the convenient gz* functions from zlib, because the latter doesn't give the control over the generated gzip header that the next patch requires. Original-patch-by: Rohit Ashiwal Signed-off-by: René Scharfe Signed-off-by: Junio C Hamano --- Documentation/git-archive.txt | 3 ++- archive-tar.c | 45 ++++++++++++++++++++++++++++++++++- t/t5000-tar-tree.sh | 16 +++++++++++++ 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/Documentation/git-archive.txt b/Documentation/git-archive.txt index ff3f7b0344..b2d1b63d31 100644 --- a/Documentation/git-archive.txt +++ b/Documentation/git-archive.txt @@ -148,7 +148,8 @@ tar..command:: to the command (e.g., `-9`). + The `tar.gz` and `tgz` formats are defined automatically and use the -command `gzip -cn` by default. +command `gzip -cn` by default. An internal gzip implementation can be +used by specifying the value `git archive gzip`. tar..remote:: If true, enable the format for use by remote clients via diff --git a/archive-tar.c b/archive-tar.c index 4e6a3deb80..53d0ef685c 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -38,11 +38,13 @@ static int write_tar_filter_archive(const struct archiver *ar, #define USTAR_MAX_MTIME 077777777777ULL #endif -static void write_block(const void *buf) +static void tar_write_block(const void *buf) { write_or_die(1, buf, BLOCKSIZE); } +static void (*write_block)(const void *) = tar_write_block; + /* writes out the whole block, but only if it is full */ static void write_if_needed(void) { @@ -430,6 +432,34 @@ static int write_tar_archive(const struct archiver *ar, return err; } +static git_zstream gzstream; +static unsigned char outbuf[16384]; + +static void tgz_deflate(int flush) +{ + while (gzstream.avail_in || flush == Z_FINISH) { + int status = git_deflate(&gzstream, flush); + if (!gzstream.avail_out || status == Z_STREAM_END) { + write_or_die(1, outbuf, gzstream.next_out - outbuf); + gzstream.next_out = outbuf; + gzstream.avail_out = sizeof(outbuf); + if (status == Z_STREAM_END) + break; + } + if (status != Z_OK && status != Z_BUF_ERROR) + die(_("deflate error (%d)"), status); + } +} + +static void tgz_write_block(const void *data) +{ + gzstream.next_in = (void *)data; + gzstream.avail_in = BLOCKSIZE; + tgz_deflate(Z_NO_FLUSH); +} + +static const char internal_gzip_command[] = "git archive gzip"; + static int write_tar_filter_archive(const struct archiver *ar, struct archiver_args *args) { @@ -440,6 +470,19 @@ static int write_tar_filter_archive(const struct archiver *ar, if (!ar->filter_command) BUG("tar-filter archiver called with no filter defined"); + if (!strcmp(ar->filter_command, internal_gzip_command)) { + write_block = tgz_write_block; + git_deflate_init_gzip(&gzstream, args->compression_level); + gzstream.next_out = outbuf; + gzstream.avail_out = sizeof(outbuf); + + r = write_tar_archive(ar, args); + + tgz_deflate(Z_FINISH); + git_deflate_end(&gzstream); + return r; + } + strbuf_addstr(&cmd, ar->filter_command); if (args->compression_level >= 0) strbuf_addf(&cmd, " -%d", args->compression_level); diff --git a/t/t5000-tar-tree.sh b/t/t5000-tar-tree.sh index 7f8d2ab0a7..9ac0ec67fe 100755 --- a/t/t5000-tar-tree.sh +++ b/t/t5000-tar-tree.sh @@ -374,6 +374,22 @@ test_expect_success GZIP 'remote tar.gz can be disabled' ' >remote.tar.gz ' +test_expect_success 'git archive --format=tgz (internal gzip)' ' + test_config tar.tgz.command "git archive gzip" && + git archive --format=tgz HEAD >internal_gzip.tgz +' + +test_expect_success 'git archive --format=tar.gz (internal gzip)' ' + test_config tar.tar.gz.command "git archive gzip" && + git archive --format=tar.gz HEAD >internal_gzip.tar.gz && + test_cmp_bin internal_gzip.tgz internal_gzip.tar.gz +' + +test_expect_success GZIP 'extract tgz file (internal gzip)' ' + gzip -d -c internal_gzip.tar && + test_cmp_bin b.tar internal_gzip.tar +' + test_expect_success 'archive and :(glob)' ' git archive -v HEAD -- ":(glob)**/sh" >/dev/null 2>actual && cat >expect < Date: Wed, 15 Jun 2022 19:04:09 +0200 Subject: [PATCH 5/6] archive-tar: use OS_CODE 3 (Unix) for internal gzip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gzip(1) encodes the OS it runs on in the 10th byte of its output. It uses the following OS_CODE values according to its tailor.h [1]: 0 - MS-DOS 3 - UNIX 5 - Atari ST 6 - OS/2 10 - TOPS-20 11 - Windows NT The gzip.exe that comes with Git for Windows uses OS_CODE 3 for some reason, so this value is used on practically all supported platforms when generating tgz archives using gzip(1). Zlib uses a bigger set of values according to its zutil.h [2], aligned with section 4.4.2 of the ZIP specification, APPNOTE.txt [3]: 0 - MS-DOS 1 - Amiga 3 - UNIX 4 - VM/CMS 5 - Atari ST 6 - OS/2 7 - Macintosh 8 - Z-System 10 - Windows NT 11 - MVS (OS/390 - Z/OS) 13 - Acorn Risc 16 - BeOS 18 - OS/400 19 - OS X (Darwin) Thus the internal gzip implementation in archive-tar.c sets different OS_CODE header values on major platforms Windows and macOS. Git for Windows uses its own zlib-based variant since v2.20.1 by default and thus embeds OS_CODE 10 in tgz archives. The tar archive for a commit is generated consistently on all systems (by the same Git version). The OS_CODE in the gzip header does not influence extraction. Avoid leaking OS information and make tgz archives constistent and reproducable (with the same Git and libz versions) by using OS_CODE 3 everywhere. At least on macOS 12.4 this produces the same output as gzip(1) for the examples I tried: # before $ git -c tar.tgz.command='git archive gzip' archive --format=tgz v2.36.0 | shasum 3abbffb40b7c63cf9b7d91afc682f11682f80759 - # with this patch $ git -c tar.tgz.command='git archive gzip' archive --format=tgz v2.36.0 | shasum dc6dc6ba9636d522799085d0d77ab6a110bcc141 - $ git archive --format=tar v2.36.0 | gzip -cn | shasum dc6dc6ba9636d522799085d0d77ab6a110bcc141 - [1] https://git.savannah.gnu.org/cgit/gzip.git/tree/tailor.h [2] https://github.com/madler/zlib/blob/master/zutil.h [3] https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT Helped-by: Johannes Schindelin Signed-off-by: René Scharfe Signed-off-by: Junio C Hamano --- archive-tar.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/archive-tar.c b/archive-tar.c index 53d0ef685c..efba78118b 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -463,6 +463,9 @@ static const char internal_gzip_command[] = "git archive gzip"; static int write_tar_filter_archive(const struct archiver *ar, struct archiver_args *args) { +#if ZLIB_VERNUM >= 0x1221 + struct gz_header_s gzhead = { .os = 3 }; /* Unix, for reproducibility */ +#endif struct strbuf cmd = STRBUF_INIT; struct child_process filter = CHILD_PROCESS_INIT; int r; @@ -473,6 +476,10 @@ static int write_tar_filter_archive(const struct archiver *ar, if (!strcmp(ar->filter_command, internal_gzip_command)) { write_block = tgz_write_block; git_deflate_init_gzip(&gzstream, args->compression_level); +#if ZLIB_VERNUM >= 0x1221 + if (deflateSetHeader(&gzstream.z, &gzhead) != Z_OK) + BUG("deflateSetHeader() called too late"); +#endif gzstream.next_out = outbuf; gzstream.avail_out = sizeof(outbuf); From 4f4be00d302bc52d0d9d5a3d4738bb525066c710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Wed, 15 Jun 2022 19:05:03 +0200 Subject: [PATCH 6/6] archive-tar: use internal gzip by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the dependency on gzip(1) and use our internal implementation to create tar.gz and tgz files. Signed-off-by: René Scharfe Signed-off-by: Junio C Hamano --- Documentation/git-archive.txt | 4 ++-- archive-tar.c | 4 ++-- t/t5000-tar-tree.sh | 32 ++++++++++++++++---------------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Documentation/git-archive.txt b/Documentation/git-archive.txt index b2d1b63d31..60c040988b 100644 --- a/Documentation/git-archive.txt +++ b/Documentation/git-archive.txt @@ -148,8 +148,8 @@ tar..command:: to the command (e.g., `-9`). + The `tar.gz` and `tgz` formats are defined automatically and use the -command `gzip -cn` by default. An internal gzip implementation can be -used by specifying the value `git archive gzip`. +magic command `git archive gzip` by default, which invokes an internal +implementation of gzip. tar..remote:: If true, enable the format for use by remote clients via diff --git a/archive-tar.c b/archive-tar.c index efba78118b..3d77e0f750 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -526,9 +526,9 @@ void init_tar_archiver(void) int i; register_archiver(&tar_archiver); - tar_filter_config("tar.tgz.command", "gzip -cn", NULL); + tar_filter_config("tar.tgz.command", internal_gzip_command, NULL); tar_filter_config("tar.tgz.remote", "true", NULL); - tar_filter_config("tar.tar.gz.command", "gzip -cn", NULL); + tar_filter_config("tar.tar.gz.command", internal_gzip_command, NULL); tar_filter_config("tar.tar.gz.remote", "true", NULL); git_config(git_tar_config, NULL); for (i = 0; i < nr_tar_filters; i++) { diff --git a/t/t5000-tar-tree.sh b/t/t5000-tar-tree.sh index 9ac0ec67fe..1a68e89a55 100755 --- a/t/t5000-tar-tree.sh +++ b/t/t5000-tar-tree.sh @@ -339,21 +339,21 @@ test_expect_success 'only enabled filters are available remotely' ' test_cmp_bin remote.bar config.bar ' -test_expect_success GZIP 'git archive --format=tgz' ' +test_expect_success 'git archive --format=tgz' ' git archive --format=tgz HEAD >j.tgz ' -test_expect_success GZIP 'git archive --format=tar.gz' ' +test_expect_success 'git archive --format=tar.gz' ' git archive --format=tar.gz HEAD >j1.tar.gz && test_cmp_bin j.tgz j1.tar.gz ' -test_expect_success GZIP 'infer tgz from .tgz filename' ' +test_expect_success 'infer tgz from .tgz filename' ' git archive --output=j2.tgz HEAD && test_cmp_bin j.tgz j2.tgz ' -test_expect_success GZIP 'infer tgz from .tar.gz filename' ' +test_expect_success 'infer tgz from .tar.gz filename' ' git archive --output=j3.tar.gz HEAD && test_cmp_bin j.tgz j3.tar.gz ' @@ -363,31 +363,31 @@ test_expect_success GZIP 'extract tgz file' ' test_cmp_bin b.tar j.tar ' -test_expect_success GZIP 'remote tar.gz is allowed by default' ' +test_expect_success 'remote tar.gz is allowed by default' ' git archive --remote=. --format=tar.gz HEAD >remote.tar.gz && test_cmp_bin j.tgz remote.tar.gz ' -test_expect_success GZIP 'remote tar.gz can be disabled' ' +test_expect_success 'remote tar.gz can be disabled' ' git config tar.tar.gz.remote false && test_must_fail git archive --remote=. --format=tar.gz HEAD \ >remote.tar.gz ' -test_expect_success 'git archive --format=tgz (internal gzip)' ' - test_config tar.tgz.command "git archive gzip" && - git archive --format=tgz HEAD >internal_gzip.tgz +test_expect_success GZIP 'git archive --format=tgz (external gzip)' ' + test_config tar.tgz.command "gzip -cn" && + git archive --format=tgz HEAD >external_gzip.tgz ' -test_expect_success 'git archive --format=tar.gz (internal gzip)' ' - test_config tar.tar.gz.command "git archive gzip" && - git archive --format=tar.gz HEAD >internal_gzip.tar.gz && - test_cmp_bin internal_gzip.tgz internal_gzip.tar.gz +test_expect_success GZIP 'git archive --format=tar.gz (external gzip)' ' + test_config tar.tar.gz.command "gzip -cn" && + git archive --format=tar.gz HEAD >external_gzip.tar.gz && + test_cmp_bin external_gzip.tgz external_gzip.tar.gz ' -test_expect_success GZIP 'extract tgz file (internal gzip)' ' - gzip -d -c internal_gzip.tar && - test_cmp_bin b.tar internal_gzip.tar +test_expect_success GZIP 'extract tgz file (external gzip)' ' + gzip -d -c external_gzip.tar && + test_cmp_bin b.tar external_gzip.tar ' test_expect_success 'archive and :(glob)' '