From 4b46e22d48271d1a220133a925dc5009048eb577 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 30 Dec 2006 02:18:24 -0800 Subject: [PATCH 1/7] commit re-encoding: fix confusion between no and default conversion. Telling the git-log family not to do any character conversion is done with --encoding=none, which sets log_output_encoding to an empty string. However, logmsg_reencode() confused this with log_output_encoding and commit_encoding set to NULL. The latter means we should use the default encoding (i.e. utf-8). Signed-off-by: Junio C Hamano --- commit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/commit.c b/commit.c index eb06afbbe0..e13b9cb6a3 100644 --- a/commit.c +++ b/commit.c @@ -633,6 +633,8 @@ static char *logmsg_reencode(const struct commit *commit) : git_commit_encoding); if (!output_encoding) + output_encoding = "utf-8"; + else if (!*output_encoding) return NULL; encoding = get_header(commit, "encoding"); if (!encoding || !strcmp(encoding, output_encoding)) { From 000792830b8ca96e6d973545192cba8d5df48279 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 30 Dec 2006 02:35:14 -0800 Subject: [PATCH 2/7] t3900: test log --encoding=none Signed-off-by: Junio C Hamano --- t/t3900-i18n-commit.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/t/t3900-i18n-commit.sh b/t/t3900-i18n-commit.sh index 46fd47cb0f..6714b0dd6e 100755 --- a/t/t3900-i18n-commit.sh +++ b/t/t3900-i18n-commit.sh @@ -8,7 +8,7 @@ test_description='commit and log output encodings' . ./test-lib.sh compare_with () { - git-show -s "$1" | sed -e '1,/^$/d' -e 's/^ //' -e '$d' >current && + git-show -s $1 | sed -e '1,/^$/d' -e 's/^ //' -e '$d' >current && diff -u current "$2" } @@ -112,4 +112,11 @@ do done done +for H in ISO-8859-1 EUCJP ISO-2022-JP +do + test_expect_success "No conversion with $H" ' + compare_with "--encoding=none '$H'" ../t3900/'$H'.txt + ' +done + test_done From 5dc7bcc2453ce854dc1192cfffcc8aee1cc3b69d Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 30 Dec 2006 02:22:38 -0800 Subject: [PATCH 3/7] Documentation: i18n commit log message notes. Signed-off-by: Junio C Hamano --- Documentation/git-commit-tree.txt | 5 +++ Documentation/git-commit.txt | 5 +++ Documentation/git-log.txt | 6 ++++ Documentation/git-show.txt | 5 +++ Documentation/i18n.txt | 57 +++++++++++++++++++++++++++++++ 5 files changed, 78 insertions(+) create mode 100644 Documentation/i18n.txt diff --git a/Documentation/git-commit-tree.txt b/Documentation/git-commit-tree.txt index 41d1a1c4b3..77ba96ed8a 100644 --- a/Documentation/git-commit-tree.txt +++ b/Documentation/git-commit-tree.txt @@ -81,6 +81,11 @@ Your parents must have hated you!:: Your sysadmin must hate you!:: The password(5) name field is longer than a giant static buffer. +Discussion +---------- + +include::i18n.txt[] + See Also -------- gitlink:git-write-tree[1] diff --git a/Documentation/git-commit.txt b/Documentation/git-commit.txt index 0b74cd708e..a7adf24fa5 100644 --- a/Documentation/git-commit.txt +++ b/Documentation/git-commit.txt @@ -223,6 +223,11 @@ should be recorded as a single commit. In fact, the command refuses to run when given pathnames (but see `-i` option). +DISCUSSION +---------- + +include::i18n.txt[] + ENVIRONMENT VARIABLES --------------------- The command specified by either the VISUAL or EDITOR environment diff --git a/Documentation/git-log.txt b/Documentation/git-log.txt index 79643ac928..c87133542a 100644 --- a/Documentation/git-log.txt +++ b/Documentation/git-log.txt @@ -63,6 +63,12 @@ git log -r --name-status release..test:: in the "release" branch, along with the list of paths each commit modifies. +Discussion +---------- + +include::i18n.txt[] + + Author ------ Written by Linus Torvalds diff --git a/Documentation/git-show.txt b/Documentation/git-show.txt index 98dea6125d..160abb5b24 100644 --- a/Documentation/git-show.txt +++ b/Documentation/git-show.txt @@ -54,6 +54,11 @@ git show master:Makefile master:t/Makefile Concatenates the contents of said Makefiles in the head of the branch `master`. +Discussion +---------- + +include::i18n.txt[] + Author ------ Written by Linus Torvalds and diff --git a/Documentation/i18n.txt b/Documentation/i18n.txt new file mode 100644 index 0000000000..b4cbb3830e --- /dev/null +++ b/Documentation/i18n.txt @@ -0,0 +1,57 @@ +At the core level, git is character encoding agnostic. + + - The pathnames recorded in the index and in the tree objects + are treated as uninterpreted sequences of non-NUL bytes. + What readdir(2) returns are what are recorded and compared + with the data git keeps track of, which in turn are expected + to be what lstat(2) and creat(2) accepts. There is no such + thing as pathname encoding translation. + + - The contents of the blob objects are uninterpreted sequence + of bytes. There is no encoding translation at the core + level. + + - The commit log messages are uninterpreted sequence of non-NUL + bytes. + +Although we encourage that the commit log messages are encoded +in UTF-8, both the core and git Porcelain are designed not to +force UTF-8 on projects. If all participants of a particular +project find it more convenient to use legacy encodings, git +does not forbid it. However, there are a few things to keep in +mind. + +. `git-commit-tree` (hence, `git-commit` which uses it) issues + an warning if the commit log message given to it does not look + like a valid UTF-8 string, unless you explicitly say your + project uses a legacy encoding. The way to say this is to + have core.commitencoding in `.git/config` file, like this: ++ +------------ +[core] + commitencoding = ISO-8859-1 +------------ ++ +Commit objects created with the above setting record the value +of `core.commitencoding` in its `encoding` header. This is to +help other people who look at them later. Lack of this header +implies that the commit log message is encoded in UTF-8. + +. `git-log`, `git-show` and friends looks at the `encoding` + header of a commit object, and tries to re-code the log + message into UTF-8 unless otherwise specified. You can + specify the desired output encoding with + `core.logoutputencoding` in `.git/config` file, like this: ++ +------------ +[core] + logoutputencoding = ISO-8859-1 +------------ ++ +If you do not have this configuration variable, the value of +`core.commitencoding` is used instead. + +Note that we deliberately chose not to re-code the commit log +message when a commit is made to force UTF-8 at the commit +object level, because re-coding to UTF-8 is not necessarily a +reversible operation. From 99e09cce8d0eace48209483b07e2a9d99542bd04 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 30 Dec 2006 02:21:48 -0800 Subject: [PATCH 4/7] Documentation: minor rewording for git-log and git-show pages. Signed-off-by: Junio C Hamano --- Documentation/git-log.txt | 4 +++- Documentation/git-show.txt | 10 ++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/git-log.txt b/Documentation/git-log.txt index c87133542a..e9f746bbd4 100644 --- a/Documentation/git-log.txt +++ b/Documentation/git-log.txt @@ -31,7 +31,9 @@ include::pretty-formats.txt[] Limits the number of commits to show. ..:: - Show only commits between the named two commits. + Show only commits between the named two commits. When + either or is omitted, it defaults to + `HEAD`, i.e. the tip of the current branch. -p:: Show the change the commit introduces in a patch form. diff --git a/Documentation/git-show.txt b/Documentation/git-show.txt index 160abb5b24..c210b9af6b 100644 --- a/Documentation/git-show.txt +++ b/Documentation/git-show.txt @@ -30,8 +30,8 @@ This manual page describes only the most frequently used options. OPTIONS ------- -:: - ID of the commit to show. +:: + The name of the object to show. include::pretty-formats.txt[] @@ -40,7 +40,8 @@ EXAMPLES -------- git show v1.0.0:: - Shows the tag `v1.0.0`. + Shows the tag `v1.0.0`, along with the object the tags + points at. git show v1.0.0^{tree}:: Shows the tree pointed to by the tag `v1.0.0`. @@ -62,7 +63,8 @@ include::i18n.txt[] Author ------ Written by Linus Torvalds and -Junio C Hamano +Junio C Hamano . Significantly enhanced by +Johannes Schindelin . Documentation From 7cbcf4d5579bdc5d9f8a4bf3f37d3390fc6e2572 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 30 Dec 2006 11:59:08 -0800 Subject: [PATCH 5/7] Move commit reencoding parameter parsing to revision.c This way, git-rev-list and git-diff-tree with --pretty can use it. Signed-off-by: Junio C Hamano --- Documentation/git-rev-list.txt | 1 + Documentation/pretty-formats.txt | 7 +++++++ revision.c | 8 ++++++++ 3 files changed, 16 insertions(+) diff --git a/Documentation/git-rev-list.txt b/Documentation/git-rev-list.txt index 9e0dcf8d3f..86c94e7dfd 100644 --- a/Documentation/git-rev-list.txt +++ b/Documentation/git-rev-list.txt @@ -21,6 +21,7 @@ SYNOPSIS [ \--stdin ] [ \--topo-order ] [ \--parents ] + [ \--encoding[=] ] [ \--(author|committer|grep)= ] [ [\--objects | \--objects-edge] [ \--unpacked ] ] [ \--pretty | \--header ] diff --git a/Documentation/pretty-formats.txt b/Documentation/pretty-formats.txt index 996f628903..fb0b0b9582 100644 --- a/Documentation/pretty-formats.txt +++ b/Documentation/pretty-formats.txt @@ -76,3 +76,10 @@ displayed in full, regardless of whether --abbrev or --no-abbrev are used, and 'parents' information show the true parent commits, without taking grafts nor history simplification into account. + +--encoding[=]:: + The commit objects record the encoding used for the log message + in their encoding header; this option can be used to tell the + command to re-code the commit log message in the encoding + preferred by the user. For non plumbing commands this + defaults to UTF-8. diff --git a/revision.c b/revision.c index af9f87418c..6e4ec46302 100644 --- a/revision.c +++ b/revision.c @@ -1039,6 +1039,14 @@ int setup_revisions(int argc, const char **argv, struct rev_info *revs, const ch all_match = 1; continue; } + if (!strncmp(arg, "--encoding=", 11)) { + arg += 11; + if (strcmp(arg, "none")) + git_log_output_encoding = strdup(arg); + else + git_log_output_encoding = ""; + continue; + } opts = diff_opt_parse(&revs->diffopt, argv+i, argc-i); if (opts > 0) { From 677cfed56ac530878b746ee4cca3ada8af384a81 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 30 Dec 2006 12:20:43 -0800 Subject: [PATCH 6/7] commit-tree: cope with different ways "utf-8" can be spelled. People can spell config.commitencoding differently from what we internally have ("utf-8") to mean UTF-8. Try to accept them and treat them equally. Signed-off-by: Junio C Hamano --- builtin-commit-tree.c | 3 +-- utf8.c | 9 +++++++++ utf8.h | 2 ++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/builtin-commit-tree.c b/builtin-commit-tree.c index 146aaffd28..0651e5927e 100644 --- a/builtin-commit-tree.c +++ b/builtin-commit-tree.c @@ -119,8 +119,7 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix) } /* Not having i18n.commitencoding is the same as having utf-8 */ - encoding_is_utf8 = (!git_commit_encoding || - !strcmp(git_commit_encoding, "utf-8")); + encoding_is_utf8 = is_encoding_utf8(git_commit_encoding); init_buffer(&buffer, &size); add_buffer(&buffer, &size, "tree %s\n", sha1_to_hex(tree_sha1)); diff --git a/utf8.c b/utf8.c index 1eedd8b61a..7c80eeccb4 100644 --- a/utf8.c +++ b/utf8.c @@ -277,6 +277,15 @@ void print_wrapped_text(const char *text, int indent, int indent2, int width) } } +int is_encoding_utf8(const char *name) +{ + if (!name) + return 1; + if (!strcasecmp(name, "utf-8") || !strcasecmp(name, "utf8")) + return 1; + return 0; +} + /* * Given a buffer and its encoding, return it re-encoded * with iconv. If the conversion fails, returns NULL. diff --git a/utf8.h b/utf8.h index cae2a8e665..a07c5a88af 100644 --- a/utf8.h +++ b/utf8.h @@ -3,6 +3,8 @@ int utf8_width(const char **start); int is_utf8(const char *text); +int is_encoding_utf8(const char *name); + void print_wrapped_text(const char *text, int indent, int indent2, int len); #ifndef NO_ICONV From 53af9816bcb1d441fef76c3adaf0c4cb858768ac Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 30 Dec 2006 15:49:32 -0800 Subject: [PATCH 7/7] i18n: drop "encoding" header in the output after re-coding. After re-coding the commit message into the encoding the user specified (either with core.logoutputencidng or --encoding option), this drops the "encoding" header altogether. The output is after re-coding as the user asked (either with the config or --encoding= option), and the extra header becomes redundant information. Signed-off-by: Junio C Hamano --- commit.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/commit.c b/commit.c index e13b9cb6a3..afdf27eece 100644 --- a/commit.c +++ b/commit.c @@ -624,6 +624,48 @@ static char *get_header(const struct commit *commit, const char *key) } } +static char *replace_encoding_header(char *buf, char *encoding) +{ + char *encoding_header = strstr(buf, "\nencoding "); + char *end_of_encoding_header; + int encoding_header_pos; + int encoding_header_len; + int new_len; + int need_len; + int buflen = strlen(buf) + 1; + + if (!encoding_header) + return buf; /* should not happen but be defensive */ + encoding_header++; + end_of_encoding_header = strchr(encoding_header, '\n'); + if (!end_of_encoding_header) + return buf; /* should not happen but be defensive */ + end_of_encoding_header++; + + encoding_header_len = end_of_encoding_header - encoding_header; + encoding_header_pos = encoding_header - buf; + + if (is_encoding_utf8(encoding)) { + /* we have re-coded to UTF-8; drop the header */ + memmove(encoding_header, end_of_encoding_header, + buflen - (encoding_header_pos + encoding_header_len)); + return buf; + } + new_len = strlen(encoding); + need_len = new_len + strlen("encoding \n"); + if (encoding_header_len < need_len) { + buf = xrealloc(buf, buflen + (need_len - encoding_header_len)); + encoding_header = buf + encoding_header_pos; + end_of_encoding_header = encoding_header + encoding_header_len; + } + memmove(end_of_encoding_header + (need_len - encoding_header_len), + end_of_encoding_header, + buflen - (encoding_header_pos + encoding_header_len)); + memcpy(encoding_header + 9, encoding, strlen(encoding)); + encoding_header[9 + new_len] = '\n'; + return buf; +} + static char *logmsg_reencode(const struct commit *commit) { char *encoding; @@ -642,6 +684,9 @@ static char *logmsg_reencode(const struct commit *commit) return NULL; } out = reencode_string(commit->buffer, output_encoding, encoding); + if (out) + out = replace_encoding_header(out, output_encoding); + free(encoding); if (!out) return NULL;