From 99b2ba35f5a67dd6308b265a8e4c3a92e2e67221 Mon Sep 17 00:00:00 2001 From: Doan Tran Cong Danh Date: Thu, 7 Nov 2019 09:56:12 +0700 Subject: [PATCH 1/8] t0028: eliminate non-standard usage of printf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit man 1p printf: In addition to the escape sequences shown in the Base Definitions volume of POSIX.1‐2008, Chapter 5, File Format Notation ('\\', '\a', '\b', '\f', '\n', '\r', '\t', '\v'), "\ddd", where ddd is a one, two, or three-digit octal number, shall be written as a byte with the numeric value specified by the octal number. printf '\xfe\xff' is an extension of some shell. Dash, a popular yet simple shell, do not implement this extension. This wasn't caught by most people running the tests, even though common shells like dash don't handle hex escapes, because their systems don't trigger the NO_UTF16_BOM prereq. But systems with musl libc do; when combined with dash, the test fails. Correct it. Signed-off-by: Doan Tran Cong Danh Signed-off-by: Junio C Hamano --- t/t0028-working-tree-encoding.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh index 7aa0945d8d..bfc4fb9af5 100755 --- a/t/t0028-working-tree-encoding.sh +++ b/t/t0028-working-tree-encoding.sh @@ -17,7 +17,7 @@ test_lazy_prereq NO_UTF32_BOM ' write_utf16 () { if test_have_prereq NO_UTF16_BOM then - printf '\xfe\xff' + printf '\376\377' fi && iconv -f UTF-8 -t UTF-16 } @@ -25,7 +25,7 @@ write_utf16 () { write_utf32 () { if test_have_prereq NO_UTF32_BOM then - printf '\x00\x00\xfe\xff' + printf '\0\0\376\377' fi && iconv -f UTF-8 -t UTF-32 } From 1ba6e7aecdd2c72df1b0da7a89502e82485f7819 Mon Sep 17 00:00:00 2001 From: Doan Tran Cong Danh Date: Fri, 8 Nov 2019 16:43:44 +0700 Subject: [PATCH 2/8] configure.ac: define ICONV_OMITS_BOM if necessary From commit 79444c9294, ("utf8: handle systems that don't write BOM for UTF-16", 2019-02-12), we're supporting those systems with iconv that omits BOM with: make ICONV_OMITS_BOM=Yes However, configure script wasn't taught to detect those systems. Teach configure to do so. Signed-off-by: Doan Tran Cong Danh Signed-off-by: Junio C Hamano --- configure.ac | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/configure.ac b/configure.ac index a43b476402..ecba7e6e51 100644 --- a/configure.ac +++ b/configure.ac @@ -844,12 +844,61 @@ AC_MSG_CHECKING([for old iconv()]) AC_COMPILE_IFELSE([OLDICONVTEST_SRC], [AC_MSG_RESULT([no])], [AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_OLD_ICONV, 1) OLD_ICONV=UnfortunatelyYes]) GIT_UNSTASH_FLAGS($ICONVDIR) GIT_CONF_SUBST([OLD_ICONV]) +# +# Define ICONV_OMITS_BOM if you are on a system which +# iconv omits bom for utf-{16,32} +if test -z "$NO_ICONV"; then +AC_CACHE_CHECK([whether iconv omits bom for utf-16 and utf-32], + [ac_cv_iconv_omits_bom], +[ +old_LIBS="$LIBS" +if test -n "$NEEDS_LIBICONV"; then + LIBS="$LIBS -liconv" +fi + +AC_RUN_IFELSE( + [AC_LANG_PROGRAM([AC_INCLUDES_DEFAULT + #include + #ifdef HAVE_OLD_ICONV + typedef const char *iconv_ibp; + #else + typedef char *iconv_ibp; + #endif + ], + [[ + int v; + iconv_t conv; + char in[] = "a"; iconv_ibp pin = in; + char out[20] = ""; char *pout = out; + size_t isz = sizeof in; + size_t osz = sizeof out; + + conv = iconv_open("UTF-16", "UTF-8"); + iconv(conv, &pin, &isz, &pout, &osz); + iconv_close(conv); + v = (unsigned char)(out[0]) + (unsigned char)(out[1]); + return v != 0xfe + 0xff; + ]])], + [ac_cv_iconv_omits_bom=no], + [ac_cv_iconv_omits_bom=yes]) + +LIBS="$old_LIBS" +]) +if test "x$ac_cv_iconv_omits_bom" = xyes; then + ICONV_OMITS_BOM=Yes +else + ICONV_OMITS_BOM= +fi +GIT_CONF_SUBST([ICONV_OMITS_BOM]) +fi + ## Checks for typedefs, structures, and compiler characteristics. AC_MSG_NOTICE([CHECKS for typedefs, structures, and compiler characteristics]) # From e4b95b3b5fb36bfa3db4408dd469a123c6efc51f Mon Sep 17 00:00:00 2001 From: Doan Tran Cong Danh Date: Fri, 8 Nov 2019 16:43:45 +0700 Subject: [PATCH 3/8] t3900: demonstrate git-rebase problem with multi encoding We're using fixup!/squash! to mark if current commit will be used to be fixed up or squashed to a previous commit. However, if we're changing i18n.commitencoding after making the original commit but before making the fixing up, we couldn't find the original commit to do the fixup/squash. Add a test to demonstrate that problem. Helped-by: Jeff King Signed-off-by: Doan Tran Cong Danh Signed-off-by: Junio C Hamano --- t/t3900-i18n-commit.sh | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/t/t3900-i18n-commit.sh b/t/t3900-i18n-commit.sh index b92ff95977..dd56384b93 100755 --- a/t/t3900-i18n-commit.sh +++ b/t/t3900-i18n-commit.sh @@ -204,4 +204,33 @@ test_commit_autosquash_flags eucJP fixup test_commit_autosquash_flags ISO-2022-JP squash +test_commit_autosquash_multi_encoding () { + flag=$1 + old=$2 + new=$3 + msg=$4 + test_expect_failure "commit --$flag into $old from $new" ' + git checkout -b $flag-$old-$new C0 && + git config i18n.commitencoding $old && + echo $old >>F && + git commit -a -F "$TEST_DIRECTORY"/t3900/$msg && + test_tick && + echo intermediate stuff >>G && + git add G && + git commit -a -m "intermediate commit" && + test_tick && + git config i18n.commitencoding $new && + echo $new-$flag >>F && + git commit -a --$flag HEAD^ && + git rebase --autosquash -i HEAD^^^ && + git rev-list HEAD >actual && + test_line_count = 3 actual + ' +} + +test_commit_autosquash_multi_encoding fixup UTF-8 ISO-8859-1 1-UTF-8.txt +test_commit_autosquash_multi_encoding squash ISO-8859-1 UTF-8 ISO8859-1.txt +test_commit_autosquash_multi_encoding squash eucJP ISO-2022-JP eucJP.txt +test_commit_autosquash_multi_encoding fixup ISO-2022-JP UTF-8 ISO-2022-JP.txt + test_done From 0798d16fe38e45453b626c699d92de6f9d71f5ac Mon Sep 17 00:00:00 2001 From: Doan Tran Cong Danh Date: Fri, 8 Nov 2019 16:43:46 +0700 Subject: [PATCH 4/8] sequencer: reencode to utf-8 before arrange rebase's todo list On musl libc, ISO-2022-JP encoder is too eager to switch back to 1 byte encoding, musl's iconv always switch back after every combining character. Comparing glibc and musl's output for this command $ sed q t/t3900/ISO-2022-JP.txt| iconv -f ISO-2022-JP -t utf-8 | iconv -f utf-8 -t ISO-2022-JP | xxd glibc: 00000000: 1b24 4224 4f24 6c24 5224 5b24 551b 2842 .$B$O$l$R$[$U.(B 00000010: 0a . musl: 00000000: 1b24 4224 4f1b 2842 1b24 4224 6c1b 2842 .$B$O.(B.$B$l.(B 00000010: 1b24 4224 521b 2842 1b24 4224 5b1b 2842 .$B$R.(B.$B$[.(B 00000020: 1b24 4224 551b 2842 0a .$B$U.(B. Although musl iconv's output isn't optimal, it's still correct. From commit 7d509878b8, ("pretty.c: format string with truncate respects logOutputEncoding", 2014-05-21), we're encoding the message to utf-8 first, then format it and convert the message to the actual output encoding on git commit --squash. Thus, t3900::test_commit_autosquash_flags is failing on musl libc. Reencode to utf-8 before arranging rebase's todo list. By doing this, we also remove a breakage noticed by a test added in the previous commit. Signed-off-by: Doan Tran Cong Danh Signed-off-by: Junio C Hamano --- sequencer.c | 2 +- t/t3900-i18n-commit.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sequencer.c b/sequencer.c index 9d5964fd81..69430fe23f 100644 --- a/sequencer.c +++ b/sequencer.c @@ -5169,7 +5169,7 @@ int todo_list_rearrange_squash(struct todo_list *todo_list) *commit_todo_item_at(&commit_todo, item->commit) = item; parse_commit(item->commit); - commit_buffer = get_commit_buffer(item->commit, NULL); + commit_buffer = logmsg_reencode(item->commit, NULL, "UTF-8"); find_commit_subject(commit_buffer, &subject); format_subject(&buf, subject, " "); subject = subjects[i] = strbuf_detach(&buf, &subject_len); diff --git a/t/t3900-i18n-commit.sh b/t/t3900-i18n-commit.sh index dd56384b93..a518281b04 100755 --- a/t/t3900-i18n-commit.sh +++ b/t/t3900-i18n-commit.sh @@ -209,7 +209,7 @@ test_commit_autosquash_multi_encoding () { old=$2 new=$3 msg=$4 - test_expect_failure "commit --$flag into $old from $new" ' + test_expect_success "commit --$flag into $old from $new" ' git checkout -b $flag-$old-$new C0 && git config i18n.commitencoding $old && echo $old >>F && From 019a9d836230c8851aa8b0d4dc2e0dea42662a90 Mon Sep 17 00:00:00 2001 From: Doan Tran Cong Danh Date: Fri, 8 Nov 2019 16:43:47 +0700 Subject: [PATCH 5/8] sequencer: reencode revert/cherry-pick's todo list Keep revert/cherry-pick's todo list in line with rebase todo list. Signed-off-by: Doan Tran Cong Danh Signed-off-by: Junio C Hamano --- sequencer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sequencer.c b/sequencer.c index 69430fe23f..a19954f2bf 100644 --- a/sequencer.c +++ b/sequencer.c @@ -2564,14 +2564,17 @@ static int walk_revs_populate_todo(struct todo_list *todo_list, enum todo_command command = opts->action == REPLAY_PICK ? TODO_PICK : TODO_REVERT; const char *command_string = todo_command_info[command].str; + const char *encoding; struct commit *commit; if (prepare_revs(opts)) return -1; + encoding = get_log_output_encoding(); + while ((commit = get_revision(opts->revs))) { struct todo_item *item = append_new_todo(todo_list); - const char *commit_buffer = get_commit_buffer(commit, NULL); + const char *commit_buffer = logmsg_reencode(commit, NULL, encoding); const char *subject; int subject_len; From b375744274113889c85bee69445375ce51e96648 Mon Sep 17 00:00:00 2001 From: Doan Tran Cong Danh Date: Fri, 8 Nov 2019 16:43:48 +0700 Subject: [PATCH 6/8] sequencer: reencode squashing commit's message On fixup/squash-ing rebase, git will create new commit in i18n.commitencoding, reencode the commit message to that said encode. Signed-off-by: Doan Tran Cong Danh Signed-off-by: Junio C Hamano --- sequencer.c | 8 +++++--- t/t3900-i18n-commit.sh | 10 +++++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/sequencer.c b/sequencer.c index a19954f2bf..833a928929 100644 --- a/sequencer.c +++ b/sequencer.c @@ -1576,6 +1576,7 @@ static int update_squash_messages(struct repository *r, struct strbuf buf = STRBUF_INIT; int res; const char *message, *body; + const char *encoding = get_commit_output_encoding(); if (opts->current_fixup_count > 0) { struct strbuf header = STRBUF_INIT; @@ -1602,7 +1603,7 @@ static int update_squash_messages(struct repository *r, return error(_("need a HEAD to fixup")); if (!(head_commit = lookup_commit_reference(r, &head))) return error(_("could not read HEAD")); - if (!(head_message = get_commit_buffer(head_commit, NULL))) + if (!(head_message = logmsg_reencode(head_commit, NULL, encoding))) return error(_("could not read HEAD's commit message")); find_commit_subject(head_message, &body); @@ -1623,7 +1624,7 @@ static int update_squash_messages(struct repository *r, unuse_commit_buffer(head_commit, head_message); } - if (!(message = get_commit_buffer(commit, NULL))) + if (!(message = logmsg_reencode(commit, NULL, encoding))) return error(_("could not read commit message of %s"), oid_to_hex(&commit->object.oid)); find_commit_subject(message, &body); @@ -4154,9 +4155,10 @@ static int commit_staged_changes(struct repository *r, */ struct commit *commit; const char *path = rebase_path_squash_msg(); + const char *encoding = get_commit_output_encoding(); if (parse_head(r, &commit) || - !(p = get_commit_buffer(commit, NULL)) || + !(p = logmsg_reencode(commit, NULL, encoding)) || write_message(p, strlen(p), path, 0)) { unuse_commit_buffer(commit, p); return error(_("could not write file: " diff --git a/t/t3900-i18n-commit.sh b/t/t3900-i18n-commit.sh index a518281b04..d277a9f4b7 100755 --- a/t/t3900-i18n-commit.sh +++ b/t/t3900-i18n-commit.sh @@ -224,7 +224,15 @@ test_commit_autosquash_multi_encoding () { git commit -a --$flag HEAD^ && git rebase --autosquash -i HEAD^^^ && git rev-list HEAD >actual && - test_line_count = 3 actual + test_line_count = 3 actual && + iconv -f $old -t UTF-8 "$TEST_DIRECTORY"/t3900/$msg >expect && + if test $flag = squash; then + subject="$(head -1 expect)" && + printf "\nsquash! %s\n" "$subject" >>expect + fi && + git cat-file commit HEAD^ >raw && + (sed "1,/^$/d" raw | iconv -f $new -t utf-8) >actual && + test_cmp expect actual ' } From 5772b0c745ea7f57b94880f377e84a79e2675f38 Mon Sep 17 00:00:00 2001 From: Doan Tran Cong Danh Date: Mon, 11 Nov 2019 13:03:40 +0700 Subject: [PATCH 7/8] sequencer: reencode old merge-commit message During rebasing, old merge's message (encoded in old encoding) will be used as message for new merge commit (created by rebase). In case of the value of i18n.commitencoding has been changed after the old merge time. We will receive an unusable message for this new merge. Correct it. This change also notice a breakage with git-rebase label system. Signed-off-by: Doan Tran Cong Danh Signed-off-by: Junio C Hamano --- sequencer.c | 3 ++- t/t3434-rebase-i18n.sh | 57 ++++++++++++++++++++++++++++++++++++++++++ t/t3434/eucJP.txt | 4 +++ 3 files changed, 63 insertions(+), 1 deletion(-) create mode 100755 t/t3434-rebase-i18n.sh create mode 100644 t/t3434/eucJP.txt diff --git a/sequencer.c b/sequencer.c index 833a928929..d735d09f98 100644 --- a/sequencer.c +++ b/sequencer.c @@ -3374,7 +3374,8 @@ static int do_merge(struct repository *r, } if (commit) { - const char *message = get_commit_buffer(commit, NULL); + const char *encoding = get_commit_output_encoding(); + const char *message = logmsg_reencode(commit, NULL, encoding); const char *body; int len; diff --git a/t/t3434-rebase-i18n.sh b/t/t3434-rebase-i18n.sh new file mode 100755 index 0000000000..c6c16373eb --- /dev/null +++ b/t/t3434-rebase-i18n.sh @@ -0,0 +1,57 @@ +#!/bin/sh +# +# Copyright (c) 2019 Doan Tran Cong Danh +# + +test_description='rebase with changing encoding + +Initial setup: + +1 - 2 master + \ + 3 - 4 first + \ + 5 - 6 second +' + +. ./test-lib.sh + +compare_msg () { + iconv -f "$2" -t "$3" "$TEST_DIRECTORY/t3434/$1" >expect && + git cat-file commit HEAD >raw && + sed "1,/^$/d" raw >actual && + test_cmp expect actual +} + +test_expect_success setup ' + test_commit one && + git branch first && + test_commit two && + git switch first && + test_commit three && + git branch second && + test_commit four && + git switch second && + test_commit five && + test_commit six +' + +test_expect_success 'rebase --rebase-merges update encoding eucJP to UTF-8' ' + git switch -c merge-eucJP-UTF-8 first && + git config i18n.commitencoding eucJP && + git merge -F "$TEST_DIRECTORY/t3434/eucJP.txt" second && + git config i18n.commitencoding UTF-8 && + git rebase --rebase-merges master && + compare_msg eucJP.txt eucJP UTF-8 +' + +test_expect_failure 'rebase --rebase-merges update encoding eucJP to ISO-2022-JP' ' + git switch -c merge-eucJP-ISO-2022-JP first && + git config i18n.commitencoding eucJP && + git merge -F "$TEST_DIRECTORY/t3434/eucJP.txt" second && + git config i18n.commitencoding ISO-2022-JP && + git rebase --rebase-merges master && + compare_msg eucJP.txt eucJP ISO-2022-JP +' + +test_done diff --git a/t/t3434/eucJP.txt b/t/t3434/eucJP.txt new file mode 100644 index 0000000000..546f2aac01 --- /dev/null +++ b/t/t3434/eucJP.txt @@ -0,0 +1,4 @@ +ϤҤۤ + +ƤΤΤǡ +ͤۤפݤޤӤء From 52f52e5ae4937de2bc798828c47c49f469b2cc85 Mon Sep 17 00:00:00 2001 From: Doan Tran Cong Danh Date: Mon, 11 Nov 2019 13:03:41 +0700 Subject: [PATCH 8/8] sequencer: reencode commit message for am/rebase --show-current-patch The message file will be used as commit message for the git-{am,rebase} --continue. Signed-off-by: Doan Tran Cong Danh Signed-off-by: Junio C Hamano --- sequencer.c | 3 ++- t/t3434-rebase-i18n.sh | 27 +++++++++++++++++++++++++++ t/t3434/ISO8859-1.txt | 3 +++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 t/t3434/ISO8859-1.txt diff --git a/sequencer.c b/sequencer.c index d735d09f98..4d12ad3cc6 100644 --- a/sequencer.c +++ b/sequencer.c @@ -2972,7 +2972,8 @@ static int make_patch(struct repository *r, strbuf_addf(&buf, "%s/message", get_dir(opts)); if (!file_exists(buf.buf)) { - const char *commit_buffer = get_commit_buffer(commit, NULL); + const char *encoding = get_commit_output_encoding(); + const char *commit_buffer = logmsg_reencode(commit, NULL, encoding); find_commit_subject(commit_buffer, &subject); res |= write_message(subject, strlen(subject), buf.buf, 1); unuse_commit_buffer(commit, commit_buffer); diff --git a/t/t3434-rebase-i18n.sh b/t/t3434-rebase-i18n.sh index c6c16373eb..4b5b128cd6 100755 --- a/t/t3434-rebase-i18n.sh +++ b/t/t3434-rebase-i18n.sh @@ -54,4 +54,31 @@ test_expect_failure 'rebase --rebase-merges update encoding eucJP to ISO-2022-JP compare_msg eucJP.txt eucJP ISO-2022-JP ' +test_rebase_continue_update_encode () { + old=$1 + new=$2 + msgfile=$3 + test_expect_success "rebase --continue update from $old to $new" ' + (git rebase --abort || : abort current git-rebase failure) && + git switch -c conflict-$old-$new one && + echo for-conflict >two.t && + git add two.t && + git config i18n.commitencoding $old && + git commit -F "$TEST_DIRECTORY/t3434/$msgfile" && + git config i18n.commitencoding $new && + test_must_fail git rebase -m master && + test -f .git/rebase-merge/message && + git stripspace <.git/rebase-merge/message >two.t && + git add two.t && + git rebase --continue && + compare_msg $msgfile $old $new && + : git-commit assume invalid utf-8 is latin1 && + test_cmp expect two.t + ' +} + +test_rebase_continue_update_encode ISO-8859-1 UTF-8 ISO8859-1.txt +test_rebase_continue_update_encode eucJP UTF-8 eucJP.txt +test_rebase_continue_update_encode eucJP ISO-2022-JP eucJP.txt + test_done diff --git a/t/t3434/ISO8859-1.txt b/t/t3434/ISO8859-1.txt new file mode 100644 index 0000000000..7cbef0ee6f --- /dev/null +++ b/t/t3434/ISO8859-1.txt @@ -0,0 +1,3 @@ + + +bdfg