From dd5e4d39765f44492031f2d9319c24f0868bbe1a Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 15 Feb 2023 10:16:14 -0500 Subject: [PATCH 1/3] shorten_unambiguous_ref(): avoid integer truncation We parse the shortened name "foo" out of the full refname "refs/heads/foo", and then assign the result of strlen(short_name) to an int, which may truncate or wrap to negative. In practice, this should never happen, as it requires a 2GB refname. And even somebody trying to do something malicious should at worst end up with a confused answer (we use the size only to feed back as a placeholder length to strbuf_addf() to see if there are any collisions in the lookup rules). And it may even be impossible to trigger this, as we parse the string with sscanf(), and stdio formatting functions are not known for handling large strings well. I didn't test, but I wouldn't be surprised if sscanf() on many platforms simply reports no match here. But even if it is not a problem in practice so far, it is worth fixing for two reasons: 1. We'll shortly be replacing the sscanf() call with a real parser which will handle arbitrary-sized strings. 2. Assigning strlen() to an int is an anti-pattern that requires people to look twice when auditing for real overflow problems. So we'll make this a size_t. Unfortunately we still have to cast to int eventually for the strbuf_addf() call, but at least we can localize the cast there, and check that it will be valid. I used our new cast helper here, which will just bail completely. That should be OK, as anybody with a 2GB refname is up to no good, but if we really wanted to, we could detect it manually and just refuse to shorten the refname. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- refs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/refs.c b/refs.c index 2c7e88b190..40d5edfce6 100644 --- a/refs.c +++ b/refs.c @@ -1356,7 +1356,7 @@ char *refs_shorten_unambiguous_ref(struct ref_store *refs, for (i = nr_rules - 1; i > 0 ; --i) { int j; int rules_to_fail = i; - int short_name_len; + size_t short_name_len; if (1 != sscanf(refname, scanf_fmts[i], short_name)) continue; @@ -1388,7 +1388,8 @@ char *refs_shorten_unambiguous_ref(struct ref_store *refs, */ strbuf_reset(&resolved_buf); strbuf_addf(&resolved_buf, rule, - short_name_len, short_name); + cast_size_t_to_int(short_name_len), + short_name); if (refs_ref_exists(refs, resolved_buf.buf)) break; } From 8f416f65c9b1a42d7e6bf747c7ac5cf6a49250f8 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 15 Feb 2023 10:16:18 -0500 Subject: [PATCH 2/3] shorten_unambiguous_ref(): use NUM_REV_PARSE_RULES constant The ref_rev_parse_rules[] array is terminated with a NULL entry, and we count it and store the result in the local nr_rules variable. But we don't need to do so; since the array is a constant, we can compute its size directly. The original code probably didn't do that because it was written as part of for-each-ref, and saw the array only as a pointer. It was migrated in 7c2b3029df (make get_short_ref a public function, 2009-04-07) and could have been updated then, but that subtlety was not noticed. We even have a constant that represents this value already, courtesy of 60650a48c0 (remote: make refspec follow the same disambiguation rule as local refs, 2018-08-01), though again, nobody noticed at the time that it could be used here, too. The current count-up isn't a big deal, as we need to preprocess that array anyway. But it will become more cumbersome as we refactor the shortening code. So let's get rid of it and just use the constant everywhere. Note that there are two things here that aren't just simple text replacements: 1. We also use nr_rules to see if a previous call has initialized the static pre-processing variables. We can just use the scanf_fmts pointer to do the same thing, as it is non-NULL only after we've done that initialization. 2. If nr_rules is zero after we've counted it up, we bail from the function. This code is unreachable, though, as the set of rules is hard-coded and non-empty. And that becomes even more apparent now that we are using the constant. So we can drop this conditional completely (and ironically, the code would have the same output if it _did_ trigger, as we'd simply skip the loop entirely and return the whole refname). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- refs.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/refs.c b/refs.c index 40d5edfce6..75fd74b06c 100644 --- a/refs.c +++ b/refs.c @@ -1315,11 +1315,10 @@ char *refs_shorten_unambiguous_ref(struct ref_store *refs, { int i; static char **scanf_fmts; - static int nr_rules; char *short_name; struct strbuf resolved_buf = STRBUF_INIT; - if (!nr_rules) { + if (!scanf_fmts) { /* * Pre-generate scanf formats from ref_rev_parse_rules[]. * Generate a format suitable for scanf from a @@ -1329,31 +1328,26 @@ char *refs_shorten_unambiguous_ref(struct ref_store *refs, size_t total_len = 0; size_t offset = 0; - /* the rule list is NULL terminated, count them first */ - for (nr_rules = 0; ref_rev_parse_rules[nr_rules]; nr_rules++) + for (i = 0; i < NUM_REV_PARSE_RULES; i++) /* -2 for strlen("%.*s") - strlen("%s"); +1 for NUL */ - total_len += strlen(ref_rev_parse_rules[nr_rules]) - 2 + 1; + total_len += strlen(ref_rev_parse_rules[i]) - 2 + 1; - scanf_fmts = xmalloc(st_add(st_mult(sizeof(char *), nr_rules), total_len)); + scanf_fmts = xmalloc(st_add(st_mult(sizeof(char *), NUM_REV_PARSE_RULES), total_len)); offset = 0; - for (i = 0; i < nr_rules; i++) { + for (i = 0; i < NUM_REV_PARSE_RULES; i++) { assert(offset < total_len); - scanf_fmts[i] = (char *)&scanf_fmts[nr_rules] + offset; + scanf_fmts[i] = (char *)&scanf_fmts[NUM_REV_PARSE_RULES] + offset; offset += xsnprintf(scanf_fmts[i], total_len - offset, ref_rev_parse_rules[i], 2, "%s") + 1; } } - /* bail out if there are no rules */ - if (!nr_rules) - return xstrdup(refname); - /* buffer for scanf result, at most refname must fit */ short_name = xstrdup(refname); /* skip first rule, it will always match */ - for (i = nr_rules - 1; i > 0 ; --i) { + for (i = NUM_REV_PARSE_RULES - 1; i > 0 ; --i) { int j; int rules_to_fail = i; size_t short_name_len; @@ -1368,7 +1362,7 @@ char *refs_shorten_unambiguous_ref(struct ref_store *refs, * must fail to resolve to a valid non-ambiguous ref */ if (strict) - rules_to_fail = nr_rules; + rules_to_fail = NUM_REV_PARSE_RULES; /* * check if the short name resolves to a valid ref, From 613bef56b820cf7f24dc4b3ae65fc91826368185 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 15 Feb 2023 10:16:21 -0500 Subject: [PATCH 3/3] shorten_unambiguous_ref(): avoid sscanf() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To shorten a fully qualified ref (e.g., taking "refs/heads/foo" to just "foo"), we munge the usual lookup rules ("refs/heads/%.*s", etc) to drop the ".*" modifier (so "refs/heads/%s"), and then use sscanf() to match that against the refname, pulling the "%s" content into a separate buffer. This has a few downsides: - sscanf("%s") reportedly misbehaves on macOS with some input and locale combinations, returning a partial or garbled string. See this thread: https://lore.kernel.org/git/CAGF3oAcCi+fG12j-1U0hcrWwkF5K_9WhOi6ZPHBzUUzfkrZDxA@mail.gmail.com/ - scanf's matching of "%s" is greedy. So the "refs/remotes/%s/HEAD" rule would never pull "origin" out of "refs/remotes/origin/HEAD". Instead it always produced "origin/HEAD", which is redundant with the "refs/remotes/%s" rule. - scanf in general is an error-prone interface. For example, scanning for "%s" will copy bytes into a destination string, which must have been correctly sized ahead of time to avoid a buffer overflow. In this case, the code is OK (the buffer is pessimistically sized to match the original string, which should give us a maximum). But in general, we do not want to encourage people to use scanf at all. So instead, let's note that our lookup rules are not arbitrary format strings, but all contain exactly one "%.*s" placeholder. We already rely on this, both for lookup (we feed the lookup format along with exactly one int/ptr combo to snprintf, etc) and for shortening (we munge "%.*s" to "%s", and then insist that sscanf() finds exactly one result). We can parse this manually by just matching the bytes that occur before and after the "%.*s" placeholder. While we have a few extra lines of parsing code, the result is arguably simpler, as can skip the preprocessing step and its tricky memory management entirely. The in-code comments should explain the parsing strategy, but there's one subtle change here. The original code allocated a single buffer, and then overwrote it in each loop iteration, since that's the only option sscanf() gives us. But our parser can actually return a ptr/len combo for the matched string, which is all we need (since we just feed it back to the lookup rules with "%.*s"), and then copy it only when returning to the caller. There are a few new tests here, all using symbolic-ref (the code can be triggered in many ways, but symrefs are convenient in that we don't need to create a real ref, which avoids any complications from the filesystem munging the name): - the first covers the real-world case which misbehaved on macOS. Setting LC_ALL is required to trigger the problem there (since otherwise our tests use LC_ALL=C), and hopefully is at worst simply ignored on other systems (and doesn't cause libc to complain, etc, on systems without that locale). - the second covers the "origin/HEAD" case as discussed above, which is now fixed - the remainder are for "weird" cases that work both before and after this patch, but would be easy to get wrong with off-by-one problems in the parsing (and came out of discussions and earlier iterations of the patch that did get them wrong). - absent here are tests of boring, expected-to-work cases like "refs/heads/foo", etc. Those are covered all over the test suite both explicitly (for-each-ref's refname:short) and implicitly (in the output of git-status, etc). Reported-by: 孟子易 Helped-by: Eric Sunshine Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- refs.c | 78 +++++++++++++++++++++++------------------ t/t1401-symbolic-ref.sh | 34 ++++++++++++++++++ 2 files changed, 77 insertions(+), 35 deletions(-) diff --git a/refs.c b/refs.c index 75fd74b06c..a5e546244d 100644 --- a/refs.c +++ b/refs.c @@ -1310,53 +1310,62 @@ int update_ref(const char *msg, const char *refname, old_oid, flags, onerr); } +/* + * Check that the string refname matches a rule of the form + * "{prefix}%.*s{suffix}". So "foo/bar/baz" would match the rule + * "foo/%.*s/baz", and return the string "bar". + */ +static const char *match_parse_rule(const char *refname, const char *rule, + size_t *len) +{ + /* + * Check that rule matches refname up to the first percent in the rule. + * We can bail immediately if not, but otherwise we leave "rule" at the + * %-placeholder, and "refname" at the start of the potential matched + * name. + */ + while (*rule != '%') { + if (!*rule) + BUG("rev-parse rule did not have percent"); + if (*refname++ != *rule++) + return NULL; + } + + /* + * Check that our "%" is the expected placeholder. This assumes there + * are no other percents (placeholder or quoted) in the string, but + * that is sufficient for our rev-parse rules. + */ + if (!skip_prefix(rule, "%.*s", &rule)) + return NULL; + + /* + * And now check that our suffix (if any) matches. + */ + if (!strip_suffix(refname, rule, len)) + return NULL; + + return refname; /* len set by strip_suffix() */ +} + char *refs_shorten_unambiguous_ref(struct ref_store *refs, const char *refname, int strict) { int i; - static char **scanf_fmts; - char *short_name; struct strbuf resolved_buf = STRBUF_INIT; - if (!scanf_fmts) { - /* - * Pre-generate scanf formats from ref_rev_parse_rules[]. - * Generate a format suitable for scanf from a - * ref_rev_parse_rules rule by interpolating "%s" at the - * location of the "%.*s". - */ - size_t total_len = 0; - size_t offset = 0; - - for (i = 0; i < NUM_REV_PARSE_RULES; i++) - /* -2 for strlen("%.*s") - strlen("%s"); +1 for NUL */ - total_len += strlen(ref_rev_parse_rules[i]) - 2 + 1; - - scanf_fmts = xmalloc(st_add(st_mult(sizeof(char *), NUM_REV_PARSE_RULES), total_len)); - - offset = 0; - for (i = 0; i < NUM_REV_PARSE_RULES; i++) { - assert(offset < total_len); - scanf_fmts[i] = (char *)&scanf_fmts[NUM_REV_PARSE_RULES] + offset; - offset += xsnprintf(scanf_fmts[i], total_len - offset, - ref_rev_parse_rules[i], 2, "%s") + 1; - } - } - - /* buffer for scanf result, at most refname must fit */ - short_name = xstrdup(refname); - /* skip first rule, it will always match */ for (i = NUM_REV_PARSE_RULES - 1; i > 0 ; --i) { int j; int rules_to_fail = i; + const char *short_name; size_t short_name_len; - if (1 != sscanf(refname, scanf_fmts[i], short_name)) + short_name = match_parse_rule(refname, ref_rev_parse_rules[i], + &short_name_len); + if (!short_name) continue; - short_name_len = strlen(short_name); - /* * in strict mode, all (except the matched one) rules * must fail to resolve to a valid non-ambiguous ref @@ -1394,12 +1403,11 @@ char *refs_shorten_unambiguous_ref(struct ref_store *refs, */ if (j == rules_to_fail) { strbuf_release(&resolved_buf); - return short_name; + return xmemdupz(short_name, short_name_len); } } strbuf_release(&resolved_buf); - free(short_name); return xstrdup(refname); } diff --git a/t/t1401-symbolic-ref.sh b/t/t1401-symbolic-ref.sh index d708acdb81..be23be30c7 100755 --- a/t/t1401-symbolic-ref.sh +++ b/t/t1401-symbolic-ref.sh @@ -189,4 +189,38 @@ test_expect_success 'symbolic-ref pointing at another' ' test_cmp expect actual ' +test_expect_success 'symbolic-ref --short handles complex utf8 case' ' + name="测试-加-增加-加-增加" && + git symbolic-ref TEST_SYMREF "refs/heads/$name" && + # In the real world, we saw problems with this case only + # when the locale includes UTF-8. Set it here to try to make things as + # hard as possible for us to pass, but in practice we should do the + # right thing regardless (and of course some platforms may not even + # have this locale). + LC_ALL=en_US.UTF-8 git symbolic-ref --short TEST_SYMREF >actual && + echo "$name" >expect && + test_cmp expect actual +' + +test_expect_success 'symbolic-ref --short handles name with suffix' ' + git symbolic-ref TEST_SYMREF "refs/remotes/origin/HEAD" && + git symbolic-ref --short TEST_SYMREF >actual && + echo "origin" >expect && + test_cmp expect actual +' + +test_expect_success 'symbolic-ref --short handles almost-matching name' ' + git symbolic-ref TEST_SYMREF "refs/headsXfoo" && + git symbolic-ref --short TEST_SYMREF >actual && + echo "headsXfoo" >expect && + test_cmp expect actual +' + +test_expect_success 'symbolic-ref --short handles name with percent' ' + git symbolic-ref TEST_SYMREF "refs/heads/%foo" && + git symbolic-ref --short TEST_SYMREF >actual && + echo "%foo" >expect && + test_cmp expect actual +' + test_done