From a4fea08b6ebc2782891abdf6fd4bb9feeb21ff4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Sun, 24 Jan 2021 18:28:12 +0100 Subject: [PATCH 1/2] grep/pcre2 tests: don't rely on invalid UTF-8 data test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As noted in [1] when I originally added this test in [2] the test was completely broken as it lacked a redirect[3]. I now think this whole thing is overly fragile. Let's only test if we have a segfault here. Before this the first test's "test_cmp" was pretty meaningless. We were only testing if PCREv2 was so broken that it would spew out something completely unrelated on stdout, which isn't very plausible. In the second test we're relying on PCREv2 forever holding to the current behavior of the PCRE_UTF8 flag, as opposed to learning some optimistic graceful fallback to PCRE2_MATCH_INVALID_UTF in the future. If that happens having this test broken under bisecting would suck. A follow-up commit will actually test this case in a meaningful way under the PCRE2_MATCH_INVALID_UTF flag. Let's run this one unconditionally, and just make sure we don't segfault. 1. e714b898c6 (t7812: expect failure for grep -i with invalid UTF-8 data, 2019-11-29) 2. 8a5999838e (grep: stess test PCRE v2 on invalid UTF-8 data, 2019-07-26) 3. c74b3cbb83 (t7812: add missing redirects, 2019-11-26) Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t7812-grep-icase-non-ascii.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index 03dba6685a..38457c2e4f 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -76,12 +76,7 @@ test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invali test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i' ' test_might_fail git grep -hi "Æ" invalid-0x80 >actual && - if test -s actual - then - test_cmp expected actual - fi && - test_must_fail git grep -hi "(*NO_JIT)Æ" invalid-0x80 >actual && - ! test_cmp expected actual + test_might_fail git grep -hi "(*NO_JIT)Æ" invalid-0x80 >actual ' test_done From 95ca1f987edd23389e3079d0a7fe6d0f89927b68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Sun, 24 Jan 2021 18:28:13 +0100 Subject: [PATCH 2/2] grep/pcre2: better support invalid UTF-8 haystacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Improve the support for invalid UTF-8 haystacks given a non-ASCII needle when using the PCREv2 backend. This is a more complete fix for a bug I started to fix in 870eea8166 (grep: do not enter PCRE2_UTF mode on fixed matching, 2019-07-26), now that PCREv2 has the PCRE2_MATCH_INVALID_UTF mode we can make use of it. This fixes the sort of case described in 8a5999838e (grep: stess test PCRE v2 on invalid UTF-8 data, 2019-07-26), i.e.: - The subject string is non-ASCII (e.g. "ævar") - We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C" - We are using --ignore-case, or we're a non-fixed pattern If those conditions were satisfied and we matched found non-valid UTF-8 data PCREv2 might bark on it, in practice this only happened under the JIT backend (turned on by default on most platforms). Ultimately this fixes a "regression" in b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string search", 2019-07-01), I'm putting that in scare-quotes because before then we wouldn't properly support these complex case-folding, locale etc. cases either, it just broke in different ways. There was a bug related to this the PCRE2_NO_START_OPTIMIZE flag fixed in PCREv2 10.36. It can be worked around by setting the PCRE2_NO_START_OPTIMIZE flag. Let's do that in those cases, and add tests for the bug. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- Makefile | 1 + grep.c | 18 ++++++++++++- grep.h | 4 +++ t/helper/test-pcre2-config.c | 12 +++++++++ t/helper/test-tool.c | 1 + t/helper/test-tool.h | 1 + t/t7812-grep-icase-non-ascii.sh | 46 ++++++++++++++++++++++++++++++++- 7 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 t/helper/test-pcre2-config.c diff --git a/Makefile b/Makefile index 4edfda3e00..42a7ed96e2 100644 --- a/Makefile +++ b/Makefile @@ -722,6 +722,7 @@ TEST_BUILTINS_OBJS += test-online-cpus.o TEST_BUILTINS_OBJS += test-parse-options.o TEST_BUILTINS_OBJS += test-parse-pathspec-file.o TEST_BUILTINS_OBJS += test-path-utils.o +TEST_BUILTINS_OBJS += test-pcre2-config.o TEST_BUILTINS_OBJS += test-pkt-line.o TEST_BUILTINS_OBJS += test-prio-queue.o TEST_BUILTINS_OBJS += test-proc-receive.o diff --git a/grep.c b/grep.c index efeb6dc58d..ad0af66a26 100644 --- a/grep.c +++ b/grep.c @@ -492,7 +492,23 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt } if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) && !(!opt->ignore_case && (p->fixed || p->is_fixed))) - options |= PCRE2_UTF; + options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF); + + /* Work around https://bugs.exim.org/show_bug.cgi?id=2642 fixed in 10.36 */ + if (PCRE2_MATCH_INVALID_UTF && options & (PCRE2_UTF | PCRE2_CASELESS)) { + struct strbuf buf; + int len; + int err; + + if ((len = pcre2_config(PCRE2_CONFIG_VERSION, NULL)) < 0) + BUG("pcre2_config(..., NULL) failed: %d", len); + strbuf_init(&buf, len + 1); + if ((err = pcre2_config(PCRE2_CONFIG_VERSION, buf.buf)) < 0) + BUG("pcre2_config(..., buf.buf) failed: %d", err); + if (versioncmp(buf.buf, "10.36") < 0) + options |= PCRE2_NO_START_OPTIMIZE; + strbuf_release(&buf); + } p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern, p->patternlen, options, &error, &erroffset, diff --git a/grep.h b/grep.h index b5c4e223a8..ade21c8812 100644 --- a/grep.h +++ b/grep.h @@ -18,6 +18,10 @@ typedef int pcre2_code; typedef int pcre2_match_data; typedef int pcre2_compile_context; #endif +#ifndef PCRE2_MATCH_INVALID_UTF +/* PCRE2_MATCH_* dummy also with !USE_LIBPCRE2, for test-pcre2-config.c */ +#define PCRE2_MATCH_INVALID_UTF 0 +#endif #include "thread-utils.h" #include "userdiff.h" diff --git a/t/helper/test-pcre2-config.c b/t/helper/test-pcre2-config.c new file mode 100644 index 0000000000..5258fdddba --- /dev/null +++ b/t/helper/test-pcre2-config.c @@ -0,0 +1,12 @@ +#include "test-tool.h" +#include "cache.h" +#include "grep.h" + +int cmd__pcre2_config(int argc, const char **argv) +{ + if (argc == 2 && !strcmp(argv[1], "has-PCRE2_MATCH_INVALID_UTF")) { + int value = PCRE2_MATCH_INVALID_UTF; + return !value; + } + return 1; +} diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index 9d6d14d929..f97cd9f48a 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -46,6 +46,7 @@ static struct test_cmd cmds[] = { { "parse-options", cmd__parse_options }, { "parse-pathspec-file", cmd__parse_pathspec_file }, { "path-utils", cmd__path_utils }, + { "pcre2-config", cmd__pcre2_config }, { "pkt-line", cmd__pkt_line }, { "prio-queue", cmd__prio_queue }, { "proc-receive", cmd__proc_receive}, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index a6470ff62c..28072c0ad5 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -35,6 +35,7 @@ int cmd__online_cpus(int argc, const char **argv); int cmd__parse_options(int argc, const char **argv); int cmd__parse_pathspec_file(int argc, const char** argv); int cmd__path_utils(int argc, const char **argv); +int cmd__pcre2_config(int argc, const char **argv); int cmd__pkt_line(int argc, const char **argv); int cmd__prio_queue(int argc, const char **argv); int cmd__proc_receive(int argc, const char **argv); diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index 38457c2e4f..e5d1e4ea68 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -57,7 +57,12 @@ test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: setup invalid UTF-8 data' printf "\\200\\n" >invalid-0x80 && echo "ævar" >expected && cat expected >>invalid-0x80 && - git add invalid-0x80 + git add invalid-0x80 && + + # Test for PCRE2_MATCH_INVALID_UTF bug + # https://bugs.exim.org/show_bug.cgi?id=2642 + printf "\\345Aæ\\n" >invalid-0xe5 && + git add invalid-0xe5 ' test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep ASCII from invalid UTF-8 data' ' @@ -67,6 +72,13 @@ test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep ASCII from invalid UT test_cmp expected actual ' +test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep ASCII from invalid UTF-8 data (PCRE2 bug #2642)' ' + git grep -h "Aæ" invalid-0xe5 >actual && + test_cmp invalid-0xe5 actual && + git grep -h "(*NO_JIT)Aæ" invalid-0xe5 >actual && + test_cmp invalid-0xe5 actual +' + test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data' ' git grep -h "æ" invalid-0x80 >actual && test_cmp expected actual && @@ -74,9 +86,41 @@ test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invali test_cmp expected actual ' +test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data (PCRE2 bug #2642)' ' + git grep -h "Aæ" invalid-0xe5 >actual && + test_cmp invalid-0xe5 actual && + git grep -h "(*NO_JIT)Aæ" invalid-0xe5 >actual && + test_cmp invalid-0xe5 actual +' + +test_lazy_prereq PCRE2_MATCH_INVALID_UTF ' + test-tool pcre2-config has-PCRE2_MATCH_INVALID_UTF +' + test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i' ' test_might_fail git grep -hi "Æ" invalid-0x80 >actual && test_might_fail git grep -hi "(*NO_JIT)Æ" invalid-0x80 >actual ' +test_expect_success GETTEXT_LOCALE,LIBPCRE2,PCRE2_MATCH_INVALID_UTF 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i' ' + git grep -hi "Æ" invalid-0x80 >actual && + test_cmp expected actual && + git grep -hi "(*NO_JIT)Æ" invalid-0x80 >actual && + test_cmp expected actual +' + +test_expect_success GETTEXT_LOCALE,LIBPCRE2,PCRE2_MATCH_INVALID_UTF 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i (PCRE2 bug #2642)' ' + git grep -hi "Æ" invalid-0xe5 >actual && + test_cmp invalid-0xe5 actual && + git grep -hi "(*NO_JIT)Æ" invalid-0xe5 >actual && + test_cmp invalid-0xe5 actual && + + # Only the case of grepping the ASCII part in a way that + # relies on -i fails + git grep -hi "aÆ" invalid-0xe5 >actual && + test_cmp invalid-0xe5 actual && + git grep -hi "(*NO_JIT)aÆ" invalid-0xe5 >actual && + test_cmp invalid-0xe5 actual +' + test_done