From 02a188baf63969b6e6f14c49d305c79b61a2cb06 Mon Sep 17 00:00:00 2001
From: Shane Synan <digitalcircuit36939@gmail.com>
Date: Mon, 7 Jun 2021 23:06:13 -0400
Subject: [PATCH] common: Make regular expressions Unicode-aware
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Without the UseUnicodePropertiesOption flag, Qt's regular expressions do
not include extra-ASCII codepoints when matching character classes. This
results in \W (used by the highlight logic to prevent matching against
substrings) matching against letters with diacritics, meaning the words
'VÃ¤stra' and 'TÃV' would both count as a highlight for the nick V.

Add unit tests to verify this functionality is correct in Quassel and
any projects that implement ExpressionMatch.

Co-authored-by: V <v@anomalous.eu>
---
 src/common/expressionmatch.cpp       |  12 ++--
 tests/common/expressionmatchtest.cpp | 103 +++++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/src/common/expressionmatch.cpp b/src/common/expressionmatch.cpp
index 97b7228d..f9c834ae 100644
--- a/src/common/expressionmatch.cpp
+++ b/src/common/expressionmatch.cpp
@@ -384,10 +384,14 @@ void ExpressionMatch::cacheRegEx()
 
 QRegularExpression ExpressionMatch::regExFactory(const QString& regExString, bool caseSensitive)
 {
-    // Construct the regular expression object, setting case sensitivity as appropriate
-    QRegularExpression newRegEx = QRegularExpression(regExString,
-                                                     caseSensitive ? QRegularExpression::PatternOption::NoPatternOption
-                                                                   : QRegularExpression::PatternOption::CaseInsensitiveOption);
+    // This is required, else extra-ASCII codepoints get treated as word boundaries
+    QRegularExpression::PatternOptions options = QRegularExpression::UseUnicodePropertiesOption;
+
+    if (!caseSensitive) {
+        options |= QRegularExpression::CaseInsensitiveOption;
+    }
+
+    QRegularExpression newRegEx = QRegularExpression(regExString, options);
 
     // Check if rule is valid
     if (!newRegEx.isValid()) {
diff --git a/tests/common/expressionmatchtest.cpp b/tests/common/expressionmatchtest.cpp
index 157ae5fe..ba62f7f1 100644
--- a/tests/common/expressionmatchtest.cpp
+++ b/tests/common/expressionmatchtest.cpp
@@ -453,3 +453,106 @@ TEST(ExpressionMatchTest, testInvalidRegEx)
     // Assert literal match fails
     EXPECT_FALSE(invalidRegExMatch.match("*network"));
 }
+
+
+TEST(ExpressionMatchTest, matchPhraseUnicode)
+{
+    // Escape Unicode color emoji as a workaround for bug with libXft, otherwise, color emoji may
+    // crash "git gui" and "gitk"
+    // This has no impact on Quassel or the tests themselves
+    //
+    // See https://unix.stackexchange.com/questions/629281/gitk-crashes-when-viewing-commit-containing-emoji-x-error-of-failed-request-ba
+    const QString UnicodeEmojiFire("\xf0\x9f\x94\xa5");
+    const QString UnicodeEmojiFox("\xf0\x9f\xa6\x8a");
+
+    // Simple phrase, case-insensitive, ASCII
+    ExpressionMatch simpleMatchASCII =
+            ExpressionMatch("V", ExpressionMatch::MatchMode::MatchPhrase, false);
+    // Simple phrase, case-insensitive, ASCII Unicode mix
+    ExpressionMatch simpleMatchUniASCII =
+            ExpressionMatch("rÃ¤v", ExpressionMatch::MatchMode::MatchPhrase, false);
+    // Simple phrase, case-insensitive, full Unicode
+    ExpressionMatch simpleMatchUnicode =
+            ExpressionMatch("ç", ExpressionMatch::MatchMode::MatchPhrase, false);
+    // Simple phrase, case-insensitive, emoji
+    ExpressionMatch simpleMatchEmoji =
+            ExpressionMatch(UnicodeEmojiFox, ExpressionMatch::MatchMode::MatchPhrase, false);
+
+    // Assert valid and not empty
+    ASSERT_TRUE(simpleMatchASCII.isValid());
+    EXPECT_FALSE(simpleMatchASCII.isEmpty());
+    ASSERT_TRUE(simpleMatchUniASCII.isValid());
+    EXPECT_FALSE(simpleMatchUniASCII.isEmpty());
+    ASSERT_TRUE(simpleMatchUnicode.isValid());
+    EXPECT_FALSE(simpleMatchUnicode.isEmpty());
+    ASSERT_TRUE(simpleMatchEmoji.isValid());
+    EXPECT_FALSE(simpleMatchEmoji.isEmpty());
+
+    // Assert basic match succeeds
+    EXPECT_TRUE(simpleMatchASCII.match("V"));
+    EXPECT_TRUE(simpleMatchUniASCII.match("rÃ¤v"));
+    EXPECT_TRUE(simpleMatchUnicode.match("ç"));
+    EXPECT_TRUE(simpleMatchEmoji.match(UnicodeEmojiFox));
+
+    // Assert classic word boundaries succeed
+    EXPECT_TRUE(simpleMatchASCII.match("V: hello"));
+    EXPECT_TRUE(simpleMatchUniASCII.match("\"rÃ¤v\""));
+    EXPECT_TRUE(simpleMatchUnicode.match("ç."));
+    EXPECT_TRUE(simpleMatchEmoji.match("(" + UnicodeEmojiFox  + ")"));
+
+    // Assert non-word-boundary Unicode is NOT treated as a word boundary
+    // > ASCII nickname (most common case with spam)
+    EXPECT_FALSE(simpleMatchASCII.match("TÃV VÃ¤stra"));
+    // > ASCII/Unicode mix
+    EXPECT_FALSE(simpleMatchUniASCII.match("rÃ¤vÄ«oli"));
+    // > Full unicode
+    EXPECT_FALSE(simpleMatchUnicode.match("ä¹å°¾ã®ç"));
+
+    // Assert emoji are treated as word boundaries
+    EXPECT_TRUE(simpleMatchASCII.match(UnicodeEmojiFire + "V" + UnicodeEmojiFire));
+    EXPECT_TRUE(simpleMatchEmoji.match(UnicodeEmojiFire + UnicodeEmojiFox));
+
+    // Assert Unicode case folding does NOT happen (Ã¤ -> a)
+    EXPECT_FALSE(simpleMatchUniASCII.match("rav"));
+}
+
+
+TEST(ExpressionMatchTest, matchRegExUnicode)
+{
+    // Word character (letter and digit) regex, case-insensitive
+    ExpressionMatch simpleMatchSixWordChar =
+            ExpressionMatch(R"(\w{6})",
+                            ExpressionMatch::MatchMode::MatchRegEx, false);
+    // Digit regex, case-insensitive
+    ExpressionMatch simpleMatchThreeDigit =
+            ExpressionMatch(R"(\d{3})",
+                            ExpressionMatch::MatchMode::MatchRegEx, false);
+    ExpressionMatch simpleMatchAnyDigit =
+            ExpressionMatch(R"(\d+)",
+                            ExpressionMatch::MatchMode::MatchRegEx, false);
+
+    // Assert valid and not empty
+    ASSERT_TRUE(simpleMatchSixWordChar.isValid());
+    EXPECT_FALSE(simpleMatchSixWordChar.isEmpty());
+    ASSERT_TRUE(simpleMatchThreeDigit.isValid());
+    EXPECT_FALSE(simpleMatchThreeDigit.isEmpty());
+    ASSERT_TRUE(simpleMatchAnyDigit.isValid());
+    EXPECT_FALSE(simpleMatchAnyDigit.isEmpty());
+
+    // Assert ASCII matches
+    EXPECT_TRUE(simpleMatchSixWordChar.match("abc123"));
+    EXPECT_TRUE(simpleMatchThreeDigit.match("123"));
+    EXPECT_TRUE(simpleMatchAnyDigit.match("1"));
+
+    // Assert Unicode matches
+    EXPECT_TRUE(simpleMatchSixWordChar.match("Ã¡wÃ¡à¹2à¹"));
+    //
+    // "Unicode Characters in the 'Number, Decimal Digit' Category"
+    // Thai digits
+    // See https://www.fileformat.info/info/unicode/category/Nd/list.htm
+    EXPECT_TRUE(simpleMatchThreeDigit.match("à¹2à¹"));
+    EXPECT_TRUE(simpleMatchAnyDigit.match("à¹"));
+
+    // Assert wrong content doesn't match
+    EXPECT_FALSE(simpleMatchAnyDigit.match("Ã¡wÃ¡"));
+}
-- 
2.20.1