1 /***************************************************************************
2 * Copyright (C) 2005-2018 by the Quassel Project *
3 * devel@quassel-irc.org *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) version 3. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 ***************************************************************************/
21 #include "expressionmatch.h"
26 #include <QStringList>
28 #if QT_VERSION >= 0x050000
29 #include <QRegularExpression>
34 #include "logmessage.h"
36 ExpressionMatch::ExpressionMatch(const QString &expression, MatchMode mode, bool caseSensitive)
38 // Store the original parameters for later reference
39 _sourceExpression = expression;
41 _sourceCaseSensitive = caseSensitive;
43 // Calculate the internal regex
45 // Do this now instead of on-demand to provide immediate feedback on errors when editing
46 // highlight and ignore rules.
51 bool ExpressionMatch::match(const QString &string, bool matchEmpty) const
53 // Handle empty expression strings
54 if (_sourceExpressionEmpty) {
55 // Match found if matching empty is allowed, otherwise no match found
60 // Can't match on an invalid rule
64 // We have "_matchRegEx", "_matchInvertRegEx", or both due to isValid() check above
66 // If specified, first check inverted rules
67 if (_matchInvertRegExActive && _matchInvertRegEx.isValid()) {
68 // Check inverted match rule
70 // See _matchRegEx section below for explanations of QRegExp vs. QRegularExpression
72 #if QT_VERSION >= 0x050000
73 _matchInvertRegEx.match(string).hasMatch()
75 _matchInvertRegEx.indexIn(string) != -1
78 // Inverted rule matched, the rest of the rule cannot match
83 if (_matchRegExActive && _matchRegEx.isValid()) {
84 // Check regular match rule
85 #if QT_VERSION >= 0x050000
86 // QRegularExpression does partial matching by default (exact matching requires anchoring
87 // expressions to be added)
88 // See https://doc.qt.io/qt-5/qregularexpression.html#porting-from-qregexp-exactmatch
89 return _matchRegEx.match(string).hasMatch();
91 // QRegExp partial matching is done via indexIn
92 // See https://doc.qt.io/qt-5/qregexp.html#indexIn
93 return (_matchRegEx.indexIn(string) != -1);
96 // If no valid regular rules exist, due to the isValid() check there must be valid inverted
97 // rules that did not match. Count this as properly matching (implicit wildcard).
103 QString ExpressionMatch::trimMultiWildcardWhitespace(const QString &originalRule)
105 // This gets handled in two steps:
107 // 1. Break apart ";"-separated list into components
108 // 2. Combine whitespace-trimmed components into wildcard expression
110 // Let's start by making the list...
112 // Convert a ";"-separated list into an actual list, splitting on newlines and unescaping
113 // escaped characters
115 // Escaped list rules (where "[\n]" represents newline):
121 // \\; | Split (keep as "\\")
122 // \\\ | Keep as "\\" + "\", set consecutive slashes to 1
124 // \[\n] | Split (keep as "\")
125 // \\[\n] | Split (keep as "\\")
126 // ... | Keep as "..."
127 // \... | Keep as "\..."
128 // \\... | Keep as "\\..."
130 // Strings are forced to end with "\n", always applying "\..." and "\\..." rules
131 // "..." also includes another "\" character
133 // All whitespace is trimmed from each component
135 // "\\" and "\" are not downconverted to allow for other escape codes to be detected in
136 // ExpressionMatch::wildcardToRegex
141 // norm; norm-space ; newline-space [\n] ;escape \; sep ; slash-end-split\\; quad\\\\norm;
142 // newline-split-slash\\[\n] slash-at-end\\ [line does not continue]
149 // slash-end-split\\ [line does not continue]
151 // newline-split-slash\\ [line does not continue]
152 // slash-at-end\\ [line does not continue]
154 // > Trimmed wildcard rule
155 // norm; norm-space; newline-space[\n]escape \; sep; slash-end-split\\; quad\\\\norm;
156 // newline-split-slash\\[\n]slash-at-end\\ [line does not continue]
158 // (Newlines are encoded as "[\n]". Ignore linebreaks for the sake of comment wrapping.)
160 // Note: R"(\\)" results in the literal of "\\", two backslash characters. Anything inside the
161 // brackets is treated as a literal. Outside the brackets but inside the quotes is still
164 // See https://en.cppreference.com/w/cpp/language/string_literal
168 QString rule(originalRule);
170 // Force a termination at the end of the string to trigger a split
171 // Don't check for ";" splits as they may be escaped
172 if (!rule.endsWith("\n")) {
181 QString curString = {};
183 int sourceLength = rule.length();
184 // Consecutive "\" characters
185 int consecutiveSlashes = 0;
187 // We know it's going to be the same length or smaller, so reserve the same size as the string
188 result.reserve(sourceLength);
190 // For every character...
191 for (int i = 0; i < sourceLength; i++) {
193 curChar = rule.at(i);
194 // Check if it's on the list of special list characters, converting to Unicode for use
195 // in the switch statement
197 // See https://doc.qt.io/qt-5/qchar.html#unicode
198 switch (curChar.unicode()) {
201 switch (consecutiveSlashes) {
206 // "\\;" -> Split (keep as "\\")
207 // Not escaped separator, split into a new item
209 // Apply the additional "\\" if needed
210 if (consecutiveSlashes == 2) {
211 // "\\;" -> Split (keep as "\\")
212 curString.append(R"(\\)");
215 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
216 curString = curString.trimmed();
219 if (!curString.isEmpty()) {
220 // Add to list with the same separator used
221 result.append(curString + "; ");
223 // Reset the current list item
227 // "\;" -> Keep as "\;"
228 curString.append(R"(\;)");
231 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
232 qWarning() << Q_FUNC_INFO << "Wildcard rule"
233 << rule << "resulted in rule component"
234 << curString << "with unexpected count of consecutive '\\' ("
235 << consecutiveSlashes << "), ignoring" << curChar << "character!";
238 consecutiveSlashes = 0;
242 // Increase consecutive slash count
243 consecutiveSlashes++;
244 // Check if we've reached "\\\"...
245 if (consecutiveSlashes == 3) {
246 // "\\\" -> Keep as "\\" + "\"
247 curString.append(R"(\\)");
248 // Set consecutive slashes to 1, recognizing the trailing "\"
249 consecutiveSlashes = 1;
251 else if (consecutiveSlashes > 3) {
252 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
253 qWarning() << Q_FUNC_INFO << "Wildcard rule"
254 << rule << "resulted in rule component"
255 << curString << "with unexpected count of consecutive '\\' ("
256 << consecutiveSlashes << "), ignoring" << curChar << "character!";
262 // Preserve the characters as they are now
265 // "\[\n]" -> Split (keep as "\")
266 // "\\[\n]" -> Split (keep as "\\")
268 switch (consecutiveSlashes) {
274 // Apply the additional "\" or "\\"
275 curString.append(QString(R"(\)").repeated(consecutiveSlashes));
278 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
279 qWarning() << Q_FUNC_INFO << "Wildcard rule"
280 << rule << "resulted in rule component"
281 << curString << "with unexpected count of consecutive '\\' ("
282 << consecutiveSlashes << "), applying newline split anyways!";
286 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
287 curString = curString.trimmed();
290 if (!curString.isEmpty()) {
291 // Add to list with the same separator used
292 result.append(curString + "\n");
294 // Reset the current list item
296 consecutiveSlashes = 0;
299 // Preserve the characters as they are now
300 switch (consecutiveSlashes) {
302 // "..." -> Keep as "..."
303 curString.append(curChar);
307 // "\..." -> Keep as "\..."
308 // "\\..." -> Keep as "\\..."
309 curString.append(QString("\\").repeated(consecutiveSlashes) + curChar);
312 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
313 qWarning() << Q_FUNC_INFO << "Wildcard rule"
314 << rule << "resulted in rule component"
315 << curString << "with unexpected count of consecutive '\\' ("
316 << consecutiveSlashes << "), ignoring " << curChar << "char escape!";
319 consecutiveSlashes = 0;
324 // Remove any trailing separators
325 if (result.endsWith("; ")) {
329 // Remove any trailing whitespace
330 return result.trimmed();
334 void ExpressionMatch::cacheRegEx()
336 _matchRegExActive = false;
337 _matchInvertRegExActive = false;
339 _sourceExpressionEmpty = _sourceExpression.isEmpty();
340 if (_sourceExpressionEmpty) {
341 // No need to calculate anything for empty strings
345 // Convert the given expression to a regular expression based on the mode
346 switch (_sourceMode) {
347 case MatchMode::MatchPhrase:
348 // Match entire phrase, noninverted
349 // Don't trim whitespace for phrase matching as someone might want to match on " word ", a
350 // more-specific request than "word".
351 _matchRegEx = regExFactory("(?:^|\\W)" + regExEscape(_sourceExpression) + "(?:\\W|$)",
352 _sourceCaseSensitive);
353 _matchRegExActive = true;
355 case MatchMode::MatchMultiPhrase:
356 // Match multiple entire phrases, noninverted
357 // Convert from multiple-phrase rules
358 _matchRegEx = regExFactory(convertFromMultiPhrase(_sourceExpression), _sourceCaseSensitive);
359 _matchRegExActive = true;
361 case MatchMode::MatchWildcard:
362 // Match as wildcard expression
363 // Convert from wildcard rules for a single wildcard
364 if (_sourceExpression.startsWith("!")) {
365 // Inverted rule: take the remainder of the string
366 // "^" + invertComponents.at(0) + "$"
367 _matchInvertRegEx = regExFactory("^" + wildcardToRegEx(_sourceExpression.mid(1)) + "$",
368 _sourceCaseSensitive);
369 _matchInvertRegExActive = true;
372 // Normal rule: take the whole string
373 // Account for any escaped "!" (i.e. "\!") by skipping past the "\", but don't skip past
374 // escaped "\" (i.e. "\\!")
376 regExFactory("^" + wildcardToRegEx(_sourceExpression.startsWith("\\!")
377 ? _sourceExpression.mid(1)
378 : _sourceExpression) + "$",
379 _sourceCaseSensitive);
380 _matchRegExActive = true;
383 case MatchMode::MatchMultiWildcard:
384 // Match as multiple wildcard expressions
385 // Convert from wildcard rules for multiple wildcards
386 // (The generator function handles setting matchRegEx/matchInvertRegEx)
387 generateFromMultiWildcard(_sourceExpression, _sourceCaseSensitive);
389 case MatchMode::MatchRegEx:
390 // Match as regular expression
391 if (_sourceExpression.startsWith("!")) {
392 // Inverted rule: take the remainder of the string
393 _matchInvertRegEx = regExFactory(_sourceExpression.mid(1), _sourceCaseSensitive);
394 _matchInvertRegExActive = true;
397 // Normal rule: take the whole string
398 // Account for any escaped "!" (i.e. "\!") by skipping past the "\", but don't skip past
399 // escaped "\" (i.e. "\\!")
401 regExFactory(_sourceExpression.startsWith("\\!") ? _sourceExpression.mid(1)
403 _sourceCaseSensitive);
404 _matchRegExActive = true;
408 // This should never happen if you keep the above consistent
409 qWarning() << Q_FUNC_INFO << "Unknown MatchMode" << (int)_sourceMode << "!";
413 if (!_sourceExpressionEmpty && !isValid()) {
414 // This can happen with invalid regex, so make it a bit more user-friendly. Set it to Info
415 // level as ideally someone's not just going to leave a broken match rule around. For
416 // MatchRegEx, they probably need to fix their regex rule. For the other modes, there's
417 // probably a bug in the parsing routines (which should also be fixed).
418 quInfo() << "Could not parse expression match rule"
419 << _sourceExpression << "(match mode:" << (int)_sourceMode
420 << "), this rule will be ignored";
425 #if QT_VERSION >= 0x050000
426 QRegularExpression ExpressionMatch::regExFactory(const QString ®ExString,
429 QRegExp ExpressionMatch::regExFactory(const QString ®ExString, bool caseSensitive)
432 // Construct the regular expression object, setting case sensitivity as appropriate
433 #if QT_VERSION >= 0x050000
434 QRegularExpression newRegEx =
435 QRegularExpression(regExString, caseSensitive ?
436 QRegularExpression::PatternOption::NoPatternOption
437 : QRegularExpression::PatternOption::CaseInsensitiveOption );
439 QRegExp newRegEx = QRegExp(regExString, caseSensitive ?
440 Qt::CaseSensitivity::CaseSensitive
441 : Qt::CaseSensitivity::CaseInsensitive);
444 // Check if rule is valid
445 if (!newRegEx.isValid()) {
446 // This can happen with invalid regex, so make it a bit more user-friendly. Keep this
447 // distinct from the main info-level message for easier debugging in case a regex component
448 // in Wildcard or Phrase mode breaks.
449 qDebug() << "Internal regular expression component" << regExString
450 << "is invalid and will be ignored";
452 // Qt 5.4 (QT_VERSION >= 0x050400) offers explicit control over when QRegularExpression objects
453 // get optimized. By default, patterns are only optimized after some number of uses as defined
454 // within Qt internals.
456 // In the context of ExpressionMatch, some regular expressions might go unused, e.g. a highlight
457 // rule might never match a channel pattern, resulting in the contents pattern being untouched.
458 // It should be safe to let Qt handle optimization, taking a non-deterministic, one-off
459 // performance penalty on optimization for the sake of saving memory usage on patterns that
462 // If profiling shows expressions are generally used and/or the automatic optimization
463 // interferes incurs too high of a penalty (unlikely given we've created regular expression
464 // objects willy-nilly before now), this can be revisited to explicitly call...
467 // // Optimize regex now
468 // #if QT_VERSION >= 0x050400
469 // newRegEx.optimize();
474 // NOTE: This should only be called if the expression is valid! Apply within an "else" of the
475 // inverted isValid() check above.
477 // See https://doc.qt.io/qt-5/qregularexpression.html#optimize
483 QString ExpressionMatch::regExEscape(const QString &phrase)
485 // Escape the given phrase of any special regular expression characters
486 #if QT_VERSION >= 0x050000
487 return QRegularExpression::escape(phrase);
489 return QRegExp::escape(phrase);
494 QString ExpressionMatch::convertFromMultiPhrase(const QString &originalRule)
496 // Convert the multi-phrase rule into regular expression format
497 // Split apart the original rule into components
498 // Use QStringList instead of std::vector<QString> to make use of Qt's built-in .join() method
499 QStringList components = {};
501 for (auto &&component : originalRule.split("\n", QString::SkipEmptyParts)) {
502 // Don't trim whitespace to maintain consistency with single phrase matching
503 // As trimming is not performed, empty components will already be skipped. This means " "
504 // is considered a valid matching phrase.
506 // Take the whole string, escaping any regex
507 components.append(regExEscape(component));
510 // Create full regular expression by...
511 // > Enclosing within a non-capturing group to avoid overhead of text extraction, "(?:...)"
512 // > Flattening normal and inverted rules using the regex OR character "...|..."
514 // Before: [foo, bar, baz]
515 // After: (?:^|\W)(?:foo|bar|baz)(?:\W|$)
517 if (components.count() == 1) {
518 // Single item, skip the noncapturing group
519 return "(?:^|\\W)" + components.at(0) + "(?:\\W|$)";
522 return "(?:^|\\W)(?:" + components.join("|") + ")(?:\\W|$)";
527 void ExpressionMatch::generateFromMultiWildcard(const QString &originalRule, bool caseSensitive)
529 // Convert the wildcard rule into regular expression format
530 // First, reset the existing match expressions
532 _matchInvertRegEx = {};
533 _matchRegExActive = false;
534 _matchInvertRegExActive = false;
536 // This gets handled in three steps:
538 // 1. Break apart ";"-separated list into components
539 // 2. Convert components from wildcard format into regular expression format
540 // 3. Combine normal/invert components into normal/invert regular expressions
542 // Let's start by making the list...
544 // Convert a ";"-separated list into an actual list, splitting on newlines and unescaping
545 // escaped characters
547 // Escaped list rules (where "[\n]" represents newline):
552 // \; | Replace with ";"
553 // \\; | Split (keep as "\\")
554 // ! | At start: mark as inverted
555 // \! | At start: replace with "!"
556 // \\! | At start: keep as "\\!" (replaced with "\!" in wildcard conversion)
557 // ! | Elsewhere: keep as "!"
558 // \! | Elsewhere: keep as "\!"
559 // \\! | Elsewhere: keep as "\\!"
560 // \\\ | Keep as "\\" + "\", set consecutive slashes to 1
562 // \[\n] | Split (keep as "\")
563 // \\[\n] | Split (keep as "\\")
564 // ... | Keep as "..."
565 // \... | Keep as "\..."
566 // \\... | Keep as "\\..."
568 // Strings are forced to end with "\n", always applying "\..." and "\\..." rules
569 // "..." also includes another "\" character
571 // All whitespace is trimmed from each component
573 // "\\" and "\" are not downconverted to allow for other escape codes to be detected in
574 // ExpressionMatch::wildcardToRegex
580 // norm;!invert; norm-space ; !invert-space ;;!;\!norm-escaped;\\!slash-invert;\\\\double;
581 // escape\;sep;slash-end-split\\;quad\\\\!noninvert;newline-split[\n]newline-split-slash\\[\n]
582 // slash-at-end\\ [line does not continue]
584 // (Newlines are encoded as "[\n]". Ignore linebreaks for the sake of comment wrapping.)
587 // > Normal components without wildcard conversion
594 // slash-end-split\\ [line does not continue]
595 // quad\\\\!noninvert
597 // newline-split-slash\\ [line does not continue]
598 // slash-at-end\\ [line does not continue]
600 // > Inverted components without wildcard conversion
605 // > Normal components with wildcard conversion
612 // slash\-end\-split\\ [line does not continue]
613 // quad\\\\\!noninvert
615 // newline\-split\-slash\\ [line does not continue]
616 // slash\-at\-end\\ [line does not continue]
618 // > Inverted components with wildcard conversion
623 // > Normal wildcard-converted regex
624 // ^(?:norm|norm\-space|\!norm\-escaped|\\\!slash\-invert|\\\\double|escape\;sep|
625 // slash\-end\-split\\|quad\\\\\!noninvert|newline\-split|newline\-split\-slash\\|
626 // slash\-at\-end\\)$
628 // > Inverted wildcard-converted regex
629 // ^(?:invert|invert\-space)$
631 // Note: R"(\\)" results in the literal of "\\", two backslash characters. Anything inside the
632 // brackets is treated as a literal. Outside the brackets but inside the quotes is still
635 // See https://en.cppreference.com/w/cpp/language/string_literal
639 QString rule(originalRule);
641 // Force a termination at the end of the string to trigger a split
642 // Don't check for ";" splits as they may be escaped
643 if (!rule.endsWith("\n")) {
647 // Result, sorted into normal and inverted rules
648 // Use QStringList instead of std::vector<QString> to make use of Qt's built-in .join() method
649 QStringList normalComponents = {}, invertComponents = {};
654 QString curString = {};
656 int sourceLength = rule.length();
657 // Consecutive "\" characters
658 int consecutiveSlashes = 0;
659 // Whether or not this marks an inverted rule
660 bool isInverted = false;
661 // Whether or not we're at the beginning of the rule (for detecting "!" and "\!")
662 bool isRuleStart = true;
664 // We know it's going to have ";"-count items or less, so reserve ";"-count items for both.
665 // Without parsing it's not easily possible to tell which are escaped or not, and among the
666 // non-escaped entries, which are inverted or not. These get destroyed once out of scope of
667 // this function, so balancing towards performance over memory usage should be okay, hopefully.
668 int separatorCount = rule.count(";");
669 normalComponents.reserve(separatorCount);
670 invertComponents.reserve(separatorCount);
672 // For every character...
673 for (int i = 0; i < sourceLength; i++) {
675 curChar = rule.at(i);
676 // Check if it's on the list of special list characters, converting to Unicode for use
677 // in the switch statement
679 // See https://doc.qt.io/qt-5/qchar.html#unicode
680 switch (curChar.unicode()) {
683 switch (consecutiveSlashes) {
688 // "\\;" -> Split (keep as "\\")
689 // Not escaped separator, split into a new item
691 // Apply the additional "\\" if needed
692 if (consecutiveSlashes == 2) {
693 // "\\;" -> Split (keep as "\\")
694 curString.append(R"(\\)");
697 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
698 curString = curString.trimmed();
701 if (!curString.isEmpty()) {
702 // Add to inverted/normal list
704 invertComponents.append(wildcardToRegEx(curString));
707 normalComponents.append(wildcardToRegEx(curString));
710 // Reset the current list item
716 // "\;" -> Replace with ";"
717 curString.append(";");
721 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
722 qWarning() << Q_FUNC_INFO << "Wildcard rule"
723 << rule << "resulted in rule component"
724 << curString << "with unexpected count of consecutive '\\' ("
725 << consecutiveSlashes << "), ignoring" << curChar << "character!";
729 consecutiveSlashes = 0;
732 // Rule inverter found
734 // Apply the inverting logic
735 switch (consecutiveSlashes) {
737 // "!" -> At start: mark as inverted
739 // Don't include the "!" character
742 // "\!" -> At start: replace with "!"
743 curString.append("!");
746 // "\\!" -> At start: keep as "\\!" (replaced with "\!" in wildcard conversion)
747 curString.append(R"(\\!)");
750 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
751 qWarning() << Q_FUNC_INFO << "Wildcard rule"
752 << rule << "resulted in rule component"
753 << curString << "with unexpected count of consecutive '\\' ("
754 << consecutiveSlashes << "), ignoring" << curChar << "character!";
759 // Preserve the characters as they are now
760 switch (consecutiveSlashes) {
762 // "!" -> Elsewhere: keep as "!"
763 curString.append("!");
767 // "\!" -> Elsewhere: keep as "\!"
768 // "\\!" -> Elsewhere: keep as "\\!"
769 curString.append(QString(R"(\)").repeated(consecutiveSlashes) + "!");
772 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
773 qWarning() << Q_FUNC_INFO << "Wildcard rule"
774 << rule << "resulted in rule component"
775 << curString << "with unexpected count of consecutive '\\' ("
776 << consecutiveSlashes << "), ignoring" << curChar << "character!";
781 consecutiveSlashes = 0;
785 // Increase consecutive slash count
786 consecutiveSlashes++;
787 // Check if we've reached "\\\"...
788 if (consecutiveSlashes == 3) {
789 // "\\\" -> Keep as "\\" + "\"
790 curString.append(R"(\\)");
791 // No longer at the rule start
793 // Set consecutive slashes to 1, recognizing the trailing "\"
794 consecutiveSlashes = 1;
796 else if (consecutiveSlashes > 3) {
797 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
798 qWarning() << Q_FUNC_INFO << "Wildcard rule"
799 << rule << "resulted in rule component"
800 << curString << "with unexpected count of consecutive '\\' ("
801 << consecutiveSlashes << "), ignoring" << curChar << "character!";
804 // Don't set "isRuleStart" here as "\" is used in escape sequences
808 // Preserve the characters as they are now
811 // "\[\n]" -> Split (keep as "\")
812 // "\\[\n]" -> Split (keep as "\\")
814 switch (consecutiveSlashes) {
820 // Apply the additional "\" or "\\"
821 curString.append(QString(R"(\)").repeated(consecutiveSlashes));
824 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
825 qWarning() << Q_FUNC_INFO << "Wildcard rule"
826 << rule << "resulted in rule component"
827 << curString << "with unexpected count of consecutive '\\' ("
828 << consecutiveSlashes << "), applying newline split anyways!";
832 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
833 curString = curString.trimmed();
836 if (!curString.isEmpty()) {
837 // Add to inverted/normal list
839 invertComponents.append(wildcardToRegEx(curString));
842 normalComponents.append(wildcardToRegEx(curString));
845 // Reset the current list item
849 consecutiveSlashes = 0;
852 // Preserve the characters as they are now
853 switch (consecutiveSlashes) {
855 // "..." -> Keep as "..."
856 curString.append(curChar);
860 // "\..." -> Keep as "\..."
861 // "\\..." -> Keep as "\\..."
862 curString.append(QString("\\").repeated(consecutiveSlashes) + curChar);
865 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
866 qWarning() << Q_FUNC_INFO << "Wildcard rule"
867 << rule << "resulted in rule component"
868 << curString << "with unexpected count of consecutive '\\' ("
869 << consecutiveSlashes << "), ignoring " << curChar << "char escape!";
872 // Don't mark as past rule start for whitespace (whitespace gets trimmed)
873 if (!curChar.isSpace()) {
876 consecutiveSlashes = 0;
881 // Clean up any duplicates
882 normalComponents.removeDuplicates();
883 invertComponents.removeDuplicates();
885 // Create full regular expressions by...
886 // > Anchoring to start and end of string to mimic QRegExp's .exactMatch() handling, "^...$"
887 // > Enclosing within a non-capturing group to avoid overhead of text extraction, "(?:...)"
888 // > Flattening normal and inverted rules using the regex OR character "...|..."
890 // Before: [foo, bar, baz]
891 // After: ^(?:foo|bar|baz)$
893 // See https://doc.qt.io/qt-5/qregularexpression.html#porting-from-qregexp-exactmatch
894 // And https://regex101.com/
896 // Any empty/invalid regex are handled within ExpressionMatch::match()
897 if (!normalComponents.isEmpty()) {
898 // Create normal match regex
899 if (normalComponents.count() == 1) {
900 // Single item, skip the noncapturing group
901 _matchRegEx = regExFactory("^" + normalComponents.at(0) + "$", caseSensitive);
904 _matchRegEx = regExFactory("^(?:" + normalComponents.join("|") + ")$", caseSensitive);
906 _matchRegExActive = true;
908 if (!invertComponents.isEmpty()) {
909 // Create invert match regex
910 if (invertComponents.count() == 1) {
911 // Single item, skip the noncapturing group
912 _matchInvertRegEx = regExFactory("^" + invertComponents.at(0) + "$", caseSensitive);
916 regExFactory("^(?:" + invertComponents.join("|") + ")$", caseSensitive);
918 _matchInvertRegExActive = true;
923 QString ExpressionMatch::wildcardToRegEx(const QString &expression)
925 // Convert the wildcard expression into regular expression format
927 // We're taking a little bit different of a route...
929 // Original QRegExp::Wildcard rules:
930 // --------------------------
931 // Wildcard | Regex | Outcome
932 // ---------|-------|--------
933 // * | .* | zero or more of any character
934 // ? | . | any single character
936 // NOTE 1: This is QRegExp::Wildcard, not QRegExp::WildcardUnix
938 // NOTE 2: We are ignoring the "[...]" character-class matching functionality of
939 // QRegExp::Wildcard as that feature's a bit more complex and can be handled with full-featured
942 // See https://doc.qt.io/qt-5/qregexp.html#wildcard-matching
944 // Quassel originally did not use QRegExp::WildcardUnix, which prevented escaping "*" and "?" in
945 // messages. Unfortunately, spam messages might decide to use both, so offering a way to escape
948 // On the flip-side, that means to match "\" requires escaping as "\\", breaking backwards
951 // Quassel's Wildcard rules
952 // ------------------------------------------
953 // Wildcard | Regex escaped | Regex | Outcome
954 // ---------|---------------|-------|--------
955 // * | \* | .* | zero or more of any character
956 // ? | \? | . | any single character
957 // \* | \\\* | \* | literal "*"
958 // \? | \\\? | \? | literal "?"
959 // \[...] | \\[...] | [...] | invalid escape, ignore it
960 // \\ | \\\\ | \\ | literal "\"
962 // In essence, "*" and "?" need changed only if not escaped, "\\" collapses into "\", "\" gets
963 // ignored; other characters escape normally.
968 // never?gonna*give\*you\?up\\test|y\yeah\\1\\\\2\\\1inval
970 // ("\\\\" represents "\\", "\\" represents "\", and "\\\" is valid+invalid, "\")
972 // > Regex escaped wildcard rule
973 // never\?gonna\*give\\\*you\\\?up\\\\test\|y\\yeah\\\\1\\\\\\\\2\\\\\\1inval
975 // > Expected correct regex
976 // never.gonna.*give\*you\?up\\test\|yyeah\\1\\\\2\\1inval
978 // > Undoing regex escaping of "\" as "\\" (i.e. simple replace, with special escapes intact)
979 // never.gonna.*give\*you\?up\test\|yyeah\1\\2\1inval
981 // Escape string according to regex
982 QString regExEscaped(regExEscape(expression));
986 // NOTE: In theory, regular expression lookbehind could solve this. Unfortunately, QRegExp does
987 // not support lookbehind, and it's theoretically inefficient, anyways. Just use an approach
988 // similar to that taken by QRegExp's official wildcard mode.
990 // Lookbehind example (that we can't use):
991 // (?<!abc)test Negative lookbehind - don't match if "test" is proceeded by "abc"
993 // See https://code.qt.io/cgit/qt/qtbase.git/tree/src/corelib/tools/qregexp.cpp
995 // NOTE: We don't copy QRegExp's mode as QRegularExpression has more special characters. We
996 // can't use the same escaping code, hence calling the appropriate QReg[...]::escape() above.
1001 QString result = {};
1002 // Current character
1005 int sourceLength = regExEscaped.length();
1006 // Consecutive "\" characters
1007 int consecutiveSlashes = 0;
1009 // We know it's going to be the same length or smaller, so reserve the same size as the string
1010 result.reserve(sourceLength);
1012 // For every character...
1013 for (int i = 0; i < sourceLength; i++) {
1014 // Get the character
1015 curChar = regExEscaped.at(i);
1016 // Check if it's on the list of special wildcard characters, converting to Unicode for use
1017 // in the switch statement
1019 // See https://doc.qt.io/qt-5/qchar.html#unicode
1020 switch (curChar.unicode()) {
1023 switch (consecutiveSlashes) {
1025 // "?" -> "\?" -> "."
1026 // Convert from regex escaped "?" to regular expression
1030 // "\?" -> "\\\?" -> "\?"
1031 // Convert from regex escaped "\?" to literal string
1032 result.append(R"(\?)");
1035 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
1036 qWarning() << Q_FUNC_INFO << "Wildcard rule"
1037 << expression << "resulted in escaped regular expression string"
1038 << regExEscaped << " with unexpected count of consecutive '\\' ("
1039 << consecutiveSlashes << "), ignoring" << curChar << "character!";
1042 consecutiveSlashes = 0;
1046 switch (consecutiveSlashes) {
1048 // "*" -> "\*" -> ".*"
1049 // Convert from regex escaped "*" to regular expression
1050 result.append(".*");
1053 // "\*" -> "\\\*" -> "\*"
1054 // Convert from regex escaped "\*" to literal string
1055 result.append(R"(\*)");
1058 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
1059 qWarning() << Q_FUNC_INFO << "Wildcard rule"
1060 << expression << "resulted in escaped regular expression string"
1061 << regExEscaped << " with unexpected count of consecutive '\\' ("
1062 << consecutiveSlashes << "), ignoring" << curChar << "character!";
1065 consecutiveSlashes = 0;
1069 // Increase consecutive slash count
1070 consecutiveSlashes++;
1071 // Check if we've hit an escape sequence
1072 if (consecutiveSlashes == 4) {
1073 // "\\" -> "\\\\" -> "\\"
1074 // Convert from regex escaped "\\" to literal string
1075 result.append(R"(\\)");
1076 // Reset slash count
1077 consecutiveSlashes = 0;
1081 // Any other character
1082 switch (consecutiveSlashes) {
1085 // "[...]" -> "[...]" -> "[...]"
1087 // "\[...]" -> "\\[...]" -> "[...]"
1088 // Either just print the character itself, or convert from regex-escaped invalid
1089 // wildcard escape sequence to the character itself
1091 // Both mean doing nothing, the actual character [...] gets appended below
1094 // "[...]" -> "\[...]" -> "\"
1095 // Keep regex-escaped special character "[...]" as literal string
1096 // (Where "[...]" represents any non-wildcard regex special character)
1097 result.append(R"(\)");
1098 // The actual character [...] gets appended below
1101 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
1102 qWarning() << Q_FUNC_INFO << "Wildcard rule"
1103 << expression << "resulted in escaped regular expression string"
1104 << regExEscaped << " with unexpected count of consecutive '\\' ("
1105 << consecutiveSlashes << "), ignoring" << curChar << "char escape!";
1108 consecutiveSlashes = 0;
1109 // Add the character itself
1110 result.append(curChar);
1115 // Anchoring to simulate QRegExp::exactMatch() is handled in
1116 // ExpressionMatch::convertFromWildcard()