1 /***************************************************************************
2 * Copyright (C) 2005-2022 by the Quassel Project *
3 * devel@quassel-irc.org *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) version 3. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 ***************************************************************************/
21 #include "expressionmatch.h"
26 #include <QStringList>
28 ExpressionMatch::ExpressionMatch(const QString& expression, MatchMode mode, bool caseSensitive)
30 // Store the original parameters for later reference
31 _sourceExpression = expression;
33 _sourceCaseSensitive = caseSensitive;
35 // Calculate the internal regex
37 // Do this now instead of on-demand to provide immediate feedback on errors when editing
38 // highlight and ignore rules.
42 bool ExpressionMatch::match(const QString& string, bool matchEmpty) const
44 // Handle empty expression strings
45 if (_sourceExpressionEmpty) {
46 // Match found if matching empty is allowed, otherwise no match found
51 // Can't match on an invalid rule
55 // We have "_matchRegEx", "_matchInvertRegEx", or both due to isValid() check above
57 // If specified, first check inverted rules
58 if (_matchInvertRegExActive && _matchInvertRegEx.isValid()) {
59 // Check inverted match rule
60 if (_matchInvertRegEx.match(string).hasMatch()) {
61 // Inverted rule matched, the rest of the rule cannot match
66 if (_matchRegExActive && _matchRegEx.isValid()) {
67 // Check regular match rule
68 return _matchRegEx.match(string).hasMatch();
71 // If no valid regular rules exist, due to the isValid() check there must be valid inverted
72 // rules that did not match. Count this as properly matching (implicit wildcard).
77 QString ExpressionMatch::trimMultiWildcardWhitespace(const QString& originalRule)
79 // This gets handled in two steps:
81 // 1. Break apart ";"-separated list into components
82 // 2. Combine whitespace-trimmed components into wildcard expression
84 // Let's start by making the list...
86 // Convert a ";"-separated list into an actual list, splitting on newlines and unescaping
89 // Escaped list rules (where "[\n]" represents newline):
95 // \\; | Split (keep as "\\")
96 // \\\ | Keep as "\\" + "\", set consecutive slashes to 1
98 // \[\n] | Split (keep as "\")
99 // \\[\n] | Split (keep as "\\")
100 // ... | Keep as "..."
101 // \... | Keep as "\..."
102 // \\... | Keep as "\\..."
104 // Strings are forced to end with "\n", always applying "\..." and "\\..." rules
105 // "..." also includes another "\" character
107 // All whitespace is trimmed from each component
109 // "\\" and "\" are not downconverted to allow for other escape codes to be detected in
110 // ExpressionMatch::wildcardToRegex
115 // norm; norm-space ; newline-space [\n] ;escape \; sep ; slash-end-split\\; quad\\\\norm;
116 // newline-split-slash\\[\n] slash-at-end\\ [line does not continue]
123 // slash-end-split\\ [line does not continue]
125 // newline-split-slash\\ [line does not continue]
126 // slash-at-end\\ [line does not continue]
128 // > Trimmed wildcard rule
129 // norm; norm-space; newline-space[\n]escape \; sep; slash-end-split\\; quad\\\\norm;
130 // newline-split-slash\\[\n]slash-at-end\\ [line does not continue]
132 // (Newlines are encoded as "[\n]". Ignore linebreaks for the sake of comment wrapping.)
134 // Note: R"(\\)" results in the literal of "\\", two backslash characters. Anything inside the
135 // brackets is treated as a literal. Outside the brackets but inside the quotes is still
138 // See https://en.cppreference.com/w/cpp/language/string_literal
142 QString rule(originalRule);
144 // Force a termination at the end of the string to trigger a split
145 // Don't check for ";" splits as they may be escaped
146 if (!rule.endsWith("\n")) {
155 QString curString = {};
157 int sourceLength = rule.length();
158 // Consecutive "\" characters
159 int consecutiveSlashes = 0;
161 // We know it's going to be the same length or smaller, so reserve the same size as the string
162 result.reserve(sourceLength);
164 // For every character...
165 for (int i = 0; i < sourceLength; i++) {
167 curChar = rule.at(i);
168 // Check if it's on the list of special list characters, converting to Unicode for use
169 // in the switch statement
171 // See https://doc.qt.io/qt-5/qchar.html#unicode
172 switch (curChar.unicode()) {
175 switch (consecutiveSlashes) {
180 // "\\;" -> Split (keep as "\\")
181 // Not escaped separator, split into a new item
183 // Apply the additional "\\" if needed
184 if (consecutiveSlashes == 2) {
185 // "\\;" -> Split (keep as "\\")
186 curString.append(R"(\\)");
189 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
190 curString = curString.trimmed();
193 if (!curString.isEmpty()) {
194 // Add to list with the same separator used
195 result.append(curString + "; ");
197 // Reset the current list item
201 // "\;" -> Keep as "\;"
202 curString.append(R"(\;)");
205 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
206 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
207 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
211 consecutiveSlashes = 0;
215 // Increase consecutive slash count
216 consecutiveSlashes++;
217 // Check if we've reached "\\\"...
218 if (consecutiveSlashes == 3) {
219 // "\\\" -> Keep as "\\" + "\"
220 curString.append(R"(\\)");
221 // Set consecutive slashes to 1, recognizing the trailing "\"
222 consecutiveSlashes = 1;
224 else if (consecutiveSlashes > 3) {
225 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
226 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
227 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
234 // Preserve the characters as they are now
237 // "\[\n]" -> Split (keep as "\")
238 // "\\[\n]" -> Split (keep as "\\")
240 switch (consecutiveSlashes) {
246 // Apply the additional "\" or "\\"
247 curString.append(QString(R"(\)").repeated(consecutiveSlashes));
250 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
251 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
252 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), applying newline split anyways!";
256 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
257 curString = curString.trimmed();
260 if (!curString.isEmpty()) {
261 // Add to list with the same separator used
262 result.append(curString + "\n");
264 // Reset the current list item
266 consecutiveSlashes = 0;
269 // Preserve the characters as they are now
270 switch (consecutiveSlashes) {
272 // "..." -> Keep as "..."
273 curString.append(curChar);
277 // "\..." -> Keep as "\..."
278 // "\\..." -> Keep as "\\..."
279 curString.append(QString("\\").repeated(consecutiveSlashes) + curChar);
282 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
283 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
284 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring " << curChar
288 consecutiveSlashes = 0;
293 // Remove any trailing separators
294 if (result.endsWith("; ")) {
298 // Remove any trailing whitespace
299 return result.trimmed();
302 void ExpressionMatch::cacheRegEx()
304 _matchRegExActive = false;
305 _matchInvertRegExActive = false;
307 _sourceExpressionEmpty = _sourceExpression.isEmpty();
308 if (_sourceExpressionEmpty) {
309 // No need to calculate anything for empty strings
313 // Convert the given expression to a regular expression based on the mode
314 switch (_sourceMode) {
315 case MatchMode::MatchPhrase:
316 // Match entire phrase, noninverted
317 // Don't trim whitespace for phrase matching as someone might want to match on " word ", a
318 // more-specific request than "word".
319 _matchRegEx = regExFactory("(?:^|\\W)" + regExEscape(_sourceExpression) + "(?:\\W|$)", _sourceCaseSensitive);
320 _matchRegExActive = true;
322 case MatchMode::MatchMultiPhrase:
323 // Match multiple entire phrases, noninverted
324 // Convert from multiple-phrase rules
325 _matchRegEx = regExFactory(convertFromMultiPhrase(_sourceExpression), _sourceCaseSensitive);
326 _matchRegExActive = true;
328 case MatchMode::MatchWildcard:
329 // Match as wildcard expression
330 // Convert from wildcard rules for a single wildcard
331 if (_sourceExpression.startsWith("!")) {
332 // Inverted rule: take the remainder of the string
333 // "^" + invertComponents.at(0) + "$"
334 _matchInvertRegEx = regExFactory("^" + wildcardToRegEx(_sourceExpression.mid(1)) + "$", _sourceCaseSensitive);
335 _matchInvertRegExActive = true;
338 // Normal rule: take the whole string
339 // Account for any escaped "!" (i.e. "\!") by skipping past the "\", but don't skip past
340 // escaped "\" (i.e. "\\!")
341 _matchRegEx = regExFactory("^" + wildcardToRegEx(_sourceExpression.startsWith("\\!") ? _sourceExpression.mid(1) : _sourceExpression)
343 _sourceCaseSensitive);
344 _matchRegExActive = true;
347 case MatchMode::MatchMultiWildcard:
348 // Match as multiple wildcard expressions
349 // Convert from wildcard rules for multiple wildcards
350 // (The generator function handles setting matchRegEx/matchInvertRegEx)
351 generateFromMultiWildcard(_sourceExpression, _sourceCaseSensitive);
353 case MatchMode::MatchRegEx:
354 // Match as regular expression
355 if (_sourceExpression.startsWith("!")) {
356 // Inverted rule: take the remainder of the string
357 _matchInvertRegEx = regExFactory(_sourceExpression.mid(1), _sourceCaseSensitive);
358 _matchInvertRegExActive = true;
361 // Normal rule: take the whole string
362 // Account for any escaped "!" (i.e. "\!") by skipping past the "\", but don't skip past
363 // escaped "\" (i.e. "\\!")
364 _matchRegEx = regExFactory(_sourceExpression.startsWith("\\!") ? _sourceExpression.mid(1) : _sourceExpression,
365 _sourceCaseSensitive);
366 _matchRegExActive = true;
370 // This should never happen if you keep the above consistent
371 qWarning() << Q_FUNC_INFO << "Unknown MatchMode" << (int)_sourceMode << "!";
375 if (!_sourceExpressionEmpty && !isValid()) {
376 // This can happen with invalid regex, so make it a bit more user-friendly. Set it to Info
377 // level as ideally someone's not just going to leave a broken match rule around. For
378 // MatchRegEx, they probably need to fix their regex rule. For the other modes, there's
379 // probably a bug in the parsing routines (which should also be fixed).
380 qInfo() << "Could not parse expression match rule" << _sourceExpression << "(match mode:" << (int)_sourceMode
381 << "), this rule will be ignored";
385 QRegularExpression ExpressionMatch::regExFactory(const QString& regExString, bool caseSensitive)
387 // This is required, else extra-ASCII codepoints get treated as word boundaries
388 QRegularExpression::PatternOptions options = QRegularExpression::UseUnicodePropertiesOption;
390 if (!caseSensitive) {
391 options |= QRegularExpression::CaseInsensitiveOption;
394 QRegularExpression newRegEx = QRegularExpression(regExString, options);
396 // Check if rule is valid
397 if (!newRegEx.isValid()) {
398 // This can happen with invalid regex, so make it a bit more user-friendly. Keep this
399 // distinct from the main info-level message for easier debugging in case a regex component
400 // in Wildcard or Phrase mode breaks.
401 qDebug() << "Internal regular expression component" << regExString << "is invalid and will be ignored";
403 // Qt offers explicit control over when QRegularExpression objects get optimized.
404 // By default, patterns are only optimized after some number of uses as defined
405 // within Qt internals.
407 // In the context of ExpressionMatch, some regular expressions might go unused, e.g. a highlight
408 // rule might never match a channel pattern, resulting in the contents pattern being untouched.
409 // It should be safe to let Qt handle optimization, taking a non-deterministic, one-off
410 // performance penalty on optimization for the sake of saving memory usage on patterns that
413 // If profiling shows expressions are generally used and/or the automatic optimization
414 // interferes incurs too high of a penalty (unlikely given we've created regular expression
415 // objects willy-nilly before now), this can be revisited to explicitly call...
418 // // Optimize regex now
419 // newRegEx.optimize();
422 // NOTE: This should only be called if the expression is valid! Apply within an "else" of the
423 // inverted isValid() check above.
425 // See https://doc.qt.io/qt-5/qregularexpression.html#optimize
430 QString ExpressionMatch::regExEscape(const QString& phrase)
432 // Escape the given phrase of any special regular expression characters
433 return QRegularExpression::escape(phrase);
436 QString ExpressionMatch::convertFromMultiPhrase(const QString& originalRule)
438 // Convert the multi-phrase rule into regular expression format
439 // Split apart the original rule into components
440 // Use QStringList instead of std::vector<QString> to make use of Qt's built-in .join() method
441 QStringList components = {};
443 for (auto&& component : originalRule.split("\n", QString::SkipEmptyParts)) {
444 // Don't trim whitespace to maintain consistency with single phrase matching
445 // As trimming is not performed, empty components will already be skipped. This means " "
446 // is considered a valid matching phrase.
448 // Take the whole string, escaping any regex
449 components.append(regExEscape(component));
452 // Create full regular expression by...
453 // > Enclosing within a non-capturing group to avoid overhead of text extraction, "(?:...)"
454 // > Flattening normal and inverted rules using the regex OR character "...|..."
456 // Before: [foo, bar, baz]
457 // After: (?:^|\W)(?:foo|bar|baz)(?:\W|$)
459 if (components.count() == 1) {
460 // Single item, skip the noncapturing group
461 return "(?:^|\\W)" + components.at(0) + "(?:\\W|$)";
464 return "(?:^|\\W)(?:" + components.join("|") + ")(?:\\W|$)";
468 void ExpressionMatch::generateFromMultiWildcard(const QString& originalRule, bool caseSensitive)
470 // Convert the wildcard rule into regular expression format
471 // First, reset the existing match expressions
473 _matchInvertRegEx = {};
474 _matchRegExActive = false;
475 _matchInvertRegExActive = false;
477 // This gets handled in three steps:
479 // 1. Break apart ";"-separated list into components
480 // 2. Convert components from wildcard format into regular expression format
481 // 3. Combine normal/invert components into normal/invert regular expressions
483 // Let's start by making the list...
485 // Convert a ";"-separated list into an actual list, splitting on newlines and unescaping
486 // escaped characters
488 // Escaped list rules (where "[\n]" represents newline):
493 // \; | Replace with ";"
494 // \\; | Split (keep as "\\")
495 // ! | At start: mark as inverted
496 // \! | At start: replace with "!"
497 // \\! | At start: keep as "\\!" (replaced with "\!" in wildcard conversion)
498 // ! | Elsewhere: keep as "!"
499 // \! | Elsewhere: keep as "\!"
500 // \\! | Elsewhere: keep as "\\!"
501 // \\\ | Keep as "\\" + "\", set consecutive slashes to 1
503 // \[\n] | Split (keep as "\")
504 // \\[\n] | Split (keep as "\\")
505 // ... | Keep as "..."
506 // \... | Keep as "\..."
507 // \\... | Keep as "\\..."
509 // Strings are forced to end with "\n", always applying "\..." and "\\..." rules
510 // "..." also includes another "\" character
512 // All whitespace is trimmed from each component
514 // "\\" and "\" are not downconverted to allow for other escape codes to be detected in
515 // ExpressionMatch::wildcardToRegex
520 // norm;!invert; norm-space ; !invert-space ;;!;\!norm-escaped;\\!slash-invert;\\\\double;
521 // escape\;sep;slash-end-split\\;quad\\\\!noninvert;newline-split[\n]newline-split-slash\\[\n]
522 // slash-at-end\\ [line does not continue]
524 // (Newlines are encoded as "[\n]". Ignore linebreaks for the sake of comment wrapping.)
527 // > Normal components without wildcard conversion
534 // slash-end-split\\ [line does not continue]
535 // quad\\\\!noninvert
537 // newline-split-slash\\ [line does not continue]
538 // slash-at-end\\ [line does not continue]
540 // > Inverted components without wildcard conversion
545 // > Normal components with wildcard conversion
552 // slash\-end\-split\\ [line does not continue]
553 // quad\\\\\!noninvert
555 // newline\-split\-slash\\ [line does not continue]
556 // slash\-at\-end\\ [line does not continue]
558 // > Inverted components with wildcard conversion
563 // > Normal wildcard-converted regex
564 // ^(?:norm|norm\-space|\!norm\-escaped|\\\!slash\-invert|\\\\double|escape\;sep|
565 // slash\-end\-split\\|quad\\\\\!noninvert|newline\-split|newline\-split\-slash\\|
566 // slash\-at\-end\\)$
568 // > Inverted wildcard-converted regex
569 // ^(?:invert|invert\-space)$
571 // Note: R"(\\)" results in the literal of "\\", two backslash characters. Anything inside the
572 // brackets is treated as a literal. Outside the brackets but inside the quotes is still
575 // See https://en.cppreference.com/w/cpp/language/string_literal
579 QString rule(originalRule);
581 // Force a termination at the end of the string to trigger a split
582 // Don't check for ";" splits as they may be escaped
583 if (!rule.endsWith("\n")) {
587 // Result, sorted into normal and inverted rules
588 // Use QStringList instead of std::vector<QString> to make use of Qt's built-in .join() method
589 QStringList normalComponents = {}, invertComponents = {};
594 QString curString = {};
596 int sourceLength = rule.length();
597 // Consecutive "\" characters
598 int consecutiveSlashes = 0;
599 // Whether or not this marks an inverted rule
600 bool isInverted = false;
601 // Whether or not we're at the beginning of the rule (for detecting "!" and "\!")
602 bool isRuleStart = true;
604 // We know it's going to have ";"-count items or less, so reserve ";"-count items for both.
605 // Without parsing it's not easily possible to tell which are escaped or not, and among the
606 // non-escaped entries, which are inverted or not. These get destroyed once out of scope of
607 // this function, so balancing towards performance over memory usage should be okay, hopefully.
608 int separatorCount = rule.count(";");
609 normalComponents.reserve(separatorCount);
610 invertComponents.reserve(separatorCount);
612 // For every character...
613 for (int i = 0; i < sourceLength; i++) {
615 curChar = rule.at(i);
616 // Check if it's on the list of special list characters, converting to Unicode for use
617 // in the switch statement
619 // See https://doc.qt.io/qt-5/qchar.html#unicode
620 switch (curChar.unicode()) {
623 switch (consecutiveSlashes) {
628 // "\\;" -> Split (keep as "\\")
629 // Not escaped separator, split into a new item
631 // Apply the additional "\\" if needed
632 if (consecutiveSlashes == 2) {
633 // "\\;" -> Split (keep as "\\")
634 curString.append(R"(\\)");
637 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
638 curString = curString.trimmed();
641 if (!curString.isEmpty()) {
642 // Add to inverted/normal list
644 invertComponents.append(wildcardToRegEx(curString));
647 normalComponents.append(wildcardToRegEx(curString));
650 // Reset the current list item
656 // "\;" -> Replace with ";"
657 curString.append(";");
661 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
662 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
663 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
668 consecutiveSlashes = 0;
671 // Rule inverter found
673 // Apply the inverting logic
674 switch (consecutiveSlashes) {
676 // "!" -> At start: mark as inverted
678 // Don't include the "!" character
681 // "\!" -> At start: replace with "!"
682 curString.append("!");
685 // "\\!" -> At start: keep as "\\!" (replaced with "\!" in wildcard conversion)
686 curString.append(R"(\\!)");
689 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
690 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
691 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
697 // Preserve the characters as they are now
698 switch (consecutiveSlashes) {
700 // "!" -> Elsewhere: keep as "!"
701 curString.append("!");
705 // "\!" -> Elsewhere: keep as "\!"
706 // "\\!" -> Elsewhere: keep as "\\!"
707 curString.append(QString(R"(\)").repeated(consecutiveSlashes) + "!");
710 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
711 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
712 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
718 consecutiveSlashes = 0;
722 // Increase consecutive slash count
723 consecutiveSlashes++;
724 // Check if we've reached "\\\"...
725 if (consecutiveSlashes == 3) {
726 // "\\\" -> Keep as "\\" + "\"
727 curString.append(R"(\\)");
728 // No longer at the rule start
730 // Set consecutive slashes to 1, recognizing the trailing "\"
731 consecutiveSlashes = 1;
733 else if (consecutiveSlashes > 3) {
734 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
735 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
736 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
740 // Don't set "isRuleStart" here as "\" is used in escape sequences
744 // Preserve the characters as they are now
747 // "\[\n]" -> Split (keep as "\")
748 // "\\[\n]" -> Split (keep as "\\")
750 switch (consecutiveSlashes) {
756 // Apply the additional "\" or "\\"
757 curString.append(QString(R"(\)").repeated(consecutiveSlashes));
760 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
761 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
762 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), applying newline split anyways!";
766 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
767 curString = curString.trimmed();
770 if (!curString.isEmpty()) {
771 // Add to inverted/normal list
773 invertComponents.append(wildcardToRegEx(curString));
776 normalComponents.append(wildcardToRegEx(curString));
779 // Reset the current list item
783 consecutiveSlashes = 0;
786 // Preserve the characters as they are now
787 switch (consecutiveSlashes) {
789 // "..." -> Keep as "..."
790 curString.append(curChar);
794 // "\..." -> Keep as "\..."
795 // "\\..." -> Keep as "\\..."
796 curString.append(QString("\\").repeated(consecutiveSlashes) + curChar);
799 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
800 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
801 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring " << curChar
805 // Don't mark as past rule start for whitespace (whitespace gets trimmed)
806 if (!curChar.isSpace()) {
809 consecutiveSlashes = 0;
814 // Clean up any duplicates
815 normalComponents.removeDuplicates();
816 invertComponents.removeDuplicates();
818 // Create full regular expressions by...
819 // > Anchoring to start and end of string to mimic QRegExp's .exactMatch() handling, "^...$"
820 // > Enclosing within a non-capturing group to avoid overhead of text extraction, "(?:...)"
821 // > Flattening normal and inverted rules using the regex OR character "...|..."
823 // Before: [foo, bar, baz]
824 // After: ^(?:foo|bar|baz)$
826 // See https://doc.qt.io/qt-5/qregularexpression.html#porting-from-qregexp-exactmatch
827 // And https://regex101.com/
829 // Any empty/invalid regex are handled within ExpressionMatch::match()
830 if (!normalComponents.isEmpty()) {
831 // Create normal match regex
832 if (normalComponents.count() == 1) {
833 // Single item, skip the noncapturing group
834 _matchRegEx = regExFactory("^" + normalComponents.at(0) + "$", caseSensitive);
837 _matchRegEx = regExFactory("^(?:" + normalComponents.join("|") + ")$", caseSensitive);
839 _matchRegExActive = true;
841 if (!invertComponents.isEmpty()) {
842 // Create invert match regex
843 if (invertComponents.count() == 1) {
844 // Single item, skip the noncapturing group
845 _matchInvertRegEx = regExFactory("^" + invertComponents.at(0) + "$", caseSensitive);
848 _matchInvertRegEx = regExFactory("^(?:" + invertComponents.join("|") + ")$", caseSensitive);
850 _matchInvertRegExActive = true;
854 QString ExpressionMatch::wildcardToRegEx(const QString& expression)
856 // Convert the wildcard expression into regular expression format
858 // We're taking a little bit different of a route...
860 // Original QRegExp::Wildcard rules:
861 // --------------------------
862 // Wildcard | Regex | Outcome
863 // ---------|-------|--------
864 // * | .* | zero or more of any character
865 // ? | . | any single character
867 // NOTE 1: This is QRegExp::Wildcard, not QRegExp::WildcardUnix
869 // NOTE 2: We are ignoring the "[...]" character-class matching functionality of
870 // QRegExp::Wildcard as that feature's a bit more complex and can be handled with full-featured
873 // See https://doc.qt.io/qt-5/qregexp.html#wildcard-matching
875 // Quassel originally did not use QRegExp::WildcardUnix, which prevented escaping "*" and "?" in
876 // messages. Unfortunately, spam messages might decide to use both, so offering a way to escape
879 // On the flip-side, that means to match "\" requires escaping as "\\", breaking backwards
882 // Quassel's Wildcard rules
883 // ------------------------------------------
884 // Wildcard | Regex escaped | Regex | Outcome
885 // ---------|---------------|-------|--------
886 // * | \* | .* | zero or more of any character
887 // ? | \? | . | any single character
888 // \* | \\\* | \* | literal "*"
889 // \? | \\\? | \? | literal "?"
890 // \[...] | \\[...] | [...] | invalid escape, ignore it
891 // \\ | \\\\ | \\ | literal "\"
893 // In essence, "*" and "?" need changed only if not escaped, "\\" collapses into "\", "\" gets
894 // ignored; other characters escape normally.
899 // never?gonna*give\*you\?up\\test|y\yeah\\1\\\\2\\\1inval
901 // ("\\\\" represents "\\", "\\" represents "\", and "\\\" is valid+invalid, "\")
903 // > Regex escaped wildcard rule
904 // never\?gonna\*give\\\*you\\\?up\\\\test\|y\\yeah\\\\1\\\\\\\\2\\\\\\1inval
906 // > Expected correct regex
907 // never.gonna.*give\*you\?up\\test\|yyeah\\1\\\\2\\1inval
909 // > Undoing regex escaping of "\" as "\\" (i.e. simple replace, with special escapes intact)
910 // never.gonna.*give\*you\?up\test\|yyeah\1\\2\1inval
912 // Escape string according to regex
913 QString regExEscaped(regExEscape(expression));
917 // NOTE: In theory, regular expression lookbehind could solve this. Unfortunately, QRegExp does
918 // not support lookbehind, and it's theoretically inefficient, anyways. Just use an approach
919 // similar to that taken by QRegExp's official wildcard mode.
921 // Lookbehind example (that we can't use):
922 // (?<!abc)test Negative lookbehind - don't match if "test" is proceeded by "abc"
924 // See https://code.qt.io/cgit/qt/qtbase.git/tree/src/corelib/tools/qregexp.cpp
926 // NOTE: We don't copy QRegExp's mode as QRegularExpression has more special characters. We
927 // can't use the same escaping code, hence calling the appropriate QReg[...]::escape() above.
936 int sourceLength = regExEscaped.length();
937 // Consecutive "\" characters
938 int consecutiveSlashes = 0;
940 // We know it's going to be the same length or smaller, so reserve the same size as the string
941 result.reserve(sourceLength);
943 // For every character...
944 for (int i = 0; i < sourceLength; i++) {
946 curChar = regExEscaped.at(i);
947 // Check if it's on the list of special wildcard characters, converting to Unicode for use
948 // in the switch statement
950 // See https://doc.qt.io/qt-5/qchar.html#unicode
951 switch (curChar.unicode()) {
954 switch (consecutiveSlashes) {
956 // "?" -> "\?" -> "."
957 // Convert from regex escaped "?" to regular expression
961 // "\?" -> "\\\?" -> "\?"
962 // Convert from regex escaped "\?" to literal string
963 result.append(R"(\?)");
966 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
967 qWarning() << Q_FUNC_INFO << "Wildcard rule" << expression << "resulted in escaped regular expression string"
968 << regExEscaped << " with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring"
969 << curChar << "character!";
972 consecutiveSlashes = 0;
976 switch (consecutiveSlashes) {
978 // "*" -> "\*" -> ".*"
979 // Convert from regex escaped "*" to regular expression
983 // "\*" -> "\\\*" -> "\*"
984 // Convert from regex escaped "\*" to literal string
985 result.append(R"(\*)");
988 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
989 qWarning() << Q_FUNC_INFO << "Wildcard rule" << expression << "resulted in escaped regular expression string"
990 << regExEscaped << " with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring"
991 << curChar << "character!";
994 consecutiveSlashes = 0;
998 // Increase consecutive slash count
999 consecutiveSlashes++;
1000 // Check if we've hit an escape sequence
1001 if (consecutiveSlashes == 4) {
1002 // "\\" -> "\\\\" -> "\\"
1003 // Convert from regex escaped "\\" to literal string
1004 result.append(R"(\\)");
1005 // Reset slash count
1006 consecutiveSlashes = 0;
1010 // Any other character
1011 switch (consecutiveSlashes) {
1014 // "[...]" -> "[...]" -> "[...]"
1016 // "\[...]" -> "\\[...]" -> "[...]"
1017 // Either just print the character itself, or convert from regex-escaped invalid
1018 // wildcard escape sequence to the character itself
1020 // Both mean doing nothing, the actual character [...] gets appended below
1023 // "[...]" -> "\[...]" -> "\"
1024 // Keep regex-escaped special character "[...]" as literal string
1025 // (Where "[...]" represents any non-wildcard regex special character)
1026 result.append(R"(\)");
1027 // The actual character [...] gets appended below
1030 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
1031 qWarning() << Q_FUNC_INFO << "Wildcard rule" << expression << "resulted in escaped regular expression string"
1032 << regExEscaped << " with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring"
1033 << curChar << "char escape!";
1036 consecutiveSlashes = 0;
1037 // Add the character itself
1038 result.append(curChar);
1043 // Anchoring to simulate QRegExp::exactMatch() is handled in
1044 // ExpressionMatch::convertFromWildcard()