1 /***************************************************************************
2 * Copyright (C) 2005-2018 by the Quassel Project *
3 * devel@quassel-irc.org *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) version 3. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 ***************************************************************************/
21 #include "expressionmatch.h"
26 #include <QStringList>
28 #include "logmessage.h"
30 ExpressionMatch::ExpressionMatch(const QString& expression, MatchMode mode, bool caseSensitive)
32 // Store the original parameters for later reference
33 _sourceExpression = expression;
35 _sourceCaseSensitive = caseSensitive;
37 // Calculate the internal regex
39 // Do this now instead of on-demand to provide immediate feedback on errors when editing
40 // highlight and ignore rules.
44 bool ExpressionMatch::match(const QString& string, bool matchEmpty) const
46 // Handle empty expression strings
47 if (_sourceExpressionEmpty) {
48 // Match found if matching empty is allowed, otherwise no match found
53 // Can't match on an invalid rule
57 // We have "_matchRegEx", "_matchInvertRegEx", or both due to isValid() check above
59 // If specified, first check inverted rules
60 if (_matchInvertRegExActive && _matchInvertRegEx.isValid()) {
61 // Check inverted match rule
62 if (_matchInvertRegEx.match(string).hasMatch()) {
63 // Inverted rule matched, the rest of the rule cannot match
68 if (_matchRegExActive && _matchRegEx.isValid()) {
69 // Check regular match rule
70 return _matchRegEx.match(string).hasMatch();
73 // If no valid regular rules exist, due to the isValid() check there must be valid inverted
74 // rules that did not match. Count this as properly matching (implicit wildcard).
79 QString ExpressionMatch::trimMultiWildcardWhitespace(const QString& originalRule)
81 // This gets handled in two steps:
83 // 1. Break apart ";"-separated list into components
84 // 2. Combine whitespace-trimmed components into wildcard expression
86 // Let's start by making the list...
88 // Convert a ";"-separated list into an actual list, splitting on newlines and unescaping
91 // Escaped list rules (where "[\n]" represents newline):
97 // \\; | Split (keep as "\\")
98 // \\\ | Keep as "\\" + "\", set consecutive slashes to 1
100 // \[\n] | Split (keep as "\")
101 // \\[\n] | Split (keep as "\\")
102 // ... | Keep as "..."
103 // \... | Keep as "\..."
104 // \\... | Keep as "\\..."
106 // Strings are forced to end with "\n", always applying "\..." and "\\..." rules
107 // "..." also includes another "\" character
109 // All whitespace is trimmed from each component
111 // "\\" and "\" are not downconverted to allow for other escape codes to be detected in
112 // ExpressionMatch::wildcardToRegex
117 // norm; norm-space ; newline-space [\n] ;escape \; sep ; slash-end-split\\; quad\\\\norm;
118 // newline-split-slash\\[\n] slash-at-end\\ [line does not continue]
125 // slash-end-split\\ [line does not continue]
127 // newline-split-slash\\ [line does not continue]
128 // slash-at-end\\ [line does not continue]
130 // > Trimmed wildcard rule
131 // norm; norm-space; newline-space[\n]escape \; sep; slash-end-split\\; quad\\\\norm;
132 // newline-split-slash\\[\n]slash-at-end\\ [line does not continue]
134 // (Newlines are encoded as "[\n]". Ignore linebreaks for the sake of comment wrapping.)
136 // Note: R"(\\)" results in the literal of "\\", two backslash characters. Anything inside the
137 // brackets is treated as a literal. Outside the brackets but inside the quotes is still
140 // See https://en.cppreference.com/w/cpp/language/string_literal
144 QString rule(originalRule);
146 // Force a termination at the end of the string to trigger a split
147 // Don't check for ";" splits as they may be escaped
148 if (!rule.endsWith("\n")) {
157 QString curString = {};
159 int sourceLength = rule.length();
160 // Consecutive "\" characters
161 int consecutiveSlashes = 0;
163 // We know it's going to be the same length or smaller, so reserve the same size as the string
164 result.reserve(sourceLength);
166 // For every character...
167 for (int i = 0; i < sourceLength; i++) {
169 curChar = rule.at(i);
170 // Check if it's on the list of special list characters, converting to Unicode for use
171 // in the switch statement
173 // See https://doc.qt.io/qt-5/qchar.html#unicode
174 switch (curChar.unicode()) {
177 switch (consecutiveSlashes) {
182 // "\\;" -> Split (keep as "\\")
183 // Not escaped separator, split into a new item
185 // Apply the additional "\\" if needed
186 if (consecutiveSlashes == 2) {
187 // "\\;" -> Split (keep as "\\")
188 curString.append(R"(\\)");
191 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
192 curString = curString.trimmed();
195 if (!curString.isEmpty()) {
196 // Add to list with the same separator used
197 result.append(curString + "; ");
199 // Reset the current list item
203 // "\;" -> Keep as "\;"
204 curString.append(R"(\;)");
207 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
208 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
209 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
213 consecutiveSlashes = 0;
217 // Increase consecutive slash count
218 consecutiveSlashes++;
219 // Check if we've reached "\\\"...
220 if (consecutiveSlashes == 3) {
221 // "\\\" -> Keep as "\\" + "\"
222 curString.append(R"(\\)");
223 // Set consecutive slashes to 1, recognizing the trailing "\"
224 consecutiveSlashes = 1;
226 else if (consecutiveSlashes > 3) {
227 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
228 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
229 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
236 // Preserve the characters as they are now
239 // "\[\n]" -> Split (keep as "\")
240 // "\\[\n]" -> Split (keep as "\\")
242 switch (consecutiveSlashes) {
248 // Apply the additional "\" or "\\"
249 curString.append(QString(R"(\)").repeated(consecutiveSlashes));
252 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
253 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
254 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), applying newline split anyways!";
258 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
259 curString = curString.trimmed();
262 if (!curString.isEmpty()) {
263 // Add to list with the same separator used
264 result.append(curString + "\n");
266 // Reset the current list item
268 consecutiveSlashes = 0;
271 // Preserve the characters as they are now
272 switch (consecutiveSlashes) {
274 // "..." -> Keep as "..."
275 curString.append(curChar);
279 // "\..." -> Keep as "\..."
280 // "\\..." -> Keep as "\\..."
281 curString.append(QString("\\").repeated(consecutiveSlashes) + curChar);
284 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
285 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
286 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring " << curChar
290 consecutiveSlashes = 0;
295 // Remove any trailing separators
296 if (result.endsWith("; ")) {
300 // Remove any trailing whitespace
301 return result.trimmed();
304 void ExpressionMatch::cacheRegEx()
306 _matchRegExActive = false;
307 _matchInvertRegExActive = false;
309 _sourceExpressionEmpty = _sourceExpression.isEmpty();
310 if (_sourceExpressionEmpty) {
311 // No need to calculate anything for empty strings
315 // Convert the given expression to a regular expression based on the mode
316 switch (_sourceMode) {
317 case MatchMode::MatchPhrase:
318 // Match entire phrase, noninverted
319 // Don't trim whitespace for phrase matching as someone might want to match on " word ", a
320 // more-specific request than "word".
321 _matchRegEx = regExFactory("(?:^|\\W)" + regExEscape(_sourceExpression) + "(?:\\W|$)", _sourceCaseSensitive);
322 _matchRegExActive = true;
324 case MatchMode::MatchMultiPhrase:
325 // Match multiple entire phrases, noninverted
326 // Convert from multiple-phrase rules
327 _matchRegEx = regExFactory(convertFromMultiPhrase(_sourceExpression), _sourceCaseSensitive);
328 _matchRegExActive = true;
330 case MatchMode::MatchWildcard:
331 // Match as wildcard expression
332 // Convert from wildcard rules for a single wildcard
333 if (_sourceExpression.startsWith("!")) {
334 // Inverted rule: take the remainder of the string
335 // "^" + invertComponents.at(0) + "$"
336 _matchInvertRegEx = regExFactory("^" + wildcardToRegEx(_sourceExpression.mid(1)) + "$", _sourceCaseSensitive);
337 _matchInvertRegExActive = true;
340 // Normal rule: take the whole string
341 // Account for any escaped "!" (i.e. "\!") by skipping past the "\", but don't skip past
342 // escaped "\" (i.e. "\\!")
343 _matchRegEx = regExFactory("^" + wildcardToRegEx(_sourceExpression.startsWith("\\!") ? _sourceExpression.mid(1) : _sourceExpression)
345 _sourceCaseSensitive);
346 _matchRegExActive = true;
349 case MatchMode::MatchMultiWildcard:
350 // Match as multiple wildcard expressions
351 // Convert from wildcard rules for multiple wildcards
352 // (The generator function handles setting matchRegEx/matchInvertRegEx)
353 generateFromMultiWildcard(_sourceExpression, _sourceCaseSensitive);
355 case MatchMode::MatchRegEx:
356 // Match as regular expression
357 if (_sourceExpression.startsWith("!")) {
358 // Inverted rule: take the remainder of the string
359 _matchInvertRegEx = regExFactory(_sourceExpression.mid(1), _sourceCaseSensitive);
360 _matchInvertRegExActive = true;
363 // Normal rule: take the whole string
364 // Account for any escaped "!" (i.e. "\!") by skipping past the "\", but don't skip past
365 // escaped "\" (i.e. "\\!")
366 _matchRegEx = regExFactory(_sourceExpression.startsWith("\\!") ? _sourceExpression.mid(1) : _sourceExpression,
367 _sourceCaseSensitive);
368 _matchRegExActive = true;
372 // This should never happen if you keep the above consistent
373 qWarning() << Q_FUNC_INFO << "Unknown MatchMode" << (int)_sourceMode << "!";
377 if (!_sourceExpressionEmpty && !isValid()) {
378 // This can happen with invalid regex, so make it a bit more user-friendly. Set it to Info
379 // level as ideally someone's not just going to leave a broken match rule around. For
380 // MatchRegEx, they probably need to fix their regex rule. For the other modes, there's
381 // probably a bug in the parsing routines (which should also be fixed).
382 quInfo() << "Could not parse expression match rule" << _sourceExpression << "(match mode:" << (int)_sourceMode
383 << "), this rule will be ignored";
387 QRegularExpression ExpressionMatch::regExFactory(const QString& regExString, bool caseSensitive)
389 // Construct the regular expression object, setting case sensitivity as appropriate
390 QRegularExpression newRegEx = QRegularExpression(regExString,
391 caseSensitive ? QRegularExpression::PatternOption::NoPatternOption
392 : QRegularExpression::PatternOption::CaseInsensitiveOption);
394 // Check if rule is valid
395 if (!newRegEx.isValid()) {
396 // This can happen with invalid regex, so make it a bit more user-friendly. Keep this
397 // distinct from the main info-level message for easier debugging in case a regex component
398 // in Wildcard or Phrase mode breaks.
399 qDebug() << "Internal regular expression component" << regExString << "is invalid and will be ignored";
401 // Qt offers explicit control over when QRegularExpression objects get optimized.
402 // By default, patterns are only optimized after some number of uses as defined
403 // within Qt internals.
405 // In the context of ExpressionMatch, some regular expressions might go unused, e.g. a highlight
406 // rule might never match a channel pattern, resulting in the contents pattern being untouched.
407 // It should be safe to let Qt handle optimization, taking a non-deterministic, one-off
408 // performance penalty on optimization for the sake of saving memory usage on patterns that
411 // If profiling shows expressions are generally used and/or the automatic optimization
412 // interferes incurs too high of a penalty (unlikely given we've created regular expression
413 // objects willy-nilly before now), this can be revisited to explicitly call...
416 // // Optimize regex now
417 // newRegEx.optimize();
420 // NOTE: This should only be called if the expression is valid! Apply within an "else" of the
421 // inverted isValid() check above.
423 // See https://doc.qt.io/qt-5/qregularexpression.html#optimize
428 QString ExpressionMatch::regExEscape(const QString& phrase)
430 // Escape the given phrase of any special regular expression characters
431 return QRegularExpression::escape(phrase);
434 QString ExpressionMatch::convertFromMultiPhrase(const QString& originalRule)
436 // Convert the multi-phrase rule into regular expression format
437 // Split apart the original rule into components
438 // Use QStringList instead of std::vector<QString> to make use of Qt's built-in .join() method
439 QStringList components = {};
441 for (auto&& component : originalRule.split("\n", QString::SkipEmptyParts)) {
442 // Don't trim whitespace to maintain consistency with single phrase matching
443 // As trimming is not performed, empty components will already be skipped. This means " "
444 // is considered a valid matching phrase.
446 // Take the whole string, escaping any regex
447 components.append(regExEscape(component));
450 // Create full regular expression by...
451 // > Enclosing within a non-capturing group to avoid overhead of text extraction, "(?:...)"
452 // > Flattening normal and inverted rules using the regex OR character "...|..."
454 // Before: [foo, bar, baz]
455 // After: (?:^|\W)(?:foo|bar|baz)(?:\W|$)
457 if (components.count() == 1) {
458 // Single item, skip the noncapturing group
459 return "(?:^|\\W)" + components.at(0) + "(?:\\W|$)";
462 return "(?:^|\\W)(?:" + components.join("|") + ")(?:\\W|$)";
466 void ExpressionMatch::generateFromMultiWildcard(const QString& originalRule, bool caseSensitive)
468 // Convert the wildcard rule into regular expression format
469 // First, reset the existing match expressions
471 _matchInvertRegEx = {};
472 _matchRegExActive = false;
473 _matchInvertRegExActive = false;
475 // This gets handled in three steps:
477 // 1. Break apart ";"-separated list into components
478 // 2. Convert components from wildcard format into regular expression format
479 // 3. Combine normal/invert components into normal/invert regular expressions
481 // Let's start by making the list...
483 // Convert a ";"-separated list into an actual list, splitting on newlines and unescaping
484 // escaped characters
486 // Escaped list rules (where "[\n]" represents newline):
491 // \; | Replace with ";"
492 // \\; | Split (keep as "\\")
493 // ! | At start: mark as inverted
494 // \! | At start: replace with "!"
495 // \\! | At start: keep as "\\!" (replaced with "\!" in wildcard conversion)
496 // ! | Elsewhere: keep as "!"
497 // \! | Elsewhere: keep as "\!"
498 // \\! | Elsewhere: keep as "\\!"
499 // \\\ | Keep as "\\" + "\", set consecutive slashes to 1
501 // \[\n] | Split (keep as "\")
502 // \\[\n] | Split (keep as "\\")
503 // ... | Keep as "..."
504 // \... | Keep as "\..."
505 // \\... | Keep as "\\..."
507 // Strings are forced to end with "\n", always applying "\..." and "\\..." rules
508 // "..." also includes another "\" character
510 // All whitespace is trimmed from each component
512 // "\\" and "\" are not downconverted to allow for other escape codes to be detected in
513 // ExpressionMatch::wildcardToRegex
518 // norm;!invert; norm-space ; !invert-space ;;!;\!norm-escaped;\\!slash-invert;\\\\double;
519 // escape\;sep;slash-end-split\\;quad\\\\!noninvert;newline-split[\n]newline-split-slash\\[\n]
520 // slash-at-end\\ [line does not continue]
522 // (Newlines are encoded as "[\n]". Ignore linebreaks for the sake of comment wrapping.)
525 // > Normal components without wildcard conversion
532 // slash-end-split\\ [line does not continue]
533 // quad\\\\!noninvert
535 // newline-split-slash\\ [line does not continue]
536 // slash-at-end\\ [line does not continue]
538 // > Inverted components without wildcard conversion
543 // > Normal components with wildcard conversion
550 // slash\-end\-split\\ [line does not continue]
551 // quad\\\\\!noninvert
553 // newline\-split\-slash\\ [line does not continue]
554 // slash\-at\-end\\ [line does not continue]
556 // > Inverted components with wildcard conversion
561 // > Normal wildcard-converted regex
562 // ^(?:norm|norm\-space|\!norm\-escaped|\\\!slash\-invert|\\\\double|escape\;sep|
563 // slash\-end\-split\\|quad\\\\\!noninvert|newline\-split|newline\-split\-slash\\|
564 // slash\-at\-end\\)$
566 // > Inverted wildcard-converted regex
567 // ^(?:invert|invert\-space)$
569 // Note: R"(\\)" results in the literal of "\\", two backslash characters. Anything inside the
570 // brackets is treated as a literal. Outside the brackets but inside the quotes is still
573 // See https://en.cppreference.com/w/cpp/language/string_literal
577 QString rule(originalRule);
579 // Force a termination at the end of the string to trigger a split
580 // Don't check for ";" splits as they may be escaped
581 if (!rule.endsWith("\n")) {
585 // Result, sorted into normal and inverted rules
586 // Use QStringList instead of std::vector<QString> to make use of Qt's built-in .join() method
587 QStringList normalComponents = {}, invertComponents = {};
592 QString curString = {};
594 int sourceLength = rule.length();
595 // Consecutive "\" characters
596 int consecutiveSlashes = 0;
597 // Whether or not this marks an inverted rule
598 bool isInverted = false;
599 // Whether or not we're at the beginning of the rule (for detecting "!" and "\!")
600 bool isRuleStart = true;
602 // We know it's going to have ";"-count items or less, so reserve ";"-count items for both.
603 // Without parsing it's not easily possible to tell which are escaped or not, and among the
604 // non-escaped entries, which are inverted or not. These get destroyed once out of scope of
605 // this function, so balancing towards performance over memory usage should be okay, hopefully.
606 int separatorCount = rule.count(";");
607 normalComponents.reserve(separatorCount);
608 invertComponents.reserve(separatorCount);
610 // For every character...
611 for (int i = 0; i < sourceLength; i++) {
613 curChar = rule.at(i);
614 // Check if it's on the list of special list characters, converting to Unicode for use
615 // in the switch statement
617 // See https://doc.qt.io/qt-5/qchar.html#unicode
618 switch (curChar.unicode()) {
621 switch (consecutiveSlashes) {
626 // "\\;" -> Split (keep as "\\")
627 // Not escaped separator, split into a new item
629 // Apply the additional "\\" if needed
630 if (consecutiveSlashes == 2) {
631 // "\\;" -> Split (keep as "\\")
632 curString.append(R"(\\)");
635 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
636 curString = curString.trimmed();
639 if (!curString.isEmpty()) {
640 // Add to inverted/normal list
642 invertComponents.append(wildcardToRegEx(curString));
645 normalComponents.append(wildcardToRegEx(curString));
648 // Reset the current list item
654 // "\;" -> Replace with ";"
655 curString.append(";");
659 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
660 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
661 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
666 consecutiveSlashes = 0;
669 // Rule inverter found
671 // Apply the inverting logic
672 switch (consecutiveSlashes) {
674 // "!" -> At start: mark as inverted
676 // Don't include the "!" character
679 // "\!" -> At start: replace with "!"
680 curString.append("!");
683 // "\\!" -> At start: keep as "\\!" (replaced with "\!" in wildcard conversion)
684 curString.append(R"(\\!)");
687 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
688 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
689 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
695 // Preserve the characters as they are now
696 switch (consecutiveSlashes) {
698 // "!" -> Elsewhere: keep as "!"
699 curString.append("!");
703 // "\!" -> Elsewhere: keep as "\!"
704 // "\\!" -> Elsewhere: keep as "\\!"
705 curString.append(QString(R"(\)").repeated(consecutiveSlashes) + "!");
708 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
709 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
710 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
716 consecutiveSlashes = 0;
720 // Increase consecutive slash count
721 consecutiveSlashes++;
722 // Check if we've reached "\\\"...
723 if (consecutiveSlashes == 3) {
724 // "\\\" -> Keep as "\\" + "\"
725 curString.append(R"(\\)");
726 // No longer at the rule start
728 // Set consecutive slashes to 1, recognizing the trailing "\"
729 consecutiveSlashes = 1;
731 else if (consecutiveSlashes > 3) {
732 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
733 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
734 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring" << curChar
738 // Don't set "isRuleStart" here as "\" is used in escape sequences
742 // Preserve the characters as they are now
745 // "\[\n]" -> Split (keep as "\")
746 // "\\[\n]" -> Split (keep as "\\")
748 switch (consecutiveSlashes) {
754 // Apply the additional "\" or "\\"
755 curString.append(QString(R"(\)").repeated(consecutiveSlashes));
758 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
759 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
760 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), applying newline split anyways!";
764 // Remove any whitespace, e.g. "item1; item2" -> " item2" -> "item2"
765 curString = curString.trimmed();
768 if (!curString.isEmpty()) {
769 // Add to inverted/normal list
771 invertComponents.append(wildcardToRegEx(curString));
774 normalComponents.append(wildcardToRegEx(curString));
777 // Reset the current list item
781 consecutiveSlashes = 0;
784 // Preserve the characters as they are now
785 switch (consecutiveSlashes) {
787 // "..." -> Keep as "..."
788 curString.append(curChar);
792 // "\..." -> Keep as "\..."
793 // "\\..." -> Keep as "\\..."
794 curString.append(QString("\\").repeated(consecutiveSlashes) + curChar);
797 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
798 qWarning() << Q_FUNC_INFO << "Wildcard rule" << rule << "resulted in rule component" << curString
799 << "with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring " << curChar
803 // Don't mark as past rule start for whitespace (whitespace gets trimmed)
804 if (!curChar.isSpace()) {
807 consecutiveSlashes = 0;
812 // Clean up any duplicates
813 normalComponents.removeDuplicates();
814 invertComponents.removeDuplicates();
816 // Create full regular expressions by...
817 // > Anchoring to start and end of string to mimic QRegExp's .exactMatch() handling, "^...$"
818 // > Enclosing within a non-capturing group to avoid overhead of text extraction, "(?:...)"
819 // > Flattening normal and inverted rules using the regex OR character "...|..."
821 // Before: [foo, bar, baz]
822 // After: ^(?:foo|bar|baz)$
824 // See https://doc.qt.io/qt-5/qregularexpression.html#porting-from-qregexp-exactmatch
825 // And https://regex101.com/
827 // Any empty/invalid regex are handled within ExpressionMatch::match()
828 if (!normalComponents.isEmpty()) {
829 // Create normal match regex
830 if (normalComponents.count() == 1) {
831 // Single item, skip the noncapturing group
832 _matchRegEx = regExFactory("^" + normalComponents.at(0) + "$", caseSensitive);
835 _matchRegEx = regExFactory("^(?:" + normalComponents.join("|") + ")$", caseSensitive);
837 _matchRegExActive = true;
839 if (!invertComponents.isEmpty()) {
840 // Create invert match regex
841 if (invertComponents.count() == 1) {
842 // Single item, skip the noncapturing group
843 _matchInvertRegEx = regExFactory("^" + invertComponents.at(0) + "$", caseSensitive);
846 _matchInvertRegEx = regExFactory("^(?:" + invertComponents.join("|") + ")$", caseSensitive);
848 _matchInvertRegExActive = true;
852 QString ExpressionMatch::wildcardToRegEx(const QString& expression)
854 // Convert the wildcard expression into regular expression format
856 // We're taking a little bit different of a route...
858 // Original QRegExp::Wildcard rules:
859 // --------------------------
860 // Wildcard | Regex | Outcome
861 // ---------|-------|--------
862 // * | .* | zero or more of any character
863 // ? | . | any single character
865 // NOTE 1: This is QRegExp::Wildcard, not QRegExp::WildcardUnix
867 // NOTE 2: We are ignoring the "[...]" character-class matching functionality of
868 // QRegExp::Wildcard as that feature's a bit more complex and can be handled with full-featured
871 // See https://doc.qt.io/qt-5/qregexp.html#wildcard-matching
873 // Quassel originally did not use QRegExp::WildcardUnix, which prevented escaping "*" and "?" in
874 // messages. Unfortunately, spam messages might decide to use both, so offering a way to escape
877 // On the flip-side, that means to match "\" requires escaping as "\\", breaking backwards
880 // Quassel's Wildcard rules
881 // ------------------------------------------
882 // Wildcard | Regex escaped | Regex | Outcome
883 // ---------|---------------|-------|--------
884 // * | \* | .* | zero or more of any character
885 // ? | \? | . | any single character
886 // \* | \\\* | \* | literal "*"
887 // \? | \\\? | \? | literal "?"
888 // \[...] | \\[...] | [...] | invalid escape, ignore it
889 // \\ | \\\\ | \\ | literal "\"
891 // In essence, "*" and "?" need changed only if not escaped, "\\" collapses into "\", "\" gets
892 // ignored; other characters escape normally.
897 // never?gonna*give\*you\?up\\test|y\yeah\\1\\\\2\\\1inval
899 // ("\\\\" represents "\\", "\\" represents "\", and "\\\" is valid+invalid, "\")
901 // > Regex escaped wildcard rule
902 // never\?gonna\*give\\\*you\\\?up\\\\test\|y\\yeah\\\\1\\\\\\\\2\\\\\\1inval
904 // > Expected correct regex
905 // never.gonna.*give\*you\?up\\test\|yyeah\\1\\\\2\\1inval
907 // > Undoing regex escaping of "\" as "\\" (i.e. simple replace, with special escapes intact)
908 // never.gonna.*give\*you\?up\test\|yyeah\1\\2\1inval
910 // Escape string according to regex
911 QString regExEscaped(regExEscape(expression));
915 // NOTE: In theory, regular expression lookbehind could solve this. Unfortunately, QRegExp does
916 // not support lookbehind, and it's theoretically inefficient, anyways. Just use an approach
917 // similar to that taken by QRegExp's official wildcard mode.
919 // Lookbehind example (that we can't use):
920 // (?<!abc)test Negative lookbehind - don't match if "test" is proceeded by "abc"
922 // See https://code.qt.io/cgit/qt/qtbase.git/tree/src/corelib/tools/qregexp.cpp
924 // NOTE: We don't copy QRegExp's mode as QRegularExpression has more special characters. We
925 // can't use the same escaping code, hence calling the appropriate QReg[...]::escape() above.
934 int sourceLength = regExEscaped.length();
935 // Consecutive "\" characters
936 int consecutiveSlashes = 0;
938 // We know it's going to be the same length or smaller, so reserve the same size as the string
939 result.reserve(sourceLength);
941 // For every character...
942 for (int i = 0; i < sourceLength; i++) {
944 curChar = regExEscaped.at(i);
945 // Check if it's on the list of special wildcard characters, converting to Unicode for use
946 // in the switch statement
948 // See https://doc.qt.io/qt-5/qchar.html#unicode
949 switch (curChar.unicode()) {
952 switch (consecutiveSlashes) {
954 // "?" -> "\?" -> "."
955 // Convert from regex escaped "?" to regular expression
959 // "\?" -> "\\\?" -> "\?"
960 // Convert from regex escaped "\?" to literal string
961 result.append(R"(\?)");
964 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
965 qWarning() << Q_FUNC_INFO << "Wildcard rule" << expression << "resulted in escaped regular expression string"
966 << regExEscaped << " with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring"
967 << curChar << "character!";
970 consecutiveSlashes = 0;
974 switch (consecutiveSlashes) {
976 // "*" -> "\*" -> ".*"
977 // Convert from regex escaped "*" to regular expression
981 // "\*" -> "\\\*" -> "\*"
982 // Convert from regex escaped "\*" to literal string
983 result.append(R"(\*)");
986 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
987 qWarning() << Q_FUNC_INFO << "Wildcard rule" << expression << "resulted in escaped regular expression string"
988 << regExEscaped << " with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring"
989 << curChar << "character!";
992 consecutiveSlashes = 0;
996 // Increase consecutive slash count
997 consecutiveSlashes++;
998 // Check if we've hit an escape sequence
999 if (consecutiveSlashes == 4) {
1000 // "\\" -> "\\\\" -> "\\"
1001 // Convert from regex escaped "\\" to literal string
1002 result.append(R"(\\)");
1003 // Reset slash count
1004 consecutiveSlashes = 0;
1008 // Any other character
1009 switch (consecutiveSlashes) {
1012 // "[...]" -> "[...]" -> "[...]"
1014 // "\[...]" -> "\\[...]" -> "[...]"
1015 // Either just print the character itself, or convert from regex-escaped invalid
1016 // wildcard escape sequence to the character itself
1018 // Both mean doing nothing, the actual character [...] gets appended below
1021 // "[...]" -> "\[...]" -> "\"
1022 // Keep regex-escaped special character "[...]" as literal string
1023 // (Where "[...]" represents any non-wildcard regex special character)
1024 result.append(R"(\)");
1025 // The actual character [...] gets appended below
1028 // This shouldn't ever happen (even with invalid wildcard rules), log a warning
1029 qWarning() << Q_FUNC_INFO << "Wildcard rule" << expression << "resulted in escaped regular expression string"
1030 << regExEscaped << " with unexpected count of consecutive '\\' (" << consecutiveSlashes << "), ignoring"
1031 << curChar << "char escape!";
1034 consecutiveSlashes = 0;
1035 // Add the character itself
1036 result.append(curChar);
1041 // Anchoring to simulate QRegExp::exactMatch() is handled in
1042 // ExpressionMatch::convertFromWildcard()