From 7e6d619516b45c40365d694af3eae28a8a839090 Mon Sep 17 00:00:00 2001 From: nowrep Date: Tue, 26 Feb 2013 15:48:47 +0100 Subject: [PATCH] AdBlock: Greatly improves performance of regexp rules. Regexp will be parsed into parts that can be used with QString::contains for quick match. If url contains all parts, real QRegExp matching is applied. This speedup affects rules that are internally converted into regexp, not AdBlock's regexp rules (but those rules are very rarely used in subscriptions, so there's no need to optimise them) --- CHANGELOG | 1 + src/lib/adblock/adblockrule.cpp | 59 +++++++++++++---- src/lib/adblock/adblockrule.h | 8 ++- src/lib/cookies/cookiejar.cpp | 13 +--- src/lib/tools/qztools.cpp | 18 ++++++ src/lib/tools/qztools.h | 2 + tests/autotests/adblocktest.cpp | 109 ++++++++++++++++++++++++++++++++ tests/autotests/adblocktest.h | 36 +++++++++++ tests/autotests/autotests.pro | 6 +- tests/autotests/main.cpp | 4 ++ 10 files changed, 227 insertions(+), 29 deletions(-) create mode 100644 tests/autotests/adblocktest.cpp create mode 100644 tests/autotests/adblocktest.h diff --git a/CHANGELOG b/CHANGELOG index 11e53d877..fb8964549 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -24,6 +24,7 @@ Version 1.4.0 * moved config directory into ~/.config/qupzilla * certificates bundle is now only used on windows * reduced memory usage of AdBlock (saves up to 30MB with just EasyList) + * greatly improved performance when matching regexp rules in AdBlock * GreaseMonkey: reload script if source file changed on disk * GreaseMonkey: fixed don't loading invalid scripts * fixed opening browser with url with ampersand (%26) as command line argument diff --git a/src/lib/adblock/adblockrule.cpp b/src/lib/adblock/adblockrule.cpp index 8fe00b439..169595489 100644 --- a/src/lib/adblock/adblockrule.cpp +++ b/src/lib/adblock/adblockrule.cpp @@ -46,6 +46,7 @@ #include "adblockrule.h" #include "adblocksubscription.h" +#include "qztools.h" #include #include "qzregexp.h" @@ -206,12 +207,16 @@ bool AdBlockRule::networkMatch(const QNetworkRequest &request, const QString &do bool matched = false; if (m_useDomainMatch) { - matched = _matchDomain(domain, m_matchString); + matched = isMatchingDomain(domain, m_matchString); } else if (m_useEndsMatch) { matched = encodedUrl.endsWith(m_matchString, m_caseSensitivity); } else if (m_regExp) { + if (!isMatchingRegExpStrings(encodedUrl)) { + return false; + } + matched = (m_regExp->indexIn(encodedUrl) != -1); } else { @@ -277,14 +282,14 @@ bool AdBlockRule::matchDomain(const QString &domain) const if (m_blockedDomains.isEmpty()) { foreach(const QString & d, m_allowedDomains) { - if (_matchDomain(domain, d)) { + if (isMatchingDomain(domain, d)) { return true; } } } else if (m_allowedDomains.isEmpty()) { foreach(const QString & d, m_blockedDomains) { - if (_matchDomain(domain, d)) { + if (isMatchingDomain(domain, d)) { return false; } } @@ -292,13 +297,13 @@ bool AdBlockRule::matchDomain(const QString &domain) const } else { foreach(const QString & d, m_blockedDomains) { - if (_matchDomain(domain, d)) { + if (isMatchingDomain(domain, d)) { return false; } } foreach(const QString & d, m_allowedDomains) { - if (_matchDomain(domain, d)) { + if (isMatchingDomain(domain, d)) { return true; } } @@ -388,6 +393,7 @@ void AdBlockRule::parseFilter() m_cssSelector = parsedLine.mid(pos + 2); m_cssSelector.remove('\\'); + // CSS rule cannot have more options -> stop parsing return; } @@ -504,7 +510,9 @@ void AdBlockRule::parseFilter() // we must modify parsedLine to comply with QzRegExp if (parsedLine.contains(QLatin1Char('*')) || parsedLine.contains(QLatin1Char('^')) || parsedLine.contains(QLatin1Char('|'))) { - parsedLine.replace(QzRegExp(QLatin1String("\\*+")), QLatin1String("*")) // remove multiple wildcards + QString parsedRegExp = parsedLine; + + parsedRegExp.replace(QzRegExp(QLatin1String("\\*+")), QLatin1String("*")) // remove multiple wildcards .replace(QzRegExp(QLatin1String("\\^\\|$")), QLatin1String("^")) // remove anchors following separator placeholder .replace(QzRegExp(QLatin1String("^(\\*)")), QString()) // remove leading wildcards .replace(QzRegExp(QLatin1String("(\\*)$")), QString()) @@ -517,7 +525,8 @@ void AdBlockRule::parseFilter() .replace(QzRegExp(QLatin1String("\\\\\\|$")), QLatin1String("$")) // process anchor at expression end .replace(QzRegExp(QLatin1String("\\\\\\*")), QLatin1String(".*")); // replace wildcards by .* - m_regExp = new QzRegExp(parsedLine, m_caseSensitivity); + m_regExpStrings = parseRegExpFilter(parsedLine); + m_regExp = new QzRegExp(parsedRegExp, m_caseSensitivity); return; } @@ -544,19 +553,41 @@ void AdBlockRule::parseDomains(const QString &domains, const QChar &separator) m_domainRestricted = (!m_blockedDomains.isEmpty() || !m_allowedDomains.isEmpty()); } -bool AdBlockRule::_matchDomain(const QString &domain, const QString &filter) const +bool AdBlockRule::isMatchingDomain(const QString &domain, const QString &filter) const { - if (!domain.endsWith(filter)) { - return false; + return QzTools::matchDomain(filter, domain); +} + +bool AdBlockRule::isMatchingRegExpStrings(const QString &url) const +{ + foreach(const QString & string, m_regExpStrings) { + if (!url.contains(string)) { + return false; + } } - int index = domain.indexOf(filter); + return true; +} - if (index == 0 || filter[0] == QLatin1Char('.')) { - return true; +// Split regexp filter into strings that can be used with QString::contains +// Don't use parts that contains only 1 char and duplicated parts +QStringList AdBlockRule::parseRegExpFilter(const QString &parsedFilter) const +{ + // Meta characters in AdBlock rules are | * ^ + QStringList list = parsedFilter.split(QzRegExp("[|\\*\\^]"), QString::SkipEmptyParts); + + list.removeDuplicates(); + + for (int i = 0; i < list.length(); ++i) { + const QString &part = list.at(i); + + if (part.length() < 2) { + list.removeAt(i); + i--; + } } - return domain[index - 1] == QLatin1Char('.'); + return list; } AdBlockRule::~AdBlockRule() diff --git a/src/lib/adblock/adblockrule.h b/src/lib/adblock/adblockrule.h index f64f7d4b0..ed9afc480 100644 --- a/src/lib/adblock/adblockrule.h +++ b/src/lib/adblock/adblockrule.h @@ -95,12 +95,15 @@ public: bool matchXmlHttpRequest(const QNetworkRequest &request) const; bool matchImage(const QString &encodedUrl) const; +protected: + bool isMatchingDomain(const QString &domain, const QString &filter) const; + bool isMatchingRegExpStrings(const QString &url) const; + QStringList parseRegExpFilter(const QString &parsedFilter) const; + private: void parseFilter(); void parseDomains(const QString &domains, const QChar &separator); - bool _matchDomain(const QString &domain, const QString &filter) const; - AdBlockSubscription* m_subscription; QString m_filter; @@ -112,6 +115,7 @@ private: bool m_domainRestricted; QzRegExp* m_regExp; + QStringList m_regExpStrings; bool m_useDomainMatch; bool m_useEndsMatch; diff --git a/src/lib/cookies/cookiejar.cpp b/src/lib/cookies/cookiejar.cpp index 4c20c9134..122d8ae0a 100644 --- a/src/lib/cookies/cookiejar.cpp +++ b/src/lib/cookies/cookiejar.cpp @@ -19,6 +19,7 @@ #include "qupzilla.h" #include "mainapplication.h" #include "settings.h" +#include "qztools.h" #include #include @@ -234,17 +235,7 @@ bool CookieJar::matchDomain(QString cookieDomain, QString siteDomain) siteDomain = siteDomain.mid(1); } - if (cookieDomain == siteDomain) { - return true; - } - - if (!siteDomain.endsWith(cookieDomain)) { - return false; - } - - int index = siteDomain.indexOf(cookieDomain); - - return index > 0 && siteDomain[index - 1] == QLatin1Char('.'); + return QzTools::matchDomain(cookieDomain, siteDomain); } bool CookieJar::listMatchesDomain(const QStringList &list, const QString &cookieDomain) diff --git a/src/lib/tools/qztools.cpp b/src/lib/tools/qztools.cpp index 0fd775cec..996303fbc 100644 --- a/src/lib/tools/qztools.cpp +++ b/src/lib/tools/qztools.cpp @@ -472,6 +472,24 @@ bool QzTools::isUtf8(const char* string) return true; } +// Matches domain (assumes both pattern and domain not starting with dot) +// pattern = domain to be matched +// domain = site domain +bool QzTools::matchDomain(const QString &pattern, const QString &domain) +{ + if (pattern == domain) { + return true; + } + + if (!domain.endsWith(pattern)) { + return false; + } + + int index = domain.indexOf(pattern); + + return index > 0 && domain[index - 1] == QLatin1Char('.'); +} + static inline bool isQuote(const QChar &c) { return (c == QLatin1Char('"') || c == QLatin1Char('\'')); diff --git a/src/lib/tools/qztools.h b/src/lib/tools/qztools.h index 0d1008bd3..7465831f8 100644 --- a/src/lib/tools/qztools.h +++ b/src/lib/tools/qztools.h @@ -63,6 +63,8 @@ bool QT_QUPZILLA_EXPORT startExternalProcess(const QString &executable, const QS QIcon QT_QUPZILLA_EXPORT iconFromFileName(const QString &fileName); bool QT_QUPZILLA_EXPORT isUtf8(const char* string); +bool QT_QUPZILLA_EXPORT matchDomain(const QString &pattern, const QString &domain); + QString QT_QUPZILLA_EXPORT operatingSystem(); // Qt5 migration help functions diff --git a/tests/autotests/adblocktest.cpp b/tests/autotests/adblocktest.cpp new file mode 100644 index 000000000..9bcac640b --- /dev/null +++ b/tests/autotests/adblocktest.cpp @@ -0,0 +1,109 @@ +/* ============================================================ +* QupZilla - WebKit based browser +* Copyright (C) 2013 David Rosca +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program. If not, see . +* ============================================================ */ +#include "adblocktest.h" +#include "adblockrule.h" + +#include + +class AdBlockRule_Test : public AdBlockRule +{ +public: + QStringList parseRegExpFilter(const QString &parsedFilter) + { + return AdBlockRule::parseRegExpFilter(parsedFilter); + } + + bool isMatchingDomain(const QString &domain, const QString &filter) const + { + return AdBlockRule::isMatchingDomain(domain, filter); + } +}; + +void AdBlockTest::isMatchingCookieTest_data() +{ + // Test copied from CookiesTest + QTest::addColumn("filterDomain"); + QTest::addColumn("siteDomain"); + QTest::addColumn("result"); + + QTest::newRow("test1") << "example.com" << "www.example.com" << true; + QTest::newRow("test2") << "example.com" << "example.com" << true; + QTest::newRow("test3") << "example.com" << "anotherexample.com" << false; + QTest::newRow("test4") << "test.example.com" << "example.com" << false; + QTest::newRow("test5") << "www.example.com" << "example.com" << false; + QTest::newRow("test_empty") << "www.example.com" << "" << false; + QTest::newRow("test_empty2") << "" << "example.com" << false; +} + +void AdBlockTest::isMatchingCookieTest() +{ + AdBlockRule_Test rule_test; + + QFETCH(QString, filterDomain); + QFETCH(QString, siteDomain); + QFETCH(bool, result); + + QCOMPARE(rule_test.isMatchingDomain(siteDomain, filterDomain), result); +} + +void AdBlockTest::parseRegExpFilterTest_data() +{ + QTest::addColumn("parsedFilter"); + QTest::addColumn("result"); + + QTest::newRow("test1") << "||doubleclick.net/pfadx/tmg.telegraph." + << (QStringList() << "doubleclick.net/pfadx/tmg.telegraph."); + QTest::newRow("test2") << "||doubleclick.net/pfadx/*.mtvi" + << (QStringList() << "doubleclick.net/pfadx/" << ".mtvi"); + QTest::newRow("test3") << "&prvtof=*&poru=" + << (QStringList() << "&prvtof=" << "&poru="); + QTest::newRow("test4") << "/addyn|*;adtech;" + << (QStringList() << "/addyn" << ";adtech;"); + QTest::newRow("test5") << "/eas_fif.html^" + << (QStringList() << "/eas_fif.html"); + QTest::newRow("test6") << "://findnsave.^.*/api/groupon.json?" + << (QStringList() << "://findnsave." << "/api/groupon.json?"); + QTest::newRow("test7") << "^fp=*&prvtof=" + << (QStringList() << "fp=" << "&prvtof="); + QTest::newRow("test8") << "|http://ax-d.*/jstag^" + << (QStringList() << "http://ax-d." << "/jstag"); + QTest::newRow("test9") << "||reuters.com^*/rcom-wt-mlt.js" + << (QStringList() << "reuters.com" <<"/rcom-wt-mlt.js"); + QTest::newRow("test10") << "||chip.de^*/tracking.js" + << (QStringList() << "chip.de" << "/tracking.js"); + QTest::newRow("ignore1char") << "/search.php?uid=*.*&src=" + << (QStringList() << "/search.php?uid=" << "&src="); + QTest::newRow("ignoreDuplicates") << "/search.*.dup.*.dup.*&src=" + << (QStringList() << "/search." << ".dup." << "&src="); + QTest::newRow("empty") << QString() + << (QStringList()); + QTest::newRow("justspaces") << QString(" ") + << (QStringList() << " "); + QTest::newRow("spacesWithMetachars") << QString(" * ?") + << (QStringList() << " " << " ?"); +} + +void AdBlockTest::parseRegExpFilterTest() +{ + AdBlockRule_Test rule_test; + + QFETCH(QString, parsedFilter); + QFETCH(QStringList, result); + + QCOMPARE(rule_test.parseRegExpFilter(parsedFilter), result); +} diff --git a/tests/autotests/adblocktest.h b/tests/autotests/adblocktest.h new file mode 100644 index 000000000..447720f43 --- /dev/null +++ b/tests/autotests/adblocktest.h @@ -0,0 +1,36 @@ +/* ============================================================ +* QupZilla - WebKit based browser +* Copyright (C) 2013 David Rosca +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program. If not, see . +* ============================================================ */ +#ifndef ADBLOCKTEST_H +#define ADBLOCKTEST_H + +#include + +class AdBlockTest : public QObject +{ + Q_OBJECT + +private slots: + void isMatchingCookieTest_data(); + void isMatchingCookieTest(); + + void parseRegExpFilterTest_data(); + void parseRegExpFilterTest(); + +}; + +#endif // ADBLOCKTEST_H diff --git a/tests/autotests/autotests.pro b/tests/autotests/autotests.pro index 4ea0bdd00..faf2b388f 100644 --- a/tests/autotests/autotests.pro +++ b/tests/autotests/autotests.pro @@ -49,11 +49,13 @@ HEADERS += \ qztoolstest.h \ formcompletertest.h \ cookiestest.h \ - downloadstest.h + downloadstest.h \ + adblocktest.h SOURCES += \ qztoolstest.cpp \ main.cpp \ formcompletertest.cpp \ cookiestest.cpp \ - downloadstest.cpp + downloadstest.cpp \ + adblocktest.cpp diff --git a/tests/autotests/main.cpp b/tests/autotests/main.cpp index 42829f56e..03f592db6 100644 --- a/tests/autotests/main.cpp +++ b/tests/autotests/main.cpp @@ -19,6 +19,7 @@ #include "formcompletertest.h" #include "cookiestest.h" #include "downloadstest.h" +#include "adblocktest.h" #include @@ -39,5 +40,8 @@ int main(int argc, char *argv[]) DownloadsTest downloadsTest; QTest::qExec(&downloadsTest, argc, argv); + AdBlockTest adblockTest; + QTest::qExec(&adblockTest, argc, argv); + return 0; }