1
mirror of https://invent.kde.org/network/falkon.git synced 2024-09-21 17:52:10 +02:00

AdBlock: Greatly improves performance of regexp rules.

Regexp will be parsed into parts that can be used with
QString::contains for quick match. If url contains all parts,
real QRegExp matching is applied.

This speedup affects rules that are internally converted into
regexp, not AdBlock's regexp rules (but those rules are very rarely used
in subscriptions, so there's no need to optimise them)
This commit is contained in:
nowrep 2013-02-26 15:48:47 +01:00
parent b52d150e7d
commit 7e6d619516
10 changed files with 227 additions and 29 deletions

View File

@ -24,6 +24,7 @@ Version 1.4.0
* moved config directory into ~/.config/qupzilla * moved config directory into ~/.config/qupzilla
* certificates bundle is now only used on windows * certificates bundle is now only used on windows
* reduced memory usage of AdBlock (saves up to 30MB with just EasyList) * reduced memory usage of AdBlock (saves up to 30MB with just EasyList)
* greatly improved performance when matching regexp rules in AdBlock
* GreaseMonkey: reload script if source file changed on disk * GreaseMonkey: reload script if source file changed on disk
* GreaseMonkey: fixed don't loading invalid scripts * GreaseMonkey: fixed don't loading invalid scripts
* fixed opening browser with url with ampersand (%26) as command line argument * fixed opening browser with url with ampersand (%26) as command line argument

View File

@ -46,6 +46,7 @@
#include "adblockrule.h" #include "adblockrule.h"
#include "adblocksubscription.h" #include "adblocksubscription.h"
#include "qztools.h"
#include <QDebug> #include <QDebug>
#include "qzregexp.h" #include "qzregexp.h"
@ -206,12 +207,16 @@ bool AdBlockRule::networkMatch(const QNetworkRequest &request, const QString &do
bool matched = false; bool matched = false;
if (m_useDomainMatch) { if (m_useDomainMatch) {
matched = _matchDomain(domain, m_matchString); matched = isMatchingDomain(domain, m_matchString);
} }
else if (m_useEndsMatch) { else if (m_useEndsMatch) {
matched = encodedUrl.endsWith(m_matchString, m_caseSensitivity); matched = encodedUrl.endsWith(m_matchString, m_caseSensitivity);
} }
else if (m_regExp) { else if (m_regExp) {
if (!isMatchingRegExpStrings(encodedUrl)) {
return false;
}
matched = (m_regExp->indexIn(encodedUrl) != -1); matched = (m_regExp->indexIn(encodedUrl) != -1);
} }
else { else {
@ -277,14 +282,14 @@ bool AdBlockRule::matchDomain(const QString &domain) const
if (m_blockedDomains.isEmpty()) { if (m_blockedDomains.isEmpty()) {
foreach(const QString & d, m_allowedDomains) { foreach(const QString & d, m_allowedDomains) {
if (_matchDomain(domain, d)) { if (isMatchingDomain(domain, d)) {
return true; return true;
} }
} }
} }
else if (m_allowedDomains.isEmpty()) { else if (m_allowedDomains.isEmpty()) {
foreach(const QString & d, m_blockedDomains) { foreach(const QString & d, m_blockedDomains) {
if (_matchDomain(domain, d)) { if (isMatchingDomain(domain, d)) {
return false; return false;
} }
} }
@ -292,13 +297,13 @@ bool AdBlockRule::matchDomain(const QString &domain) const
} }
else { else {
foreach(const QString & d, m_blockedDomains) { foreach(const QString & d, m_blockedDomains) {
if (_matchDomain(domain, d)) { if (isMatchingDomain(domain, d)) {
return false; return false;
} }
} }
foreach(const QString & d, m_allowedDomains) { foreach(const QString & d, m_allowedDomains) {
if (_matchDomain(domain, d)) { if (isMatchingDomain(domain, d)) {
return true; return true;
} }
} }
@ -388,6 +393,7 @@ void AdBlockRule::parseFilter()
m_cssSelector = parsedLine.mid(pos + 2); m_cssSelector = parsedLine.mid(pos + 2);
m_cssSelector.remove('\\'); m_cssSelector.remove('\\');
// CSS rule cannot have more options -> stop parsing // CSS rule cannot have more options -> stop parsing
return; return;
} }
@ -504,7 +510,9 @@ void AdBlockRule::parseFilter()
// we must modify parsedLine to comply with QzRegExp // we must modify parsedLine to comply with QzRegExp
if (parsedLine.contains(QLatin1Char('*')) || parsedLine.contains(QLatin1Char('^')) if (parsedLine.contains(QLatin1Char('*')) || parsedLine.contains(QLatin1Char('^'))
|| parsedLine.contains(QLatin1Char('|'))) { || parsedLine.contains(QLatin1Char('|'))) {
parsedLine.replace(QzRegExp(QLatin1String("\\*+")), QLatin1String("*")) // remove multiple wildcards QString parsedRegExp = parsedLine;
parsedRegExp.replace(QzRegExp(QLatin1String("\\*+")), QLatin1String("*")) // remove multiple wildcards
.replace(QzRegExp(QLatin1String("\\^\\|$")), QLatin1String("^")) // remove anchors following separator placeholder .replace(QzRegExp(QLatin1String("\\^\\|$")), QLatin1String("^")) // remove anchors following separator placeholder
.replace(QzRegExp(QLatin1String("^(\\*)")), QString()) // remove leading wildcards .replace(QzRegExp(QLatin1String("^(\\*)")), QString()) // remove leading wildcards
.replace(QzRegExp(QLatin1String("(\\*)$")), QString()) .replace(QzRegExp(QLatin1String("(\\*)$")), QString())
@ -517,7 +525,8 @@ void AdBlockRule::parseFilter()
.replace(QzRegExp(QLatin1String("\\\\\\|$")), QLatin1String("$")) // process anchor at expression end .replace(QzRegExp(QLatin1String("\\\\\\|$")), QLatin1String("$")) // process anchor at expression end
.replace(QzRegExp(QLatin1String("\\\\\\*")), QLatin1String(".*")); // replace wildcards by .* .replace(QzRegExp(QLatin1String("\\\\\\*")), QLatin1String(".*")); // replace wildcards by .*
m_regExp = new QzRegExp(parsedLine, m_caseSensitivity); m_regExpStrings = parseRegExpFilter(parsedLine);
m_regExp = new QzRegExp(parsedRegExp, m_caseSensitivity);
return; return;
} }
@ -544,19 +553,41 @@ void AdBlockRule::parseDomains(const QString &domains, const QChar &separator)
m_domainRestricted = (!m_blockedDomains.isEmpty() || !m_allowedDomains.isEmpty()); m_domainRestricted = (!m_blockedDomains.isEmpty() || !m_allowedDomains.isEmpty());
} }
bool AdBlockRule::_matchDomain(const QString &domain, const QString &filter) const bool AdBlockRule::isMatchingDomain(const QString &domain, const QString &filter) const
{ {
if (!domain.endsWith(filter)) { return QzTools::matchDomain(filter, domain);
}
bool AdBlockRule::isMatchingRegExpStrings(const QString &url) const
{
foreach(const QString & string, m_regExpStrings) {
if (!url.contains(string)) {
return false; return false;
} }
int index = domain.indexOf(filter);
if (index == 0 || filter[0] == QLatin1Char('.')) {
return true;
} }
return domain[index - 1] == QLatin1Char('.'); return true;
}
// Split regexp filter into strings that can be used with QString::contains
// Don't use parts that contains only 1 char and duplicated parts
QStringList AdBlockRule::parseRegExpFilter(const QString &parsedFilter) const
{
// Meta characters in AdBlock rules are | * ^
QStringList list = parsedFilter.split(QzRegExp("[|\\*\\^]"), QString::SkipEmptyParts);
list.removeDuplicates();
for (int i = 0; i < list.length(); ++i) {
const QString &part = list.at(i);
if (part.length() < 2) {
list.removeAt(i);
i--;
}
}
return list;
} }
AdBlockRule::~AdBlockRule() AdBlockRule::~AdBlockRule()

View File

@ -95,12 +95,15 @@ public:
bool matchXmlHttpRequest(const QNetworkRequest &request) const; bool matchXmlHttpRequest(const QNetworkRequest &request) const;
bool matchImage(const QString &encodedUrl) const; bool matchImage(const QString &encodedUrl) const;
protected:
bool isMatchingDomain(const QString &domain, const QString &filter) const;
bool isMatchingRegExpStrings(const QString &url) const;
QStringList parseRegExpFilter(const QString &parsedFilter) const;
private: private:
void parseFilter(); void parseFilter();
void parseDomains(const QString &domains, const QChar &separator); void parseDomains(const QString &domains, const QChar &separator);
bool _matchDomain(const QString &domain, const QString &filter) const;
AdBlockSubscription* m_subscription; AdBlockSubscription* m_subscription;
QString m_filter; QString m_filter;
@ -112,6 +115,7 @@ private:
bool m_domainRestricted; bool m_domainRestricted;
QzRegExp* m_regExp; QzRegExp* m_regExp;
QStringList m_regExpStrings;
bool m_useDomainMatch; bool m_useDomainMatch;
bool m_useEndsMatch; bool m_useEndsMatch;

View File

@ -19,6 +19,7 @@
#include "qupzilla.h" #include "qupzilla.h"
#include "mainapplication.h" #include "mainapplication.h"
#include "settings.h" #include "settings.h"
#include "qztools.h"
#include <QNetworkCookie> #include <QNetworkCookie>
#include <QWebSettings> #include <QWebSettings>
@ -234,17 +235,7 @@ bool CookieJar::matchDomain(QString cookieDomain, QString siteDomain)
siteDomain = siteDomain.mid(1); siteDomain = siteDomain.mid(1);
} }
if (cookieDomain == siteDomain) { return QzTools::matchDomain(cookieDomain, siteDomain);
return true;
}
if (!siteDomain.endsWith(cookieDomain)) {
return false;
}
int index = siteDomain.indexOf(cookieDomain);
return index > 0 && siteDomain[index - 1] == QLatin1Char('.');
} }
bool CookieJar::listMatchesDomain(const QStringList &list, const QString &cookieDomain) bool CookieJar::listMatchesDomain(const QStringList &list, const QString &cookieDomain)

View File

@ -472,6 +472,24 @@ bool QzTools::isUtf8(const char* string)
return true; return true;
} }
// Matches domain (assumes both pattern and domain not starting with dot)
// pattern = domain to be matched
// domain = site domain
bool QzTools::matchDomain(const QString &pattern, const QString &domain)
{
if (pattern == domain) {
return true;
}
if (!domain.endsWith(pattern)) {
return false;
}
int index = domain.indexOf(pattern);
return index > 0 && domain[index - 1] == QLatin1Char('.');
}
static inline bool isQuote(const QChar &c) static inline bool isQuote(const QChar &c)
{ {
return (c == QLatin1Char('"') || c == QLatin1Char('\'')); return (c == QLatin1Char('"') || c == QLatin1Char('\''));

View File

@ -63,6 +63,8 @@ bool QT_QUPZILLA_EXPORT startExternalProcess(const QString &executable, const QS
QIcon QT_QUPZILLA_EXPORT iconFromFileName(const QString &fileName); QIcon QT_QUPZILLA_EXPORT iconFromFileName(const QString &fileName);
bool QT_QUPZILLA_EXPORT isUtf8(const char* string); bool QT_QUPZILLA_EXPORT isUtf8(const char* string);
bool QT_QUPZILLA_EXPORT matchDomain(const QString &pattern, const QString &domain);
QString QT_QUPZILLA_EXPORT operatingSystem(); QString QT_QUPZILLA_EXPORT operatingSystem();
// Qt5 migration help functions // Qt5 migration help functions

View File

@ -0,0 +1,109 @@
/* ============================================================
* QupZilla - WebKit based browser
* Copyright (C) 2013 David Rosca <nowrep@gmail.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* ============================================================ */
#include "adblocktest.h"
#include "adblockrule.h"
#include <QtTest/QtTest>
class AdBlockRule_Test : public AdBlockRule
{
public:
QStringList parseRegExpFilter(const QString &parsedFilter)
{
return AdBlockRule::parseRegExpFilter(parsedFilter);
}
bool isMatchingDomain(const QString &domain, const QString &filter) const
{
return AdBlockRule::isMatchingDomain(domain, filter);
}
};
void AdBlockTest::isMatchingCookieTest_data()
{
// Test copied from CookiesTest
QTest::addColumn<QString>("filterDomain");
QTest::addColumn<QString>("siteDomain");
QTest::addColumn<bool>("result");
QTest::newRow("test1") << "example.com" << "www.example.com" << true;
QTest::newRow("test2") << "example.com" << "example.com" << true;
QTest::newRow("test3") << "example.com" << "anotherexample.com" << false;
QTest::newRow("test4") << "test.example.com" << "example.com" << false;
QTest::newRow("test5") << "www.example.com" << "example.com" << false;
QTest::newRow("test_empty") << "www.example.com" << "" << false;
QTest::newRow("test_empty2") << "" << "example.com" << false;
}
void AdBlockTest::isMatchingCookieTest()
{
AdBlockRule_Test rule_test;
QFETCH(QString, filterDomain);
QFETCH(QString, siteDomain);
QFETCH(bool, result);
QCOMPARE(rule_test.isMatchingDomain(siteDomain, filterDomain), result);
}
void AdBlockTest::parseRegExpFilterTest_data()
{
QTest::addColumn<QString>("parsedFilter");
QTest::addColumn<QStringList>("result");
QTest::newRow("test1") << "||doubleclick.net/pfadx/tmg.telegraph."
<< (QStringList() << "doubleclick.net/pfadx/tmg.telegraph.");
QTest::newRow("test2") << "||doubleclick.net/pfadx/*.mtvi"
<< (QStringList() << "doubleclick.net/pfadx/" << ".mtvi");
QTest::newRow("test3") << "&prvtof=*&poru="
<< (QStringList() << "&prvtof=" << "&poru=");
QTest::newRow("test4") << "/addyn|*;adtech;"
<< (QStringList() << "/addyn" << ";adtech;");
QTest::newRow("test5") << "/eas_fif.html^"
<< (QStringList() << "/eas_fif.html");
QTest::newRow("test6") << "://findnsave.^.*/api/groupon.json?"
<< (QStringList() << "://findnsave." << "/api/groupon.json?");
QTest::newRow("test7") << "^fp=*&prvtof="
<< (QStringList() << "fp=" << "&prvtof=");
QTest::newRow("test8") << "|http://ax-d.*/jstag^"
<< (QStringList() << "http://ax-d." << "/jstag");
QTest::newRow("test9") << "||reuters.com^*/rcom-wt-mlt.js"
<< (QStringList() << "reuters.com" <<"/rcom-wt-mlt.js");
QTest::newRow("test10") << "||chip.de^*/tracking.js"
<< (QStringList() << "chip.de" << "/tracking.js");
QTest::newRow("ignore1char") << "/search.php?uid=*.*&src="
<< (QStringList() << "/search.php?uid=" << "&src=");
QTest::newRow("ignoreDuplicates") << "/search.*.dup.*.dup.*&src="
<< (QStringList() << "/search." << ".dup." << "&src=");
QTest::newRow("empty") << QString()
<< (QStringList());
QTest::newRow("justspaces") << QString(" ")
<< (QStringList() << " ");
QTest::newRow("spacesWithMetachars") << QString(" * ?")
<< (QStringList() << " " << " ?");
}
void AdBlockTest::parseRegExpFilterTest()
{
AdBlockRule_Test rule_test;
QFETCH(QString, parsedFilter);
QFETCH(QStringList, result);
QCOMPARE(rule_test.parseRegExpFilter(parsedFilter), result);
}

View File

@ -0,0 +1,36 @@
/* ============================================================
* QupZilla - WebKit based browser
* Copyright (C) 2013 David Rosca <nowrep@gmail.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* ============================================================ */
#ifndef ADBLOCKTEST_H
#define ADBLOCKTEST_H
#include <QObject>
class AdBlockTest : public QObject
{
Q_OBJECT
private slots:
void isMatchingCookieTest_data();
void isMatchingCookieTest();
void parseRegExpFilterTest_data();
void parseRegExpFilterTest();
};
#endif // ADBLOCKTEST_H

View File

@ -49,11 +49,13 @@ HEADERS += \
qztoolstest.h \ qztoolstest.h \
formcompletertest.h \ formcompletertest.h \
cookiestest.h \ cookiestest.h \
downloadstest.h downloadstest.h \
adblocktest.h
SOURCES += \ SOURCES += \
qztoolstest.cpp \ qztoolstest.cpp \
main.cpp \ main.cpp \
formcompletertest.cpp \ formcompletertest.cpp \
cookiestest.cpp \ cookiestest.cpp \
downloadstest.cpp downloadstest.cpp \
adblocktest.cpp

View File

@ -19,6 +19,7 @@
#include "formcompletertest.h" #include "formcompletertest.h"
#include "cookiestest.h" #include "cookiestest.h"
#include "downloadstest.h" #include "downloadstest.h"
#include "adblocktest.h"
#include <QtTest/QtTest> #include <QtTest/QtTest>
@ -39,5 +40,8 @@ int main(int argc, char *argv[])
DownloadsTest downloadsTest; DownloadsTest downloadsTest;
QTest::qExec(&downloadsTest, argc, argv); QTest::qExec(&downloadsTest, argc, argv);
AdBlockTest adblockTest;
QTest::qExec(&adblockTest, argc, argv);
return 0; return 0;
} }