1
mirror of https://invent.kde.org/network/falkon.git synced 2024-09-21 09:42:10 +02:00

[AdBlock] Improved performance of loading rules

Don't use regexps for parsing rules.
Added benchmark for loading subscriptions

Before:

********* Start testing of AdBlockParseRule *********
Config: Using QTest library 4.8.6, Qt 4.8.6
PASS   : AdBlockParseRule::initTestCase()
RESULT : AdBlockParseRule::parseEasyList():
     596.3 msecs per iteration (total: 2,982, iterations: 5)
PASS   : AdBlockParseRule::parseEasyList()
PASS   : AdBlockParseRule::cleanupTestCase()
Totals: 3 passed, 0 failed, 0 skipped
********* Finished testing of AdBlockParseRule *********

After:

********* Start testing of AdBlockParseRule *********
Config: Using QTest library 4.8.6, Qt 4.8.6
PASS   : AdBlockParseRule::initTestCase()
RESULT : AdBlockParseRule::parseEasyList():
     481.8 msecs per iteration (total: 2,409, iterations: 5)
PASS   : AdBlockParseRule::parseEasyList()
PASS   : AdBlockParseRule::cleanupTestCase()
Totals: 3 passed, 0 failed, 0 skipped
********* Finished testing of AdBlockParseRule *********
This commit is contained in:
nowrep 2014-04-06 13:34:01 +02:00
parent 1b01e7469f
commit db664184d0
10 changed files with 42167 additions and 86 deletions

View File

@ -70,24 +70,24 @@ static QString toSecondLevelDomain(const QUrl &url)
QString domain = urlHost.left(urlHost.size() - topLevelDomain.size());
if (domain.count(QLatin1Char('.')) == 0) {
if (domain.count(QL1C('.')) == 0) {
return urlHost;
}
while (domain.count(QLatin1Char('.')) != 0) {
domain = domain.mid(domain.indexOf(QLatin1Char('.')) + 1);
while (domain.count(QL1C('.')) != 0) {
domain = domain.mid(domain.indexOf(QL1C('.')) + 1);
}
return domain + topLevelDomain;
#else
QString domain = url.host();
if (domain.count(QLatin1Char('.')) == 0) {
if (domain.count(QL1C('.')) == 0) {
return QString();
}
while (domain.count(QLatin1Char('.')) != 1) {
domain = domain.mid(domain.indexOf(QLatin1Char('.')) + 1);
while (domain.count(QL1C('.')) != 1) {
domain = domain.mid(domain.indexOf(QL1C('.')) + 1);
}
return domain;
@ -106,6 +106,11 @@ AdBlockRule::AdBlockRule(const QString &filter, AdBlockSubscription* subscriptio
setFilter(filter);
}
AdBlockRule::~AdBlockRule()
{
delete m_regExp;
}
AdBlockSubscription* AdBlockRule::subscription() const
{
return m_subscription;
@ -159,7 +164,7 @@ bool AdBlockRule::isException() const
bool AdBlockRule::isComment() const
{
return m_filter.startsWith(QLatin1Char('!'));
return m_filter.startsWith(QL1C('!'));
}
bool AdBlockRule::isEnabled() const
@ -315,7 +320,7 @@ bool AdBlockRule::matchThirdParty(const QNetworkRequest &request) const
bool AdBlockRule::matchObject(const QNetworkRequest &request) const
{
bool match = request.attribute(QNetworkRequest::Attribute(QNetworkRequest::User + 150)).toString() == QLatin1String("object");
bool match = request.attribute(QNetworkRequest::Attribute(QNetworkRequest::User + 150)).toString() == QL1S("object");
return hasException(ObjectOption) ? !match : match;
}
@ -346,10 +351,10 @@ bool AdBlockRule::matchXmlHttpRequest(const QNetworkRequest &request) const
bool AdBlockRule::matchImage(const QString &encodedUrl) const
{
bool match = encodedUrl.endsWith(QLatin1String(".png")) ||
encodedUrl.endsWith(QLatin1String(".jpg")) ||
encodedUrl.endsWith(QLatin1String(".gif")) ||
encodedUrl.endsWith(QLatin1String(".jpeg"));
bool match = encodedUrl.endsWith(QL1S(".png")) ||
encodedUrl.endsWith(QL1S(".jpg")) ||
encodedUrl.endsWith(QL1S(".gif")) ||
encodedUrl.endsWith(QL1S(".jpeg"));
return hasException(ImageOption) ? !match : match;
}
@ -359,7 +364,7 @@ void AdBlockRule::parseFilter()
QString parsedLine = m_filter;
// Empty rule or just comment
if (m_filter.trimmed().isEmpty() || m_filter.startsWith(QLatin1Char('!'))) {
if (m_filter.trimmed().isEmpty() || m_filter.startsWith(QL1C('!'))) {
// We want to differentiate rule disabled by user and rule disabled in subscription file
// m_isInternalDisabled is also used when rule is disabled due to all options not being supported
m_isEnabled = false;
@ -369,14 +374,14 @@ void AdBlockRule::parseFilter()
}
// CSS Element hiding rule
if (parsedLine.contains(QLatin1String("##"))) {
if (parsedLine.contains(QL1S("##"))) {
m_type = CssRule;
int pos = parsedLine.indexOf(QLatin1String("##"));
int pos = parsedLine.indexOf(QL1S("##"));
// Domain restricted rule
if (!parsedLine.startsWith(QLatin1String("##"))) {
if (!parsedLine.startsWith(QL1S("##"))) {
QString domains = parsedLine.left(pos);
parseDomains(domains, QLatin1Char(','));
parseDomains(domains, QL1C(','));
}
m_matchString = parsedLine.mid(pos + 2);
@ -386,60 +391,60 @@ void AdBlockRule::parseFilter()
}
// Exception always starts with @@
if (parsedLine.startsWith(QLatin1String("@@"))) {
if (parsedLine.startsWith(QL1S("@@"))) {
m_isException = true;
parsedLine = parsedLine.mid(2);
}
// Parse all options following $ char
int optionsIndex = parsedLine.indexOf(QLatin1Char('$'));
int optionsIndex = parsedLine.indexOf(QL1C('$'));
if (optionsIndex >= 0) {
QStringList options = parsedLine.mid(optionsIndex + 1).split(QLatin1Char(','), QString::SkipEmptyParts);
const QStringList options = parsedLine.mid(optionsIndex + 1).split(QL1C(','), QString::SkipEmptyParts);
int handledOptions = 0;
foreach (const QString &option, options) {
if (option.startsWith(QLatin1String("domain="))) {
parseDomains(option.mid(7), QLatin1Char('|'));
if (option.startsWith(QL1S("domain="))) {
parseDomains(option.mid(7), QL1C('|'));
++handledOptions;
}
else if (option == QLatin1String("match-case")) {
else if (option == QL1S("match-case")) {
m_caseSensitivity = Qt::CaseSensitive;
++handledOptions;
}
else if (option.endsWith(QLatin1String("third-party"))) {
else if (option.endsWith(QL1S("third-party"))) {
setOption(ThirdPartyOption);
setException(ThirdPartyOption, option.startsWith(QLatin1Char('~')));
setException(ThirdPartyOption, option.startsWith(QL1C('~')));
++handledOptions;
}
else if (option.endsWith(QLatin1String("object"))) {
else if (option.endsWith(QL1S("object"))) {
setOption(ObjectOption);
setException(ObjectOption, option.startsWith(QLatin1Char('~')));
setException(ObjectOption, option.startsWith(QL1C('~')));
++handledOptions;
}
else if (option.endsWith(QLatin1String("subdocument"))) {
else if (option.endsWith(QL1S("subdocument"))) {
setOption(SubdocumentOption);
setException(SubdocumentOption, option.startsWith(QLatin1Char('~')));
setException(SubdocumentOption, option.startsWith(QL1C('~')));
++handledOptions;
}
else if (option.endsWith(QLatin1String("xmlhttprequest"))) {
else if (option.endsWith(QL1S("xmlhttprequest"))) {
setOption(XMLHttpRequestOption);
setException(XMLHttpRequestOption, option.startsWith(QLatin1Char('~')));
setException(XMLHttpRequestOption, option.startsWith(QL1C('~')));
++handledOptions;
}
else if (option.endsWith(QLatin1String("image"))) {
else if (option.endsWith(QL1S("image"))) {
setOption(ImageOption);
setException(ImageOption, option.startsWith(QLatin1Char('~')));
setException(ImageOption, option.startsWith(QL1C('~')));
++handledOptions;
}
else if (option == QLatin1String("document") && m_isException) {
else if (option == QL1S("document") && m_isException) {
setOption(DocumentOption);
++handledOptions;
}
else if (option == QLatin1String("elemhide") && m_isException) {
else if (option == QL1S("elemhide") && m_isException) {
setOption(ElementHideOption);
++handledOptions;
}
else if (option == QLatin1String("collapse")) {
else if (option == QL1S("collapse")) {
// Hiding placeholders of blocked elements is enabled by default
++handledOptions;
}
@ -456,7 +461,7 @@ void AdBlockRule::parseFilter()
}
// Rule is classic regexp
if (parsedLine.startsWith(QLatin1Char('/')) && parsedLine.endsWith(QLatin1Char('/'))) {
if (parsedLine.startsWith(QL1C('/')) && parsedLine.endsWith(QL1C('/'))) {
parsedLine = parsedLine.mid(1);
parsedLine = parsedLine.left(parsedLine.size() - 1);
@ -468,19 +473,16 @@ void AdBlockRule::parseFilter()
}
// Remove starting and ending wildcards (*)
if (parsedLine.startsWith(QLatin1Char('*'))) {
if (parsedLine.startsWith(QL1C('*'))) {
parsedLine = parsedLine.mid(1);
}
if (parsedLine.endsWith(QLatin1Char('*'))) {
if (parsedLine.endsWith(QL1C('*'))) {
parsedLine = parsedLine.left(parsedLine.size() - 1);
}
// We can use fast string matching for domain here
if (parsedLine.startsWith(QLatin1String("||")) &&
parsedLine.endsWith(QLatin1Char('^')) &&
!parsedLine.contains(QzRegExp("[/:?=&\\*]"))
) {
if (filterIsOnlyDomain(parsedLine)) {
parsedLine = parsedLine.mid(2);
parsedLine = parsedLine.left(parsedLine.size() - 1);
@ -490,10 +492,7 @@ void AdBlockRule::parseFilter()
}
// If rule contains only | at end, we can also use string matching
if (parsedLine.endsWith(QLatin1Char('|')) &&
parsedLine.count(QLatin1Char('|')) == 1 &&
!parsedLine.contains(QzRegExp("[\\^\\*]"))
) {
if (filterIsOnlyEndsMatch(parsedLine)) {
parsedLine = parsedLine.left(parsedLine.size() - 1);
m_type = StringEndsMatchRule;
@ -503,28 +502,13 @@ void AdBlockRule::parseFilter()
// If we still find a wildcard (*) or separator (^) or (|)
// we must modify parsedLine to comply with QzRegExp
if (parsedLine.contains(QLatin1Char('*')) ||
parsedLine.contains(QLatin1Char('^')) ||
parsedLine.contains(QLatin1Char('|'))
if (parsedLine.contains(QL1C('*')) ||
parsedLine.contains(QL1C('^')) ||
parsedLine.contains(QL1C('|'))
) {
QString parsedRegExp = parsedLine;
parsedRegExp.replace(QzRegExp(QLatin1String("\\*+")), QLatin1String("*")) // remove multiple wildcards
.replace(QzRegExp(QLatin1String("\\^\\|$")), QLatin1String("^")) // remove anchors following separator placeholder
.replace(QzRegExp(QLatin1String("^(\\*)")), QString()) // remove leading wildcards
.replace(QzRegExp(QLatin1String("(\\*)$")), QString())
.replace(QzRegExp(QLatin1String("(\\W)")), QLatin1String("\\\\1")) // escape special symbols
.replace(QzRegExp(QLatin1String("^\\\\\\|\\\\\\|")),
QLatin1String("^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?")) // process extended anchor at expression start
.replace(QzRegExp(QLatin1String("\\\\\\^")),
QLatin1String("(?:[^\\w\\d\\-.%]|$)")) // process separator placeholders
.replace(QzRegExp(QLatin1String("^\\\\\\|")), QLatin1String("^")) // process anchor at expression start
.replace(QzRegExp(QLatin1String("\\\\\\|$")), QLatin1String("$")) // process anchor at expression end
.replace(QzRegExp(QLatin1String("\\\\\\*")), QLatin1String(".*")); // replace wildcards by .*
m_type = RegExpMatchRule;
m_regExp = new RegExp;
m_regExp->regExp = QzRegExp(parsedRegExp, m_caseSensitivity);
m_regExp->regExp = QzRegExp(createRegExpFromFilter(parsedLine), m_caseSensitivity);
m_regExp->regExpStrings = parseRegExpFilter(parsedLine);
return;
}
@ -542,7 +526,7 @@ void AdBlockRule::parseDomains(const QString &domains, const QChar &separator)
if (domain.isEmpty()) {
continue;
}
if (domain.startsWith(QLatin1Char('~'))) {
if (domain.startsWith(QL1C('~'))) {
m_blockedDomains.append(domain.mid(1));
}
else {
@ -555,6 +539,99 @@ void AdBlockRule::parseDomains(const QString &domains, const QChar &separator)
}
}
bool AdBlockRule::filterIsOnlyDomain(const QString &filter) const
{
if (!filter.endsWith(QL1C('^')) || !filter.startsWith(QL1S("||")))
return false;
for (int i = 0; i < filter.size(); ++i) {
switch (filter.at(i).toAscii()) {
case '/':
case ':':
case '?':
case '=':
case '&':
case '*':
return false;
default:
break;
}
}
return true;
}
bool AdBlockRule::filterIsOnlyEndsMatch(const QString &filter) const
{
for (int i = 0; i < filter.size(); ++i) {
switch (filter.at(i).toAscii()) {
case '^':
case '*':
return false;
case '|':
return i == filter.size() - 1;
default:
break;
}
}
return false;
}
static bool wordCharacter(const QChar &c)
{
return c.isLetterOrNumber() || c.isMark() || c == QL1C('_');
}
QString AdBlockRule::createRegExpFromFilter(const QString &filter) const
{
QString parsed;
parsed.reserve(filter.size());
bool hadWildcard = false; // Filter multiple wildcards
for (int i = 0; i < filter.size(); ++i) {
const QChar c = filter.at(i);
switch (c.toAscii()) {
case '^':
parsed.append(QL1S("(?:[^\\w\\d\\-.%]|$)"));
break;
case '*':
if (!hadWildcard)
parsed.append(QL1S(".*"));
break;
case '|':
if (i == 0) {
if (filter.size() > 1 && filter.at(1) == QL1C('|')) {
parsed.append(QL1S("^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"));
i++;
}
else {
parsed.append('^');
}
break;
}
else if (i == filter.size() - 1) {
parsed.append(QL1C('$'));
break;
}
// fallthrough
default:
if (!wordCharacter(c))
parsed.append(QL1C('\\') + c);
else
parsed.append(c);
}
hadWildcard = c == QL1C('*');
}
return parsed;
}
bool AdBlockRule::isMatchingDomain(const QString &domain, const QString &filter) const
{
return QzTools::matchDomain(filter, domain);
@ -575,22 +652,28 @@ bool AdBlockRule::isMatchingRegExpStrings(const QString &url) const
// Split regexp filter into strings that can be used with QString::contains
// Don't use parts that contains only 1 char and duplicated parts
QStringList AdBlockRule::parseRegExpFilter(const QString &parsedFilter) const
QStringList AdBlockRule::parseRegExpFilter(const QString &filter) const
{
// Meta characters in AdBlock rules are | * ^
QStringList list = parsedFilter.split(QzRegExp("[|\\*\\^]"), QString::SkipEmptyParts);
QStringList list;
int startPos = -1;
list.removeDuplicates();
for (int i = 0; i < list.length(); ++i) {
const QString part = list.at(i);
if (part.length() < 2) {
list.removeAt(i);
i--;
for (int i = 0; i < filter.size(); ++i) {
const QChar c = filter.at(i);
// Meta characters in AdBlock rules are | * ^
if (c == QL1C('|') || c == QL1C('*') || c == QL1C('^')) {
const QString sub = filter.mid(startPos, i - startPos);
if (sub.size() > 1)
list.append(sub);
startPos = i + 1;
}
}
const QString sub = filter.mid(startPos);
if (sub.size() > 1)
list.append(sub);
list.removeDuplicates();
return list;
}
@ -615,8 +698,3 @@ void AdBlockRule::setException(const AdBlockRule::RuleOption &opt, bool on)
m_exceptions |= opt;
}
}
AdBlockRule::~AdBlockRule()
{
delete m_regExp;
}

View File

@ -98,7 +98,7 @@ public:
protected:
bool isMatchingDomain(const QString &domain, const QString &filter) const;
bool isMatchingRegExpStrings(const QString &url) const;
QStringList parseRegExpFilter(const QString &parsedFilter) const;
QStringList parseRegExpFilter(const QString &filter) const;
private:
enum RuleType {
@ -133,6 +133,9 @@ private:
void parseFilter();
void parseDomains(const QString &domains, const QChar &separator);
bool filterIsOnlyDomain(const QString &filter) const;
bool filterIsOnlyEndsMatch(const QString &filter) const;
QString createRegExpFromFilter(const QString &filter) const;
AdBlockSubscription* m_subscription;

View File

@ -48,6 +48,10 @@
#define QL1S(x) QLatin1String(x)
#endif
#ifndef QL1C
#define QL1C(x) QLatin1Char(x)
#endif
namespace Qz
{
// Version of session.dat file

View File

@ -1,3 +1,5 @@
include($$PWD/../../src/defines.pri)
isEqual(QT_MAJOR_VERSION, 5) {
QT += webkitwidgets network widgets printsupport sql script gui-private testlib
} else {
@ -12,8 +14,6 @@ TARGET = autotests
unix:contains(DEFINES, "NO_SYSTEM_DATAPATH"): QMAKE_LFLAGS+=$${QMAKE_LFLAGS_RPATH}\\$\$ORIGIN
include($$PWD/../../src/defines.pri)
# KWallet plugin
exists($$PWD/../../bin/plugins/libKWalletPasswords.so) {
LIBS += $$PWD/../../bin/plugins/libKWalletPasswords.so

Binary file not shown.

View File

@ -0,0 +1,42 @@
/* ============================================================
* QupZilla - WebKit based browser
* Copyright (C) 2013 David Rosca <nowrep@gmail.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* ============================================================ */
#include "adblockrule.h"
#include "adblocksubscription.h"
#include <QtTest/QtTest>
class AdBlockParseRule : public QObject
{
Q_OBJECT
private slots:
void parseEasyList();
};
void AdBlockParseRule::parseEasyList()
{
QBENCHMARK {
AdBlockSubscription* subscription = new AdBlockSubscription("EasyList", this);
subscription->setFilePath("easylist.txt");
subscription->loadSubscription(QStringList());
}
}
QTEST_MAIN(AdBlockParseRule)
#include "adblockparserule.moc"

View File

@ -0,0 +1,4 @@
include(../benchmarks.pri)
TARGET = adblockparserule
SOURCES = adblockparserule.cpp

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,55 @@
include($$PWD/../../src/defines.pri)
isEqual(QT_MAJOR_VERSION, 5) {
QT += webkitwidgets network widgets printsupport sql script gui-private testlib
} else {
QT += core gui webkit sql network script
CONFIG += qtestlib
}
!unix|mac: LIBS += -L$$PWD/../../bin -lQupZilla
!mac:unix: LIBS += $$PWD/../../bin/libQupZilla.so
unix:contains(DEFINES, "NO_SYSTEM_DATAPATH"): QMAKE_LFLAGS+=$${QMAKE_LFLAGS_RPATH}$$PWD/../../bin
# KWallet plugin
exists($$PWD/../../bin/plugins/libKWalletPasswords.so) {
LIBS += $$PWD/../../bin/plugins/libKWalletPasswords.so
DEFINES += HAVE_KDE_PASSWORDS_PLUGIN
}
# GnomeKeyring plugin
exists($$PWD/../../bin/plugins/libGnomeKeyringPasswords.so) {
LIBS += $$PWD/../../bin/plugins/libGnomeKeyringPasswords.so
DEFINES += HAVE_GNOME_PASSWORDS_PLUGIN
}
DESTDIR =
OBJECTS_DIR = build
MOC_DIR = build
RCC_DIR = build
UI_DIR = build
INCLUDEPATH += $$PWD/../../src/lib/3rdparty \
$$PWD/../../src/lib/adblock \
$$PWD/../../src/lib/app \
$$PWD/../../src/lib/autofill \
$$PWD/../../src/lib/bookmarks \
$$PWD/../../src/lib/cookies \
$$PWD/../../src/lib/downloads \
$$PWD/../../src/lib/history \
$$PWD/../../src/lib/navigation \
$$PWD/../../src/lib/network \
$$PWD/../../src/lib/notifications \
$$PWD/../../src/lib/opensearch \
$$PWD/../../src/lib/other \
$$PWD/../../src/lib/plugins \
$$PWD/../../src/lib/popupwindow \
$$PWD/../../src/lib/preferences \
$$PWD/../../src/lib/rss \
$$PWD/../../src/lib/session \
$$PWD/../../src/lib/sidebar \
$$PWD/../../src/lib/tabwidget \
$$PWD/../../src/lib/tools \
$$PWD/../../src/lib/webkit \
$$PWD/../../src/lib/webtab \

View File

@ -0,0 +1,20 @@
TEMPLATE = subdirs
include(benchmarks.pri)
defineTest(addSubdir) {
for(subdir, 1) {
entries = $$files($$subdir/*)
for(entry, entries) {
fullPath = $$replace(entry, ;,"")
fullPath = $$replace(fullPath, \\\\, /)
name = $$replace(fullPath, $$re_escape("$$subdir/"), "")
os2|win32: fullPath = $$lower($$fullPath)
exists($$fullPath/*.pro): SUBDIRS += $$fullPath
}
}
export (SUBDIRS)
}
addSubdir($$PWD)