mirror of
https://invent.kde.org/network/falkon.git
synced 2024-12-19 10:16:34 +01:00
AdBlock: Greatly improves performance of regexp rules.
Regexp will be parsed into parts that can be used with QString::contains for quick match. If url contains all parts, real QRegExp matching is applied. This speedup affects rules that are internally converted into regexp, not AdBlock's regexp rules (but those rules are very rarely used in subscriptions, so there's no need to optimise them)
This commit is contained in:
parent
b52d150e7d
commit
7e6d619516
@ -24,6 +24,7 @@ Version 1.4.0
|
||||
* moved config directory into ~/.config/qupzilla
|
||||
* certificates bundle is now only used on windows
|
||||
* reduced memory usage of AdBlock (saves up to 30MB with just EasyList)
|
||||
* greatly improved performance when matching regexp rules in AdBlock
|
||||
* GreaseMonkey: reload script if source file changed on disk
|
||||
* GreaseMonkey: fixed don't loading invalid scripts
|
||||
* fixed opening browser with url with ampersand (%26) as command line argument
|
||||
|
@ -46,6 +46,7 @@
|
||||
|
||||
#include "adblockrule.h"
|
||||
#include "adblocksubscription.h"
|
||||
#include "qztools.h"
|
||||
|
||||
#include <QDebug>
|
||||
#include "qzregexp.h"
|
||||
@ -206,12 +207,16 @@ bool AdBlockRule::networkMatch(const QNetworkRequest &request, const QString &do
|
||||
bool matched = false;
|
||||
|
||||
if (m_useDomainMatch) {
|
||||
matched = _matchDomain(domain, m_matchString);
|
||||
matched = isMatchingDomain(domain, m_matchString);
|
||||
}
|
||||
else if (m_useEndsMatch) {
|
||||
matched = encodedUrl.endsWith(m_matchString, m_caseSensitivity);
|
||||
}
|
||||
else if (m_regExp) {
|
||||
if (!isMatchingRegExpStrings(encodedUrl)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
matched = (m_regExp->indexIn(encodedUrl) != -1);
|
||||
}
|
||||
else {
|
||||
@ -277,14 +282,14 @@ bool AdBlockRule::matchDomain(const QString &domain) const
|
||||
|
||||
if (m_blockedDomains.isEmpty()) {
|
||||
foreach(const QString & d, m_allowedDomains) {
|
||||
if (_matchDomain(domain, d)) {
|
||||
if (isMatchingDomain(domain, d)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (m_allowedDomains.isEmpty()) {
|
||||
foreach(const QString & d, m_blockedDomains) {
|
||||
if (_matchDomain(domain, d)) {
|
||||
if (isMatchingDomain(domain, d)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -292,13 +297,13 @@ bool AdBlockRule::matchDomain(const QString &domain) const
|
||||
}
|
||||
else {
|
||||
foreach(const QString & d, m_blockedDomains) {
|
||||
if (_matchDomain(domain, d)) {
|
||||
if (isMatchingDomain(domain, d)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
foreach(const QString & d, m_allowedDomains) {
|
||||
if (_matchDomain(domain, d)) {
|
||||
if (isMatchingDomain(domain, d)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -388,6 +393,7 @@ void AdBlockRule::parseFilter()
|
||||
|
||||
m_cssSelector = parsedLine.mid(pos + 2);
|
||||
m_cssSelector.remove('\\');
|
||||
|
||||
// CSS rule cannot have more options -> stop parsing
|
||||
return;
|
||||
}
|
||||
@ -504,7 +510,9 @@ void AdBlockRule::parseFilter()
|
||||
// we must modify parsedLine to comply with QzRegExp
|
||||
if (parsedLine.contains(QLatin1Char('*')) || parsedLine.contains(QLatin1Char('^'))
|
||||
|| parsedLine.contains(QLatin1Char('|'))) {
|
||||
parsedLine.replace(QzRegExp(QLatin1String("\\*+")), QLatin1String("*")) // remove multiple wildcards
|
||||
QString parsedRegExp = parsedLine;
|
||||
|
||||
parsedRegExp.replace(QzRegExp(QLatin1String("\\*+")), QLatin1String("*")) // remove multiple wildcards
|
||||
.replace(QzRegExp(QLatin1String("\\^\\|$")), QLatin1String("^")) // remove anchors following separator placeholder
|
||||
.replace(QzRegExp(QLatin1String("^(\\*)")), QString()) // remove leading wildcards
|
||||
.replace(QzRegExp(QLatin1String("(\\*)$")), QString())
|
||||
@ -517,7 +525,8 @@ void AdBlockRule::parseFilter()
|
||||
.replace(QzRegExp(QLatin1String("\\\\\\|$")), QLatin1String("$")) // process anchor at expression end
|
||||
.replace(QzRegExp(QLatin1String("\\\\\\*")), QLatin1String(".*")); // replace wildcards by .*
|
||||
|
||||
m_regExp = new QzRegExp(parsedLine, m_caseSensitivity);
|
||||
m_regExpStrings = parseRegExpFilter(parsedLine);
|
||||
m_regExp = new QzRegExp(parsedRegExp, m_caseSensitivity);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -544,19 +553,41 @@ void AdBlockRule::parseDomains(const QString &domains, const QChar &separator)
|
||||
m_domainRestricted = (!m_blockedDomains.isEmpty() || !m_allowedDomains.isEmpty());
|
||||
}
|
||||
|
||||
bool AdBlockRule::_matchDomain(const QString &domain, const QString &filter) const
|
||||
bool AdBlockRule::isMatchingDomain(const QString &domain, const QString &filter) const
|
||||
{
|
||||
if (!domain.endsWith(filter)) {
|
||||
return false;
|
||||
return QzTools::matchDomain(filter, domain);
|
||||
}
|
||||
|
||||
bool AdBlockRule::isMatchingRegExpStrings(const QString &url) const
|
||||
{
|
||||
foreach(const QString & string, m_regExpStrings) {
|
||||
if (!url.contains(string)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int index = domain.indexOf(filter);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (index == 0 || filter[0] == QLatin1Char('.')) {
|
||||
return true;
|
||||
// Split regexp filter into strings that can be used with QString::contains
|
||||
// Don't use parts that contains only 1 char and duplicated parts
|
||||
QStringList AdBlockRule::parseRegExpFilter(const QString &parsedFilter) const
|
||||
{
|
||||
// Meta characters in AdBlock rules are | * ^
|
||||
QStringList list = parsedFilter.split(QzRegExp("[|\\*\\^]"), QString::SkipEmptyParts);
|
||||
|
||||
list.removeDuplicates();
|
||||
|
||||
for (int i = 0; i < list.length(); ++i) {
|
||||
const QString &part = list.at(i);
|
||||
|
||||
if (part.length() < 2) {
|
||||
list.removeAt(i);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
return domain[index - 1] == QLatin1Char('.');
|
||||
return list;
|
||||
}
|
||||
|
||||
AdBlockRule::~AdBlockRule()
|
||||
|
@ -95,12 +95,15 @@ public:
|
||||
bool matchXmlHttpRequest(const QNetworkRequest &request) const;
|
||||
bool matchImage(const QString &encodedUrl) const;
|
||||
|
||||
protected:
|
||||
bool isMatchingDomain(const QString &domain, const QString &filter) const;
|
||||
bool isMatchingRegExpStrings(const QString &url) const;
|
||||
QStringList parseRegExpFilter(const QString &parsedFilter) const;
|
||||
|
||||
private:
|
||||
void parseFilter();
|
||||
void parseDomains(const QString &domains, const QChar &separator);
|
||||
|
||||
bool _matchDomain(const QString &domain, const QString &filter) const;
|
||||
|
||||
AdBlockSubscription* m_subscription;
|
||||
QString m_filter;
|
||||
|
||||
@ -112,6 +115,7 @@ private:
|
||||
bool m_domainRestricted;
|
||||
|
||||
QzRegExp* m_regExp;
|
||||
QStringList m_regExpStrings;
|
||||
|
||||
bool m_useDomainMatch;
|
||||
bool m_useEndsMatch;
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include "qupzilla.h"
|
||||
#include "mainapplication.h"
|
||||
#include "settings.h"
|
||||
#include "qztools.h"
|
||||
|
||||
#include <QNetworkCookie>
|
||||
#include <QWebSettings>
|
||||
@ -234,17 +235,7 @@ bool CookieJar::matchDomain(QString cookieDomain, QString siteDomain)
|
||||
siteDomain = siteDomain.mid(1);
|
||||
}
|
||||
|
||||
if (cookieDomain == siteDomain) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!siteDomain.endsWith(cookieDomain)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int index = siteDomain.indexOf(cookieDomain);
|
||||
|
||||
return index > 0 && siteDomain[index - 1] == QLatin1Char('.');
|
||||
return QzTools::matchDomain(cookieDomain, siteDomain);
|
||||
}
|
||||
|
||||
bool CookieJar::listMatchesDomain(const QStringList &list, const QString &cookieDomain)
|
||||
|
@ -472,6 +472,24 @@ bool QzTools::isUtf8(const char* string)
|
||||
return true;
|
||||
}
|
||||
|
||||
// Matches domain (assumes both pattern and domain not starting with dot)
|
||||
// pattern = domain to be matched
|
||||
// domain = site domain
|
||||
bool QzTools::matchDomain(const QString &pattern, const QString &domain)
|
||||
{
|
||||
if (pattern == domain) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!domain.endsWith(pattern)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int index = domain.indexOf(pattern);
|
||||
|
||||
return index > 0 && domain[index - 1] == QLatin1Char('.');
|
||||
}
|
||||
|
||||
static inline bool isQuote(const QChar &c)
|
||||
{
|
||||
return (c == QLatin1Char('"') || c == QLatin1Char('\''));
|
||||
|
@ -63,6 +63,8 @@ bool QT_QUPZILLA_EXPORT startExternalProcess(const QString &executable, const QS
|
||||
QIcon QT_QUPZILLA_EXPORT iconFromFileName(const QString &fileName);
|
||||
bool QT_QUPZILLA_EXPORT isUtf8(const char* string);
|
||||
|
||||
bool QT_QUPZILLA_EXPORT matchDomain(const QString &pattern, const QString &domain);
|
||||
|
||||
QString QT_QUPZILLA_EXPORT operatingSystem();
|
||||
|
||||
// Qt5 migration help functions
|
||||
|
109
tests/autotests/adblocktest.cpp
Normal file
109
tests/autotests/adblocktest.cpp
Normal file
@ -0,0 +1,109 @@
|
||||
/* ============================================================
|
||||
* QupZilla - WebKit based browser
|
||||
* Copyright (C) 2013 David Rosca <nowrep@gmail.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* ============================================================ */
|
||||
#include "adblocktest.h"
|
||||
#include "adblockrule.h"
|
||||
|
||||
#include <QtTest/QtTest>
|
||||
|
||||
class AdBlockRule_Test : public AdBlockRule
|
||||
{
|
||||
public:
|
||||
QStringList parseRegExpFilter(const QString &parsedFilter)
|
||||
{
|
||||
return AdBlockRule::parseRegExpFilter(parsedFilter);
|
||||
}
|
||||
|
||||
bool isMatchingDomain(const QString &domain, const QString &filter) const
|
||||
{
|
||||
return AdBlockRule::isMatchingDomain(domain, filter);
|
||||
}
|
||||
};
|
||||
|
||||
void AdBlockTest::isMatchingCookieTest_data()
|
||||
{
|
||||
// Test copied from CookiesTest
|
||||
QTest::addColumn<QString>("filterDomain");
|
||||
QTest::addColumn<QString>("siteDomain");
|
||||
QTest::addColumn<bool>("result");
|
||||
|
||||
QTest::newRow("test1") << "example.com" << "www.example.com" << true;
|
||||
QTest::newRow("test2") << "example.com" << "example.com" << true;
|
||||
QTest::newRow("test3") << "example.com" << "anotherexample.com" << false;
|
||||
QTest::newRow("test4") << "test.example.com" << "example.com" << false;
|
||||
QTest::newRow("test5") << "www.example.com" << "example.com" << false;
|
||||
QTest::newRow("test_empty") << "www.example.com" << "" << false;
|
||||
QTest::newRow("test_empty2") << "" << "example.com" << false;
|
||||
}
|
||||
|
||||
void AdBlockTest::isMatchingCookieTest()
|
||||
{
|
||||
AdBlockRule_Test rule_test;
|
||||
|
||||
QFETCH(QString, filterDomain);
|
||||
QFETCH(QString, siteDomain);
|
||||
QFETCH(bool, result);
|
||||
|
||||
QCOMPARE(rule_test.isMatchingDomain(siteDomain, filterDomain), result);
|
||||
}
|
||||
|
||||
void AdBlockTest::parseRegExpFilterTest_data()
|
||||
{
|
||||
QTest::addColumn<QString>("parsedFilter");
|
||||
QTest::addColumn<QStringList>("result");
|
||||
|
||||
QTest::newRow("test1") << "||doubleclick.net/pfadx/tmg.telegraph."
|
||||
<< (QStringList() << "doubleclick.net/pfadx/tmg.telegraph.");
|
||||
QTest::newRow("test2") << "||doubleclick.net/pfadx/*.mtvi"
|
||||
<< (QStringList() << "doubleclick.net/pfadx/" << ".mtvi");
|
||||
QTest::newRow("test3") << "&prvtof=*&poru="
|
||||
<< (QStringList() << "&prvtof=" << "&poru=");
|
||||
QTest::newRow("test4") << "/addyn|*;adtech;"
|
||||
<< (QStringList() << "/addyn" << ";adtech;");
|
||||
QTest::newRow("test5") << "/eas_fif.html^"
|
||||
<< (QStringList() << "/eas_fif.html");
|
||||
QTest::newRow("test6") << "://findnsave.^.*/api/groupon.json?"
|
||||
<< (QStringList() << "://findnsave." << "/api/groupon.json?");
|
||||
QTest::newRow("test7") << "^fp=*&prvtof="
|
||||
<< (QStringList() << "fp=" << "&prvtof=");
|
||||
QTest::newRow("test8") << "|http://ax-d.*/jstag^"
|
||||
<< (QStringList() << "http://ax-d." << "/jstag");
|
||||
QTest::newRow("test9") << "||reuters.com^*/rcom-wt-mlt.js"
|
||||
<< (QStringList() << "reuters.com" <<"/rcom-wt-mlt.js");
|
||||
QTest::newRow("test10") << "||chip.de^*/tracking.js"
|
||||
<< (QStringList() << "chip.de" << "/tracking.js");
|
||||
QTest::newRow("ignore1char") << "/search.php?uid=*.*&src="
|
||||
<< (QStringList() << "/search.php?uid=" << "&src=");
|
||||
QTest::newRow("ignoreDuplicates") << "/search.*.dup.*.dup.*&src="
|
||||
<< (QStringList() << "/search." << ".dup." << "&src=");
|
||||
QTest::newRow("empty") << QString()
|
||||
<< (QStringList());
|
||||
QTest::newRow("justspaces") << QString(" ")
|
||||
<< (QStringList() << " ");
|
||||
QTest::newRow("spacesWithMetachars") << QString(" * ?")
|
||||
<< (QStringList() << " " << " ?");
|
||||
}
|
||||
|
||||
void AdBlockTest::parseRegExpFilterTest()
|
||||
{
|
||||
AdBlockRule_Test rule_test;
|
||||
|
||||
QFETCH(QString, parsedFilter);
|
||||
QFETCH(QStringList, result);
|
||||
|
||||
QCOMPARE(rule_test.parseRegExpFilter(parsedFilter), result);
|
||||
}
|
36
tests/autotests/adblocktest.h
Normal file
36
tests/autotests/adblocktest.h
Normal file
@ -0,0 +1,36 @@
|
||||
/* ============================================================
|
||||
* QupZilla - WebKit based browser
|
||||
* Copyright (C) 2013 David Rosca <nowrep@gmail.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* ============================================================ */
|
||||
#ifndef ADBLOCKTEST_H
|
||||
#define ADBLOCKTEST_H
|
||||
|
||||
#include <QObject>
|
||||
|
||||
class AdBlockTest : public QObject
|
||||
{
|
||||
Q_OBJECT
|
||||
|
||||
private slots:
|
||||
void isMatchingCookieTest_data();
|
||||
void isMatchingCookieTest();
|
||||
|
||||
void parseRegExpFilterTest_data();
|
||||
void parseRegExpFilterTest();
|
||||
|
||||
};
|
||||
|
||||
#endif // ADBLOCKTEST_H
|
@ -49,11 +49,13 @@ HEADERS += \
|
||||
qztoolstest.h \
|
||||
formcompletertest.h \
|
||||
cookiestest.h \
|
||||
downloadstest.h
|
||||
downloadstest.h \
|
||||
adblocktest.h
|
||||
|
||||
SOURCES += \
|
||||
qztoolstest.cpp \
|
||||
main.cpp \
|
||||
formcompletertest.cpp \
|
||||
cookiestest.cpp \
|
||||
downloadstest.cpp
|
||||
downloadstest.cpp \
|
||||
adblocktest.cpp
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include "formcompletertest.h"
|
||||
#include "cookiestest.h"
|
||||
#include "downloadstest.h"
|
||||
#include "adblocktest.h"
|
||||
|
||||
#include <QtTest/QtTest>
|
||||
|
||||
@ -39,5 +40,8 @@ int main(int argc, char *argv[])
|
||||
DownloadsTest downloadsTest;
|
||||
QTest::qExec(&downloadsTest, argc, argv);
|
||||
|
||||
AdBlockTest adblockTest;
|
||||
QTest::qExec(&adblockTest, argc, argv);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user