1
mirror of https://invent.kde.org/network/falkon.git synced 2024-09-22 18:22:10 +02:00
falkonOfficial/src/lib/adblock/adblockrule.cpp

482 lines
14 KiB
C++
Raw Normal View History

2011-03-27 21:59:40 +02:00
/* ============================================================
* QupZilla - WebKit based browser
* Copyright (C) 2010-2012 David Rosca <nowrep@gmail.com>
2011-03-27 21:59:40 +02:00
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* ============================================================ */
/**
* Copyright (c) 2009, Zsombor Gegesy <gzsombor@gmail.com>
* Copyright (c) 2009, Benjamin C. Meyer <ben@meyerhome.net>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the Benjamin Meyer nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "adblockrule.h"
#include "adblocksubscription.h"
#include <QDebug>
#include <QRegExp>
#include <QUrl>
#include <QString>
#include <QStringList>
#include <QNetworkRequest>
#include <QWebFrame>
#include <QWebPage>
2011-03-27 21:59:40 +02:00
// Version for Qt < 4.8 has one issue, it will wrongly
// count .co.uk (and others) as second-level domain
QString toSecondLevelDomain(const QUrl &url)
{
#if QT_VERSION >= 0x040800
const QString &topLevelDomain = url.topLevelDomain();
const QString &urlHost = url.host();
if (topLevelDomain.isEmpty() || urlHost.isEmpty()) {
return QString();
}
QString domain = urlHost.left(urlHost.size() - topLevelDomain.size());
if (domain.count('.') == 0) {
return urlHost;
}
while (domain.count('.') != 0) {
domain = domain.mid(domain.indexOf('.') + 1);
}
return domain + topLevelDomain;
#else
QString domain = url.host();
if (domain.count('.') == 0) {
return QString();
}
while (domain.count('.') != 1) {
domain = domain.mid(domain.indexOf('.') + 1);
}
return domain;
#endif
}
2011-03-27 21:59:40 +02:00
AdBlockRule::AdBlockRule(const QString &filter, AdBlockSubscription* subscription)
: m_subscription(subscription)
, m_enabled(true)
, m_cssRule(false)
, m_exception(false)
, m_internalDisabled(false)
, m_domainRestricted(false)
, m_useRegExp(false)
, m_useDomainMatch(false)
, m_useEndsMatch(false)
, m_thirdParty(false)
, m_thirdPartyException(false)
, m_object(false)
, m_objectException(false)
, m_subdocument(false)
, m_subdocumentException(false)
, m_xmlhttprequest(false)
, m_xmlhttprequestException(false)
, m_caseSensitivity(Qt::CaseInsensitive)
2011-03-27 21:59:40 +02:00
{
setFilter(filter);
}
AdBlockSubscription* AdBlockRule::subscription() const
{
return m_subscription;
}
void AdBlockRule::setSubscription(AdBlockSubscription* subscription)
{
m_subscription = subscription;
}
2011-03-27 21:59:40 +02:00
QString AdBlockRule::filter() const
{
return m_filter;
}
void AdBlockRule::setFilter(const QString &filter)
{
m_filter = filter;
parseFilter();
2011-03-27 21:59:40 +02:00
}
bool AdBlockRule::isCssRule() const
2011-03-27 21:59:40 +02:00
{
return m_cssRule;
}
2011-03-27 21:59:40 +02:00
QString AdBlockRule::cssSelector() const
{
return m_cssSelector;
2011-03-27 21:59:40 +02:00
}
bool AdBlockRule::isDomainRestricted() const
2011-03-27 21:59:40 +02:00
{
return m_domainRestricted;
2011-03-27 21:59:40 +02:00
}
bool AdBlockRule::isException() const
2011-03-27 21:59:40 +02:00
{
return m_exception;
2011-03-27 21:59:40 +02:00
}
bool AdBlockRule::isComment() const
{
return m_filter.startsWith('!');
}
2011-03-27 21:59:40 +02:00
bool AdBlockRule::isEnabled() const
{
return m_enabled;
}
void AdBlockRule::setEnabled(bool enabled)
{
m_enabled = enabled;
}
bool AdBlockRule::isSlow() const
{
return m_useRegExp;
}
bool AdBlockRule::isInternalDisabled() const
2011-03-27 21:59:40 +02:00
{
return m_internalDisabled;
}
bool AdBlockRule::networkMatch(const QNetworkRequest &request, const QString &domain, const QString &encodedUrl) const
{
if (m_cssRule || !m_enabled || m_internalDisabled) {
return false;
}
bool matched = false;
if (m_useRegExp) {
matched = (m_regExp.indexIn(encodedUrl) != -1);
}
else if (m_useDomainMatch) {
matched = domain.endsWith(m_matchString);
}
else if (m_useEndsMatch) {
matched = encodedUrl.endsWith(m_matchString, m_caseSensitivity);
}
else {
matched = encodedUrl.contains(m_matchString, m_caseSensitivity);
}
if (matched) {
// Check domain restrictions
if (m_domainRestricted && !matchDomain(domain)) {
return false;
}
// Check third-party restriction
if (m_thirdParty && !matchThirdParty(request)) {
return false;
}
// Check object restrictions
if (m_object && !matchObject(request)) {
return false;
}
// Check subdocument restriction
if (m_subdocument && !matchSubdocument(request)) {
return false;
}
// Check xmlhttprequest restriction
if (m_xmlhttprequest && !matchXmlHttpRequest(request)) {
return false;
}
}
return matched;
2011-03-27 21:59:40 +02:00
}
bool AdBlockRule::matchDomain(const QString &domain) const
{
if (!m_domainRestricted) {
return true;
}
if (m_blockedDomains.isEmpty()) {
foreach(const QString & d, m_allowedDomains) {
if (domain.endsWith(d)) {
return true;
}
}
}
else if (m_allowedDomains.isEmpty()) {
foreach(const QString & d, m_blockedDomains) {
if (domain.endsWith(d)) {
return false;
}
}
return true;
}
else {
foreach(const QString & d, m_blockedDomains) {
if (domain.endsWith(d)) {
return false;
}
}
foreach(const QString & d, m_allowedDomains) {
if (domain.endsWith(d)) {
return true;
}
}
}
return false;
2011-03-27 21:59:40 +02:00
}
bool AdBlockRule::matchThirdParty(const QNetworkRequest &request) const
{
const QString &referer = request.rawHeader("Referer");
if (referer.isEmpty()) {
return false;
}
// Third-party matching should be performed on second-level domains
const QString &refererHost = toSecondLevelDomain(QUrl(referer));
const QString &host = toSecondLevelDomain(request.url());
bool match = refererHost != host;
return m_thirdPartyException ? !match : match;
}
bool AdBlockRule::matchObject(const QNetworkRequest &request) const
{
bool match = request.attribute(QNetworkRequest::Attribute(QNetworkRequest::User + 150)).toString() == QString("object");
return m_objectException ? !match : match;
}
bool AdBlockRule::matchSubdocument(const QNetworkRequest &request) const
{
QWebFrame* originatingFrame = static_cast<QWebFrame*>(request.originatingObject());
if (!originatingFrame) {
return false;
}
QWebPage* page = originatingFrame->page();
if (!page) {
return false;
}
bool match = originatingFrame == page->mainFrame();
return m_subdocumentException ? !match : match;
}
bool AdBlockRule::matchXmlHttpRequest(const QNetworkRequest &request) const
{
bool match = request.rawHeader("X-Requested-With") == QByteArray("XMLHttpRequest");
return m_xmlhttprequestException ? !match : match;
}
void AdBlockRule::parseFilter()
2011-03-27 21:59:40 +02:00
{
QString parsedLine = m_filter;
// Empty rule or just comment
if (m_filter.trimmed().isEmpty() || m_filter.startsWith('!')) {
m_enabled = false;
return;
}
// CSS Element hiding rule
if (parsedLine.contains("##")) {
m_cssRule = true;
int pos = parsedLine.indexOf("##");
// Domain restricted rule
if (!parsedLine.startsWith("##")) {
QString domains = parsedLine.left(pos);
parseDomains(domains, ',');
}
m_cssSelector = parsedLine.mid(pos + 2);
// CSS rule cannot have more options -> stop parsing
return;
}
// Exception always starts with @@
if (parsedLine.startsWith("@@")) {
m_exception = true;
parsedLine = parsedLine.mid(2);
}
// Parse all options following $ char
int optionsIndex = parsedLine.indexOf('$');
if (optionsIndex >= 0) {
QStringList options = parsedLine.mid(optionsIndex + 1).split(',');
int handledOptions = 0;
foreach(const QString & option, options) {
if (option.startsWith("domain=")) {
parseDomains(option.mid(7), '|');
++handledOptions;
}
else if (option.endsWith("match-case")) {
m_caseSensitivity = Qt::CaseSensitive;
++handledOptions;
}
else if (option.endsWith("third-party")) {
m_thirdParty = true;
m_thirdPartyException = option.startsWith('~');
++handledOptions;
}
else if (option.endsWith("object")) {
m_object = true;
m_objectException = option.startsWith('~');
++handledOptions;
}
else if (option.endsWith("subdocument")) {
m_subdocument = true;
m_subdocumentException = option.startsWith('~');
++handledOptions;
}
else if (option.endsWith("xmlhttprequest")) {
m_xmlhttprequest = true;
m_xmlhttprequestException = option.startsWith('~');
++handledOptions;
}
}
// If we don't handle all options, it's safer to just disable this rule
if (handledOptions != options.count()) {
m_internalDisabled = true;
return;
}
parsedLine = parsedLine.left(optionsIndex);
}
// Rule is classic regexp
if (parsedLine.startsWith('/') && parsedLine.endsWith('/')) {
parsedLine = parsedLine.mid(1);
parsedLine = parsedLine.left(parsedLine.size() - 1);
m_useRegExp = true;
m_regExp = QRegExp(parsedLine, m_caseSensitivity, QRegExp::RegExp);
return;
}
// Remove starting and ending wildcards (*)
if (parsedLine.startsWith('*')) {
parsedLine = parsedLine.mid(1);
}
if (parsedLine.endsWith('*')) {
parsedLine = parsedLine.left(parsedLine.size() - 1);
}
// We can use fast string matching for domain here
if (parsedLine.startsWith("||") && parsedLine.endsWith('^') && !parsedLine.contains(QRegExp("[/:?=&\\*]"))) {
parsedLine = parsedLine.mid(2);
parsedLine = parsedLine.left(parsedLine.size() - 1);
m_useDomainMatch = true;
m_matchString = parsedLine;
return;
}
// If rule contains only | at end, we can also use string matching
if (parsedLine.endsWith('|') && !parsedLine.contains(QRegExp("[\\^\\*]")) && parsedLine.count('|') == 1) {
parsedLine = parsedLine.left(parsedLine.size() - 1);
m_useEndsMatch = true;
m_matchString = parsedLine;
return;
}
// If we still find a wildcard (*) or separator (^) or (|)
// we must modify parsedLine to comply with QRegExp
if (parsedLine.contains('*') || parsedLine.contains('^') || parsedLine.contains('|')) {
parsedLine.replace(QRegExp(QLatin1String("\\*+")), QLatin1String("*")) // remove multiple wildcards
.replace(QRegExp(QLatin1String("\\^\\|$")), QLatin1String("^")) // remove anchors following separator placeholder
.replace(QRegExp(QLatin1String("^(\\*)")), QLatin1String("")) // remove leading wildcards
.replace(QRegExp(QLatin1String("(\\*)$")), QLatin1String(""))
.replace(QRegExp(QLatin1String("(\\W)")), QLatin1String("\\\\1")) // escape special symbols
.replace(QRegExp(QLatin1String("^\\\\\\|\\\\\\|")),
QLatin1String("^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?")) // process extended anchor at expression start
.replace(QRegExp(QLatin1String("\\\\\\^")),
QLatin1String("(?:[^\\w\\d\\-.%]|$)")) // process separator placeholders
.replace(QRegExp(QLatin1String("^\\\\\\|")), QLatin1String("^")) // process anchor at expression start
.replace(QRegExp(QLatin1String("\\\\\\|$")), QLatin1String("$")) // process anchor at expression end
.replace(QRegExp(QLatin1String("\\\\\\*")), QLatin1String(".*")); // replace wildcards by .*
m_useRegExp = true;
m_regExp = QRegExp(parsedLine, m_caseSensitivity, QRegExp::RegExp);
return;
}
// We haven't found anything that needs use of regexp, yay!
m_useRegExp = false;
m_matchString = parsedLine;
2011-03-27 21:59:40 +02:00
}
void AdBlockRule::parseDomains(const QString &domains, const QChar &separator)
{
QStringList domainsList = domains.split(separator);
foreach(const QString domain, domainsList) {
if (domain.isEmpty()) {
continue;
}
if (domain.startsWith('~')) {
m_blockedDomains.append(domain.mid(1));
}
else {
m_allowedDomains.append(domain);
}
}
m_domainRestricted = (!m_blockedDomains.isEmpty() || !m_allowedDomains.isEmpty());
}