searx_ynh/sources/searx/https_rewrite.py

210 lines
6.4 KiB
Python

'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''
import re
from urlparse import urlparse
from lxml import etree
from os import listdir
from os.path import isfile, isdir, join
from searx import logger
logger = logger.getChild("https_rewrite")
# https://gitweb.torproject.org/\
# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
# HTTPS rewrite rules
https_rules = []
# load single ruleset from a xml file
def load_single_https_ruleset(filepath):
ruleset = ()
# init parser
parser = etree.XMLParser()
# load and parse xml-file
try:
tree = etree.parse(filepath, parser)
except:
# TODO, error message
return ()
# get root node
root = tree.getroot()
# check if root is a node with the name ruleset
# TODO improve parsing
if root.tag != 'ruleset':
return ()
# check if rule is deactivated by default
if root.attrib.get('default_off'):
return ()
# check if rule does only work for specific platforms
if root.attrib.get('platform'):
return ()
hosts = []
rules = []
exclusions = []
# parse childs from ruleset
for ruleset in root:
# this child define a target
if ruleset.tag == 'target':
# check if required tags available
if not ruleset.attrib.get('host'):
continue
# convert host-rule to valid regex
host = ruleset.attrib.get('host')\
.replace('.', '\.').replace('*', '.*')
# append to host list
hosts.append(host)
# this child define a rule
elif ruleset.tag == 'rule':
# check if required tags available
if not ruleset.attrib.get('from')\
or not ruleset.attrib.get('to'):
continue
# TODO hack, which convert a javascript regex group
# into a valid python regex group
rule_from = ruleset.attrib['from'].replace('$', '\\')
if rule_from.endswith('\\'):
rule_from = rule_from[:-1]+'$'
rule_to = ruleset.attrib['to'].replace('$', '\\')
if rule_to.endswith('\\'):
rule_to = rule_to[:-1]+'$'
# TODO, not working yet because of the hack above,
# currently doing that in webapp.py
# rule_from_rgx = re.compile(rule_from, re.I)
# append rule
try:
rules.append((re.compile(rule_from, re.I | re.U), rule_to))
except:
# TODO log regex error
continue
# this child define an exclusion
elif ruleset.tag == 'exclusion':
# check if required tags available
if not ruleset.attrib.get('pattern'):
continue
exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
# append exclusion
exclusions.append(exclusion_rgx)
# convert list of possible hosts to a simple regex
# TODO compress regex to improve performance
try:
target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
except:
return ()
# return ruleset
return (target_hosts, rules, exclusions)
# load all https rewrite rules
def load_https_rules(rules_path):
# check if directory exists
if not isdir(rules_path):
logger.error("directory not found: '" + rules_path + "'")
return
# search all xml files which are stored in the https rule directory
xml_files = [join(rules_path, f)
for f in listdir(rules_path)
if isfile(join(rules_path, f)) and f[-4:] == '.xml']
# load xml-files
for ruleset_file in xml_files:
# calculate rewrite-rules
ruleset = load_single_https_ruleset(ruleset_file)
# skip if no ruleset returned
if not ruleset:
continue
# append ruleset
https_rules.append(ruleset)
logger.info('{n} rules loaded'.format(n=len(https_rules)))
def https_url_rewrite(result):
skip_https_rewrite = False
# check if HTTPS rewrite is possible
for target, rules, exclusions in https_rules:
# check if target regex match with url
if target.match(result['parsed_url'].netloc):
# process exclusions
for exclusion in exclusions:
# check if exclusion match with url
if exclusion.match(result['url']):
skip_https_rewrite = True
break
# skip https rewrite if required
if skip_https_rewrite:
break
# process rules
for rule in rules:
try:
new_result_url = rule[0].sub(rule[1], result['url'])
except:
break
# parse new url
new_parsed_url = urlparse(new_result_url)
# continiue if nothing was rewritten
if result['url'] == new_result_url:
continue
# get domainname from result
# TODO, does only work correct with TLD's like
# asdf.com, not for asdf.com.de
# TODO, using publicsuffix instead of this rewrite rule
old_result_domainname = '.'.join(
result['parsed_url'].hostname.split('.')[-2:])
new_result_domainname = '.'.join(
new_parsed_url.hostname.split('.')[-2:])
# check if rewritten hostname is the same,
# to protect against wrong or malicious rewrite rules
if old_result_domainname == new_result_domainname:
# set new url
result['url'] = new_result_url
# target has matched, do not search over the other rules
break
return result