update sources
This commit is contained in:
parent
2357ffbf73
commit
d265aca41c
|
@ -33,3 +33,4 @@ generally made searx better:
|
||||||
- Benjamin Sonntag
|
- Benjamin Sonntag
|
||||||
- @opi
|
- @opi
|
||||||
- @dimqua
|
- @dimqua
|
||||||
|
- Giorgos Logiotatidis
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
FROM debian:stable
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
python-dev python2.7-minimal python-virtualenv \
|
||||||
|
python-pybabel python-pip zlib1g-dev \
|
||||||
|
libxml2-dev libxslt1-dev build-essential \
|
||||||
|
openssl
|
||||||
|
|
||||||
|
RUN useradd searx
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
RUN pip install uwsgi
|
||||||
|
COPY requirements.txt /app/requirements.txt
|
||||||
|
RUN pip install -r requirements.txt
|
||||||
|
|
||||||
|
COPY . /app
|
||||||
|
RUN sed -i -e "s/ultrasecretkey/`openssl rand -hex 16`/g" searx/settings.yml
|
||||||
|
|
||||||
|
EXPOSE 5000
|
||||||
|
CMD ["/usr/local/bin/uwsgi", "--uid", "searx", "--gid", "searx", "--http", ":5000", "-w", "searx.webapp"]
|
|
@ -46,7 +46,9 @@ minimal: bin/buildout minimal.cfg setup.py
|
||||||
|
|
||||||
styles:
|
styles:
|
||||||
@lessc -x searx/static/themes/default/less/style.less > searx/static/themes/default/css/style.css
|
@lessc -x searx/static/themes/default/less/style.less > searx/static/themes/default/css/style.css
|
||||||
|
@lessc -x searx/static/themes/default/less/style-rtl.less > searx/static/themes/default/css/style-rtl.css
|
||||||
@lessc -x searx/static/themes/courgette/less/style.less > searx/static/themes/courgette/css/style.css
|
@lessc -x searx/static/themes/courgette/less/style.less > searx/static/themes/courgette/css/style.css
|
||||||
|
@lessc -x searx/static/themes/courgette/less/style-rtl.less > searx/static/themes/courgette/css/style-rtl.css
|
||||||
@lessc -x searx/static/less/bootstrap/bootstrap.less > searx/static/css/bootstrap.min.css
|
@lessc -x searx/static/less/bootstrap/bootstrap.less > searx/static/css/bootstrap.min.css
|
||||||
@lessc -x searx/static/themes/oscar/less/oscar/oscar.less > searx/static/themes/oscar/css/oscar.min.css
|
@lessc -x searx/static/themes/oscar/less/oscar/oscar.less > searx/static/themes/oscar/css/oscar.min.css
|
||||||
|
|
||||||
|
|
|
@ -1,61 +0,0 @@
|
||||||
'''
|
|
||||||
searx is free software: you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU Affero General Public License as published by
|
|
||||||
the Free Software Foundation, either version 3 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
searx is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU Affero General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Affero General Public License
|
|
||||||
along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
|
||||||
|
|
||||||
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
|
|
||||||
'''
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from os import environ
|
|
||||||
from os.path import realpath, dirname, join, abspath
|
|
||||||
try:
|
|
||||||
from yaml import load
|
|
||||||
except:
|
|
||||||
from sys import exit, stderr
|
|
||||||
stderr.write('[E] install pyyaml\n')
|
|
||||||
exit(2)
|
|
||||||
|
|
||||||
searx_dir = abspath(dirname(__file__))
|
|
||||||
engine_dir = dirname(realpath(__file__))
|
|
||||||
|
|
||||||
# if possible set path to settings using the
|
|
||||||
# enviroment variable SEARX_SETTINGS_PATH
|
|
||||||
if 'SEARX_SETTINGS_PATH' in environ:
|
|
||||||
settings_path = environ['SEARX_SETTINGS_PATH']
|
|
||||||
# otherwise using default path
|
|
||||||
else:
|
|
||||||
settings_path = join(searx_dir, 'settings.yml')
|
|
||||||
|
|
||||||
if 'SEARX_HTTPS_REWRITE_PATH' in environ:
|
|
||||||
https_rewrite_path = environ['SEARX_HTTPS_REWRITE_PATH']
|
|
||||||
else:
|
|
||||||
https_rewrite_path = join(searx_dir, 'https_rules')
|
|
||||||
|
|
||||||
# load settings
|
|
||||||
with open(settings_path) as settings_yaml:
|
|
||||||
settings = load(settings_yaml)
|
|
||||||
|
|
||||||
if settings.get('server', {}).get('debug'):
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
|
||||||
else:
|
|
||||||
logging.basicConfig(level=logging.WARNING)
|
|
||||||
|
|
||||||
logger = logging.getLogger('searx')
|
|
||||||
|
|
||||||
# load https rules only if https rewrite is enabled
|
|
||||||
if settings.get('server', {}).get('https_rewrite'):
|
|
||||||
# loade https rules
|
|
||||||
from searx.https_rewrite import load_https_rules
|
|
||||||
load_https_rules(https_rewrite_path)
|
|
||||||
|
|
||||||
logger.info('Initialisation done')
|
|
|
@ -1,162 +0,0 @@
|
||||||
'''
|
|
||||||
searx is free software: you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU Affero General Public License as published by
|
|
||||||
the Free Software Foundation, either version 3 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
searx is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU Affero General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Affero General Public License
|
|
||||||
along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
|
||||||
|
|
||||||
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
from json import loads
|
|
||||||
from urllib import urlencode
|
|
||||||
from searx.languages import language_codes
|
|
||||||
from searx.engines import (
|
|
||||||
categories, engines, engine_shortcuts
|
|
||||||
)
|
|
||||||
from searx.poolrequests import get
|
|
||||||
|
|
||||||
|
|
||||||
def searx_bang(full_query):
|
|
||||||
'''check if the searchQuery contain a bang, and create fitting autocompleter results'''
|
|
||||||
# check if there is a query which can be parsed
|
|
||||||
if len(full_query.getSearchQuery()) == 0:
|
|
||||||
return []
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# check if current query stats with !bang
|
|
||||||
first_char = full_query.getSearchQuery()[0]
|
|
||||||
if first_char == '!' or first_char == '?':
|
|
||||||
if len(full_query.getSearchQuery()) == 1:
|
|
||||||
# show some example queries
|
|
||||||
# TODO, check if engine is not avaliable
|
|
||||||
results.append(first_char + "images")
|
|
||||||
results.append(first_char + "wikipedia")
|
|
||||||
results.append(first_char + "osm")
|
|
||||||
else:
|
|
||||||
engine_query = full_query.getSearchQuery()[1:]
|
|
||||||
|
|
||||||
# check if query starts with categorie name
|
|
||||||
for categorie in categories:
|
|
||||||
if categorie.startswith(engine_query):
|
|
||||||
results.append(first_char+'{categorie}'.format(categorie=categorie))
|
|
||||||
|
|
||||||
# check if query starts with engine name
|
|
||||||
for engine in engines:
|
|
||||||
if engine.startswith(engine_query.replace('_', ' ')):
|
|
||||||
results.append(first_char+'{engine}'.format(engine=engine.replace(' ', '_')))
|
|
||||||
|
|
||||||
# check if query starts with engine shortcut
|
|
||||||
for engine_shortcut in engine_shortcuts:
|
|
||||||
if engine_shortcut.startswith(engine_query):
|
|
||||||
results.append(first_char+'{engine_shortcut}'.format(engine_shortcut=engine_shortcut))
|
|
||||||
|
|
||||||
# check if current query stats with :bang
|
|
||||||
elif first_char == ':':
|
|
||||||
if len(full_query.getSearchQuery()) == 1:
|
|
||||||
# show some example queries
|
|
||||||
results.append(":en")
|
|
||||||
results.append(":en_us")
|
|
||||||
results.append(":english")
|
|
||||||
results.append(":united_kingdom")
|
|
||||||
else:
|
|
||||||
engine_query = full_query.getSearchQuery()[1:]
|
|
||||||
|
|
||||||
for lc in language_codes:
|
|
||||||
lang_id, lang_name, country = map(str.lower, lc)
|
|
||||||
|
|
||||||
# check if query starts with language-id
|
|
||||||
if lang_id.startswith(engine_query):
|
|
||||||
if len(engine_query) <= 2:
|
|
||||||
results.append(':{lang_id}'.format(lang_id=lang_id.split('_')[0]))
|
|
||||||
else:
|
|
||||||
results.append(':{lang_id}'.format(lang_id=lang_id))
|
|
||||||
|
|
||||||
# check if query starts with language name
|
|
||||||
if lang_name.startswith(engine_query):
|
|
||||||
results.append(':{lang_name}'.format(lang_name=lang_name))
|
|
||||||
|
|
||||||
# check if query starts with country
|
|
||||||
if country.startswith(engine_query.replace('_', ' ')):
|
|
||||||
results.append(':{country}'.format(country=country.replace(' ', '_')))
|
|
||||||
|
|
||||||
# remove duplicates
|
|
||||||
result_set = set(results)
|
|
||||||
|
|
||||||
# remove results which are already contained in the query
|
|
||||||
for query_part in full_query.query_parts:
|
|
||||||
if query_part in result_set:
|
|
||||||
result_set.remove(query_part)
|
|
||||||
|
|
||||||
# convert result_set back to list
|
|
||||||
return list(result_set)
|
|
||||||
|
|
||||||
|
|
||||||
def dbpedia(query):
|
|
||||||
# dbpedia autocompleter
|
|
||||||
autocomplete_url = 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?' # noqa
|
|
||||||
|
|
||||||
response = get(autocomplete_url
|
|
||||||
+ urlencode(dict(QueryString=query)))
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
if response.ok:
|
|
||||||
dom = etree.fromstring(response.content)
|
|
||||||
results = dom.xpath('//a:Result/a:Label//text()',
|
|
||||||
namespaces={'a': 'http://lookup.dbpedia.org/'})
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def duckduckgo(query):
|
|
||||||
# duckduckgo autocompleter
|
|
||||||
url = 'https://ac.duckduckgo.com/ac/?{0}&type=list'
|
|
||||||
|
|
||||||
resp = loads(get(url.format(urlencode(dict(q=query)))).text)
|
|
||||||
if len(resp) > 1:
|
|
||||||
return resp[1]
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def google(query):
|
|
||||||
# google autocompleter
|
|
||||||
autocomplete_url = 'http://suggestqueries.google.com/complete/search?client=toolbar&' # noqa
|
|
||||||
|
|
||||||
response = get(autocomplete_url
|
|
||||||
+ urlencode(dict(q=query)))
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
if response.ok:
|
|
||||||
dom = etree.fromstring(response.text)
|
|
||||||
results = dom.xpath('//suggestion/@data')
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def wikipedia(query):
|
|
||||||
# wikipedia autocompleter
|
|
||||||
url = 'https://en.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json' # noqa
|
|
||||||
|
|
||||||
resp = loads(get(url.format(urlencode(dict(search=query)))).text)
|
|
||||||
if len(resp) > 1:
|
|
||||||
return resp[1]
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
backends = {'dbpedia': dbpedia,
|
|
||||||
'duckduckgo': duckduckgo,
|
|
||||||
'google': google,
|
|
||||||
'wikipedia': wikipedia
|
|
||||||
}
|
|
|
@ -1,210 +0,0 @@
|
||||||
|
|
||||||
'''
|
|
||||||
searx is free software: you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU Affero General Public License as published by
|
|
||||||
the Free Software Foundation, either version 3 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
searx is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU Affero General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Affero General Public License
|
|
||||||
along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
|
||||||
|
|
||||||
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
|
|
||||||
'''
|
|
||||||
|
|
||||||
from os.path import realpath, dirname, splitext, join
|
|
||||||
import sys
|
|
||||||
from imp import load_source
|
|
||||||
from flask.ext.babel import gettext
|
|
||||||
from operator import itemgetter
|
|
||||||
from searx import settings
|
|
||||||
from searx import logger
|
|
||||||
|
|
||||||
|
|
||||||
logger = logger.getChild('engines')
|
|
||||||
|
|
||||||
engine_dir = dirname(realpath(__file__))
|
|
||||||
|
|
||||||
engines = {}
|
|
||||||
|
|
||||||
categories = {'general': []}
|
|
||||||
|
|
||||||
engine_shortcuts = {}
|
|
||||||
|
|
||||||
|
|
||||||
def load_module(filename):
|
|
||||||
modname = splitext(filename)[0]
|
|
||||||
if modname in sys.modules:
|
|
||||||
del sys.modules[modname]
|
|
||||||
filepath = join(engine_dir, filename)
|
|
||||||
module = load_source(modname, filepath)
|
|
||||||
module.name = modname
|
|
||||||
return module
|
|
||||||
|
|
||||||
|
|
||||||
def load_engine(engine_data):
|
|
||||||
engine_name = engine_data['engine']
|
|
||||||
engine = load_module(engine_name + '.py')
|
|
||||||
|
|
||||||
for param_name in engine_data:
|
|
||||||
if param_name == 'engine':
|
|
||||||
continue
|
|
||||||
if param_name == 'categories':
|
|
||||||
if engine_data['categories'] == 'none':
|
|
||||||
engine.categories = []
|
|
||||||
else:
|
|
||||||
engine.categories = map(
|
|
||||||
str.strip, engine_data['categories'].split(','))
|
|
||||||
continue
|
|
||||||
setattr(engine, param_name, engine_data[param_name])
|
|
||||||
|
|
||||||
if not hasattr(engine, 'paging'):
|
|
||||||
engine.paging = False
|
|
||||||
|
|
||||||
if not hasattr(engine, 'categories'):
|
|
||||||
engine.categories = ['general']
|
|
||||||
|
|
||||||
if not hasattr(engine, 'language_support'):
|
|
||||||
engine.language_support = True
|
|
||||||
|
|
||||||
if not hasattr(engine, 'timeout'):
|
|
||||||
engine.timeout = settings['server']['request_timeout']
|
|
||||||
|
|
||||||
if not hasattr(engine, 'shortcut'):
|
|
||||||
engine.shortcut = ''
|
|
||||||
|
|
||||||
if not hasattr(engine, 'disabled'):
|
|
||||||
engine.disabled = False
|
|
||||||
|
|
||||||
# checking required variables
|
|
||||||
for engine_attr in dir(engine):
|
|
||||||
if engine_attr.startswith('_'):
|
|
||||||
continue
|
|
||||||
if getattr(engine, engine_attr) is None:
|
|
||||||
logger.error('Missing engine config attribute: "{0}.{1}"'
|
|
||||||
.format(engine.name, engine_attr))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
engine.stats = {
|
|
||||||
'result_count': 0,
|
|
||||||
'search_count': 0,
|
|
||||||
'page_load_time': 0,
|
|
||||||
'score_count': 0,
|
|
||||||
'errors': 0
|
|
||||||
}
|
|
||||||
|
|
||||||
if hasattr(engine, 'categories'):
|
|
||||||
for category_name in engine.categories:
|
|
||||||
categories.setdefault(category_name, []).append(engine)
|
|
||||||
else:
|
|
||||||
categories['general'].append(engine)
|
|
||||||
|
|
||||||
if engine.shortcut:
|
|
||||||
if engine.shortcut in engine_shortcuts:
|
|
||||||
logger.error('Engine config error: ambigious shortcut: {0}'
|
|
||||||
.format(engine.shortcut))
|
|
||||||
sys.exit(1)
|
|
||||||
engine_shortcuts[engine.shortcut] = engine.name
|
|
||||||
return engine
|
|
||||||
|
|
||||||
|
|
||||||
def get_engines_stats():
|
|
||||||
# TODO refactor
|
|
||||||
pageloads = []
|
|
||||||
results = []
|
|
||||||
scores = []
|
|
||||||
errors = []
|
|
||||||
scores_per_result = []
|
|
||||||
|
|
||||||
max_pageload = max_results = max_score = max_errors = max_score_per_result = 0 # noqa
|
|
||||||
for engine in engines.values():
|
|
||||||
if engine.stats['search_count'] == 0:
|
|
||||||
continue
|
|
||||||
results_num = \
|
|
||||||
engine.stats['result_count'] / float(engine.stats['search_count'])
|
|
||||||
load_times = engine.stats['page_load_time'] / float(engine.stats['search_count']) # noqa
|
|
||||||
if results_num:
|
|
||||||
score = engine.stats['score_count'] / float(engine.stats['search_count']) # noqa
|
|
||||||
score_per_result = score / results_num
|
|
||||||
else:
|
|
||||||
score = score_per_result = 0.0
|
|
||||||
max_results = max(results_num, max_results)
|
|
||||||
max_pageload = max(load_times, max_pageload)
|
|
||||||
max_score = max(score, max_score)
|
|
||||||
max_score_per_result = max(score_per_result, max_score_per_result)
|
|
||||||
max_errors = max(max_errors, engine.stats['errors'])
|
|
||||||
pageloads.append({'avg': load_times, 'name': engine.name})
|
|
||||||
results.append({'avg': results_num, 'name': engine.name})
|
|
||||||
scores.append({'avg': score, 'name': engine.name})
|
|
||||||
errors.append({'avg': engine.stats['errors'], 'name': engine.name})
|
|
||||||
scores_per_result.append({
|
|
||||||
'avg': score_per_result,
|
|
||||||
'name': engine.name
|
|
||||||
})
|
|
||||||
|
|
||||||
for engine in pageloads:
|
|
||||||
if max_pageload:
|
|
||||||
engine['percentage'] = int(engine['avg'] / max_pageload * 100)
|
|
||||||
else:
|
|
||||||
engine['percentage'] = 0
|
|
||||||
|
|
||||||
for engine in results:
|
|
||||||
if max_results:
|
|
||||||
engine['percentage'] = int(engine['avg'] / max_results * 100)
|
|
||||||
else:
|
|
||||||
engine['percentage'] = 0
|
|
||||||
|
|
||||||
for engine in scores:
|
|
||||||
if max_score:
|
|
||||||
engine['percentage'] = int(engine['avg'] / max_score * 100)
|
|
||||||
else:
|
|
||||||
engine['percentage'] = 0
|
|
||||||
|
|
||||||
for engine in scores_per_result:
|
|
||||||
if max_score_per_result:
|
|
||||||
engine['percentage'] = int(engine['avg']
|
|
||||||
/ max_score_per_result * 100)
|
|
||||||
else:
|
|
||||||
engine['percentage'] = 0
|
|
||||||
|
|
||||||
for engine in errors:
|
|
||||||
if max_errors:
|
|
||||||
engine['percentage'] = int(float(engine['avg']) / max_errors * 100)
|
|
||||||
else:
|
|
||||||
engine['percentage'] = 0
|
|
||||||
|
|
||||||
return [
|
|
||||||
(
|
|
||||||
gettext('Page loads (sec)'),
|
|
||||||
sorted(pageloads, key=itemgetter('avg'))
|
|
||||||
),
|
|
||||||
(
|
|
||||||
gettext('Number of results'),
|
|
||||||
sorted(results, key=itemgetter('avg'), reverse=True)
|
|
||||||
),
|
|
||||||
(
|
|
||||||
gettext('Scores'),
|
|
||||||
sorted(scores, key=itemgetter('avg'), reverse=True)
|
|
||||||
),
|
|
||||||
(
|
|
||||||
gettext('Scores per result'),
|
|
||||||
sorted(scores_per_result, key=itemgetter('avg'), reverse=True)
|
|
||||||
),
|
|
||||||
(
|
|
||||||
gettext('Errors'),
|
|
||||||
sorted(errors, key=itemgetter('avg'), reverse=True)
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
if 'engines' not in settings or not settings['engines']:
|
|
||||||
logger.error('No engines found. Edit your settings.yml')
|
|
||||||
exit(2)
|
|
||||||
|
|
||||||
for engine_data in settings['engines']:
|
|
||||||
engine = load_engine(engine_data)
|
|
||||||
engines[engine.name] = engine
|
|
|
@ -1,84 +0,0 @@
|
||||||
## Bing (Web)
|
|
||||||
#
|
|
||||||
# @website https://www.bing.com
|
|
||||||
# @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
|
|
||||||
# max. 5000 query/month
|
|
||||||
#
|
|
||||||
# @using-api no (because of query limit)
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content
|
|
||||||
#
|
|
||||||
# @todo publishedDate
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from cgi import escape
|
|
||||||
from lxml import html
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['general']
|
|
||||||
paging = True
|
|
||||||
language_support = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://www.bing.com/'
|
|
||||||
search_string = 'search?{query}&first={offset}'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 10 + 1
|
|
||||||
|
|
||||||
if params['language'] == 'all':
|
|
||||||
language = 'en-US'
|
|
||||||
else:
|
|
||||||
language = params['language'].replace('_', '-')
|
|
||||||
|
|
||||||
search_path = search_string.format(
|
|
||||||
query=urlencode({'q': query, 'setmkt': language}),
|
|
||||||
offset=offset)
|
|
||||||
|
|
||||||
params['cookies']['SRCHHPGUSR'] = \
|
|
||||||
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
|
|
||||||
|
|
||||||
params['url'] = base_url + search_path
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.content)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath('//div[@class="sa_cc"]'):
|
|
||||||
link = result.xpath('.//h3/a')[0]
|
|
||||||
url = link.attrib.get('href')
|
|
||||||
title = extract_text(link)
|
|
||||||
content = escape(extract_text(result.xpath('.//p')))
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# return results if something is found
|
|
||||||
if results:
|
|
||||||
return results
|
|
||||||
|
|
||||||
# parse results again if nothing is found yet
|
|
||||||
for result in dom.xpath('//li[@class="b_algo"]'):
|
|
||||||
link = result.xpath('.//h2/a')[0]
|
|
||||||
url = link.attrib.get('href')
|
|
||||||
title = extract_text(link)
|
|
||||||
content = escape(extract_text(result.xpath('.//p')))
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,96 +0,0 @@
|
||||||
## Bing (Images)
|
|
||||||
#
|
|
||||||
# @website https://www.bing.com/images
|
|
||||||
# @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
|
|
||||||
# max. 5000 query/month
|
|
||||||
#
|
|
||||||
# @using-api no (because of query limit)
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, img_src
|
|
||||||
#
|
|
||||||
# @todo currently there are up to 35 images receive per page,
|
|
||||||
# because bing does not parse count=10.
|
|
||||||
# limited response to 10 images
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from lxml import html
|
|
||||||
from yaml import load
|
|
||||||
import re
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['images']
|
|
||||||
paging = True
|
|
||||||
safesearch = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://www.bing.com/'
|
|
||||||
search_string = 'images/search?{query}&count=10&first={offset}'
|
|
||||||
thumb_url = "http://ts1.mm.bing.net/th?id={ihk}"
|
|
||||||
|
|
||||||
# safesearch definitions
|
|
||||||
safesearch_types = {2: 'STRICT',
|
|
||||||
1: 'DEMOTE',
|
|
||||||
0: 'OFF'}
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 10 + 1
|
|
||||||
|
|
||||||
# required for cookie
|
|
||||||
if params['language'] == 'all':
|
|
||||||
language = 'en-US'
|
|
||||||
else:
|
|
||||||
language = params['language'].replace('_', '-')
|
|
||||||
|
|
||||||
search_path = search_string.format(
|
|
||||||
query=urlencode({'q': query}),
|
|
||||||
offset=offset)
|
|
||||||
|
|
||||||
params['cookies']['SRCHHPGUSR'] = \
|
|
||||||
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\
|
|
||||||
'&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
|
|
||||||
|
|
||||||
params['url'] = base_url + search_path
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.content)
|
|
||||||
|
|
||||||
# init regex for yaml-parsing
|
|
||||||
p = re.compile('({|,)([a-z]+):(")')
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath('//div[@class="dg_u"]'):
|
|
||||||
link = result.xpath('./a')[0]
|
|
||||||
|
|
||||||
# parse yaml-data (it is required to add a space, to make it parsable)
|
|
||||||
yaml_data = load(p.sub(r'\1\2: \3', link.attrib.get('m')))
|
|
||||||
|
|
||||||
title = link.attrib.get('t1')
|
|
||||||
ihk = link.attrib.get('ihk')
|
|
||||||
|
|
||||||
#url = 'http://' + link.attrib.get('t3')
|
|
||||||
url = yaml_data.get('surl')
|
|
||||||
img_src = yaml_data.get('imgurl')
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'template': 'images.html',
|
|
||||||
'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': '',
|
|
||||||
'thumbnail_src': thumb_url.format(ihk=ihk),
|
|
||||||
'img_src': img_src})
|
|
||||||
|
|
||||||
# TODO stop parsing if 10 images are found
|
|
||||||
if len(results) >= 10:
|
|
||||||
break
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,98 +0,0 @@
|
||||||
## Bing (News)
|
|
||||||
#
|
|
||||||
# @website https://www.bing.com/news
|
|
||||||
# @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
|
|
||||||
# max. 5000 query/month
|
|
||||||
#
|
|
||||||
# @using-api no (because of query limit)
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content, publishedDate
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from cgi import escape
|
|
||||||
from lxml import html
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from dateutil import parser
|
|
||||||
import re
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['news']
|
|
||||||
paging = True
|
|
||||||
language_support = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://www.bing.com/'
|
|
||||||
search_string = 'news/search?{query}&first={offset}'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 10 + 1
|
|
||||||
|
|
||||||
if params['language'] == 'all':
|
|
||||||
language = 'en-US'
|
|
||||||
else:
|
|
||||||
language = params['language'].replace('_', '-')
|
|
||||||
|
|
||||||
search_path = search_string.format(
|
|
||||||
query=urlencode({'q': query, 'setmkt': language}),
|
|
||||||
offset=offset)
|
|
||||||
|
|
||||||
params['cookies']['_FP'] = "ui=en-US"
|
|
||||||
|
|
||||||
params['url'] = base_url + search_path
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.content)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath('//div[@class="sn_r"]'):
|
|
||||||
link = result.xpath('.//div[@class="newstitle"]/a')[0]
|
|
||||||
url = link.attrib.get('href')
|
|
||||||
title = extract_text(link)
|
|
||||||
contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
|
|
||||||
content = escape(extract_text(contentXPath))
|
|
||||||
|
|
||||||
# parse publishedDate
|
|
||||||
publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
|
|
||||||
'//span[contains(@class,"sn_ST")]'
|
|
||||||
'//span[contains(@class,"sn_tm")]')
|
|
||||||
|
|
||||||
publishedDate = escape(extract_text(publishedDateXPath))
|
|
||||||
|
|
||||||
if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
|
|
||||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
|
||||||
publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
|
|
||||||
elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
|
|
||||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
|
||||||
publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
|
|
||||||
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
|
|
||||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
|
||||||
publishedDate = datetime.now()\
|
|
||||||
- timedelta(hours=int(timeNumbers[0]))\
|
|
||||||
- timedelta(minutes=int(timeNumbers[1]))
|
|
||||||
elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
|
|
||||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
|
||||||
publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
publishedDate = parser.parse(publishedDate, dayfirst=False)
|
|
||||||
except TypeError:
|
|
||||||
publishedDate = datetime.now()
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'publishedDate': publishedDate,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,104 +0,0 @@
|
||||||
## BTDigg (Videos, Music, Files)
|
|
||||||
#
|
|
||||||
# @website https://btdigg.org
|
|
||||||
# @provide-api yes (on demand)
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content, seed, leech, magnetlink
|
|
||||||
|
|
||||||
from urlparse import urljoin
|
|
||||||
from cgi import escape
|
|
||||||
from urllib import quote
|
|
||||||
from lxml import html
|
|
||||||
from operator import itemgetter
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['videos', 'music', 'files']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'https://btdigg.org'
|
|
||||||
search_url = url + '/search?q={search_term}&p={pageno}'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(search_term=quote(query),
|
|
||||||
pageno=params['pageno']-1)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
search_res = dom.xpath('//div[@id="search_res"]/table/tr')
|
|
||||||
|
|
||||||
# return empty array if nothing is found
|
|
||||||
if not search_res:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_res:
|
|
||||||
link = result.xpath('.//td[@class="torrent_name"]//a')[0]
|
|
||||||
href = urljoin(url, link.attrib.get('href'))
|
|
||||||
title = escape(extract_text(link))
|
|
||||||
content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0]))
|
|
||||||
content = "<br />".join(content.split("\n"))
|
|
||||||
|
|
||||||
filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0]
|
|
||||||
filesize_multiplier = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[1]
|
|
||||||
files = result.xpath('.//span[@class="attr_val"]/text()')[1]
|
|
||||||
seed = result.xpath('.//span[@class="attr_val"]/text()')[2]
|
|
||||||
|
|
||||||
# convert seed to int if possible
|
|
||||||
if seed.isdigit():
|
|
||||||
seed = int(seed)
|
|
||||||
else:
|
|
||||||
seed = 0
|
|
||||||
|
|
||||||
leech = 0
|
|
||||||
|
|
||||||
# convert filesize to byte if possible
|
|
||||||
try:
|
|
||||||
filesize = float(filesize)
|
|
||||||
|
|
||||||
# convert filesize to byte
|
|
||||||
if filesize_multiplier == 'TB':
|
|
||||||
filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
|
|
||||||
elif filesize_multiplier == 'GB':
|
|
||||||
filesize = int(filesize * 1024 * 1024 * 1024)
|
|
||||||
elif filesize_multiplier == 'MB':
|
|
||||||
filesize = int(filesize * 1024 * 1024)
|
|
||||||
elif filesize_multiplier == 'KB':
|
|
||||||
filesize = int(filesize * 1024)
|
|
||||||
except:
|
|
||||||
filesize = None
|
|
||||||
|
|
||||||
# convert files to int if possible
|
|
||||||
if files.isdigit():
|
|
||||||
files = int(files)
|
|
||||||
else:
|
|
||||||
files = None
|
|
||||||
|
|
||||||
magnetlink = result.xpath('.//td[@class="ttth"]//a')[0].attrib['href']
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': href,
|
|
||||||
'title': title,
|
|
||||||
'content': content,
|
|
||||||
'seed': seed,
|
|
||||||
'leech': leech,
|
|
||||||
'filesize': filesize,
|
|
||||||
'files': files,
|
|
||||||
'magnetlink': magnetlink,
|
|
||||||
'template': 'torrent.html'})
|
|
||||||
|
|
||||||
# return results sorted by seeder
|
|
||||||
return sorted(results, key=itemgetter('seed'), reverse=True)
|
|
|
@ -1,57 +0,0 @@
|
||||||
from datetime import datetime
|
|
||||||
import re
|
|
||||||
|
|
||||||
categories = []
|
|
||||||
url = 'http://finance.yahoo.com/d/quotes.csv?e=.csv&f=sl1d1t1&s={query}=X'
|
|
||||||
weight = 100
|
|
||||||
|
|
||||||
parser_re = re.compile(r'^\W*(\d+(?:\.\d+)?)\W*([a-z]{3})\W*(?:in)?\W*([a-z]{3})\W*$', re.I) # noqa
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
|
||||||
m = parser_re.match(query)
|
|
||||||
if not m:
|
|
||||||
# wrong query
|
|
||||||
return params
|
|
||||||
|
|
||||||
ammount, from_currency, to_currency = m.groups()
|
|
||||||
ammount = float(ammount)
|
|
||||||
|
|
||||||
q = (from_currency + to_currency).upper()
|
|
||||||
|
|
||||||
params['url'] = url.format(query=q)
|
|
||||||
params['ammount'] = ammount
|
|
||||||
params['from'] = from_currency
|
|
||||||
params['to'] = to_currency
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
try:
|
|
||||||
_, conversion_rate, _ = resp.text.split(',', 2)
|
|
||||||
conversion_rate = float(conversion_rate)
|
|
||||||
except:
|
|
||||||
return results
|
|
||||||
|
|
||||||
answer = '{0} {1} = {2} {3} (1 {1} = {4} {3})'.format(
|
|
||||||
resp.search_params['ammount'],
|
|
||||||
resp.search_params['from'],
|
|
||||||
resp.search_params['ammount'] * conversion_rate,
|
|
||||||
resp.search_params['to'],
|
|
||||||
conversion_rate
|
|
||||||
)
|
|
||||||
|
|
||||||
now_date = datetime.now().strftime('%Y%m%d')
|
|
||||||
url = 'http://finance.yahoo.com/currency/converter-results/{0}/{1}-{2}-to-{3}.html' # noqa
|
|
||||||
url = url.format(
|
|
||||||
now_date,
|
|
||||||
resp.search_params['ammount'],
|
|
||||||
resp.search_params['from'].lower(),
|
|
||||||
resp.search_params['to'].lower()
|
|
||||||
)
|
|
||||||
|
|
||||||
results.append({'answer': answer, 'url': url})
|
|
||||||
|
|
||||||
return results
|
|
|
@ -1,72 +0,0 @@
|
||||||
## Dailymotion (Videos)
|
|
||||||
#
|
|
||||||
# @website https://www.dailymotion.com
|
|
||||||
# @provide-api yes (http://www.dailymotion.com/developer)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title, thumbnail, publishedDate, embedded
|
|
||||||
#
|
|
||||||
# @todo set content-parameter with correct data
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from json import loads
|
|
||||||
from cgi import escape
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['videos']
|
|
||||||
paging = True
|
|
||||||
language_support = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
# see http://www.dailymotion.com/doc/api/obj-video.html
|
|
||||||
search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,description,duration,url,thumbnail_360_url,id&sort=relevance&limit=5&page={pageno}&{query}' # noqa
|
|
||||||
embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\
|
|
||||||
'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
if params['language'] == 'all':
|
|
||||||
locale = 'en-US'
|
|
||||||
else:
|
|
||||||
locale = params['language']
|
|
||||||
|
|
||||||
params['url'] = search_url.format(
|
|
||||||
query=urlencode({'search': query, 'localization': locale}),
|
|
||||||
pageno=params['pageno'])
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_res = loads(resp.text)
|
|
||||||
|
|
||||||
# return empty array if there are no results
|
|
||||||
if not 'list' in search_res:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for res in search_res['list']:
|
|
||||||
title = res['title']
|
|
||||||
url = res['url']
|
|
||||||
content = escape(res['description'])
|
|
||||||
thumbnail = res['thumbnail_360_url']
|
|
||||||
publishedDate = datetime.fromtimestamp(res['created_time'], None)
|
|
||||||
embedded = embedded_url.format(videoid=res['id'])
|
|
||||||
|
|
||||||
results.append({'template': 'videos.html',
|
|
||||||
'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content,
|
|
||||||
'publishedDate': publishedDate,
|
|
||||||
'embedded': embedded,
|
|
||||||
'thumbnail': thumbnail})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,61 +0,0 @@
|
||||||
## Deezer (Music)
|
|
||||||
#
|
|
||||||
# @website https://deezer.com
|
|
||||||
# @provide-api yes (http://developers.deezer.com/api/)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title, content, embedded
|
|
||||||
|
|
||||||
from json import loads
|
|
||||||
from urllib import urlencode
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['music']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'http://api.deezer.com/'
|
|
||||||
search_url = url + 'search?{query}&index={offset}'
|
|
||||||
|
|
||||||
embedded_url = '<iframe scrolling="no" frameborder="0" allowTransparency="true" ' +\
|
|
||||||
'data-src="http://www.deezer.com/plugins/player?type=tracks&id={audioid}" ' +\
|
|
||||||
'width="540" height="80"></iframe>'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 25
|
|
||||||
|
|
||||||
params['url'] = search_url.format(query=urlencode({'q': query}),
|
|
||||||
offset=offset)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_res = loads(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_res.get('data', []):
|
|
||||||
if result['type'] == 'track':
|
|
||||||
title = result['title']
|
|
||||||
url = result['link']
|
|
||||||
content = result['artist']['name'] +\
|
|
||||||
" • " +\
|
|
||||||
result['album']['title'] +\
|
|
||||||
" • " + result['title']
|
|
||||||
embedded = embedded_url.format(audioid=result['id'])
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'embedded': embedded,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,67 +0,0 @@
|
||||||
## Deviantart (Images)
|
|
||||||
#
|
|
||||||
# @website https://www.deviantart.com/
|
|
||||||
# @provide-api yes (https://www.deviantart.com/developers/) (RSS)
|
|
||||||
#
|
|
||||||
# @using-api no (TODO, rewrite to api)
|
|
||||||
# @results HTML
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, thumbnail_src, img_src
|
|
||||||
#
|
|
||||||
# @todo rewrite to api
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from urlparse import urljoin
|
|
||||||
from lxml import html
|
|
||||||
import re
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['images']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://www.deviantart.com/'
|
|
||||||
search_url = base_url+'search?offset={offset}&{query}'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 24
|
|
||||||
|
|
||||||
params['url'] = search_url.format(offset=offset,
|
|
||||||
query=urlencode({'q': query}))
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# return empty array if a redirection code is returned
|
|
||||||
if resp.status_code == 302:
|
|
||||||
return []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
regex = re.compile('\/200H\/')
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'):
|
|
||||||
link = result.xpath('.//a[contains(@class, "thumb")]')[0]
|
|
||||||
url = urljoin(base_url, link.attrib.get('href'))
|
|
||||||
title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]')
|
|
||||||
title = extract_text(title_links[0])
|
|
||||||
thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
|
|
||||||
img_src = regex.sub('/', thumbnail_src)
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'img_src': img_src,
|
|
||||||
'thumbnail_src': thumbnail_src,
|
|
||||||
'template': 'images.html'})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,70 +0,0 @@
|
||||||
## Digg (News, Social media)
|
|
||||||
#
|
|
||||||
# @website https://digg.com/
|
|
||||||
# @provide-api no
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content, publishedDate, thumbnail
|
|
||||||
|
|
||||||
from urllib import quote_plus
|
|
||||||
from json import loads
|
|
||||||
from lxml import html
|
|
||||||
from cgi import escape
|
|
||||||
from dateutil import parser
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['news', 'social media']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://digg.com/'
|
|
||||||
search_url = base_url+'api/search/{query}.json?position={position}&format=html'
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
results_xpath = '//article'
|
|
||||||
link_xpath = './/small[@class="time"]//a'
|
|
||||||
title_xpath = './/h2//a//text()'
|
|
||||||
content_xpath = './/p//text()'
|
|
||||||
pubdate_xpath = './/time'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 10
|
|
||||||
params['url'] = search_url.format(position=offset,
|
|
||||||
query=quote_plus(query))
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_result = loads(resp.text)
|
|
||||||
|
|
||||||
if 'html' not in search_result or search_result['html'] == '':
|
|
||||||
return results
|
|
||||||
|
|
||||||
dom = html.fromstring(search_result['html'])
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath(results_xpath):
|
|
||||||
url = result.attrib.get('data-contenturl')
|
|
||||||
thumbnail = result.xpath('.//img')[0].attrib.get('src')
|
|
||||||
title = ''.join(result.xpath(title_xpath))
|
|
||||||
content = escape(''.join(result.xpath(content_xpath)))
|
|
||||||
pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
|
|
||||||
publishedDate = parser.parse(pubdate)
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content,
|
|
||||||
'template': 'videos.html',
|
|
||||||
'publishedDate': publishedDate,
|
|
||||||
'thumbnail': thumbnail})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,76 +0,0 @@
|
||||||
## DuckDuckGo (Web)
|
|
||||||
#
|
|
||||||
# @website https://duckduckgo.com/
|
|
||||||
# @provide-api yes (https://duckduckgo.com/api),
|
|
||||||
# but not all results from search-site
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content
|
|
||||||
#
|
|
||||||
# @todo rewrite to api
|
|
||||||
# @todo language support
|
|
||||||
# (the current used site does not support language-change)
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from lxml.html import fromstring
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['general']
|
|
||||||
paging = True
|
|
||||||
language_support = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'https://duckduckgo.com/html?{query}&s={offset}'
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa
|
|
||||||
url_xpath = './/a[@class="large"]/@href'
|
|
||||||
title_xpath = './/a[@class="large"]'
|
|
||||||
content_xpath = './/div[@class="snippet"]'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 30
|
|
||||||
|
|
||||||
if params['language'] == 'all':
|
|
||||||
locale = 'en-us'
|
|
||||||
else:
|
|
||||||
locale = params['language'].replace('_', '-').lower()
|
|
||||||
|
|
||||||
params['url'] = url.format(
|
|
||||||
query=urlencode({'q': query, 'kl': locale}),
|
|
||||||
offset=offset)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
doc = fromstring(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for r in doc.xpath(result_xpath):
|
|
||||||
try:
|
|
||||||
res_url = r.xpath(url_xpath)[-1]
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not res_url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
title = extract_text(r.xpath(title_xpath))
|
|
||||||
content = extract_text(r.xpath(content_xpath))
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'title': title,
|
|
||||||
'content': content,
|
|
||||||
'url': res_url})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,149 +0,0 @@
|
||||||
import json
|
|
||||||
from urllib import urlencode
|
|
||||||
from lxml import html
|
|
||||||
from searx.utils import html_to_text
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
url = 'https://api.duckduckgo.com/'\
|
|
||||||
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1'
|
|
||||||
|
|
||||||
|
|
||||||
def result_to_text(url, text, htmlResult):
|
|
||||||
# TODO : remove result ending with "Meaning" or "Category"
|
|
||||||
dom = html.fromstring(htmlResult)
|
|
||||||
a = dom.xpath('//a')
|
|
||||||
if len(a) >= 1:
|
|
||||||
return extract_text(a[0])
|
|
||||||
else:
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
|
||||||
# TODO add kl={locale}
|
|
||||||
params['url'] = url.format(query=urlencode({'q': query}))
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_res = json.loads(resp.text)
|
|
||||||
|
|
||||||
content = ''
|
|
||||||
heading = search_res.get('Heading', '')
|
|
||||||
attributes = []
|
|
||||||
urls = []
|
|
||||||
infobox_id = None
|
|
||||||
relatedTopics = []
|
|
||||||
|
|
||||||
# add answer if there is one
|
|
||||||
answer = search_res.get('Answer', '')
|
|
||||||
if answer != '':
|
|
||||||
results.append({'answer': html_to_text(answer)})
|
|
||||||
|
|
||||||
# add infobox
|
|
||||||
if 'Definition' in search_res:
|
|
||||||
content = content + search_res.get('Definition', '')
|
|
||||||
|
|
||||||
if 'Abstract' in search_res:
|
|
||||||
content = content + search_res.get('Abstract', '')
|
|
||||||
|
|
||||||
# image
|
|
||||||
image = search_res.get('Image', '')
|
|
||||||
image = None if image == '' else image
|
|
||||||
|
|
||||||
# attributes
|
|
||||||
if 'Infobox' in search_res:
|
|
||||||
infobox = search_res.get('Infobox', None)
|
|
||||||
if 'content' in infobox:
|
|
||||||
for info in infobox.get('content'):
|
|
||||||
attributes.append({'label': info.get('label'),
|
|
||||||
'value': info.get('value')})
|
|
||||||
|
|
||||||
# urls
|
|
||||||
for ddg_result in search_res.get('Results', []):
|
|
||||||
if 'FirstURL' in ddg_result:
|
|
||||||
firstURL = ddg_result.get('FirstURL', '')
|
|
||||||
text = ddg_result.get('Text', '')
|
|
||||||
urls.append({'title': text, 'url': firstURL})
|
|
||||||
results.append({'title': heading, 'url': firstURL})
|
|
||||||
|
|
||||||
# related topics
|
|
||||||
for ddg_result in search_res.get('RelatedTopics', []):
|
|
||||||
if 'FirstURL' in ddg_result:
|
|
||||||
suggestion = result_to_text(ddg_result.get('FirstURL', None),
|
|
||||||
ddg_result.get('Text', None),
|
|
||||||
ddg_result.get('Result', None))
|
|
||||||
if suggestion != heading:
|
|
||||||
results.append({'suggestion': suggestion})
|
|
||||||
elif 'Topics' in ddg_result:
|
|
||||||
suggestions = []
|
|
||||||
relatedTopics.append({'name': ddg_result.get('Name', ''),
|
|
||||||
'suggestions': suggestions})
|
|
||||||
for topic_result in ddg_result.get('Topics', []):
|
|
||||||
suggestion = result_to_text(topic_result.get('FirstURL', None),
|
|
||||||
topic_result.get('Text', None),
|
|
||||||
topic_result.get('Result', None))
|
|
||||||
if suggestion != heading:
|
|
||||||
suggestions.append(suggestion)
|
|
||||||
|
|
||||||
# abstract
|
|
||||||
abstractURL = search_res.get('AbstractURL', '')
|
|
||||||
if abstractURL != '':
|
|
||||||
# add as result ? problem always in english
|
|
||||||
infobox_id = abstractURL
|
|
||||||
urls.append({'title': search_res.get('AbstractSource'),
|
|
||||||
'url': abstractURL})
|
|
||||||
|
|
||||||
# definition
|
|
||||||
definitionURL = search_res.get('DefinitionURL', '')
|
|
||||||
if definitionURL != '':
|
|
||||||
# add as result ? as answer ? problem always in english
|
|
||||||
infobox_id = definitionURL
|
|
||||||
urls.append({'title': search_res.get('DefinitionSource'),
|
|
||||||
'url': definitionURL})
|
|
||||||
|
|
||||||
# entity
|
|
||||||
entity = search_res.get('Entity', None)
|
|
||||||
# TODO continent / country / department / location / waterfall /
|
|
||||||
# mountain range :
|
|
||||||
# link to map search, get weather, near by locations
|
|
||||||
# TODO musician : link to music search
|
|
||||||
# TODO concert tour : ??
|
|
||||||
# TODO film / actor / television / media franchise :
|
|
||||||
# links to IMDB / rottentomatoes (or scrap result)
|
|
||||||
# TODO music : link tu musicbrainz / last.fm
|
|
||||||
# TODO book : ??
|
|
||||||
# TODO artist / playwright : ??
|
|
||||||
# TODO compagny : ??
|
|
||||||
# TODO software / os : ??
|
|
||||||
# TODO software engineer : ??
|
|
||||||
# TODO prepared food : ??
|
|
||||||
# TODO website : ??
|
|
||||||
# TODO performing art : ??
|
|
||||||
# TODO prepared food : ??
|
|
||||||
# TODO programming language : ??
|
|
||||||
# TODO file format : ??
|
|
||||||
|
|
||||||
if len(heading) > 0:
|
|
||||||
# TODO get infobox.meta.value where .label='article_title'
|
|
||||||
if image is None and len(attributes) == 0 and len(urls) == 1 and\
|
|
||||||
len(relatedTopics) == 0 and len(content) == 0:
|
|
||||||
results.append({
|
|
||||||
'url': urls[0]['url'],
|
|
||||||
'title': heading,
|
|
||||||
'content': content
|
|
||||||
})
|
|
||||||
else:
|
|
||||||
results.append({
|
|
||||||
'infobox': heading,
|
|
||||||
'id': infobox_id,
|
|
||||||
'entity': entity,
|
|
||||||
'content': content,
|
|
||||||
'img_src': image,
|
|
||||||
'attributes': attributes,
|
|
||||||
'urls': urls,
|
|
||||||
'relatedTopics': relatedTopics
|
|
||||||
})
|
|
||||||
|
|
||||||
return results
|
|
|
@ -1,14 +0,0 @@
|
||||||
## Dummy
|
|
||||||
#
|
|
||||||
# @results empty array
|
|
||||||
# @stable yes
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
return []
|
|
|
@ -1,114 +0,0 @@
|
||||||
## Faroo (Web, News)
|
|
||||||
#
|
|
||||||
# @website http://www.faroo.com
|
|
||||||
# @provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title, content, publishedDate, img_src
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from json import loads
|
|
||||||
import datetime
|
|
||||||
from searx.utils import searx_useragent
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['general', 'news']
|
|
||||||
paging = True
|
|
||||||
language_support = True
|
|
||||||
number_of_results = 10
|
|
||||||
api_key = None
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'http://www.faroo.com/'
|
|
||||||
search_url = url + 'api?{query}'\
|
|
||||||
'&start={offset}'\
|
|
||||||
'&length={number_of_results}'\
|
|
||||||
'&l={language}'\
|
|
||||||
'&src={categorie}'\
|
|
||||||
'&i=false'\
|
|
||||||
'&f=json'\
|
|
||||||
'&key={api_key}' # noqa
|
|
||||||
|
|
||||||
search_category = {'general': 'web',
|
|
||||||
'news': 'news'}
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * number_of_results + 1
|
|
||||||
categorie = search_category.get(params['category'], 'web')
|
|
||||||
|
|
||||||
if params['language'] == 'all':
|
|
||||||
language = 'en'
|
|
||||||
else:
|
|
||||||
language = params['language'].split('_')[0]
|
|
||||||
|
|
||||||
# if language is not supported, put it in english
|
|
||||||
if language != 'en' and\
|
|
||||||
language != 'de' and\
|
|
||||||
language != 'zh':
|
|
||||||
language = 'en'
|
|
||||||
|
|
||||||
params['url'] = search_url.format(offset=offset,
|
|
||||||
number_of_results=number_of_results,
|
|
||||||
query=urlencode({'q': query}),
|
|
||||||
language=language,
|
|
||||||
categorie=categorie,
|
|
||||||
api_key=api_key)
|
|
||||||
|
|
||||||
# using searx User-Agent
|
|
||||||
params['headers']['User-Agent'] = searx_useragent()
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
# HTTP-Code 401: api-key is not valide
|
|
||||||
if resp.status_code == 401:
|
|
||||||
raise Exception("API key is not valide")
|
|
||||||
|
|
||||||
# HTTP-Code 429: rate limit exceeded
|
|
||||||
if resp.status_code == 429:
|
|
||||||
raise Exception("rate limit has been exceeded!")
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_res = loads(resp.text)
|
|
||||||
|
|
||||||
# return empty array if there are no results
|
|
||||||
if not search_res.get('results', {}):
|
|
||||||
return []
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_res['results']:
|
|
||||||
if result['news']:
|
|
||||||
# timestamp (milliseconds since 1970)
|
|
||||||
publishedDate = datetime.datetime.fromtimestamp(result['date']/1000.0) # noqa
|
|
||||||
|
|
||||||
# append news result
|
|
||||||
results.append({'url': result['url'],
|
|
||||||
'title': result['title'],
|
|
||||||
'publishedDate': publishedDate,
|
|
||||||
'content': result['kwic']})
|
|
||||||
|
|
||||||
else:
|
|
||||||
# append general result
|
|
||||||
# TODO, publishedDate correct?
|
|
||||||
results.append({'url': result['url'],
|
|
||||||
'title': result['title'],
|
|
||||||
'content': result['kwic']})
|
|
||||||
|
|
||||||
# append image result if image url is set
|
|
||||||
# TODO, show results with an image like in faroo
|
|
||||||
if result['iurl']:
|
|
||||||
results.append({'template': 'images.html',
|
|
||||||
'url': result['url'],
|
|
||||||
'title': result['title'],
|
|
||||||
'content': result['kwic'],
|
|
||||||
'img_src': result['iurl']})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,84 +0,0 @@
|
||||||
from urllib import urlencode
|
|
||||||
from HTMLParser import HTMLParser
|
|
||||||
|
|
||||||
url = 'http://www.filecrop.com/'
|
|
||||||
search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa
|
|
||||||
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
|
|
||||||
class FilecropResultParser(HTMLParser):
|
|
||||||
def __init__(self):
|
|
||||||
HTMLParser.__init__(self)
|
|
||||||
self.__start_processing = False
|
|
||||||
|
|
||||||
self.results = []
|
|
||||||
self.result = {}
|
|
||||||
|
|
||||||
self.tr_counter = 0
|
|
||||||
self.data_counter = 0
|
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
|
||||||
|
|
||||||
if tag == 'tr':
|
|
||||||
if ('bgcolor', '#edeff5') in attrs or\
|
|
||||||
('bgcolor', '#ffffff') in attrs:
|
|
||||||
self.__start_processing = True
|
|
||||||
|
|
||||||
if not self.__start_processing:
|
|
||||||
return
|
|
||||||
|
|
||||||
if tag == 'label':
|
|
||||||
self.result['title'] = [attr[1] for attr in attrs
|
|
||||||
if attr[0] == 'title'][0]
|
|
||||||
elif tag == 'a' and ('rel', 'nofollow') in attrs\
|
|
||||||
and ('class', 'sourcelink') in attrs:
|
|
||||||
if 'content' in self.result:
|
|
||||||
self.result['content'] += [attr[1] for attr in attrs
|
|
||||||
if attr[0] == 'title'][0]
|
|
||||||
else:
|
|
||||||
self.result['content'] = [attr[1] for attr in attrs
|
|
||||||
if attr[0] == 'title'][0]
|
|
||||||
self.result['content'] += ' '
|
|
||||||
elif tag == 'a':
|
|
||||||
self.result['url'] = url + [attr[1] for attr in attrs
|
|
||||||
if attr[0] == 'href'][0]
|
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
|
||||||
if self.__start_processing is False:
|
|
||||||
return
|
|
||||||
|
|
||||||
if tag == 'tr':
|
|
||||||
self.tr_counter += 1
|
|
||||||
|
|
||||||
if self.tr_counter == 2:
|
|
||||||
self.__start_processing = False
|
|
||||||
self.tr_counter = 0
|
|
||||||
self.data_counter = 0
|
|
||||||
self.results.append(self.result)
|
|
||||||
self.result = {}
|
|
||||||
|
|
||||||
def handle_data(self, data):
|
|
||||||
if not self.__start_processing:
|
|
||||||
return
|
|
||||||
|
|
||||||
if 'content' in self.result:
|
|
||||||
self.result['content'] += data + ' '
|
|
||||||
else:
|
|
||||||
self.result['content'] = data + ' '
|
|
||||||
|
|
||||||
self.data_counter += 1
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
|
||||||
index = 1 + (params['pageno'] - 1) * 30
|
|
||||||
params['url'] = search_url.format(query=urlencode({'w': query}),
|
|
||||||
index=index)
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
|
||||||
parser = FilecropResultParser()
|
|
||||||
parser.feed(resp.text)
|
|
||||||
|
|
||||||
return parser.results
|
|
|
@ -1,96 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
## Flickr (Images)
|
|
||||||
#
|
|
||||||
# @website https://www.flickr.com
|
|
||||||
# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title, thumbnail, img_src
|
|
||||||
#More info on api-key : https://www.flickr.com/services/apps/create/
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from json import loads
|
|
||||||
|
|
||||||
categories = ['images']
|
|
||||||
|
|
||||||
nb_per_page = 15
|
|
||||||
paging = True
|
|
||||||
api_key = None
|
|
||||||
|
|
||||||
|
|
||||||
url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search' +\
|
|
||||||
'&api_key={api_key}&{text}&sort=relevance' +\
|
|
||||||
'&extras=description%2C+owner_name%2C+url_o%2C+url_n%2C+url_z' +\
|
|
||||||
'&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
|
|
||||||
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
|
|
||||||
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
|
|
||||||
def build_flickr_url(user_id, photo_id):
|
|
||||||
return photo_url.format(userid=user_id, photoid=photo_id)
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = url.format(text=urlencode({'text': query}),
|
|
||||||
api_key=api_key,
|
|
||||||
nb_per_page=nb_per_page,
|
|
||||||
page=params['pageno'])
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_results = loads(resp.text)
|
|
||||||
|
|
||||||
# return empty array if there are no results
|
|
||||||
if not 'photos' in search_results:
|
|
||||||
return []
|
|
||||||
|
|
||||||
if not 'photo' in search_results['photos']:
|
|
||||||
return []
|
|
||||||
|
|
||||||
photos = search_results['photos']['photo']
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for photo in photos:
|
|
||||||
if 'url_o' in photo:
|
|
||||||
img_src = photo['url_o']
|
|
||||||
elif 'url_z' in photo:
|
|
||||||
img_src = photo['url_z']
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# For a bigger thumbnail, keep only the url_z, not the url_n
|
|
||||||
if 'url_n' in photo:
|
|
||||||
thumbnail_src = photo['url_n']
|
|
||||||
elif 'url_z' in photo:
|
|
||||||
thumbnail_src = photo['url_z']
|
|
||||||
else:
|
|
||||||
thumbnail_src = img_src
|
|
||||||
|
|
||||||
url = build_flickr_url(photo['owner'], photo['id'])
|
|
||||||
|
|
||||||
title = photo['title']
|
|
||||||
|
|
||||||
content = '<span class="photo-author">' +\
|
|
||||||
photo['ownername'] +\
|
|
||||||
'</span><br />' +\
|
|
||||||
'<span class="description">' +\
|
|
||||||
photo['description']['_content'] +\
|
|
||||||
'</span>'
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'img_src': img_src,
|
|
||||||
'thumbnail_src': thumbnail_src,
|
|
||||||
'content': content,
|
|
||||||
'template': 'images.html'})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,109 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
# Flickr (Images)
|
|
||||||
#
|
|
||||||
# @website https://www.flickr.com
|
|
||||||
# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML
|
|
||||||
# @stable no
|
|
||||||
# @parse url, title, thumbnail, img_src
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from json import loads
|
|
||||||
import re
|
|
||||||
from searx.engines import logger
|
|
||||||
|
|
||||||
|
|
||||||
logger = logger.getChild('flickr-noapi')
|
|
||||||
|
|
||||||
categories = ['images']
|
|
||||||
|
|
||||||
url = 'https://secure.flickr.com/'
|
|
||||||
search_url = url + 'search/?{query}&page={page}'
|
|
||||||
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
|
|
||||||
regex = re.compile(r"\"search-photos-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL)
|
|
||||||
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's')
|
|
||||||
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
|
|
||||||
def build_flickr_url(user_id, photo_id):
|
|
||||||
return photo_url.format(userid=user_id, photoid=photo_id)
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(query=urlencode({'text': query}),
|
|
||||||
page=params['pageno'])
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
matches = regex.search(resp.text)
|
|
||||||
|
|
||||||
if matches is None:
|
|
||||||
return results
|
|
||||||
|
|
||||||
match = matches.group(1)
|
|
||||||
search_results = loads(match)
|
|
||||||
|
|
||||||
if '_data' not in search_results:
|
|
||||||
return []
|
|
||||||
|
|
||||||
photos = search_results['_data']
|
|
||||||
|
|
||||||
for photo in photos:
|
|
||||||
|
|
||||||
# In paged configuration, the first pages' photos
|
|
||||||
# are represented by a None object
|
|
||||||
if photo is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
img_src = None
|
|
||||||
# From the biggest to the lowest format
|
|
||||||
for image_size in image_sizes:
|
|
||||||
if image_size in photo['sizes']:
|
|
||||||
img_src = photo['sizes'][image_size]['url']
|
|
||||||
break
|
|
||||||
|
|
||||||
if not img_src:
|
|
||||||
logger.debug('cannot find valid image size: {0}'.format(repr(photo)))
|
|
||||||
continue
|
|
||||||
|
|
||||||
if 'id' not in photo['owner']:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# For a bigger thumbnail, keep only the url_z, not the url_n
|
|
||||||
if 'n' in photo['sizes']:
|
|
||||||
thumbnail_src = photo['sizes']['n']['url']
|
|
||||||
elif 'z' in photo['sizes']:
|
|
||||||
thumbnail_src = photo['sizes']['z']['url']
|
|
||||||
else:
|
|
||||||
thumbnail_src = img_src
|
|
||||||
|
|
||||||
url = build_flickr_url(photo['owner']['id'], photo['id'])
|
|
||||||
|
|
||||||
title = photo.get('title', '')
|
|
||||||
|
|
||||||
content = '<span class="photo-author">' +\
|
|
||||||
photo['owner']['username'] +\
|
|
||||||
'</span><br />'
|
|
||||||
|
|
||||||
if 'description' in photo:
|
|
||||||
content = content +\
|
|
||||||
'<span class="description">' +\
|
|
||||||
photo['description'] +\
|
|
||||||
'</span>'
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'img_src': img_src,
|
|
||||||
'thumbnail_src': thumbnail_src,
|
|
||||||
'content': content,
|
|
||||||
'template': 'images.html'})
|
|
||||||
|
|
||||||
return results
|
|
|
@ -1,60 +0,0 @@
|
||||||
## General Files (Files)
|
|
||||||
#
|
|
||||||
# @website http://www.general-files.org
|
|
||||||
# @provide-api no (nothing found)
|
|
||||||
#
|
|
||||||
# @using-api no (because nothing found)
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content
|
|
||||||
#
|
|
||||||
# @todo detect torrents?
|
|
||||||
|
|
||||||
from lxml import html
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['files']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'http://www.general-file.com'
|
|
||||||
search_url = base_url + '/files-{letter}/{query}/{pageno}'
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
result_xpath = '//table[@class="block-file"]'
|
|
||||||
title_xpath = './/h2/a//text()'
|
|
||||||
url_xpath = './/h2/a/@href'
|
|
||||||
content_xpath = './/p//text()'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
|
|
||||||
params['url'] = search_url.format(query=query,
|
|
||||||
letter=query[0],
|
|
||||||
pageno=params['pageno'])
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath(result_xpath):
|
|
||||||
url = result.xpath(url_xpath)[0]
|
|
||||||
|
|
||||||
# skip fast download links
|
|
||||||
if not url.startswith('/'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': base_url + url,
|
|
||||||
'title': ''.join(result.xpath(title_xpath)),
|
|
||||||
'content': ''.join(result.xpath(content_xpath))})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,59 +0,0 @@
|
||||||
## Github (It)
|
|
||||||
#
|
|
||||||
# @website https://github.com/
|
|
||||||
# @provide-api yes (https://developer.github.com/v3/)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes (using api)
|
|
||||||
# @parse url, title, content
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from json import loads
|
|
||||||
from cgi import escape
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['it']
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa
|
|
||||||
|
|
||||||
accept_header = 'application/vnd.github.preview.text-match+json'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(query=urlencode({'q': query}))
|
|
||||||
|
|
||||||
params['headers']['Accept'] = accept_header
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_res = loads(resp.text)
|
|
||||||
|
|
||||||
# check if items are recieved
|
|
||||||
if not 'items' in search_res:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for res in search_res['items']:
|
|
||||||
title = res['name']
|
|
||||||
url = res['html_url']
|
|
||||||
|
|
||||||
if res['description']:
|
|
||||||
content = escape(res['description'][:500])
|
|
||||||
else:
|
|
||||||
content = ''
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,140 +0,0 @@
|
||||||
# Google (Web)
|
|
||||||
#
|
|
||||||
# @website https://www.google.com
|
|
||||||
# @provide-api yes (https://developers.google.com/custom-search/)
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content, suggestion
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from urlparse import urlparse, parse_qsl
|
|
||||||
from lxml import html
|
|
||||||
from searx.poolrequests import get
|
|
||||||
from searx.engines.xpath import extract_text, extract_url
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['general']
|
|
||||||
paging = True
|
|
||||||
language_support = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
google_hostname = 'www.google.com'
|
|
||||||
search_path = '/search'
|
|
||||||
redirect_path = '/url'
|
|
||||||
images_path = '/images'
|
|
||||||
search_url = ('https://' +
|
|
||||||
google_hostname +
|
|
||||||
search_path +
|
|
||||||
'?{query}&start={offset}&gbv=1')
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
results_xpath = '//li[@class="g"]'
|
|
||||||
url_xpath = './/h3/a/@href'
|
|
||||||
title_xpath = './/h3'
|
|
||||||
content_xpath = './/span[@class="st"]'
|
|
||||||
suggestion_xpath = '//p[@class="_Bmc"]'
|
|
||||||
|
|
||||||
images_xpath = './/div/a'
|
|
||||||
image_url_xpath = './@href'
|
|
||||||
image_img_src_xpath = './img/@src'
|
|
||||||
|
|
||||||
pref_cookie = ''
|
|
||||||
|
|
||||||
|
|
||||||
# see https://support.google.com/websearch/answer/873?hl=en
|
|
||||||
def get_google_pref_cookie():
|
|
||||||
global pref_cookie
|
|
||||||
if pref_cookie == '':
|
|
||||||
resp = get('https://www.google.com/ncr', allow_redirects=False)
|
|
||||||
pref_cookie = resp.cookies["PREF"]
|
|
||||||
return pref_cookie
|
|
||||||
|
|
||||||
|
|
||||||
# remove google-specific tracking-url
|
|
||||||
def parse_url(url_string):
|
|
||||||
parsed_url = urlparse(url_string)
|
|
||||||
if (parsed_url.netloc in [google_hostname, '']
|
|
||||||
and parsed_url.path == redirect_path):
|
|
||||||
query = dict(parse_qsl(parsed_url.query))
|
|
||||||
return query['q']
|
|
||||||
else:
|
|
||||||
return url_string
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 10
|
|
||||||
|
|
||||||
if params['language'] == 'all':
|
|
||||||
language = 'en'
|
|
||||||
else:
|
|
||||||
language = params['language'].replace('_', '-').lower()
|
|
||||||
|
|
||||||
params['url'] = search_url.format(offset=offset,
|
|
||||||
query=urlencode({'q': query}))
|
|
||||||
|
|
||||||
params['headers']['Accept-Language'] = language
|
|
||||||
params['cookies']['PREF'] = get_google_pref_cookie()
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath(results_xpath):
|
|
||||||
title = extract_text(result.xpath(title_xpath)[0])
|
|
||||||
try:
|
|
||||||
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
|
|
||||||
parsed_url = urlparse(url)
|
|
||||||
if (parsed_url.netloc == google_hostname
|
|
||||||
and parsed_url.path == search_path):
|
|
||||||
# remove the link to google news
|
|
||||||
continue
|
|
||||||
|
|
||||||
# images result
|
|
||||||
if (parsed_url.netloc == google_hostname
|
|
||||||
and parsed_url.path == images_path):
|
|
||||||
# only thumbnail image provided,
|
|
||||||
# so skipping image results
|
|
||||||
# results = results + parse_images(result)
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# normal result
|
|
||||||
content = extract_text(result.xpath(content_xpath)[0])
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content})
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# parse suggestion
|
|
||||||
for suggestion in dom.xpath(suggestion_xpath):
|
|
||||||
# append suggestion
|
|
||||||
results.append({'suggestion': extract_text(suggestion)})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def parse_images(result):
|
|
||||||
results = []
|
|
||||||
for image in result.xpath(images_xpath):
|
|
||||||
url = parse_url(extract_text(image.xpath(image_url_xpath)[0]))
|
|
||||||
img_src = extract_text(image.xpath(image_img_src_xpath)[0])
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': '',
|
|
||||||
'content': '',
|
|
||||||
'img_src': img_src,
|
|
||||||
'template': 'images.html'})
|
|
||||||
|
|
||||||
return results
|
|
|
@ -1,68 +0,0 @@
|
||||||
## Google (Images)
|
|
||||||
#
|
|
||||||
# @website https://www.google.com
|
|
||||||
# @provide-api yes (https://developers.google.com/web-search/docs/),
|
|
||||||
# deprecated!
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes (but deprecated)
|
|
||||||
# @parse url, title, img_src
|
|
||||||
|
|
||||||
from urllib import urlencode, unquote
|
|
||||||
from json import loads
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['images']
|
|
||||||
paging = True
|
|
||||||
safesearch = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'https://ajax.googleapis.com/'
|
|
||||||
search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe={safesearch}&filter=off&{query}'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 8
|
|
||||||
|
|
||||||
if params['safesearch'] == 0:
|
|
||||||
safesearch = 'off'
|
|
||||||
else:
|
|
||||||
safesearch = 'on'
|
|
||||||
|
|
||||||
params['url'] = search_url.format(query=urlencode({'q': query}),
|
|
||||||
offset=offset,
|
|
||||||
safesearch=safesearch)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_res = loads(resp.text)
|
|
||||||
|
|
||||||
# return empty array if there are no results
|
|
||||||
if not search_res.get('responseData', {}).get('results'):
|
|
||||||
return []
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_res['responseData']['results']:
|
|
||||||
href = result['originalContextUrl']
|
|
||||||
title = result['title']
|
|
||||||
if 'url' not in result:
|
|
||||||
continue
|
|
||||||
thumbnail_src = result['tbUrl']
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': href,
|
|
||||||
'title': title,
|
|
||||||
'content': result['content'],
|
|
||||||
'thumbnail_src': thumbnail_src,
|
|
||||||
'img_src': unquote(result['url']),
|
|
||||||
'template': 'images.html'})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,65 +0,0 @@
|
||||||
## Google (News)
|
|
||||||
#
|
|
||||||
# @website https://www.google.com
|
|
||||||
# @provide-api yes (https://developers.google.com/web-search/docs/),
|
|
||||||
# deprecated!
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes (but deprecated)
|
|
||||||
# @parse url, title, content, publishedDate
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from json import loads
|
|
||||||
from dateutil import parser
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
categories = ['news']
|
|
||||||
paging = True
|
|
||||||
language_support = True
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
url = 'https://ajax.googleapis.com/'
|
|
||||||
search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={lang}'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 8
|
|
||||||
|
|
||||||
language = 'en-US'
|
|
||||||
if params['language'] != 'all':
|
|
||||||
language = params['language'].replace('_', '-')
|
|
||||||
|
|
||||||
params['url'] = search_url.format(offset=offset,
|
|
||||||
query=urlencode({'q': query}),
|
|
||||||
lang=language)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_res = loads(resp.text)
|
|
||||||
|
|
||||||
# return empty array if there are no results
|
|
||||||
if not search_res.get('responseData', {}).get('results'):
|
|
||||||
return []
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_res['responseData']['results']:
|
|
||||||
# parse publishedDate
|
|
||||||
publishedDate = parser.parse(result['publishedDate'])
|
|
||||||
if 'url' not in result:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': result['unescapedUrl'],
|
|
||||||
'title': result['titleNoFormatting'],
|
|
||||||
'publishedDate': publishedDate,
|
|
||||||
'content': result['content']})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,87 +0,0 @@
|
||||||
from urllib import urlencode
|
|
||||||
from json import loads
|
|
||||||
from collections import Iterable
|
|
||||||
|
|
||||||
search_url = None
|
|
||||||
url_query = None
|
|
||||||
content_query = None
|
|
||||||
title_query = None
|
|
||||||
#suggestion_xpath = ''
|
|
||||||
|
|
||||||
|
|
||||||
def iterate(iterable):
|
|
||||||
if type(iterable) == dict:
|
|
||||||
it = iterable.iteritems()
|
|
||||||
|
|
||||||
else:
|
|
||||||
it = enumerate(iterable)
|
|
||||||
for index, value in it:
|
|
||||||
yield str(index), value
|
|
||||||
|
|
||||||
|
|
||||||
def is_iterable(obj):
|
|
||||||
if type(obj) == str:
|
|
||||||
return False
|
|
||||||
if type(obj) == unicode:
|
|
||||||
return False
|
|
||||||
return isinstance(obj, Iterable)
|
|
||||||
|
|
||||||
|
|
||||||
def parse(query):
|
|
||||||
q = []
|
|
||||||
for part in query.split('/'):
|
|
||||||
if part == '':
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
q.append(part)
|
|
||||||
return q
|
|
||||||
|
|
||||||
|
|
||||||
def do_query(data, q):
|
|
||||||
ret = []
|
|
||||||
if not q:
|
|
||||||
return ret
|
|
||||||
|
|
||||||
qkey = q[0]
|
|
||||||
|
|
||||||
for key, value in iterate(data):
|
|
||||||
|
|
||||||
if len(q) == 1:
|
|
||||||
if key == qkey:
|
|
||||||
ret.append(value)
|
|
||||||
elif is_iterable(value):
|
|
||||||
ret.extend(do_query(value, q))
|
|
||||||
else:
|
|
||||||
if not is_iterable(value):
|
|
||||||
continue
|
|
||||||
if key == qkey:
|
|
||||||
ret.extend(do_query(value, q[1:]))
|
|
||||||
else:
|
|
||||||
ret.extend(do_query(value, q))
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
def query(data, query_string):
|
|
||||||
q = parse(query_string)
|
|
||||||
|
|
||||||
return do_query(data, q)
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
|
||||||
query = urlencode({'q': query})[2:]
|
|
||||||
params['url'] = search_url.format(query=query)
|
|
||||||
params['query'] = query
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
json = loads(resp.text)
|
|
||||||
|
|
||||||
urls = query(json, url_query)
|
|
||||||
contents = query(json, content_query)
|
|
||||||
titles = query(json, title_query)
|
|
||||||
for url, title, content in zip(urls, titles, contents):
|
|
||||||
results.append({'url': url, 'title': title, 'content': content})
|
|
||||||
return results
|
|
|
@ -1,120 +0,0 @@
|
||||||
## Kickass Torrent (Videos, Music, Files)
|
|
||||||
#
|
|
||||||
# @website https://kickass.so
|
|
||||||
# @provide-api no (nothing found)
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable yes (HTML can change)
|
|
||||||
# @parse url, title, content, seed, leech, magnetlink
|
|
||||||
|
|
||||||
from urlparse import urljoin
|
|
||||||
from cgi import escape
|
|
||||||
from urllib import quote
|
|
||||||
from lxml import html
|
|
||||||
from operator import itemgetter
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['videos', 'music', 'files']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'https://kickass.to/'
|
|
||||||
search_url = url + 'search/{search_term}/{pageno}/'
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
magnet_xpath = './/a[@title="Torrent magnet link"]'
|
|
||||||
torrent_xpath = './/a[@title="Download torrent file"]'
|
|
||||||
content_xpath = './/span[@class="font11px lightgrey block"]'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(search_term=quote(query),
|
|
||||||
pageno=params['pageno'])
|
|
||||||
|
|
||||||
# FIX: SSLError: hostname 'kickass.so'
|
|
||||||
# doesn't match either of '*.kickass.to', 'kickass.to'
|
|
||||||
params['verify'] = False
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
search_res = dom.xpath('//table[@class="data"]//tr')
|
|
||||||
|
|
||||||
# return empty array if nothing is found
|
|
||||||
if not search_res:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_res[1:]:
|
|
||||||
link = result.xpath('.//a[@class="cellMainLink"]')[0]
|
|
||||||
href = urljoin(url, link.attrib['href'])
|
|
||||||
title = extract_text(link)
|
|
||||||
content = escape(extract_text(result.xpath(content_xpath)))
|
|
||||||
seed = result.xpath('.//td[contains(@class, "green")]/text()')[0]
|
|
||||||
leech = result.xpath('.//td[contains(@class, "red")]/text()')[0]
|
|
||||||
filesize = result.xpath('.//td[contains(@class, "nobr")]/text()')[0]
|
|
||||||
filesize_multiplier = result.xpath('.//td[contains(@class, "nobr")]//span/text()')[0]
|
|
||||||
files = result.xpath('.//td[contains(@class, "center")][2]/text()')[0]
|
|
||||||
|
|
||||||
# convert seed to int if possible
|
|
||||||
if seed.isdigit():
|
|
||||||
seed = int(seed)
|
|
||||||
else:
|
|
||||||
seed = 0
|
|
||||||
|
|
||||||
# convert leech to int if possible
|
|
||||||
if leech.isdigit():
|
|
||||||
leech = int(leech)
|
|
||||||
else:
|
|
||||||
leech = 0
|
|
||||||
|
|
||||||
# convert filesize to byte if possible
|
|
||||||
try:
|
|
||||||
filesize = float(filesize)
|
|
||||||
|
|
||||||
# convert filesize to byte
|
|
||||||
if filesize_multiplier == 'TB':
|
|
||||||
filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
|
|
||||||
elif filesize_multiplier == 'GB':
|
|
||||||
filesize = int(filesize * 1024 * 1024 * 1024)
|
|
||||||
elif filesize_multiplier == 'MB':
|
|
||||||
filesize = int(filesize * 1024 * 1024)
|
|
||||||
elif filesize_multiplier == 'KB':
|
|
||||||
filesize = int(filesize * 1024)
|
|
||||||
except:
|
|
||||||
filesize = None
|
|
||||||
|
|
||||||
# convert files to int if possible
|
|
||||||
if files.isdigit():
|
|
||||||
files = int(files)
|
|
||||||
else:
|
|
||||||
files = None
|
|
||||||
|
|
||||||
magnetlink = result.xpath(magnet_xpath)[0].attrib['href']
|
|
||||||
|
|
||||||
torrentfile = result.xpath(torrent_xpath)[0].attrib['href']
|
|
||||||
torrentfileurl = quote(torrentfile, safe="%/:=&?~#+!$,;'@()*")
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': href,
|
|
||||||
'title': title,
|
|
||||||
'content': content,
|
|
||||||
'seed': seed,
|
|
||||||
'leech': leech,
|
|
||||||
'filesize': filesize,
|
|
||||||
'files': files,
|
|
||||||
'magnetlink': magnetlink,
|
|
||||||
'torrentfile': torrentfileurl,
|
|
||||||
'template': 'torrent.html'})
|
|
||||||
|
|
||||||
# return results sorted by seeder
|
|
||||||
return sorted(results, key=itemgetter('seed'), reverse=True)
|
|
|
@ -1,81 +0,0 @@
|
||||||
## general mediawiki-engine (Web)
|
|
||||||
#
|
|
||||||
# @website websites built on mediawiki (https://www.mediawiki.org)
|
|
||||||
# @provide-api yes (http://www.mediawiki.org/wiki/API:Search)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title
|
|
||||||
#
|
|
||||||
# @todo content
|
|
||||||
|
|
||||||
from json import loads
|
|
||||||
from string import Formatter
|
|
||||||
from urllib import urlencode, quote
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['general']
|
|
||||||
language_support = True
|
|
||||||
paging = True
|
|
||||||
number_of_results = 1
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://{language}.wikipedia.org/'
|
|
||||||
search_url = base_url + 'w/api.php?action=query'\
|
|
||||||
'&list=search'\
|
|
||||||
'&{query}'\
|
|
||||||
'&srprop=timestamp'\
|
|
||||||
'&format=json'\
|
|
||||||
'&sroffset={offset}'\
|
|
||||||
'&srlimit={limit}' # noqa
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * number_of_results
|
|
||||||
|
|
||||||
string_args = dict(query=urlencode({'srsearch': query}),
|
|
||||||
offset=offset,
|
|
||||||
limit=number_of_results)
|
|
||||||
|
|
||||||
format_strings = list(Formatter().parse(base_url))
|
|
||||||
|
|
||||||
if params['language'] == 'all':
|
|
||||||
language = 'en'
|
|
||||||
else:
|
|
||||||
language = params['language'].split('_')[0]
|
|
||||||
|
|
||||||
if len(format_strings) > 1:
|
|
||||||
string_args['language'] = language
|
|
||||||
|
|
||||||
# write search-language back to params, required in response
|
|
||||||
params['language'] = language
|
|
||||||
|
|
||||||
params['url'] = search_url.format(**string_args)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_results = loads(resp.text)
|
|
||||||
|
|
||||||
# return empty array if there are no results
|
|
||||||
if not search_results.get('query', {}).get('search'):
|
|
||||||
return []
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_results['query']['search']:
|
|
||||||
url = base_url.format(language=resp.search_params['language']) +\
|
|
||||||
'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8'))
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': result['title'],
|
|
||||||
'content': ''})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,59 +0,0 @@
|
||||||
## Mixcloud (Music)
|
|
||||||
#
|
|
||||||
# @website https://http://www.mixcloud.com/
|
|
||||||
# @provide-api yes (http://www.mixcloud.com/developers/
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title, content, embedded, publishedDate
|
|
||||||
|
|
||||||
from json import loads
|
|
||||||
from urllib import urlencode
|
|
||||||
from dateutil import parser
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['music']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'http://api.mixcloud.com/'
|
|
||||||
search_url = url + 'search/?{query}&type=cloudcast&limit=10&offset={offset}'
|
|
||||||
|
|
||||||
embedded_url = '<iframe scrolling="no" frameborder="0" allowTransparency="true" ' +\
|
|
||||||
'data-src="https://www.mixcloud.com/widget/iframe/?feed={url}" width="300" height="300"></iframe>'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 10
|
|
||||||
|
|
||||||
params['url'] = search_url.format(query=urlencode({'q': query}),
|
|
||||||
offset=offset)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_res = loads(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_res.get('data', []):
|
|
||||||
title = result['name']
|
|
||||||
url = result['url']
|
|
||||||
content = result['user']['name']
|
|
||||||
embedded = embedded_url.format(url=url)
|
|
||||||
publishedDate = parser.parse(result['created_time'])
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'embedded': embedded,
|
|
||||||
'publishedDate': publishedDate,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,97 +0,0 @@
|
||||||
## OpenStreetMap (Map)
|
|
||||||
#
|
|
||||||
# @website https://openstreetmap.org/
|
|
||||||
# @provide-api yes (http://wiki.openstreetmap.org/wiki/Nominatim)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title
|
|
||||||
|
|
||||||
from json import loads
|
|
||||||
from searx.utils import searx_useragent
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['map']
|
|
||||||
paging = False
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://nominatim.openstreetmap.org/'
|
|
||||||
search_string = 'search/{query}?format=json&polygon_geojson=1&addressdetails=1'
|
|
||||||
result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = base_url + search_string.format(query=query)
|
|
||||||
|
|
||||||
# using searx User-Agent
|
|
||||||
params['headers']['User-Agent'] = searx_useragent()
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
json = loads(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for r in json:
|
|
||||||
if 'display_name' not in r:
|
|
||||||
continue
|
|
||||||
|
|
||||||
title = r['display_name']
|
|
||||||
osm_type = r.get('osm_type', r.get('type'))
|
|
||||||
url = result_base_url.format(osm_type=osm_type,
|
|
||||||
osm_id=r['osm_id'])
|
|
||||||
|
|
||||||
osm = {'type': osm_type,
|
|
||||||
'id': r['osm_id']}
|
|
||||||
|
|
||||||
geojson = r.get('geojson')
|
|
||||||
|
|
||||||
# if no geojson is found and osm_type is a node, add geojson Point
|
|
||||||
if not geojson and osm_type == 'node':
|
|
||||||
geojson = {u'type': u'Point', u'coordinates': [r['lon'], r['lat']]}
|
|
||||||
|
|
||||||
address_raw = r.get('address')
|
|
||||||
address = {}
|
|
||||||
|
|
||||||
# get name
|
|
||||||
if r['class'] == 'amenity' or\
|
|
||||||
r['class'] == 'shop' or\
|
|
||||||
r['class'] == 'tourism' or\
|
|
||||||
r['class'] == 'leisure':
|
|
||||||
if address_raw.get('address29'):
|
|
||||||
address = {'name': address_raw.get('address29')}
|
|
||||||
else:
|
|
||||||
address = {'name': address_raw.get(r['type'])}
|
|
||||||
|
|
||||||
# add rest of adressdata, if something is already found
|
|
||||||
if address.get('name'):
|
|
||||||
address.update({'house_number': address_raw.get('house_number'),
|
|
||||||
'road': address_raw.get('road'),
|
|
||||||
'locality': address_raw.get('city',
|
|
||||||
address_raw.get('town', # noqa
|
|
||||||
address_raw.get('village'))), # noqa
|
|
||||||
'postcode': address_raw.get('postcode'),
|
|
||||||
'country': address_raw.get('country'),
|
|
||||||
'country_code': address_raw.get('country_code')})
|
|
||||||
else:
|
|
||||||
address = None
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'template': 'map.html',
|
|
||||||
'title': title,
|
|
||||||
'content': '',
|
|
||||||
'longitude': r['lon'],
|
|
||||||
'latitude': r['lat'],
|
|
||||||
'boundingbox': r['boundingbox'],
|
|
||||||
'geojson': geojson,
|
|
||||||
'address': address,
|
|
||||||
'osm': osm,
|
|
||||||
'url': url})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,132 +0,0 @@
|
||||||
## Photon (Map)
|
|
||||||
#
|
|
||||||
# @website https://photon.komoot.de
|
|
||||||
# @provide-api yes (https://photon.komoot.de/)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from json import loads
|
|
||||||
from searx.utils import searx_useragent
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['map']
|
|
||||||
paging = False
|
|
||||||
language_support = True
|
|
||||||
number_of_results = 10
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://photon.komoot.de/'
|
|
||||||
search_string = 'api/?{query}&limit={limit}'
|
|
||||||
result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
|
|
||||||
|
|
||||||
# list of supported languages
|
|
||||||
allowed_languages = ['de', 'en', 'fr', 'it']
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = base_url +\
|
|
||||||
search_string.format(query=urlencode({'q': query}),
|
|
||||||
limit=number_of_results)
|
|
||||||
|
|
||||||
if params['language'] != 'all':
|
|
||||||
language = params['language'].split('_')[0]
|
|
||||||
if language in allowed_languages:
|
|
||||||
params['url'] = params['url'] + "&lang=" + language
|
|
||||||
|
|
||||||
# using searx User-Agent
|
|
||||||
params['headers']['User-Agent'] = searx_useragent()
|
|
||||||
|
|
||||||
# FIX: SSLError: SSL3_GET_SERVER_CERTIFICATE:certificate verify failed
|
|
||||||
params['verify'] = False
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
json = loads(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for r in json.get('features', {}):
|
|
||||||
|
|
||||||
properties = r.get('properties')
|
|
||||||
|
|
||||||
if not properties:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# get title
|
|
||||||
title = properties.get('name')
|
|
||||||
|
|
||||||
# get osm-type
|
|
||||||
if properties.get('osm_type') == 'N':
|
|
||||||
osm_type = 'node'
|
|
||||||
elif properties.get('osm_type') == 'W':
|
|
||||||
osm_type = 'way'
|
|
||||||
elif properties.get('osm_type') == 'R':
|
|
||||||
osm_type = 'relation'
|
|
||||||
else:
|
|
||||||
# continue if invalide osm-type
|
|
||||||
continue
|
|
||||||
|
|
||||||
url = result_base_url.format(osm_type=osm_type,
|
|
||||||
osm_id=properties.get('osm_id'))
|
|
||||||
|
|
||||||
osm = {'type': osm_type,
|
|
||||||
'id': properties.get('osm_id')}
|
|
||||||
|
|
||||||
geojson = r.get('geometry')
|
|
||||||
|
|
||||||
if properties.get('extent'):
|
|
||||||
boundingbox = [properties.get('extent')[3],
|
|
||||||
properties.get('extent')[1],
|
|
||||||
properties.get('extent')[0],
|
|
||||||
properties.get('extent')[2]]
|
|
||||||
else:
|
|
||||||
# TODO: better boundingbox calculation
|
|
||||||
boundingbox = [geojson['coordinates'][1],
|
|
||||||
geojson['coordinates'][1],
|
|
||||||
geojson['coordinates'][0],
|
|
||||||
geojson['coordinates'][0]]
|
|
||||||
|
|
||||||
# address calculation
|
|
||||||
address = {}
|
|
||||||
|
|
||||||
# get name
|
|
||||||
if properties.get('osm_key') == 'amenity' or\
|
|
||||||
properties.get('osm_key') == 'shop' or\
|
|
||||||
properties.get('osm_key') == 'tourism' or\
|
|
||||||
properties.get('osm_key') == 'leisure':
|
|
||||||
address = {'name': properties.get('name')}
|
|
||||||
|
|
||||||
# add rest of adressdata, if something is already found
|
|
||||||
if address.get('name'):
|
|
||||||
address.update({'house_number': properties.get('housenumber'),
|
|
||||||
'road': properties.get('street'),
|
|
||||||
'locality': properties.get('city',
|
|
||||||
properties.get('town', # noqa
|
|
||||||
properties.get('village'))), # noqa
|
|
||||||
'postcode': properties.get('postcode'),
|
|
||||||
'country': properties.get('country')})
|
|
||||||
else:
|
|
||||||
address = None
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'template': 'map.html',
|
|
||||||
'title': title,
|
|
||||||
'content': '',
|
|
||||||
'longitude': geojson['coordinates'][0],
|
|
||||||
'latitude': geojson['coordinates'][1],
|
|
||||||
'boundingbox': boundingbox,
|
|
||||||
'geojson': geojson,
|
|
||||||
'address': address,
|
|
||||||
'osm': osm,
|
|
||||||
'url': url})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,94 +0,0 @@
|
||||||
## Piratebay (Videos, Music, Files)
|
|
||||||
#
|
|
||||||
# @website https://thepiratebay.se
|
|
||||||
# @provide-api no (nothing found)
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable yes (HTML can change)
|
|
||||||
# @parse url, title, content, seed, leech, magnetlink
|
|
||||||
|
|
||||||
from urlparse import urljoin
|
|
||||||
from cgi import escape
|
|
||||||
from urllib import quote
|
|
||||||
from lxml import html
|
|
||||||
from operator import itemgetter
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['videos', 'music', 'files']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'https://thepiratebay.se/'
|
|
||||||
search_url = url + 'search/{search_term}/{pageno}/99/{search_type}'
|
|
||||||
|
|
||||||
# piratebay specific type-definitions
|
|
||||||
search_types = {'files': '0',
|
|
||||||
'music': '100',
|
|
||||||
'videos': '200'}
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
magnet_xpath = './/a[@title="Download this torrent using magnet"]'
|
|
||||||
torrent_xpath = './/a[@title="Download this torrent"]'
|
|
||||||
content_xpath = './/font[@class="detDesc"]'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
search_type = search_types.get(params['category'], '0')
|
|
||||||
|
|
||||||
params['url'] = search_url.format(search_term=quote(query),
|
|
||||||
search_type=search_type,
|
|
||||||
pageno=params['pageno'] - 1)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
search_res = dom.xpath('//table[@id="searchResult"]//tr')
|
|
||||||
|
|
||||||
# return empty array if nothing is found
|
|
||||||
if not search_res:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_res[1:]:
|
|
||||||
link = result.xpath('.//div[@class="detName"]//a')[0]
|
|
||||||
href = urljoin(url, link.attrib.get('href'))
|
|
||||||
title = extract_text(link)
|
|
||||||
content = escape(extract_text(result.xpath(content_xpath)))
|
|
||||||
seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
|
|
||||||
|
|
||||||
# convert seed to int if possible
|
|
||||||
if seed.isdigit():
|
|
||||||
seed = int(seed)
|
|
||||||
else:
|
|
||||||
seed = 0
|
|
||||||
|
|
||||||
# convert leech to int if possible
|
|
||||||
if leech.isdigit():
|
|
||||||
leech = int(leech)
|
|
||||||
else:
|
|
||||||
leech = 0
|
|
||||||
|
|
||||||
magnetlink = result.xpath(magnet_xpath)[0]
|
|
||||||
torrentfile = result.xpath(torrent_xpath)[0]
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': href,
|
|
||||||
'title': title,
|
|
||||||
'content': content,
|
|
||||||
'seed': seed,
|
|
||||||
'leech': leech,
|
|
||||||
'magnetlink': magnetlink.attrib.get('href'),
|
|
||||||
'torrentfile': torrentfile.attrib.get('href'),
|
|
||||||
'template': 'torrent.html'})
|
|
||||||
|
|
||||||
# return results sorted by seeder
|
|
||||||
return sorted(results, key=itemgetter('seed'), reverse=True)
|
|
|
@ -1,68 +0,0 @@
|
||||||
## Searchcode (It)
|
|
||||||
#
|
|
||||||
# @website https://searchcode.com/
|
|
||||||
# @provide-api yes (https://searchcode.com/api/)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title, content
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from json import loads
|
|
||||||
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['it']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'https://searchcode.com/'
|
|
||||||
search_url = url+'api/codesearch_I/?{query}&p={pageno}'
|
|
||||||
|
|
||||||
# special code-endings which are not recognised by the file ending
|
|
||||||
code_endings = {'cs': 'c#',
|
|
||||||
'h': 'c',
|
|
||||||
'hpp': 'cpp',
|
|
||||||
'cxx': 'cpp'}
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(query=urlencode({'q': query}),
|
|
||||||
pageno=params['pageno']-1)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_results = loads(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_results.get('results', []):
|
|
||||||
href = result['url']
|
|
||||||
title = "" + result['name'] + " - " + result['filename']
|
|
||||||
repo = result['repo']
|
|
||||||
|
|
||||||
lines = dict()
|
|
||||||
for line, code in result['lines'].items():
|
|
||||||
lines[int(line)] = code
|
|
||||||
|
|
||||||
code_language = code_endings.get(
|
|
||||||
result['filename'].split('.')[-1].lower(),
|
|
||||||
result['filename'].split('.')[-1].lower())
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': href,
|
|
||||||
'title': title,
|
|
||||||
'content': '',
|
|
||||||
'repository': repo,
|
|
||||||
'codelines': sorted(lines.items()),
|
|
||||||
'code_language': code_language,
|
|
||||||
'template': 'code.html'})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,56 +0,0 @@
|
||||||
## Searchcode (It)
|
|
||||||
#
|
|
||||||
# @website https://searchcode.com/
|
|
||||||
# @provide-api yes (https://searchcode.com/api/)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title, content
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from json import loads
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['it']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'https://searchcode.com/'
|
|
||||||
search_url = url+'api/search_IV/?{query}&p={pageno}'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(query=urlencode({'q': query}),
|
|
||||||
pageno=params['pageno']-1)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_results = loads(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_results.get('results', []):
|
|
||||||
href = result['url']
|
|
||||||
title = "[" + result['type'] + "] " +\
|
|
||||||
result['namespace'] +\
|
|
||||||
" " + result['name']
|
|
||||||
content = '<span class="highlight">[' +\
|
|
||||||
result['type'] + "] " +\
|
|
||||||
result['name'] + " " +\
|
|
||||||
result['synopsis'] +\
|
|
||||||
"</span><br />" +\
|
|
||||||
result['description']
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': href,
|
|
||||||
'title': title,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,70 +0,0 @@
|
||||||
## Soundcloud (Music)
|
|
||||||
#
|
|
||||||
# @website https://soundcloud.com
|
|
||||||
# @provide-api yes (https://developers.soundcloud.com/)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title, content, publishedDate, embedded
|
|
||||||
|
|
||||||
from json import loads
|
|
||||||
from urllib import urlencode, quote_plus
|
|
||||||
from dateutil import parser
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['music']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# api-key
|
|
||||||
guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28'
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'https://api.soundcloud.com/'
|
|
||||||
search_url = url + 'search?{query}'\
|
|
||||||
'&facet=model'\
|
|
||||||
'&limit=20'\
|
|
||||||
'&offset={offset}'\
|
|
||||||
'&linked_partitioning=1'\
|
|
||||||
'&client_id={client_id}' # noqa
|
|
||||||
|
|
||||||
embedded_url = '<iframe width="100%" height="166" ' +\
|
|
||||||
'scrolling="no" frameborder="no" ' +\
|
|
||||||
'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 20
|
|
||||||
|
|
||||||
params['url'] = search_url.format(query=urlencode({'q': query}),
|
|
||||||
offset=offset,
|
|
||||||
client_id=guest_client_id)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_res = loads(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_res.get('collection', []):
|
|
||||||
if result['kind'] in ('track', 'playlist'):
|
|
||||||
title = result['title']
|
|
||||||
content = result['description']
|
|
||||||
publishedDate = parser.parse(result['last_modified'])
|
|
||||||
uri = quote_plus(result['uri'])
|
|
||||||
embedded = embedded_url.format(uri=uri)
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': result['permalink_url'],
|
|
||||||
'title': title,
|
|
||||||
'publishedDate': publishedDate,
|
|
||||||
'embedded': embedded,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,58 +0,0 @@
|
||||||
## Stackoverflow (It)
|
|
||||||
#
|
|
||||||
# @website https://stackoverflow.com/
|
|
||||||
# @provide-api not clear (https://api.stackexchange.com/docs/advanced-search)
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content
|
|
||||||
|
|
||||||
from urlparse import urljoin
|
|
||||||
from cgi import escape
|
|
||||||
from urllib import urlencode
|
|
||||||
from lxml import html
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['it']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'http://stackoverflow.com/'
|
|
||||||
search_url = url+'search?{query}&page={pageno}'
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
results_xpath = '//div[contains(@class,"question-summary")]'
|
|
||||||
link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
|
|
||||||
content_xpath = './/div[@class="excerpt"]'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(query=urlencode({'q': query}),
|
|
||||||
pageno=params['pageno'])
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath(results_xpath):
|
|
||||||
link = result.xpath(link_xpath)[0]
|
|
||||||
href = urljoin(url, link.attrib.get('href'))
|
|
||||||
title = escape(extract_text(link))
|
|
||||||
content = escape(extract_text(result.xpath(content_xpath)))
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': href,
|
|
||||||
'title': title,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,85 +0,0 @@
|
||||||
# Startpage (Web)
|
|
||||||
#
|
|
||||||
# @website https://startpage.com
|
|
||||||
# @provide-api no (nothing found)
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content
|
|
||||||
#
|
|
||||||
# @todo paging
|
|
||||||
|
|
||||||
from lxml import html
|
|
||||||
from cgi import escape
|
|
||||||
import re
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['general']
|
|
||||||
# there is a mechanism to block "bot" search
|
|
||||||
# (probably the parameter qid), require
|
|
||||||
# storing of qid's between mulitble search-calls
|
|
||||||
|
|
||||||
# paging = False
|
|
||||||
language_support = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://startpage.com/'
|
|
||||||
search_url = base_url + 'do/search'
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
|
|
||||||
# not ads: div[@class="result"] are the direct childs of div[@id="results"]
|
|
||||||
results_xpath = '//div[@class="result"]'
|
|
||||||
link_xpath = './/h3/a'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 10
|
|
||||||
|
|
||||||
params['url'] = search_url
|
|
||||||
params['method'] = 'POST'
|
|
||||||
params['data'] = {'query': query,
|
|
||||||
'startat': offset}
|
|
||||||
|
|
||||||
# set language if specified
|
|
||||||
if params['language'] != 'all':
|
|
||||||
params['data']['with_language'] = ('lang_' + params['language'].split('_')[0])
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.content)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath(results_xpath):
|
|
||||||
links = result.xpath(link_xpath)
|
|
||||||
if not links:
|
|
||||||
continue
|
|
||||||
link = links[0]
|
|
||||||
url = link.attrib.get('href')
|
|
||||||
|
|
||||||
# block google-ad url's
|
|
||||||
if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
|
|
||||||
continue
|
|
||||||
|
|
||||||
title = escape(extract_text(link))
|
|
||||||
|
|
||||||
if result.xpath('./p[@class="desc"]'):
|
|
||||||
content = escape(extract_text(result.xpath('./p[@class="desc"]')))
|
|
||||||
else:
|
|
||||||
content = ''
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,79 +0,0 @@
|
||||||
## Subtitleseeker (Video)
|
|
||||||
#
|
|
||||||
# @website http://www.subtitleseeker.com
|
|
||||||
# @provide-api no
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content
|
|
||||||
|
|
||||||
from cgi import escape
|
|
||||||
from urllib import quote_plus
|
|
||||||
from lxml import html
|
|
||||||
from searx.languages import language_codes
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['videos']
|
|
||||||
paging = True
|
|
||||||
language = ""
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
url = 'http://www.subtitleseeker.com/'
|
|
||||||
search_url = url + 'search/TITLES/{query}&p={pageno}'
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
results_xpath = '//div[@class="boxRows"]'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(query=quote_plus(query),
|
|
||||||
pageno=params['pageno'])
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
search_lang = ""
|
|
||||||
|
|
||||||
if resp.search_params['language'] != 'all':
|
|
||||||
search_lang = [lc[1]
|
|
||||||
for lc in language_codes
|
|
||||||
if lc[0][:2] == resp.search_params['language'].split('_')[0]][0]
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath(results_xpath):
|
|
||||||
link = result.xpath(".//a")[0]
|
|
||||||
href = link.attrib.get('href')
|
|
||||||
|
|
||||||
if language is not "":
|
|
||||||
href = href + language + '/'
|
|
||||||
elif search_lang:
|
|
||||||
href = href + search_lang + '/'
|
|
||||||
|
|
||||||
title = escape(extract_text(link))
|
|
||||||
|
|
||||||
content = extract_text(result.xpath('.//div[contains(@class,"red")]'))
|
|
||||||
content = content + " - "
|
|
||||||
text = extract_text(result.xpath('.//div[contains(@class,"grey-web")]')[0])
|
|
||||||
content = content + text
|
|
||||||
|
|
||||||
if result.xpath(".//span") != []:
|
|
||||||
content = content +\
|
|
||||||
" - (" +\
|
|
||||||
extract_text(result.xpath(".//span")) +\
|
|
||||||
")"
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': href,
|
|
||||||
'title': title,
|
|
||||||
'content': escape(content)})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,77 +0,0 @@
|
||||||
## Twitter (Social media)
|
|
||||||
#
|
|
||||||
# @website https://twitter.com/
|
|
||||||
# @provide-api yes (https://dev.twitter.com/docs/using-search)
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content
|
|
||||||
#
|
|
||||||
# @todo publishedDate
|
|
||||||
|
|
||||||
from urlparse import urljoin
|
|
||||||
from urllib import urlencode
|
|
||||||
from lxml import html
|
|
||||||
from datetime import datetime
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['social media']
|
|
||||||
language_support = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://twitter.com/'
|
|
||||||
search_url = base_url + 'search?'
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
results_xpath = '//li[@data-item-type="tweet"]'
|
|
||||||
link_xpath = './/small[@class="time"]//a'
|
|
||||||
title_xpath = './/span[@class="username js-action-profile-name"]'
|
|
||||||
content_xpath = './/p[@class="js-tweet-text tweet-text"]'
|
|
||||||
timestamp_xpath = './/span[contains(@class,"_timestamp")]'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url + urlencode({'q': query})
|
|
||||||
|
|
||||||
# set language if specified
|
|
||||||
if params['language'] != 'all':
|
|
||||||
params['cookies']['lang'] = params['language'].split('_')[0]
|
|
||||||
else:
|
|
||||||
params['cookies']['lang'] = 'en'
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for tweet in dom.xpath(results_xpath):
|
|
||||||
link = tweet.xpath(link_xpath)[0]
|
|
||||||
url = urljoin(base_url, link.attrib.get('href'))
|
|
||||||
title = extract_text(tweet.xpath(title_xpath))
|
|
||||||
content = extract_text(tweet.xpath(content_xpath)[0])
|
|
||||||
|
|
||||||
pubdate = tweet.xpath(timestamp_xpath)
|
|
||||||
if len(pubdate) > 0:
|
|
||||||
timestamp = float(pubdate[0].attrib.get('data-time'))
|
|
||||||
publishedDate = datetime.fromtimestamp(timestamp, None)
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content,
|
|
||||||
'publishedDate': publishedDate})
|
|
||||||
else:
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,75 +0,0 @@
|
||||||
# Vimeo (Videos)
|
|
||||||
#
|
|
||||||
# @website https://vimeo.com/
|
|
||||||
# @provide-api yes (http://developer.vimeo.com/api),
|
|
||||||
# they have a maximum count of queries/hour
|
|
||||||
#
|
|
||||||
# @using-api no (TODO, rewrite to api)
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, publishedDate, thumbnail, embedded
|
|
||||||
#
|
|
||||||
# @todo rewrite to api
|
|
||||||
# @todo set content-parameter with correct data
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from lxml import html
|
|
||||||
from HTMLParser import HTMLParser
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
from dateutil import parser
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['videos']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'http://vimeo.com'
|
|
||||||
search_url = base_url + '/search/page:{pageno}?{query}'
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
results_xpath = '//div[@id="browse_content"]/ol/li'
|
|
||||||
url_xpath = './a/@href'
|
|
||||||
title_xpath = './a/div[@class="data"]/p[@class="title"]'
|
|
||||||
content_xpath = './a/img/@src'
|
|
||||||
publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
|
|
||||||
|
|
||||||
embedded_url = '<iframe data-src="//player.vimeo.com/video{videoid}" ' +\
|
|
||||||
'width="540" height="304" frameborder="0" ' +\
|
|
||||||
'webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(pageno=params['pageno'],
|
|
||||||
query=urlencode({'q': query}))
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
p = HTMLParser()
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath(results_xpath):
|
|
||||||
videoid = result.xpath(url_xpath)[0]
|
|
||||||
url = base_url + videoid
|
|
||||||
title = p.unescape(extract_text(result.xpath(title_xpath)))
|
|
||||||
thumbnail = extract_text(result.xpath(content_xpath)[0])
|
|
||||||
publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
|
|
||||||
embedded = embedded_url.format(videoid=videoid)
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': '',
|
|
||||||
'template': 'videos.html',
|
|
||||||
'publishedDate': publishedDate,
|
|
||||||
'embedded': embedded,
|
|
||||||
'thumbnail': thumbnail})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,305 +0,0 @@
|
||||||
import json
|
|
||||||
from urllib import urlencode
|
|
||||||
from searx.poolrequests import get
|
|
||||||
from searx.utils import format_date_by_locale
|
|
||||||
|
|
||||||
result_count = 1
|
|
||||||
wikidata_host = 'https://www.wikidata.org'
|
|
||||||
wikidata_api = wikidata_host + '/w/api.php'
|
|
||||||
url_search = wikidata_api \
|
|
||||||
+ '?action=query&list=search&format=json'\
|
|
||||||
+ '&srnamespace=0&srprop=sectiontitle&{query}'
|
|
||||||
url_detail = wikidata_api\
|
|
||||||
+ '?action=wbgetentities&format=json'\
|
|
||||||
+ '&props=labels%7Cinfo%7Csitelinks'\
|
|
||||||
+ '%7Csitelinks%2Furls%7Cdescriptions%7Cclaims'\
|
|
||||||
+ '&{query}'
|
|
||||||
url_map = 'https://www.openstreetmap.org/'\
|
|
||||||
+ '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = url_search.format(
|
|
||||||
query=urlencode({'srsearch': query,
|
|
||||||
'srlimit': result_count}))
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
search_res = json.loads(resp.text)
|
|
||||||
|
|
||||||
wikidata_ids = set()
|
|
||||||
for r in search_res.get('query', {}).get('search', {}):
|
|
||||||
wikidata_ids.add(r.get('title', ''))
|
|
||||||
|
|
||||||
language = resp.search_params['language'].split('_')[0]
|
|
||||||
if language == 'all':
|
|
||||||
language = 'en'
|
|
||||||
|
|
||||||
url = url_detail.format(query=urlencode({'ids': '|'.join(wikidata_ids),
|
|
||||||
'languages': language + '|en'}))
|
|
||||||
|
|
||||||
htmlresponse = get(url)
|
|
||||||
jsonresponse = json.loads(htmlresponse.content)
|
|
||||||
for wikidata_id in wikidata_ids:
|
|
||||||
results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def getDetail(jsonresponse, wikidata_id, language, locale):
|
|
||||||
results = []
|
|
||||||
urls = []
|
|
||||||
attributes = []
|
|
||||||
|
|
||||||
result = jsonresponse.get('entities', {}).get(wikidata_id, {})
|
|
||||||
|
|
||||||
title = result.get('labels', {}).get(language, {}).get('value', None)
|
|
||||||
if title is None:
|
|
||||||
title = result.get('labels', {}).get('en', {}).get('value', None)
|
|
||||||
if title is None:
|
|
||||||
return results
|
|
||||||
|
|
||||||
description = result\
|
|
||||||
.get('descriptions', {})\
|
|
||||||
.get(language, {})\
|
|
||||||
.get('value', None)
|
|
||||||
|
|
||||||
if description is None:
|
|
||||||
description = result\
|
|
||||||
.get('descriptions', {})\
|
|
||||||
.get('en', {})\
|
|
||||||
.get('value', '')
|
|
||||||
|
|
||||||
claims = result.get('claims', {})
|
|
||||||
official_website = get_string(claims, 'P856', None)
|
|
||||||
if official_website is not None:
|
|
||||||
urls.append({'title': 'Official site', 'url': official_website})
|
|
||||||
results.append({'title': title, 'url': official_website})
|
|
||||||
|
|
||||||
wikipedia_link_count = 0
|
|
||||||
if language != 'en':
|
|
||||||
wikipedia_link_count += add_url(urls,
|
|
||||||
'Wikipedia (' + language + ')',
|
|
||||||
get_wikilink(result, language +
|
|
||||||
'wiki'))
|
|
||||||
wikipedia_en_link = get_wikilink(result, 'enwiki')
|
|
||||||
wikipedia_link_count += add_url(urls,
|
|
||||||
'Wikipedia (en)',
|
|
||||||
wikipedia_en_link)
|
|
||||||
if wikipedia_link_count == 0:
|
|
||||||
misc_language = get_wiki_firstlanguage(result, 'wiki')
|
|
||||||
if misc_language is not None:
|
|
||||||
add_url(urls,
|
|
||||||
'Wikipedia (' + misc_language + ')',
|
|
||||||
get_wikilink(result, misc_language + 'wiki'))
|
|
||||||
|
|
||||||
if language != 'en':
|
|
||||||
add_url(urls,
|
|
||||||
'Wiki voyage (' + language + ')',
|
|
||||||
get_wikilink(result, language + 'wikivoyage'))
|
|
||||||
|
|
||||||
add_url(urls,
|
|
||||||
'Wiki voyage (en)',
|
|
||||||
get_wikilink(result, 'enwikivoyage'))
|
|
||||||
|
|
||||||
if language != 'en':
|
|
||||||
add_url(urls,
|
|
||||||
'Wikiquote (' + language + ')',
|
|
||||||
get_wikilink(result, language + 'wikiquote'))
|
|
||||||
|
|
||||||
add_url(urls,
|
|
||||||
'Wikiquote (en)',
|
|
||||||
get_wikilink(result, 'enwikiquote'))
|
|
||||||
|
|
||||||
add_url(urls,
|
|
||||||
'Commons wiki',
|
|
||||||
get_wikilink(result, 'commonswiki'))
|
|
||||||
|
|
||||||
add_url(urls,
|
|
||||||
'Location',
|
|
||||||
get_geolink(claims, 'P625', None))
|
|
||||||
|
|
||||||
add_url(urls,
|
|
||||||
'Wikidata',
|
|
||||||
'https://www.wikidata.org/wiki/'
|
|
||||||
+ wikidata_id + '?uselang=' + language)
|
|
||||||
|
|
||||||
musicbrainz_work_id = get_string(claims, 'P435')
|
|
||||||
if musicbrainz_work_id is not None:
|
|
||||||
add_url(urls,
|
|
||||||
'MusicBrainz',
|
|
||||||
'http://musicbrainz.org/work/'
|
|
||||||
+ musicbrainz_work_id)
|
|
||||||
|
|
||||||
musicbrainz_artist_id = get_string(claims, 'P434')
|
|
||||||
if musicbrainz_artist_id is not None:
|
|
||||||
add_url(urls,
|
|
||||||
'MusicBrainz',
|
|
||||||
'http://musicbrainz.org/artist/'
|
|
||||||
+ musicbrainz_artist_id)
|
|
||||||
|
|
||||||
musicbrainz_release_group_id = get_string(claims, 'P436')
|
|
||||||
if musicbrainz_release_group_id is not None:
|
|
||||||
add_url(urls,
|
|
||||||
'MusicBrainz',
|
|
||||||
'http://musicbrainz.org/release-group/'
|
|
||||||
+ musicbrainz_release_group_id)
|
|
||||||
|
|
||||||
musicbrainz_label_id = get_string(claims, 'P966')
|
|
||||||
if musicbrainz_label_id is not None:
|
|
||||||
add_url(urls,
|
|
||||||
'MusicBrainz',
|
|
||||||
'http://musicbrainz.org/label/'
|
|
||||||
+ musicbrainz_label_id)
|
|
||||||
|
|
||||||
# musicbrainz_area_id = get_string(claims, 'P982')
|
|
||||||
# P1407 MusicBrainz series ID
|
|
||||||
# P1004 MusicBrainz place ID
|
|
||||||
# P1330 MusicBrainz instrument ID
|
|
||||||
# P1407 MusicBrainz series ID
|
|
||||||
|
|
||||||
postal_code = get_string(claims, 'P281', None)
|
|
||||||
if postal_code is not None:
|
|
||||||
attributes.append({'label': 'Postal code(s)', 'value': postal_code})
|
|
||||||
|
|
||||||
date_of_birth = get_time(claims, 'P569', None)
|
|
||||||
if date_of_birth is not None:
|
|
||||||
date_of_birth = format_date_by_locale(date_of_birth[8:], locale)
|
|
||||||
attributes.append({'label': 'Date of birth', 'value': date_of_birth})
|
|
||||||
|
|
||||||
date_of_death = get_time(claims, 'P570', None)
|
|
||||||
if date_of_death is not None:
|
|
||||||
date_of_death = format_date_by_locale(date_of_death[8:], locale)
|
|
||||||
attributes.append({'label': 'Date of death', 'value': date_of_death})
|
|
||||||
|
|
||||||
if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
|
|
||||||
results.append({
|
|
||||||
'url': urls[0]['url'],
|
|
||||||
'title': title,
|
|
||||||
'content': description
|
|
||||||
})
|
|
||||||
else:
|
|
||||||
results.append({
|
|
||||||
'infobox': title,
|
|
||||||
'id': wikipedia_en_link,
|
|
||||||
'content': description,
|
|
||||||
'attributes': attributes,
|
|
||||||
'urls': urls
|
|
||||||
})
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def add_url(urls, title, url):
|
|
||||||
if url is not None:
|
|
||||||
urls.append({'title': title, 'url': url})
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def get_mainsnak(claims, propertyName):
|
|
||||||
propValue = claims.get(propertyName, {})
|
|
||||||
if len(propValue) == 0:
|
|
||||||
return None
|
|
||||||
|
|
||||||
propValue = propValue[0].get('mainsnak', None)
|
|
||||||
return propValue
|
|
||||||
|
|
||||||
|
|
||||||
def get_string(claims, propertyName, defaultValue=None):
|
|
||||||
propValue = claims.get(propertyName, {})
|
|
||||||
if len(propValue) == 0:
|
|
||||||
return defaultValue
|
|
||||||
|
|
||||||
result = []
|
|
||||||
for e in propValue:
|
|
||||||
mainsnak = e.get('mainsnak', {})
|
|
||||||
|
|
||||||
datavalue = mainsnak.get('datavalue', {})
|
|
||||||
if datavalue is not None:
|
|
||||||
result.append(datavalue.get('value', ''))
|
|
||||||
|
|
||||||
if len(result) == 0:
|
|
||||||
return defaultValue
|
|
||||||
else:
|
|
||||||
# TODO handle multiple urls
|
|
||||||
return result[0]
|
|
||||||
|
|
||||||
|
|
||||||
def get_time(claims, propertyName, defaultValue=None):
|
|
||||||
propValue = claims.get(propertyName, {})
|
|
||||||
if len(propValue) == 0:
|
|
||||||
return defaultValue
|
|
||||||
|
|
||||||
result = []
|
|
||||||
for e in propValue:
|
|
||||||
mainsnak = e.get('mainsnak', {})
|
|
||||||
|
|
||||||
datavalue = mainsnak.get('datavalue', {})
|
|
||||||
if datavalue is not None:
|
|
||||||
value = datavalue.get('value', '')
|
|
||||||
result.append(value.get('time', ''))
|
|
||||||
|
|
||||||
if len(result) == 0:
|
|
||||||
return defaultValue
|
|
||||||
else:
|
|
||||||
return ', '.join(result)
|
|
||||||
|
|
||||||
|
|
||||||
def get_geolink(claims, propertyName, defaultValue=''):
|
|
||||||
mainsnak = get_mainsnak(claims, propertyName)
|
|
||||||
|
|
||||||
if mainsnak is None:
|
|
||||||
return defaultValue
|
|
||||||
|
|
||||||
datatype = mainsnak.get('datatype', '')
|
|
||||||
datavalue = mainsnak.get('datavalue', {})
|
|
||||||
|
|
||||||
if datatype != 'globe-coordinate':
|
|
||||||
return defaultValue
|
|
||||||
|
|
||||||
value = datavalue.get('value', {})
|
|
||||||
|
|
||||||
precision = value.get('precision', 0.0002)
|
|
||||||
|
|
||||||
# there is no zoom information, deduce from precision (error prone)
|
|
||||||
# samples :
|
|
||||||
# 13 --> 5
|
|
||||||
# 1 --> 6
|
|
||||||
# 0.016666666666667 --> 9
|
|
||||||
# 0.00027777777777778 --> 19
|
|
||||||
# wolframalpha :
|
|
||||||
# quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}}
|
|
||||||
# 14.1186-8.8322 x+0.625447 x^2
|
|
||||||
if precision < 0.0003:
|
|
||||||
zoom = 19
|
|
||||||
else:
|
|
||||||
zoom = int(15 - precision*8.8322 + precision*precision*0.625447)
|
|
||||||
|
|
||||||
url = url_map\
|
|
||||||
.replace('{latitude}', str(value.get('latitude', 0)))\
|
|
||||||
.replace('{longitude}', str(value.get('longitude', 0)))\
|
|
||||||
.replace('{zoom}', str(zoom))
|
|
||||||
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def get_wikilink(result, wikiid):
|
|
||||||
url = result.get('sitelinks', {}).get(wikiid, {}).get('url', None)
|
|
||||||
if url is None:
|
|
||||||
return url
|
|
||||||
elif url.startswith('http://'):
|
|
||||||
url = url.replace('http://', 'https://')
|
|
||||||
elif url.startswith('//'):
|
|
||||||
url = 'https:' + url
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def get_wiki_firstlanguage(result, wikipatternid):
|
|
||||||
for k in result.get('sitelinks', {}).keys():
|
|
||||||
if k.endswith(wikipatternid) and len(k) == (2+len(wikipatternid)):
|
|
||||||
return k[0:2]
|
|
||||||
return None
|
|
|
@ -1,82 +0,0 @@
|
||||||
## 1x (Images)
|
|
||||||
#
|
|
||||||
# @website http://1x.com/
|
|
||||||
# @provide-api no
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, thumbnail, img_src, content
|
|
||||||
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from urlparse import urljoin
|
|
||||||
from lxml import html
|
|
||||||
import string
|
|
||||||
import re
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['images']
|
|
||||||
paging = False
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'http://1x.com'
|
|
||||||
search_url = base_url+'/backend/search.php?{query}'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(query=urlencode({'q': query}))
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# get links from result-text
|
|
||||||
regex = re.compile('(</a>|<a)')
|
|
||||||
results_parts = re.split(regex, resp.text)
|
|
||||||
|
|
||||||
cur_element = ''
|
|
||||||
|
|
||||||
# iterate over link parts
|
|
||||||
for result_part in results_parts:
|
|
||||||
# processed start and end of link
|
|
||||||
if result_part == '<a':
|
|
||||||
cur_element = result_part
|
|
||||||
continue
|
|
||||||
elif result_part != '</a>':
|
|
||||||
cur_element += result_part
|
|
||||||
continue
|
|
||||||
|
|
||||||
cur_element += result_part
|
|
||||||
|
|
||||||
# fix xml-error
|
|
||||||
cur_element = string.replace(cur_element, '"></a>', '"/></a>')
|
|
||||||
|
|
||||||
dom = html.fromstring(cur_element)
|
|
||||||
link = dom.xpath('//a')[0]
|
|
||||||
|
|
||||||
url = urljoin(base_url, link.attrib.get('href'))
|
|
||||||
title = link.attrib.get('title', '')
|
|
||||||
|
|
||||||
thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src'])
|
|
||||||
# TODO: get image with higher resolution
|
|
||||||
img_src = thumbnail_src
|
|
||||||
|
|
||||||
# check if url is showing to a photo
|
|
||||||
if '/photo/' not in url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'img_src': img_src,
|
|
||||||
'content': '',
|
|
||||||
'thumbnail_src': thumbnail_src,
|
|
||||||
'template': 'images.html'})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,64 +0,0 @@
|
||||||
## 500px (Images)
|
|
||||||
#
|
|
||||||
# @website https://500px.com
|
|
||||||
# @provide-api yes (https://developers.500px.com/)
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, thumbnail, img_src, content
|
|
||||||
#
|
|
||||||
# @todo rewrite to api
|
|
||||||
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from urlparse import urljoin
|
|
||||||
from lxml import html
|
|
||||||
import re
|
|
||||||
from searx.engines.xpath import extract_text
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['images']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://500px.com'
|
|
||||||
search_url = base_url + '/search?search?page={pageno}&type=photos&{query}'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(pageno=params['pageno'],
|
|
||||||
query=urlencode({'q': query}))
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
regex = re.compile('3\.jpg.*$')
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath('//div[@class="photo"]'):
|
|
||||||
link = result.xpath('.//a')[0]
|
|
||||||
url = urljoin(base_url, link.attrib.get('href'))
|
|
||||||
title = extract_text(result.xpath('.//div[@class="title"]'))
|
|
||||||
thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
|
|
||||||
# To have a bigger thumbnail, uncomment the next line
|
|
||||||
# thumbnail_src = regex.sub('4.jpg', thumbnail_src)
|
|
||||||
content = extract_text(result.xpath('.//div[@class="info"]'))
|
|
||||||
img_src = regex.sub('2048.jpg', thumbnail_src)
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'img_src': img_src,
|
|
||||||
'content': content,
|
|
||||||
'thumbnail_src': thumbnail_src,
|
|
||||||
'template': 'images.html'})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,106 +0,0 @@
|
||||||
from lxml import html
|
|
||||||
from urllib import urlencode, unquote
|
|
||||||
from urlparse import urlparse, urljoin
|
|
||||||
from lxml.etree import _ElementStringResult, _ElementUnicodeResult
|
|
||||||
from searx.utils import html_to_text
|
|
||||||
|
|
||||||
search_url = None
|
|
||||||
url_xpath = None
|
|
||||||
content_xpath = None
|
|
||||||
title_xpath = None
|
|
||||||
suggestion_xpath = ''
|
|
||||||
results_xpath = ''
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
|
||||||
if xpath_results is list, extract the text from each result and concat the list
|
|
||||||
if xpath_results is a xml element, extract all the text node from it
|
|
||||||
( text_content() method from lxml )
|
|
||||||
if xpath_results is a string element, then it's already done
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
def extract_text(xpath_results):
|
|
||||||
if type(xpath_results) == list:
|
|
||||||
# it's list of result : concat everything using recursive call
|
|
||||||
if not xpath_results:
|
|
||||||
raise Exception('Empty url resultset')
|
|
||||||
result = ''
|
|
||||||
for e in xpath_results:
|
|
||||||
result = result + extract_text(e)
|
|
||||||
return result.strip()
|
|
||||||
elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
|
|
||||||
# it's a string
|
|
||||||
return ''.join(xpath_results)
|
|
||||||
else:
|
|
||||||
# it's a element
|
|
||||||
return html_to_text(xpath_results.text_content()).strip()
|
|
||||||
|
|
||||||
|
|
||||||
def extract_url(xpath_results, search_url):
|
|
||||||
url = extract_text(xpath_results)
|
|
||||||
|
|
||||||
if url.startswith('//'):
|
|
||||||
# add http or https to this kind of url //example.com/
|
|
||||||
parsed_search_url = urlparse(search_url)
|
|
||||||
url = parsed_search_url.scheme+url
|
|
||||||
elif url.startswith('/'):
|
|
||||||
# fix relative url to the search engine
|
|
||||||
url = urljoin(search_url, url)
|
|
||||||
|
|
||||||
# normalize url
|
|
||||||
url = normalize_url(url)
|
|
||||||
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_url(url):
|
|
||||||
parsed_url = urlparse(url)
|
|
||||||
|
|
||||||
# add a / at this end of the url if there is no path
|
|
||||||
if not parsed_url.netloc:
|
|
||||||
raise Exception('Cannot parse url')
|
|
||||||
if not parsed_url.path:
|
|
||||||
url += '/'
|
|
||||||
|
|
||||||
# FIXME : hack for yahoo
|
|
||||||
if parsed_url.hostname == 'search.yahoo.com'\
|
|
||||||
and parsed_url.path.startswith('/r'):
|
|
||||||
p = parsed_url.path
|
|
||||||
mark = p.find('/**')
|
|
||||||
if mark != -1:
|
|
||||||
return unquote(p[mark+3:]).decode('utf-8')
|
|
||||||
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
|
||||||
query = urlencode({'q': query})[2:]
|
|
||||||
params['url'] = search_url.format(query=query)
|
|
||||||
params['query'] = query
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
if results_xpath:
|
|
||||||
for result in dom.xpath(results_xpath):
|
|
||||||
url = extract_url(result.xpath(url_xpath), search_url)
|
|
||||||
title = extract_text(result.xpath(title_xpath)[0])
|
|
||||||
content = extract_text(result.xpath(content_xpath)[0])
|
|
||||||
results.append({'url': url, 'title': title, 'content': content})
|
|
||||||
else:
|
|
||||||
for url, title, content in zip(
|
|
||||||
(extract_url(x, search_url) for
|
|
||||||
x in dom.xpath(url_xpath)),
|
|
||||||
map(extract_text, dom.xpath(title_xpath)),
|
|
||||||
map(extract_text, dom.xpath(content_xpath))
|
|
||||||
):
|
|
||||||
results.append({'url': url, 'title': title, 'content': content})
|
|
||||||
|
|
||||||
if not suggestion_xpath:
|
|
||||||
return results
|
|
||||||
for suggestion in dom.xpath(suggestion_xpath):
|
|
||||||
results.append({'suggestion': extract_text(suggestion)})
|
|
||||||
return results
|
|
|
@ -1,97 +0,0 @@
|
||||||
## Yacy (Web, Images, Videos, Music, Files)
|
|
||||||
#
|
|
||||||
# @website http://yacy.net
|
|
||||||
# @provide-api yes
|
|
||||||
# (http://www.yacy-websuche.de/wiki/index.php/Dev:APIyacysearch)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse (general) url, title, content, publishedDate
|
|
||||||
# @parse (images) url, title, img_src
|
|
||||||
#
|
|
||||||
# @todo parse video, audio and file results
|
|
||||||
|
|
||||||
from json import loads
|
|
||||||
from urllib import urlencode
|
|
||||||
from dateutil import parser
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['general', 'images'] # TODO , 'music', 'videos', 'files'
|
|
||||||
paging = True
|
|
||||||
language_support = True
|
|
||||||
number_of_results = 5
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'http://localhost:8090'
|
|
||||||
search_url = '/yacysearch.json?{query}'\
|
|
||||||
'&startRecord={offset}'\
|
|
||||||
'&maximumRecords={limit}'\
|
|
||||||
'&contentdom={search_type}'\
|
|
||||||
'&resource=global'
|
|
||||||
|
|
||||||
# yacy specific type-definitions
|
|
||||||
search_types = {'general': 'text',
|
|
||||||
'images': 'image',
|
|
||||||
'files': 'app',
|
|
||||||
'music': 'audio',
|
|
||||||
'videos': 'video'}
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * number_of_results
|
|
||||||
search_type = search_types.get(params.get('category'), '0')
|
|
||||||
|
|
||||||
params['url'] = base_url +\
|
|
||||||
search_url.format(query=urlencode({'query': query}),
|
|
||||||
offset=offset,
|
|
||||||
limit=number_of_results,
|
|
||||||
search_type=search_type)
|
|
||||||
|
|
||||||
# add language tag if specified
|
|
||||||
if params['language'] != 'all':
|
|
||||||
params['url'] += '&lr=lang_' + params['language'].split('_')[0]
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
raw_search_results = loads(resp.text)
|
|
||||||
|
|
||||||
# return empty array if there are no results
|
|
||||||
if not raw_search_results:
|
|
||||||
return []
|
|
||||||
|
|
||||||
search_results = raw_search_results.get('channels', [])
|
|
||||||
|
|
||||||
if len(search_results) == 0:
|
|
||||||
return []
|
|
||||||
|
|
||||||
for result in search_results[0].get('items', []):
|
|
||||||
# parse image results
|
|
||||||
if result.get('image'):
|
|
||||||
# append result
|
|
||||||
results.append({'url': result['url'],
|
|
||||||
'title': result['title'],
|
|
||||||
'content': '',
|
|
||||||
'img_src': result['image'],
|
|
||||||
'template': 'images.html'})
|
|
||||||
|
|
||||||
# parse general results
|
|
||||||
else:
|
|
||||||
publishedDate = parser.parse(result['pubDate'])
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': result['link'],
|
|
||||||
'title': result['title'],
|
|
||||||
'content': result['description'],
|
|
||||||
'publishedDate': publishedDate})
|
|
||||||
|
|
||||||
# TODO parse video, audio and file results
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,103 +0,0 @@
|
||||||
## Yahoo (Web)
|
|
||||||
#
|
|
||||||
# @website https://search.yahoo.com/web
|
|
||||||
# @provide-api yes (https://developer.yahoo.com/boss/search/),
|
|
||||||
# $0.80/1000 queries
|
|
||||||
#
|
|
||||||
# @using-api no (because pricing)
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content, suggestion
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from urlparse import unquote
|
|
||||||
from lxml import html
|
|
||||||
from searx.engines.xpath import extract_text, extract_url
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['general']
|
|
||||||
paging = True
|
|
||||||
language_support = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://search.yahoo.com/'
|
|
||||||
search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
results_xpath = '//div[@class="res"]'
|
|
||||||
url_xpath = './/h3/a/@href'
|
|
||||||
title_xpath = './/h3/a'
|
|
||||||
content_xpath = './/div[@class="abstr"]'
|
|
||||||
suggestion_xpath = '//div[@id="satat"]//a'
|
|
||||||
|
|
||||||
|
|
||||||
# remove yahoo-specific tracking-url
|
|
||||||
def parse_url(url_string):
|
|
||||||
endings = ['/RS', '/RK']
|
|
||||||
endpositions = []
|
|
||||||
start = url_string.find('http', url_string.find('/RU=') + 1)
|
|
||||||
|
|
||||||
for ending in endings:
|
|
||||||
endpos = url_string.rfind(ending)
|
|
||||||
if endpos > -1:
|
|
||||||
endpositions.append(endpos)
|
|
||||||
|
|
||||||
if start == 0 or len(endpositions) == 0:
|
|
||||||
return url_string
|
|
||||||
else:
|
|
||||||
end = min(endpositions)
|
|
||||||
return unquote(url_string[start:end])
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 10 + 1
|
|
||||||
|
|
||||||
if params['language'] == 'all':
|
|
||||||
language = 'en'
|
|
||||||
else:
|
|
||||||
language = params['language'].split('_')[0]
|
|
||||||
|
|
||||||
params['url'] = base_url + search_url.format(offset=offset,
|
|
||||||
query=urlencode({'p': query}),
|
|
||||||
lang=language)
|
|
||||||
|
|
||||||
# TODO required?
|
|
||||||
params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
|
|
||||||
.format(lang=language)
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath(results_xpath):
|
|
||||||
try:
|
|
||||||
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
|
|
||||||
title = extract_text(result.xpath(title_xpath)[0])
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
|
|
||||||
content = extract_text(result.xpath(content_xpath)[0])
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content})
|
|
||||||
|
|
||||||
# if no suggestion found, return results
|
|
||||||
if not dom.xpath(suggestion_xpath):
|
|
||||||
return results
|
|
||||||
|
|
||||||
# parse suggestion
|
|
||||||
for suggestion in dom.xpath(suggestion_xpath):
|
|
||||||
# append suggestion
|
|
||||||
results.append({'suggestion': extract_text(suggestion)})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,93 +0,0 @@
|
||||||
# Yahoo (News)
|
|
||||||
#
|
|
||||||
# @website https://news.yahoo.com
|
|
||||||
# @provide-api yes (https://developer.yahoo.com/boss/search/)
|
|
||||||
# $0.80/1000 queries
|
|
||||||
#
|
|
||||||
# @using-api no (because pricing)
|
|
||||||
# @results HTML (using search portal)
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, content, publishedDate
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from lxml import html
|
|
||||||
from searx.engines.xpath import extract_text, extract_url
|
|
||||||
from searx.engines.yahoo import parse_url
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
import re
|
|
||||||
from dateutil import parser
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['news']
|
|
||||||
paging = True
|
|
||||||
language_support = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}' # noqa
|
|
||||||
|
|
||||||
# specific xpath variables
|
|
||||||
results_xpath = '//div[@class="res"]'
|
|
||||||
url_xpath = './/h3/a/@href'
|
|
||||||
title_xpath = './/h3/a'
|
|
||||||
content_xpath = './/div[@class="abstr"]'
|
|
||||||
publishedDate_xpath = './/span[@class="timestamp"]'
|
|
||||||
suggestion_xpath = '//div[@id="satat"]//a'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
offset = (params['pageno'] - 1) * 10 + 1
|
|
||||||
|
|
||||||
if params['language'] == 'all':
|
|
||||||
language = 'en'
|
|
||||||
else:
|
|
||||||
language = params['language'].split('_')[0]
|
|
||||||
|
|
||||||
params['url'] = search_url.format(offset=offset,
|
|
||||||
query=urlencode({'p': query}),
|
|
||||||
lang=language)
|
|
||||||
|
|
||||||
# TODO required?
|
|
||||||
params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
|
|
||||||
.format(lang=language)
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath(results_xpath):
|
|
||||||
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
|
|
||||||
title = extract_text(result.xpath(title_xpath)[0])
|
|
||||||
content = extract_text(result.xpath(content_xpath)[0])
|
|
||||||
|
|
||||||
# parse publishedDate
|
|
||||||
publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
|
|
||||||
|
|
||||||
if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
|
|
||||||
publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) # noqa
|
|
||||||
else:
|
|
||||||
if re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$",
|
|
||||||
publishedDate):
|
|
||||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
|
||||||
publishedDate = datetime.now()\
|
|
||||||
- timedelta(hours=int(timeNumbers[0]))\
|
|
||||||
- timedelta(minutes=int(timeNumbers[1]))
|
|
||||||
else:
|
|
||||||
publishedDate = parser.parse(publishedDate)
|
|
||||||
|
|
||||||
if publishedDate.year == 1900:
|
|
||||||
publishedDate = publishedDate.replace(year=datetime.now().year)
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content,
|
|
||||||
'publishedDate': publishedDate})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,93 +0,0 @@
|
||||||
## Youtube (Videos)
|
|
||||||
#
|
|
||||||
# @website https://www.youtube.com/
|
|
||||||
# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/)
|
|
||||||
#
|
|
||||||
# @using-api yes
|
|
||||||
# @results JSON
|
|
||||||
# @stable yes
|
|
||||||
# @parse url, title, content, publishedDate, thumbnail, embedded
|
|
||||||
|
|
||||||
from json import loads
|
|
||||||
from urllib import urlencode
|
|
||||||
from dateutil import parser
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['videos', 'music']
|
|
||||||
paging = True
|
|
||||||
language_support = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://gdata.youtube.com/feeds/api/videos'
|
|
||||||
search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5'
|
|
||||||
|
|
||||||
embedded_url = '<iframe width="540" height="304" ' +\
|
|
||||||
'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\
|
|
||||||
'frameborder="0" allowfullscreen></iframe>'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
index = (params['pageno'] - 1) * 5 + 1
|
|
||||||
|
|
||||||
params['url'] = search_url.format(query=urlencode({'q': query}),
|
|
||||||
index=index)
|
|
||||||
|
|
||||||
# add language tag if specified
|
|
||||||
if params['language'] != 'all':
|
|
||||||
params['url'] += '&lr=' + params['language'].split('_')[0]
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
search_results = loads(resp.text)
|
|
||||||
|
|
||||||
# return empty array if there are no results
|
|
||||||
if not 'feed' in search_results:
|
|
||||||
return []
|
|
||||||
|
|
||||||
feed = search_results['feed']
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in feed['entry']:
|
|
||||||
url = [x['href'] for x in result['link'] if x['type'] == 'text/html']
|
|
||||||
|
|
||||||
if not url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# remove tracking
|
|
||||||
url = url[0].replace('feature=youtube_gdata', '')
|
|
||||||
if url.endswith('&'):
|
|
||||||
url = url[:-1]
|
|
||||||
|
|
||||||
videoid = url[32:]
|
|
||||||
|
|
||||||
title = result['title']['$t']
|
|
||||||
content = ''
|
|
||||||
thumbnail = ''
|
|
||||||
|
|
||||||
pubdate = result['published']['$t']
|
|
||||||
publishedDate = parser.parse(pubdate)
|
|
||||||
|
|
||||||
if 'media$thumbnail' in result['media$group']:
|
|
||||||
thumbnail = result['media$group']['media$thumbnail'][0]['url']
|
|
||||||
|
|
||||||
content = result['content']['$t']
|
|
||||||
|
|
||||||
embedded = embedded_url.format(videoid=videoid)
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content,
|
|
||||||
'template': 'videos.html',
|
|
||||||
'publishedDate': publishedDate,
|
|
||||||
'embedded': embedded,
|
|
||||||
'thumbnail': thumbnail})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -1,209 +0,0 @@
|
||||||
'''
|
|
||||||
searx is free software: you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU Affero General Public License as published by
|
|
||||||
the Free Software Foundation, either version 3 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
searx is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU Affero General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Affero General Public License
|
|
||||||
along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
|
||||||
|
|
||||||
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
|
|
||||||
'''
|
|
||||||
|
|
||||||
import re
|
|
||||||
from urlparse import urlparse
|
|
||||||
from lxml import etree
|
|
||||||
from os import listdir
|
|
||||||
from os.path import isfile, isdir, join
|
|
||||||
from searx import logger
|
|
||||||
|
|
||||||
|
|
||||||
logger = logger.getChild("https_rewrite")
|
|
||||||
|
|
||||||
# https://gitweb.torproject.org/\
|
|
||||||
# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
|
|
||||||
|
|
||||||
# HTTPS rewrite rules
|
|
||||||
https_rules = []
|
|
||||||
|
|
||||||
|
|
||||||
# load single ruleset from a xml file
|
|
||||||
def load_single_https_ruleset(filepath):
|
|
||||||
ruleset = ()
|
|
||||||
|
|
||||||
# init parser
|
|
||||||
parser = etree.XMLParser()
|
|
||||||
|
|
||||||
# load and parse xml-file
|
|
||||||
try:
|
|
||||||
tree = etree.parse(filepath, parser)
|
|
||||||
except:
|
|
||||||
# TODO, error message
|
|
||||||
return ()
|
|
||||||
|
|
||||||
# get root node
|
|
||||||
root = tree.getroot()
|
|
||||||
|
|
||||||
# check if root is a node with the name ruleset
|
|
||||||
# TODO improve parsing
|
|
||||||
if root.tag != 'ruleset':
|
|
||||||
return ()
|
|
||||||
|
|
||||||
# check if rule is deactivated by default
|
|
||||||
if root.attrib.get('default_off'):
|
|
||||||
return ()
|
|
||||||
|
|
||||||
# check if rule does only work for specific platforms
|
|
||||||
if root.attrib.get('platform'):
|
|
||||||
return ()
|
|
||||||
|
|
||||||
hosts = []
|
|
||||||
rules = []
|
|
||||||
exclusions = []
|
|
||||||
|
|
||||||
# parse childs from ruleset
|
|
||||||
for ruleset in root:
|
|
||||||
# this child define a target
|
|
||||||
if ruleset.tag == 'target':
|
|
||||||
# check if required tags available
|
|
||||||
if not ruleset.attrib.get('host'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# convert host-rule to valid regex
|
|
||||||
host = ruleset.attrib.get('host')\
|
|
||||||
.replace('.', '\.').replace('*', '.*')
|
|
||||||
|
|
||||||
# append to host list
|
|
||||||
hosts.append(host)
|
|
||||||
|
|
||||||
# this child define a rule
|
|
||||||
elif ruleset.tag == 'rule':
|
|
||||||
# check if required tags available
|
|
||||||
if not ruleset.attrib.get('from')\
|
|
||||||
or not ruleset.attrib.get('to'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# TODO hack, which convert a javascript regex group
|
|
||||||
# into a valid python regex group
|
|
||||||
rule_from = ruleset.attrib['from'].replace('$', '\\')
|
|
||||||
if rule_from.endswith('\\'):
|
|
||||||
rule_from = rule_from[:-1]+'$'
|
|
||||||
rule_to = ruleset.attrib['to'].replace('$', '\\')
|
|
||||||
if rule_to.endswith('\\'):
|
|
||||||
rule_to = rule_to[:-1]+'$'
|
|
||||||
|
|
||||||
# TODO, not working yet because of the hack above,
|
|
||||||
# currently doing that in webapp.py
|
|
||||||
# rule_from_rgx = re.compile(rule_from, re.I)
|
|
||||||
|
|
||||||
# append rule
|
|
||||||
try:
|
|
||||||
rules.append((re.compile(rule_from, re.I | re.U), rule_to))
|
|
||||||
except:
|
|
||||||
# TODO log regex error
|
|
||||||
continue
|
|
||||||
|
|
||||||
# this child define an exclusion
|
|
||||||
elif ruleset.tag == 'exclusion':
|
|
||||||
# check if required tags available
|
|
||||||
if not ruleset.attrib.get('pattern'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
|
|
||||||
|
|
||||||
# append exclusion
|
|
||||||
exclusions.append(exclusion_rgx)
|
|
||||||
|
|
||||||
# convert list of possible hosts to a simple regex
|
|
||||||
# TODO compress regex to improve performance
|
|
||||||
try:
|
|
||||||
target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
|
|
||||||
except:
|
|
||||||
return ()
|
|
||||||
|
|
||||||
# return ruleset
|
|
||||||
return (target_hosts, rules, exclusions)
|
|
||||||
|
|
||||||
|
|
||||||
# load all https rewrite rules
|
|
||||||
def load_https_rules(rules_path):
|
|
||||||
# check if directory exists
|
|
||||||
if not isdir(rules_path):
|
|
||||||
logger.error("directory not found: '" + rules_path + "'")
|
|
||||||
return
|
|
||||||
|
|
||||||
# search all xml files which are stored in the https rule directory
|
|
||||||
xml_files = [join(rules_path, f)
|
|
||||||
for f in listdir(rules_path)
|
|
||||||
if isfile(join(rules_path, f)) and f[-4:] == '.xml']
|
|
||||||
|
|
||||||
# load xml-files
|
|
||||||
for ruleset_file in xml_files:
|
|
||||||
# calculate rewrite-rules
|
|
||||||
ruleset = load_single_https_ruleset(ruleset_file)
|
|
||||||
|
|
||||||
# skip if no ruleset returned
|
|
||||||
if not ruleset:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# append ruleset
|
|
||||||
https_rules.append(ruleset)
|
|
||||||
|
|
||||||
logger.info('{n} rules loaded'.format(n=len(https_rules)))
|
|
||||||
|
|
||||||
|
|
||||||
def https_url_rewrite(result):
|
|
||||||
skip_https_rewrite = False
|
|
||||||
# check if HTTPS rewrite is possible
|
|
||||||
for target, rules, exclusions in https_rules:
|
|
||||||
|
|
||||||
# check if target regex match with url
|
|
||||||
if target.match(result['parsed_url'].netloc):
|
|
||||||
# process exclusions
|
|
||||||
for exclusion in exclusions:
|
|
||||||
# check if exclusion match with url
|
|
||||||
if exclusion.match(result['url']):
|
|
||||||
skip_https_rewrite = True
|
|
||||||
break
|
|
||||||
|
|
||||||
# skip https rewrite if required
|
|
||||||
if skip_https_rewrite:
|
|
||||||
break
|
|
||||||
|
|
||||||
# process rules
|
|
||||||
for rule in rules:
|
|
||||||
try:
|
|
||||||
new_result_url = rule[0].sub(rule[1], result['url'])
|
|
||||||
except:
|
|
||||||
break
|
|
||||||
|
|
||||||
# parse new url
|
|
||||||
new_parsed_url = urlparse(new_result_url)
|
|
||||||
|
|
||||||
# continiue if nothing was rewritten
|
|
||||||
if result['url'] == new_result_url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# get domainname from result
|
|
||||||
# TODO, does only work correct with TLD's like
|
|
||||||
# asdf.com, not for asdf.com.de
|
|
||||||
# TODO, using publicsuffix instead of this rewrite rule
|
|
||||||
old_result_domainname = '.'.join(
|
|
||||||
result['parsed_url'].hostname.split('.')[-2:])
|
|
||||||
new_result_domainname = '.'.join(
|
|
||||||
new_parsed_url.hostname.split('.')[-2:])
|
|
||||||
|
|
||||||
# check if rewritten hostname is the same,
|
|
||||||
# to protect against wrong or malicious rewrite rules
|
|
||||||
if old_result_domainname == new_result_domainname:
|
|
||||||
# set new url
|
|
||||||
result['url'] = new_result_url
|
|
||||||
|
|
||||||
# target has matched, do not search over the other rules
|
|
||||||
break
|
|
||||||
return result
|
|
|
@ -1,17 +0,0 @@
|
||||||
<!--
|
|
||||||
This directory contains web site rewriting rules for the
|
|
||||||
HTTPS Everywhere software, available from
|
|
||||||
https://www.eff.org/https-everywhere
|
|
||||||
|
|
||||||
These rules were contributed to the project by users and aim to
|
|
||||||
enable routine secure access to as many different web sites as
|
|
||||||
possible. They are automatically installed together with the
|
|
||||||
HTTPS Everywhere software. The presence of these rules does not
|
|
||||||
mean that an HTTPS Everywhere user accessed, or intended to
|
|
||||||
access, any particular web site.
|
|
||||||
|
|
||||||
For information about how to create additional HTTPS Everywhere
|
|
||||||
rewriting rules to add support for new sites, please see
|
|
||||||
|
|
||||||
https://www.eff.org/https-everywhere/rulesets
|
|
||||||
-->
|
|
|
@ -1,56 +0,0 @@
|
||||||
<!--
|
|
||||||
For other Microsoft coverage, see Microsoft.xml.
|
|
||||||
|
|
||||||
|
|
||||||
CDN buckets:
|
|
||||||
|
|
||||||
- a134.lm.akamai.net
|
|
||||||
|
|
||||||
- akam.bing.com
|
|
||||||
- *.mm.bing.net
|
|
||||||
|
|
||||||
|
|
||||||
Nonfunctional domains:
|
|
||||||
|
|
||||||
- m2.cn.bing.com
|
|
||||||
- origin.bj1.bing.com
|
|
||||||
- blogs.bing.com
|
|
||||||
|
|
||||||
|
|
||||||
Fully covered domains:
|
|
||||||
|
|
||||||
- bing.com subdomains:
|
|
||||||
|
|
||||||
- (www.)
|
|
||||||
- c.bing (tracking beacons)
|
|
||||||
- cn.bing
|
|
||||||
- h.bing
|
|
||||||
- ssl
|
|
||||||
- testfamilysafety.bing
|
|
||||||
- udc.bing
|
|
||||||
- (www.)bing
|
|
||||||
|
|
||||||
- *.mm.bing.net
|
|
||||||
- api.bing.com
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Bing">
|
|
||||||
|
|
||||||
<target host="bing.com" />
|
|
||||||
<target host="*.bing.com" />
|
|
||||||
<target host="*.mm.bing.net" />
|
|
||||||
|
|
||||||
|
|
||||||
<securecookie host=".*\.bing\.com$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://((?:c|cn|h|ssl|testfamilysafety|udc|www)\.)?bing\.com/"
|
|
||||||
to="https://$1bing.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://([^/:@]*)\.mm\.bing\.net/"
|
|
||||||
to="https://$1.mm.bing.com/"/>
|
|
||||||
|
|
||||||
<rule from="^http://([^/:@]*)\.api\.bing\.net/"
|
|
||||||
to="https://$1.api.bing.com/"/>
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,69 +0,0 @@
|
||||||
<!--
|
|
||||||
Nonfunctional domains:
|
|
||||||
|
|
||||||
- blog.dailymotion.com
|
|
||||||
- press.dailymotion.com (shows steaw.com, CN: www.steaw.com)
|
|
||||||
- proxy-46.dailymotion.com
|
|
||||||
- publicite.dailymotion.com
|
|
||||||
- publisher.dailymotion.com (reset)
|
|
||||||
- vid.ak.dmcdn.net (403, Akamai)
|
|
||||||
- vid2.ak.dmcdn.net (504, akamai)
|
|
||||||
|
|
||||||
|
|
||||||
Problematic domains:
|
|
||||||
|
|
||||||
- ak2.static.dailymotion.com (mismatched, CN: *.dmcdn.net)
|
|
||||||
- support.dmcloud.net (mismatched, CN: *.zendesk.com)
|
|
||||||
|
|
||||||
|
|
||||||
Partially covered domains:
|
|
||||||
|
|
||||||
- (www.)dailymotion.com
|
|
||||||
|
|
||||||
- cdn/manifest/video/\w+.mnft 403s
|
|
||||||
- crossdomain.xml breaks videos
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Dailymotion (default off)" default_off="breaks some embedded videos">
|
|
||||||
|
|
||||||
<target host="dailymotion.com" />
|
|
||||||
<!--
|
|
||||||
* for cross-domain cookie.
|
|
||||||
-->
|
|
||||||
<target host="*.dailymotion.com" />
|
|
||||||
<!--
|
|
||||||
https://mail1.eff.org/pipermail/https-everywhere-rules/2012-July/001241.html
|
|
||||||
-->
|
|
||||||
<exclusion pattern="^http://(?:www\.)?dailymotion\.com/(?:cdn/[\w-]+/video/|crossdomain\.xml$)" />
|
|
||||||
<target host="ak2.static.dailymotion.com" />
|
|
||||||
<target host="*.dmcdn.net" />
|
|
||||||
<target host="dmcloud.net" />
|
|
||||||
<target host="*.dmcloud.net" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Testing wrt embedded breakage.
|
|
||||||
|
|
||||||
securecookie host="^.*\.dailymotion\.com$" name=".+" /-->
|
|
||||||
<!--
|
|
||||||
Omniture tracking cookies:
|
|
||||||
-->
|
|
||||||
<securecookie host="^\.dailymotion\.com$" name="^s_\w+$" />
|
|
||||||
<securecookie host="^www\.dailymotion\.com$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://(erroracct\.|www\.)?dailymotion\.com/"
|
|
||||||
to="https://$1dailymotion.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(s\d|static(?:\d|s\d-ssl))\.dmcdn\.net/"
|
|
||||||
to="https://$1.dmcdn.net/" />
|
|
||||||
|
|
||||||
<rule from="^https?://ak2\.static\.dailymotion\.com/"
|
|
||||||
to="https://static1-ssl.dmcdn.net/" />
|
|
||||||
|
|
||||||
<rule from="^http://(s\.|www\.)?dmcloud\.net/"
|
|
||||||
to="https://$1dmcloud.net/" />
|
|
||||||
|
|
||||||
<rule from="^https?://support\.dmcloud\.net/"
|
|
||||||
to="https://dmcloud.zendesk.com/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,53 +0,0 @@
|
||||||
<!--
|
|
||||||
For problematic rules, see Deviantart-mismatches.xml.
|
|
||||||
|
|
||||||
|
|
||||||
Other deviantArt rulesets:
|
|
||||||
|
|
||||||
- Sta.sh.xml
|
|
||||||
|
|
||||||
|
|
||||||
ToDo: Find edgecast URL for /(fc|th)\d+.
|
|
||||||
|
|
||||||
|
|
||||||
Mixed content:
|
|
||||||
|
|
||||||
- Images on *.....com from e.deviantart.net *
|
|
||||||
|
|
||||||
* Secured by us
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="DeviantArt (pending)" default_off="site operator says not ready yet">
|
|
||||||
|
|
||||||
<target host="deviantart.com" />
|
|
||||||
<target host="*.deviantart.com" />
|
|
||||||
<target host="deviantart.net" />
|
|
||||||
<target host="*.deviantart.net" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Not secured by server:
|
|
||||||
-->
|
|
||||||
<!--securecookie host="^\.deviantart\.com$" name="^userinfo$" /-->
|
|
||||||
|
|
||||||
<securecookie host="^\.deviantart\.com$" name=".*" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Redirects from com to net, but does so successfully by itself.
|
|
||||||
-->
|
|
||||||
<rule from="^http://([aei]|fc\d\d|s[ht]|th\d\d)\.deviantart\.(com|net)/"
|
|
||||||
to="https://$1.deviantart.$2/" />
|
|
||||||
|
|
||||||
<!-- This handles everything that isn't in the first rule.
|
|
||||||
Namely, usernames, backend, fc, th, and (www.).
|
|
||||||
These domains present a cert that is only
|
|
||||||
valid for .com.
|
|
||||||
Note that .net isn't used on DA, but.net does
|
|
||||||
redirect to .com, and we shouldn't break what would
|
|
||||||
otherwise work.
|
|
||||||
Mustn't rewrite from https here, as doing so
|
|
||||||
would conflict with the first rule.
|
|
||||||
-->
|
|
||||||
<rule from="^http://([^/:@\.]+\.)?deviantart\.(?:com|net)/"
|
|
||||||
to="https://$1deviantart.com/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,38 +0,0 @@
|
||||||
<!--
|
|
||||||
Problematic domains:
|
|
||||||
|
|
||||||
- www.dukgo.com (mismatched, CN: dukgo.com)
|
|
||||||
|
|
||||||
|
|
||||||
Fully covered domains:
|
|
||||||
|
|
||||||
- (www.)dukgo.com (www → ^)
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="DuckDuckGo">
|
|
||||||
<target host="duckduckgo.com" />
|
|
||||||
<target host="*.duckduckgo.com" />
|
|
||||||
<target host="ddg.gg" />
|
|
||||||
<target host="duck.co" />
|
|
||||||
<target host="i.duck.co" />
|
|
||||||
<target host="dukgo.com" />
|
|
||||||
<target host="www.dukgo.com" />
|
|
||||||
|
|
||||||
<exclusion pattern="^http://(help|meme)\.duckduckgo\.com/" />
|
|
||||||
|
|
||||||
<securecookie host="^duck\.co$" name=".*"/>
|
|
||||||
|
|
||||||
<rule from="^http://duckduckgo\.com/" to="https://duckduckgo.com/"/>
|
|
||||||
<rule from="^http://([^/:@\.]+)\.duckduckgo\.com/" to="https://$1.duckduckgo.com/"/>
|
|
||||||
<!-- TODO: What does ddg.gg/foo do? Runs query foo, redirects to homepage, or error? -->
|
|
||||||
<rule from="^http://ddg\.gg/$" to="https://duckduckgo.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://duck\.co/" to="https://duck.co/" />
|
|
||||||
|
|
||||||
<rule from="^http://i\.duck\.co/"
|
|
||||||
to="https://duckduckgo.com/"/>
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?dukgo\.com/"
|
|
||||||
to="https://dukgo.com/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,44 +0,0 @@
|
||||||
<!--
|
|
||||||
For other Yahoo coverage, see Yahoo.xml.
|
|
||||||
|
|
||||||
|
|
||||||
These altnames don't exist:
|
|
||||||
|
|
||||||
- www.blog.flickr.net
|
|
||||||
- www.code.flickr.net
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Flickr">
|
|
||||||
|
|
||||||
<target host="flic.kr" />
|
|
||||||
<target host="*.flic.kr" />
|
|
||||||
<target host="flickr.com" />
|
|
||||||
<target host="*.flickr.com" />
|
|
||||||
<target host="*.flickr.net" />
|
|
||||||
<target host="*.staticflickr.com" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Not secured by server:
|
|
||||||
-->
|
|
||||||
<!--securecookie host="^\.flic\.kr$" name="^BX$" /-->
|
|
||||||
|
|
||||||
<securecookie host="^\.flic\.kr$" name=".+" />
|
|
||||||
<securecookie host=".*\.flickr\.com$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://flic\.kr/"
|
|
||||||
to="https://flic.kr/" />
|
|
||||||
|
|
||||||
<rule from="^http://(api\.|www\.)?flickr\.com/"
|
|
||||||
to="https://$1flickr.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://s(ecure|tatic)\.flickr\.com/"
|
|
||||||
to="https://s$1.flickr.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(c2|farm\d+)\.static(\.)?flickr\.com/"
|
|
||||||
to="https://$1.static$2flickr.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(blog|code)\.flickr\.net/"
|
|
||||||
to="https://$1.flickr.net/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,11 +0,0 @@
|
||||||
<!--
|
|
||||||
For other GitHub coverage, see Github.xml.
|
|
||||||
-->
|
|
||||||
<ruleset name="GitHub Pages">
|
|
||||||
|
|
||||||
<target host="*.github.io" />
|
|
||||||
|
|
||||||
<rule from="^http://([^/@:\.]+)\.github\.io/"
|
|
||||||
to="https://$1.github.io/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,94 +0,0 @@
|
||||||
<!--
|
|
||||||
Other GitHub rulesets:
|
|
||||||
|
|
||||||
- Github-Pages.xml
|
|
||||||
- Guag.es.xml
|
|
||||||
- Speaker_Deck.com.xml
|
|
||||||
|
|
||||||
|
|
||||||
CDN buckets:
|
|
||||||
|
|
||||||
- github-images.s3.amazonaws.com
|
|
||||||
- github.global.ssl.fastly.net
|
|
||||||
- a248.e.akamai.net/assets.github.com/
|
|
||||||
- a248.e.akamai.net/camo.github.com/
|
|
||||||
- s3.amazonaws.com/github/ | d24z2fz21y4fag.cloudfront.net
|
|
||||||
- github.myshopify.com
|
|
||||||
|
|
||||||
|
|
||||||
Fully covered domains:
|
|
||||||
|
|
||||||
- github.com subdomains:
|
|
||||||
|
|
||||||
- (www.)
|
|
||||||
- assets\d+
|
|
||||||
- assets-cdn
|
|
||||||
- bounty
|
|
||||||
- cloud
|
|
||||||
- f.cloud
|
|
||||||
- codeload
|
|
||||||
- developer
|
|
||||||
- eclipse
|
|
||||||
- enterprise
|
|
||||||
- gist
|
|
||||||
- gist-assets
|
|
||||||
- help
|
|
||||||
- identicons
|
|
||||||
- jobs
|
|
||||||
- mac
|
|
||||||
- mobile
|
|
||||||
- nodeload
|
|
||||||
- octodex
|
|
||||||
- pages
|
|
||||||
- raw
|
|
||||||
- rg3
|
|
||||||
- shop
|
|
||||||
- status
|
|
||||||
- support
|
|
||||||
- training
|
|
||||||
- try
|
|
||||||
- wiki
|
|
||||||
- windows
|
|
||||||
|
|
||||||
- collector.githubapp.com
|
|
||||||
|
|
||||||
- githubusercontent.com
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="GitHub">
|
|
||||||
|
|
||||||
<target host="github.com" />
|
|
||||||
<target host="*.github.com" />
|
|
||||||
<target host="github.io" />
|
|
||||||
<target host="*.githubusercontent.com" />
|
|
||||||
<target host="collector.githubapp.com" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Secured by server:
|
|
||||||
-->
|
|
||||||
<!--securecookie host="^github\.com$" name="^(_gh_sess|tz|user_session)$" /-->
|
|
||||||
<!--securecookie host="^\.github\.com$" name="^(dotcom_user|logged_in)$" /-->
|
|
||||||
<!--securecookie host="^enterprise\.github\.com$" name="^(_enterprise_web|request_method)$" /-->
|
|
||||||
<!--securecookie host="^gist\.github\.com$" name="^_gist_session$" /-->
|
|
||||||
<!--securecookie host="^help\.github\.com$" name="^_help_session$" /-->
|
|
||||||
<!--
|
|
||||||
Not secured by server:
|
|
||||||
-->
|
|
||||||
<!--securecookie host="^status\.github\.com$" name="^rack\.session$" /-->
|
|
||||||
|
|
||||||
<securecookie host="^(?:.*\.)?github\.com$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://((?:assets\d+|assets-cdn|bounty|cloud|f\.cloud|codeload|developer|eclipse|enterprise|gist|gist-assets|help|identicons|jobs|mac|mobile|nodeload|octodex|pages|raw|rg3|shop|status|support|training|try|wiki|windows|www)\.)?github\.com/"
|
|
||||||
to="https://$1github.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://collector\.githubapp\.com/"
|
|
||||||
to="https://collector.githubapp.com/" />
|
|
||||||
|
|
||||||
<rule from="^https?://github\.io/"
|
|
||||||
to="https://pages.github.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://([^/@:\.]+)\.githubusercontent\.com/"
|
|
||||||
to="https://$1.githubusercontent.com/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,26 +0,0 @@
|
||||||
<!--
|
|
||||||
|
|
||||||
Problematic domains:
|
|
||||||
|
|
||||||
- (www.)apture.com (works, mismatched, CN: *.google.com)
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Google (mismatches)" default_off="mismatches">
|
|
||||||
|
|
||||||
<!-- Akamai -->
|
|
||||||
<target host="js.admeld.com"/>
|
|
||||||
<target host="apture.com" />
|
|
||||||
<target host="www.apture.com" />
|
|
||||||
<target host="googleartproject.com"/>
|
|
||||||
<target host="www.googleartproject.com"/>
|
|
||||||
|
|
||||||
<rule from="^http://js\.admeld\.com/"
|
|
||||||
to="https://js.admeld.com/"/>
|
|
||||||
|
|
||||||
<rule from="^https?://(?:www\.)?apture\.com/"
|
|
||||||
to="https://apture.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?googleartproject\.com/"
|
|
||||||
to="https://www.googleartproject.com/"/>
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,14 +0,0 @@
|
||||||
<!--
|
|
||||||
For other Google coverage, see GoogleServices.xml.
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Google.org">
|
|
||||||
|
|
||||||
<target host="google.org" />
|
|
||||||
<target host="www.google.org" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://(www\.)?google\.org/"
|
|
||||||
to="https://$1google.org/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,143 +0,0 @@
|
||||||
<!--
|
|
||||||
For other Google coverage, see GoogleServices.xml.
|
|
||||||
|
|
||||||
|
|
||||||
Nonfunctional domains:
|
|
||||||
|
|
||||||
- hosted.gmodules.com *
|
|
||||||
- img0.gmodules.com *
|
|
||||||
- p.gmodules.com *
|
|
||||||
|
|
||||||
* 404; mismatched, CN: *.googleusercontent.com
|
|
||||||
|
|
||||||
|
|
||||||
Problematic domains:
|
|
||||||
|
|
||||||
- gmodules.com (503, CN: www.google.com)
|
|
||||||
- www.gmodules.com (503, CN: *.googleusercontent.com)
|
|
||||||
- gstatic.com (404, valid cert)
|
|
||||||
- api.recaptcha.net (works; mismatched, CN: google.com)
|
|
||||||
|
|
||||||
|
|
||||||
Partially covered domains:
|
|
||||||
|
|
||||||
- (www.)gmodules.com (→ www.google.com)
|
|
||||||
- (www.)google.com
|
|
||||||
- chart.apis.google.com (→ chart.googleapis.com)
|
|
||||||
|
|
||||||
|
|
||||||
Fully covered domains:
|
|
||||||
|
|
||||||
- api.google.com
|
|
||||||
|
|
||||||
- *.clients.google.com:
|
|
||||||
|
|
||||||
- linkhelp
|
|
||||||
|
|
||||||
- ssl.google-analytics.com
|
|
||||||
- www.google-analytics.com
|
|
||||||
|
|
||||||
- googleapis.com subdomains:
|
|
||||||
|
|
||||||
- ajax
|
|
||||||
- chart
|
|
||||||
- *.commondatastorage
|
|
||||||
- fonts
|
|
||||||
- *.storage
|
|
||||||
- www
|
|
||||||
|
|
||||||
- gstatic.com subdomains:
|
|
||||||
|
|
||||||
- (www.) (^ → www)
|
|
||||||
- csi
|
|
||||||
- encrypted-tbn\d
|
|
||||||
- g0
|
|
||||||
- *.metric
|
|
||||||
- ssl
|
|
||||||
- t\d
|
|
||||||
|
|
||||||
- api.recaptcha.net (→ www.google.com)
|
|
||||||
- api-secure.recaptcha.net
|
|
||||||
- gdata.youtube.com
|
|
||||||
|
|
||||||
|
|
||||||
ssl.google-analytics.com/ga.js sets __utm\w wildcard
|
|
||||||
cookies on whichever domain it is loaded from.
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Google APIs">
|
|
||||||
|
|
||||||
<target host="gmodules.com" />
|
|
||||||
<target host="www.gmodules.com" />
|
|
||||||
<target host="google.com" />
|
|
||||||
<target host="apis.google.com" />
|
|
||||||
<target host="*.apis.google.com" />
|
|
||||||
<target host="*.clients.google.com" />
|
|
||||||
<target host="www.google.com" />
|
|
||||||
<target host="*.google-analytics.com" />
|
|
||||||
<target host="*.googleapis.com" />
|
|
||||||
<target host="gstatic.com" />
|
|
||||||
<target host="*.gstatic.com" />
|
|
||||||
<!-- Captive portal detection redirects to this URL, and many captive
|
|
||||||
portals break TLS, so exempt this redirect URL.
|
|
||||||
See GitHub bug #368
|
|
||||||
-->
|
|
||||||
<exclusion pattern="^http://www\.gstatic\.com/generate_204" />
|
|
||||||
<target host="*.recaptcha.net" />
|
|
||||||
<target host="gdata.youtube.com" />
|
|
||||||
<exclusion pattern="^http://gdata\.youtube\.com/crossdomain\.xml" />
|
|
||||||
|
|
||||||
|
|
||||||
<securecookie host="^ssl\.google-analytics\.com$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?gmodules\.com/ig/images/"
|
|
||||||
to="https://www.google.com/ig/images/" />
|
|
||||||
|
|
||||||
<!-- jsapi was causing problems on some sites that embed google maps:
|
|
||||||
https://trac.torproject.org/projects/tor/ticket/2335
|
|
||||||
Apparently now fixed; thanks, Google!
|
|
||||||
-->
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/(afsonline/|chart|jsapi|recaptcha/|uds)"
|
|
||||||
to="https://www.google.com/$1" />
|
|
||||||
|
|
||||||
<rule from="^http://(api|[\w-]+\.client)s\.google\.com/"
|
|
||||||
to="https://$1s.google.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://chart\.apis\.google\.com/chart"
|
|
||||||
to="https://chart.googleapis.com/chart" />
|
|
||||||
|
|
||||||
<rule from="^http://(ssl|www)\.google-analytics\.com/"
|
|
||||||
to="https://$1.google-analytics.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(ajax|chart|fonts|www)\.googleapis\.com/"
|
|
||||||
to="https://$1.googleapis.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://([^@:\./]+\.)?(commondata)?storage\.googleapis\.com/"
|
|
||||||
to="https://$1$2storage.googleapis.com/" />
|
|
||||||
|
|
||||||
<!-- There is an interesting question about whether we should
|
|
||||||
append &strip=1 to all cache URLs. This causes them to load
|
|
||||||
without images and styles, which is more secure but can look
|
|
||||||
worse.
|
|
||||||
Without &strip=1, the images and styles from the cached
|
|
||||||
pages still load from the original, typically unencrypted, page.
|
|
||||||
With &strip=1, the cached page will be text-only and
|
|
||||||
will come exclusively from Google's HTTPS server.
|
|
||||||
-->
|
|
||||||
<rule from="^http://(?:www\.)?gstatic\.com/"
|
|
||||||
to="https://www.gstatic.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(csi|encrypted-tbn\d|g0|[\w-]+\.metric|ssl|t\d)\.gstatic\.com/"
|
|
||||||
to="https://$1.gstatic.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://api\.recaptcha\.net/"
|
|
||||||
to="https://www.google.com/recaptcha/api/" />
|
|
||||||
|
|
||||||
<rule from="^http://api-secure\.recaptcha\.net/"
|
|
||||||
to="https://api-secure.recaptcha.net/" />
|
|
||||||
|
|
||||||
<rule from="^http://gdata\.youtube\.com/"
|
|
||||||
to="https://gdata.youtube.com/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,6 +0,0 @@
|
||||||
<ruleset name="GoogleCanada">
|
|
||||||
<target host="google.ca" />
|
|
||||||
<target host="*.google.ca" />
|
|
||||||
<rule from="^http://([^/:@\.]+)\.google\.ca/finance" to="https://$1.google.ca/finance"/>
|
|
||||||
</ruleset>
|
|
||||||
|
|
|
@ -1,65 +0,0 @@
|
||||||
<!--
|
|
||||||
For other Google coverage, see GoogleServices.xml.
|
|
||||||
|
|
||||||
|
|
||||||
Problematic domains:
|
|
||||||
|
|
||||||
- www.google.bo *
|
|
||||||
- www.google.co *
|
|
||||||
- www.google.ec *
|
|
||||||
- www.google.in *
|
|
||||||
- www.google.kr *
|
|
||||||
- www.google.com.kz **
|
|
||||||
- www.google.com.lk *
|
|
||||||
- www.google.mx **
|
|
||||||
- www.google.sg *
|
|
||||||
- www.google.sl *
|
|
||||||
- www.google.ug *
|
|
||||||
- www.google.vn *
|
|
||||||
|
|
||||||
* 404; mismatched, CN: google.com
|
|
||||||
** Works; mismatched, CN: google.com
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Google Images">
|
|
||||||
|
|
||||||
<target host="google.*" />
|
|
||||||
<target host="www.google.*" />
|
|
||||||
<target host="google.co.*" />
|
|
||||||
<target host="www.google.co.*" />
|
|
||||||
<target host="google.com" />
|
|
||||||
<target host="images.google.com" />
|
|
||||||
<target host="google.com.*" />
|
|
||||||
<target host="www.google.com.*" />
|
|
||||||
<!--
|
|
||||||
Only handle image-related paths in this ruleset:
|
|
||||||
-->
|
|
||||||
<exclusion pattern="^http://(?:www\.)?google(?:\.com?)?\.\w{2,3}/(?!(?:advanced_image_search|imghp|.*tb(?:m=isch|s=sbi)))" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/"
|
|
||||||
to="https://www.google.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://images\.google\.com/"
|
|
||||||
to="https://images.google.com/" />
|
|
||||||
|
|
||||||
<!-- First handle problematic domains:
|
|
||||||
-->
|
|
||||||
<rule from="^http://(?:www\.)?google\.co/"
|
|
||||||
to="https://www.google.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.(?:co\.)?(in|kr|ug)/"
|
|
||||||
to="https://www.google.co.$1/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.(?:com\.)?(kz|lk)/"
|
|
||||||
to="https://www.google.$1/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.(?:com\.)?(bo|ec|mx|sg|sl|vn)/"
|
|
||||||
to="https://www.google.com.$1/" />
|
|
||||||
|
|
||||||
<!-- And then the rest:
|
|
||||||
-->
|
|
||||||
<rule from="^http://(?:www\.)?google\.(com?\.)?(ae|ar|at|au|bg|bh|br|ca|ch|cl|co|cr|cu|de|eg|es|fi|fr|gh|gt|hr|id|ie|il|it|jo|jp|jm|ke|kw|lb|ly|my|na|ng|nl|no|nz|om|pa|pe|pk|pl|pt|py|qa|ro|ru|rw|sa|se|sv|th|tr|uk|uy|ve|za|zw)/"
|
|
||||||
to="https://www.google.$1$2/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,78 +0,0 @@
|
||||||
<ruleset name="Search www.google.com">
|
|
||||||
|
|
||||||
<!--
|
|
||||||
Enabling this ruleset should cause searches to go to
|
|
||||||
https://www.google.com rather than https://encrypted.google.com. Note that
|
|
||||||
the filename is important; it must be before GoogleSearch.xml in a bash
|
|
||||||
expansion of src/chrome/content/rules/*.xml in order to take precedence.
|
|
||||||
-->
|
|
||||||
|
|
||||||
<target host="*.google.com" />
|
|
||||||
<target host="google.com" />
|
|
||||||
<target host="www.google.com.*" />
|
|
||||||
<target host="google.com.*" />
|
|
||||||
<target host="www.google.co.*" />
|
|
||||||
<target host="google.co.*" />
|
|
||||||
<target host="www.google.*" />
|
|
||||||
<target host="google.*" />
|
|
||||||
<!-- beyond clients1 these do not currently exist in the ccTLDs,
|
|
||||||
but just in case... -->
|
|
||||||
<target host="clients1.google.com.*" />
|
|
||||||
<target host="clients2.google.com.*" />
|
|
||||||
<target host="clients3.google.com.*" />
|
|
||||||
<target host="clients4.google.com.*" />
|
|
||||||
<target host="clients5.google.com.*" />
|
|
||||||
<target host="clients6.google.com.*" />
|
|
||||||
<target host="clients1.google.co.*" />
|
|
||||||
<target host="clients2.google.co.*" />
|
|
||||||
<target host="clients3.google.co.*" />
|
|
||||||
<target host="clients4.google.co.*" />
|
|
||||||
<target host="clients5.google.co.*" />
|
|
||||||
<target host="clients6.google.co.*" />
|
|
||||||
<target host="clients1.google.*" />
|
|
||||||
<target host="clients2.google.*" />
|
|
||||||
<target host="clients3.google.*" />
|
|
||||||
<target host="clients4.google.*" />
|
|
||||||
<target host="clients5.google.*" />
|
|
||||||
<target host="clients6.google.*" />
|
|
||||||
|
|
||||||
<rule from="^http://www\.google\.com/$"
|
|
||||||
to="https://www.google.com/"/>
|
|
||||||
|
|
||||||
<!-- The most basic case. -->
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/search"
|
|
||||||
to="https://www.google.com/search"/>
|
|
||||||
|
|
||||||
<!-- A very annoying exception that we seem to need for the basic case -->
|
|
||||||
|
|
||||||
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbs=shop" />
|
|
||||||
<exclusion pattern="^http://clients[0-9]\.google\.com/.*client=products.*" />
|
|
||||||
<exclusion pattern="^http://suggestqueries\.google\.com/.*client=.*" />
|
|
||||||
|
|
||||||
<!-- https://trac.torproject.org/projects/tor/ticket/9713 -->
|
|
||||||
|
|
||||||
<exclusion pattern="^http://clients[0-9]\.google\.com/ocsp" />
|
|
||||||
|
|
||||||
<!-- This is necessary for image results links from web search results -->
|
|
||||||
|
|
||||||
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbm=isch.*" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/webhp"
|
|
||||||
to="https://www.google.com/webhp"/>
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/#"
|
|
||||||
to="https://www.google.com/#"/>
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/$"
|
|
||||||
to="https://www.google.com/"/>
|
|
||||||
|
|
||||||
<!-- Completion urls look like this:
|
|
||||||
|
|
||||||
http://clients2.google.co.jp/complete/search?hl=ja&client=hp&expIds=17259,24660,24729,24745&q=m&cp=1 HTTP/1.1\r\n
|
|
||||||
|
|
||||||
-->
|
|
||||||
<rule from="^http://clients[0-9]\.google\.com/complete/search"
|
|
||||||
to="https://clients1.google.com/complete/search"/>
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,67 +0,0 @@
|
||||||
<!--
|
|
||||||
Problematic domains:
|
|
||||||
|
|
||||||
- khms *
|
|
||||||
- khms[0-3] *
|
|
||||||
|
|
||||||
* $ 404s
|
|
||||||
|
|
||||||
|
|
||||||
Fully covered domains:
|
|
||||||
|
|
||||||
- google.com subdomains:
|
|
||||||
|
|
||||||
- khms
|
|
||||||
- khms[0-3]
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Google Maps">
|
|
||||||
|
|
||||||
<target host="maps.google.*" />
|
|
||||||
<!--
|
|
||||||
https://trac.torproject.org/projects/tor/ticket/8627
|
|
||||||
-->
|
|
||||||
<exclusion pattern="^http://maps\.google\.com/local_url" />
|
|
||||||
<exclusion pattern="^http://maps\.google\.gr/transitathens" />
|
|
||||||
<target host="maps.google.co.*" />
|
|
||||||
<target host="khms.google.com" />
|
|
||||||
<target host="khms0.google.com" />
|
|
||||||
<target host="khms1.google.com" />
|
|
||||||
<target host="khms2.google.com" />
|
|
||||||
<target host="khms3.google.com" />
|
|
||||||
<target host="maps-api-ssl.google.com" />
|
|
||||||
<target host="mw2.google.com" />
|
|
||||||
<target host="maps.google.com.*" />
|
|
||||||
<target host="maps.googleapis.com" />
|
|
||||||
<!--
|
|
||||||
https://mail1.eff.org/pipermail/https-everywhere-rules/2012-September/001317.html
|
|
||||||
-->
|
|
||||||
<!--exclusion pattern="^http://maps\.googleapis\.com/map(files/lib/map_1_20\.swf|sapi/publicapi\?file=flashapi)" /-->
|
|
||||||
<exclusion pattern="^http://maps\.googleapis\.com/map(?:files/lib/map_\d+_\d+\.swf|sapi/publicapi\?file=flashapi)" />
|
|
||||||
<target host="maps.gstatic.com" />
|
|
||||||
|
|
||||||
|
|
||||||
<!--securecookie host="^maps\.google\.(com?\.)?(au|ca|gh|ie|in|jm|ke|lk|my|n[agz]|pk|rw|sl|sg|ug|uk|za|zw)$" name=".+" /-->
|
|
||||||
<securecookie host="^maps\.google\.[\w.]{2,6}$" name=".+" />
|
|
||||||
<securecookie host="^maps\.g(?:oogle|oogleapis|static)\.com$" name=".+" />
|
|
||||||
<securecookie host="^maps-api-ssl\.google\.com$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://maps\.google\.([^/]+)/"
|
|
||||||
to="https://maps.google.$1/" />
|
|
||||||
|
|
||||||
<!-- http://khms.../$ 404s:
|
|
||||||
-->
|
|
||||||
<rule from="^http://khms\d?\.google\.com/+\??$"
|
|
||||||
to="https://www.google.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(khms\d?|maps-api-ssl|mw2)\.google\.com/"
|
|
||||||
to="https://$1.google.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://maps\.g(oogleapis|static)\.com/"
|
|
||||||
to="https://maps.g$1.com/" />
|
|
||||||
|
|
||||||
<rule from="^https://maps\.googleapis\.com/map(?=files/lib/map_\d+_\d+\.swf|sapi/publicapi\?file=flashapi)"
|
|
||||||
to="http://maps.googleapis.com/map" downgrade="1" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,6 +0,0 @@
|
||||||
<ruleset name="GoogleMelange">
|
|
||||||
<target host="www.google-melange.com" />
|
|
||||||
<target host="google-melange.com" />
|
|
||||||
|
|
||||||
<rule from="^http://(www\.)?google-melange\.com/" to="https://www.google-melange.com/" />
|
|
||||||
</ruleset>
|
|
|
@ -1,135 +0,0 @@
|
||||||
<ruleset name="Google Search">
|
|
||||||
|
|
||||||
<target host="google.com" />
|
|
||||||
<target host="*.google.com" />
|
|
||||||
<target host="google.com.*" />
|
|
||||||
<target host="www.google.com.*" />
|
|
||||||
<target host="google.co.*" />
|
|
||||||
<target host="www.google.co.*" />
|
|
||||||
<target host="google.*" />
|
|
||||||
<target host="www.google.*" />
|
|
||||||
<!--
|
|
||||||
Beyond clients1 these do not currently
|
|
||||||
exist in the ccTLDs, but just in case...
|
|
||||||
-->
|
|
||||||
<target host="clients1.google.com.*" />
|
|
||||||
<target host="clients2.google.com.*" />
|
|
||||||
<target host="clients3.google.com.*" />
|
|
||||||
<target host="clients4.google.com.*" />
|
|
||||||
<target host="clients5.google.com.*" />
|
|
||||||
<target host="clients6.google.com.*" />
|
|
||||||
<target host="clients1.google.co.*" />
|
|
||||||
<target host="clients2.google.co.*" />
|
|
||||||
<target host="clients3.google.co.*" />
|
|
||||||
<target host="clients4.google.co.*" />
|
|
||||||
<target host="clients5.google.co.*" />
|
|
||||||
<target host="clients6.google.co.*" />
|
|
||||||
<target host="clients1.google.*" />
|
|
||||||
<target host="clients2.google.*" />
|
|
||||||
<target host="clients3.google.*" />
|
|
||||||
<target host="clients4.google.*" />
|
|
||||||
<target host="clients5.google.*" />
|
|
||||||
<target host="clients6.google.*" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Some Google pages can generate naive links back to the
|
|
||||||
unencrypted version of encrypted.google.com, which is
|
|
||||||
a 301 but theoretically vulnerable to SSL stripping.
|
|
||||||
-->
|
|
||||||
<rule from="^http://encrypted\.google\.com/"
|
|
||||||
to="https://encrypted.google.com/" />
|
|
||||||
|
|
||||||
<!-- The most basic case.
|
|
||||||
-->
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/search"
|
|
||||||
to="https://encrypted.google.com/search" />
|
|
||||||
|
|
||||||
<!-- A very annoying exception that we
|
|
||||||
seem to need for the basic case
|
|
||||||
-->
|
|
||||||
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbs=shop" />
|
|
||||||
<exclusion pattern="^http://clients\d\.google\.com/.*client=products.*" />
|
|
||||||
<exclusion pattern="^http://suggestqueries\.google\.com/.*client=.*" />
|
|
||||||
|
|
||||||
<!-- https://trac.torproject.org/projects/tor/ticket/9713
|
|
||||||
-->
|
|
||||||
|
|
||||||
<exclusion pattern="^http://clients[0-9]\.google\.com/ocsp" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- This is necessary for image results
|
|
||||||
links from web search results
|
|
||||||
-->
|
|
||||||
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbm=isch.*" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/about"
|
|
||||||
to="https://www.google.com/about" />
|
|
||||||
|
|
||||||
<!-- There are two distinct cases for these firefox searches -->
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google(?:\.com?)?\.[a-z]{2}/firefox/?$"
|
|
||||||
to="https://encrypted.google.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google(?:\.com?)?\.[a-z]{2}/firefox"
|
|
||||||
to="https://encrypted.google.com/webhp" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/webhp"
|
|
||||||
to="https://encrypted.google.com/webhp" />
|
|
||||||
|
|
||||||
<rule from="^http://codesearch\.google\.com/"
|
|
||||||
to="https://codesearch.google.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/codesearch"
|
|
||||||
to="https://www.google.com/codesearch" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/#"
|
|
||||||
to="https://encrypted.google.com/#" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/$"
|
|
||||||
to="https://encrypted.google.com/" />
|
|
||||||
|
|
||||||
<!-- Google supports IPv6 search, including
|
|
||||||
HTTPS with a valid certificate! -->
|
|
||||||
<rule from="^http://ipv6\.google\.com/"
|
|
||||||
to="https://ipv6.google.com/" />
|
|
||||||
|
|
||||||
<!-- most google international sites look like
|
|
||||||
"google.fr", some look like "google.co.jp",
|
|
||||||
and some crazy ones like "google.com.au" -->
|
|
||||||
|
|
||||||
<rule from="^http://(www\.)?google(\.com?)?\.([a-z]{2})/(search\?|#)"
|
|
||||||
to="https://$1google$2.$3/$4" />
|
|
||||||
|
|
||||||
<!-- Language preference setting -->
|
|
||||||
<rule from="^http://(www\.)?google(\.com?)?\.([a-z]{2})/setprefs"
|
|
||||||
to="https://$1google$2.$3/setprefs" />
|
|
||||||
|
|
||||||
<!-- Completion urls look like this:
|
|
||||||
|
|
||||||
http://clients2.google.co.jp/complete/search?hl=ja&client=hp&expIds=17259,24660,24729,24745&q=m&cp=1 HTTP/1.1\r\n
|
|
||||||
|
|
||||||
-->
|
|
||||||
<rule from="^http://clients\d\.google\.com/complete/search"
|
|
||||||
to="https://clients1.google.com/complete/search" />
|
|
||||||
|
|
||||||
<rule from="^http://clients\d\.google(\.com?\.[a-z]{2})/complete/search"
|
|
||||||
to="https://clients1.google.$1/complete/search" />
|
|
||||||
|
|
||||||
<rule from="^http://clients\d\.google\.([a-z]{2})/complete/search"
|
|
||||||
to="https://clients1.google.$1/complete/search" />
|
|
||||||
|
|
||||||
<rule from="^http://suggestqueries\.google\.com/complete/search"
|
|
||||||
to="https://clients1.google.com/complete/search" />
|
|
||||||
|
|
||||||
<rule from="^http://(www\.)?google\.(com?\.)?([a-z]{2})/(?:webhp)?$"
|
|
||||||
to="https://$1google.$2$3/" />
|
|
||||||
|
|
||||||
<!-- If there are URL parameters, keep them. -->
|
|
||||||
<rule from="^http://(www\.)?google\.(com?\.)?([a-z]{2})/(?:webhp)?\?"
|
|
||||||
to="https://$1google.$2$3/webhp?" />
|
|
||||||
|
|
||||||
<!-- teapot -->
|
|
||||||
<rule from="^http://(www\.)?google(\.com?)?\.([a-z]{2})/teapot"
|
|
||||||
to="https://$1google$2.$3/teapot" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,345 +0,0 @@
|
||||||
<!--
|
|
||||||
Other Google rulesets:
|
|
||||||
|
|
||||||
- 2mdn.net.xml
|
|
||||||
- Admeld.xml
|
|
||||||
- ChannelIntelligence.com.xml
|
|
||||||
- Doubleclick.net.xml
|
|
||||||
- FeedBurner.xml
|
|
||||||
- Google.org.xml
|
|
||||||
- GoogleAPIs.xml
|
|
||||||
- Google_App_Engine.xml
|
|
||||||
- GoogleImages.xml
|
|
||||||
- GoogleShopping.xml
|
|
||||||
- Ingress.xml
|
|
||||||
- Meebo.xml
|
|
||||||
- Orkut.xml
|
|
||||||
- Postini.xml
|
|
||||||
- WebM_Project.org.xml
|
|
||||||
|
|
||||||
|
|
||||||
Nonfunctional domains:
|
|
||||||
|
|
||||||
- feedproxy.google.com (404, valid cert)
|
|
||||||
- partnerpage.google.com *
|
|
||||||
- safebrowsing.clients.google.com (404, mismatched)
|
|
||||||
- (www.)googlesyndicatedsearch.com (404; mismatched, CN: google.com)
|
|
||||||
- buttons.googlesyndication.com *
|
|
||||||
|
|
||||||
* 404, valid cert
|
|
||||||
|
|
||||||
|
|
||||||
Nonfunctional google.com paths:
|
|
||||||
|
|
||||||
- analytics (redirects to http)
|
|
||||||
- imgres
|
|
||||||
- gadgets *
|
|
||||||
- hangouts (404)
|
|
||||||
- u/ (404)
|
|
||||||
|
|
||||||
* Redirects to http
|
|
||||||
|
|
||||||
|
|
||||||
Problematic domains:
|
|
||||||
|
|
||||||
- www.goo.gl (404; mismatched, CN: *.google.com)
|
|
||||||
|
|
||||||
- google.com subdomains:
|
|
||||||
|
|
||||||
- books (googlebooks/, images/, & intl/ 404, but works when rewritten to www)
|
|
||||||
- cbks0 ****
|
|
||||||
- earth *
|
|
||||||
- gg ($ 404s)
|
|
||||||
- knoll *
|
|
||||||
- scholar **
|
|
||||||
- trends *
|
|
||||||
|
|
||||||
- news.google.cctld **
|
|
||||||
- scholar.google.cctld **
|
|
||||||
- *-opensocial.googleusercontent.com ***
|
|
||||||
|
|
||||||
**** $ 404s
|
|
||||||
* 404, valid cert
|
|
||||||
** Redirects to http, valid cert
|
|
||||||
*** Breaks followers widget - https://trac.torproject.org/projects/tor/ticket/7294
|
|
||||||
|
|
||||||
|
|
||||||
Partially covered domains:
|
|
||||||
|
|
||||||
- google.cctld subdomains:
|
|
||||||
|
|
||||||
- scholar (→ www)
|
|
||||||
|
|
||||||
- google.com subdomains:
|
|
||||||
|
|
||||||
- (www.)
|
|
||||||
- cbks0 ($ 404s)
|
|
||||||
- gg ($ 404s)
|
|
||||||
- news (→ www)
|
|
||||||
- scholar (→ www)
|
|
||||||
|
|
||||||
- *.googleusercontent.com (*-opensocial excluded)
|
|
||||||
|
|
||||||
|
|
||||||
Fully covered domains:
|
|
||||||
|
|
||||||
- lh[3-6].ggpht.com
|
|
||||||
- (www.)goo.gl (www → ^)
|
|
||||||
|
|
||||||
- google.com subdomains:
|
|
||||||
|
|
||||||
- accounts
|
|
||||||
- adwords
|
|
||||||
- apis
|
|
||||||
- appengine
|
|
||||||
- books (→ encrypted)
|
|
||||||
- calendar
|
|
||||||
- checkout
|
|
||||||
- chrome
|
|
||||||
- clients[12]
|
|
||||||
- code
|
|
||||||
- *.corp
|
|
||||||
- developers
|
|
||||||
- dl
|
|
||||||
- docs
|
|
||||||
- docs\d
|
|
||||||
- \d.docs
|
|
||||||
- drive
|
|
||||||
- earth (→ www)
|
|
||||||
- encrypted
|
|
||||||
- encrypted-tbn[123]
|
|
||||||
- feedburner
|
|
||||||
- fiber
|
|
||||||
- finance
|
|
||||||
- glass
|
|
||||||
- groups
|
|
||||||
- health
|
|
||||||
- helpouts
|
|
||||||
- history
|
|
||||||
- hostedtalkgadget
|
|
||||||
- id
|
|
||||||
- investor
|
|
||||||
- knol
|
|
||||||
- knoll (→ knol)
|
|
||||||
- lh\d
|
|
||||||
- mail
|
|
||||||
- chatenabled.mail
|
|
||||||
- pack
|
|
||||||
- picasaweb
|
|
||||||
- pki
|
|
||||||
- play
|
|
||||||
- plus
|
|
||||||
- plusone
|
|
||||||
- productforums
|
|
||||||
- profiles
|
|
||||||
- safebrowsing-cache
|
|
||||||
- cert-test.sandbox
|
|
||||||
- plus.sandbox
|
|
||||||
- sb-ssl
|
|
||||||
- script
|
|
||||||
- security
|
|
||||||
- services
|
|
||||||
- servicessites
|
|
||||||
- sites
|
|
||||||
- spreadsheets
|
|
||||||
- spreadsheets\d
|
|
||||||
- support
|
|
||||||
- talk
|
|
||||||
- talkgadget
|
|
||||||
- tbn2 (→ encrypted-tbn2)
|
|
||||||
- tools
|
|
||||||
- trends (→ www)
|
|
||||||
|
|
||||||
- partner.googleadservices.com
|
|
||||||
- (www.)googlecode.com
|
|
||||||
- *.googlecode.com (per-project subdomains)
|
|
||||||
- googlesource.com
|
|
||||||
- *.googlesource.com
|
|
||||||
- pagead2.googlesyndication.com
|
|
||||||
- tpc.googlesyndication.com
|
|
||||||
- mail-attachment.googleusercontent.com
|
|
||||||
- webcache.googleusercontent.com
|
|
||||||
|
|
||||||
|
|
||||||
XXX: Needs more testing
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Google Services">
|
|
||||||
|
|
||||||
<target host="*.ggpht.com" />
|
|
||||||
<target host="gmail.com" />
|
|
||||||
<target host="www.gmail.com" />
|
|
||||||
<target host="goo.gl" />
|
|
||||||
<target host="www.goo.gl" />
|
|
||||||
<target host="google.*" />
|
|
||||||
<target host="accounts.google.*" />
|
|
||||||
<target host="adwords.google.*" />
|
|
||||||
<target host="finance.google.*" />
|
|
||||||
<target host="groups.google.*" />
|
|
||||||
<target host="it.google.*" />
|
|
||||||
<target host="news.google.*" />
|
|
||||||
<exclusion pattern="^http://(?:news\.)?google\.com/(?:archivesearch|newspapers)" />
|
|
||||||
<target host="picasaweb.google.*" />
|
|
||||||
<target host="scholar.google.*" />
|
|
||||||
<target host="www.google.*" />
|
|
||||||
<target host="*.google.ca" />
|
|
||||||
<target host="google.co.*" />
|
|
||||||
<target host="accounts.google.co.*" />
|
|
||||||
<target host="adwords.google.co.*" />
|
|
||||||
<target host="finance.google.co.*" />
|
|
||||||
<target host="groups.google.co.*" />
|
|
||||||
<target host="id.google.co.*" />
|
|
||||||
<target host="news.google.co.*" />
|
|
||||||
<target host="picasaweb.google.co.*" />
|
|
||||||
<target host="scholar.google.co.*" />
|
|
||||||
<target host="www.google.co.*" />
|
|
||||||
<target host="google.com" />
|
|
||||||
<target host="*.google.com" />
|
|
||||||
<exclusion pattern="^http://(?:www\.)?google\.com/analytics/*(?:/[^/]+)?(?:\?.*)?$" />
|
|
||||||
<!--exclusion pattern="^http://books\.google\.com/(?!books/(\w+\.js|css/|javascript/)|favicon\.ico|googlebooks/|images/|intl/)" /-->
|
|
||||||
<exclusion pattern="^http://cbks0\.google\.com/(?:$|\?)" />
|
|
||||||
<exclusion pattern="^http://gg\.google\.com/(?!csi(?:$|\?))" />
|
|
||||||
<target host="google.com.*" />
|
|
||||||
<target host="accounts.google.com.*" />
|
|
||||||
<target host="adwords.google.com.*" />
|
|
||||||
<target host="groups.google.com.*" />
|
|
||||||
<target host="id.google.com.*" />
|
|
||||||
<target host="news.google.com.*" />
|
|
||||||
<target host="picasaweb.google.com.*" />
|
|
||||||
<target host="scholar.google.com.*" />
|
|
||||||
<target host="www.google.com.*" />
|
|
||||||
<target host="partner.googleadservices.com" />
|
|
||||||
<target host="googlecode.com" />
|
|
||||||
<target host="*.googlecode.com" />
|
|
||||||
<target host="googlemail.com" />
|
|
||||||
<target host="www.googlemail.com" />
|
|
||||||
<target host="googlesource.com" />
|
|
||||||
<target host="*.googlesource.com" />
|
|
||||||
<target host="*.googlesyndication.com" />
|
|
||||||
<target host="www.googletagservices.com" />
|
|
||||||
<target host="googleusercontent.com" />
|
|
||||||
<target host="*.googleusercontent.com" />
|
|
||||||
<!--
|
|
||||||
Necessary for the Followers widget:
|
|
||||||
|
|
||||||
https://trac.torproject.org/projects/tor/ticket/7294
|
|
||||||
-->
|
|
||||||
<exclusion pattern="http://[^@:\./]+-opensocial\.googleusercontent\.com" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Can we secure any of these wildcard cookies safely?
|
|
||||||
-->
|
|
||||||
<!--securecookie host="^\.google\.com$" name="^(hl|I4SUserLocale|NID|PREF|S)$" /-->
|
|
||||||
<!--securecookie host="^\.google\.[\w.]{2,6}$" name="^(hl|I4SUserLocale|NID|PREF|S|S_awfe)$" /-->
|
|
||||||
<securecookie host="^(?:accounts|adwords|\.code|login\.corp|developers|docs|\d\.docs|fiber|mail|picasaweb|plus|\.?productforums|support)\.google\.[\w.]{2,6}$" name=".+" />
|
|
||||||
<securecookie host="^www\.google\.com$" name="^GoogleAccountsLocale_session$" />
|
|
||||||
<securecookie host="^mail-attachment\.googleusercontent\.com$" name=".+" />
|
|
||||||
<securecookie host="^gmail\.com$" name=".+" />
|
|
||||||
<securecookie host="^www\.gmail\.com$" name=".+" />
|
|
||||||
<securecookie host="^googlemail\.com$" name=".+" />
|
|
||||||
<securecookie host="^www\.googlemail\.com$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- - lh 3-6 exist
|
|
||||||
- All appear identical
|
|
||||||
- Identical to lh\d.googleusercontent.com
|
|
||||||
-->
|
|
||||||
<rule from="^http://lh(\d)\.ggpht\.com/"
|
|
||||||
to="https://lh$1.ggpht.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://lh(\d)\.google\.ca/"
|
|
||||||
to="https://lh$1.google.ca/" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://(www\.)?g(oogle)?mail\.com/"
|
|
||||||
to="https://$1g$2mail.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?goo\.gl/"
|
|
||||||
to="https://goo.gl/" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Redirects to http when rewritten to www:
|
|
||||||
-->
|
|
||||||
<rule from="^http://books\.google\.com/"
|
|
||||||
to="https://encrypted.google.com/" />
|
|
||||||
|
|
||||||
<!-- tisp$ 404s:
|
|
||||||
-->
|
|
||||||
<rule from="^http://(?:www\.)?google\.((?:com?\.)?\w{2,3})/tisp(?=$|\?)"
|
|
||||||
to="https://www.google.$1/tisp/" />
|
|
||||||
|
|
||||||
<!-- Paths that work on all in google.*
|
|
||||||
-->
|
|
||||||
<rule from="^http://(?:www\.)?google\.((?:com?\.)?\w{2,3})/(accounts|adplanner|ads|adsense|adwords|analytics|bookmarks|chrome|contacts|coop|cse|css|culturalinstitute|doodles|earth|favicon\.ico|finance|get|goodtoknow|googleblogs|grants|green|hostednews|images|intl|js|landing|logos|mapmaker|newproducts|news|nexus|patents|policies|prdhp|profiles|products|reader|s2|settings|shopping|support|tisp|tools|transparencyreport|trends|urchin|webmasters)(?=$|[?/])"
|
|
||||||
to="https://www.google.$1/$2" />
|
|
||||||
|
|
||||||
<!-- Paths that 404 on .ccltd, but work on .com:
|
|
||||||
-->
|
|
||||||
<rule from="^http://(?:www\.)?google\.(?:com?\.)?\w{2,3}/(?=calendar|dictionary|doubleclick|help|ideas|pacman|postini|powermeter|url)"
|
|
||||||
to="https://www.google.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.(?:com?\.)?\w{2,3}/custom"
|
|
||||||
to="https://www.google.com/cse" />
|
|
||||||
|
|
||||||
<!-- Paths that only exist/work on .com
|
|
||||||
-->
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/(\+|appsstatus|books|buzz|extern_js|glass|googlebooks|ig|insights|moderator|phone|safebrowsing|videotargetting|webfonts)(?=$|[?/])"
|
|
||||||
to="https://www.google.com/$1" />
|
|
||||||
|
|
||||||
<!-- Subdomains that work on all in google.*
|
|
||||||
-->
|
|
||||||
<rule from="^http://(accounts|adwords|finance|groups|id|picasaweb|)\.google\.((?:com?\.)?\w{2,3})/"
|
|
||||||
to="https://$1.google.$2/" />
|
|
||||||
|
|
||||||
<!-- Subdomains that only exist/work on .com
|
|
||||||
-->
|
|
||||||
<rule from="^http://(apis|appengine|books|calendar|cbks0|chat|checkout|chrome|clients[12]|code|[\w-]+\.corp|developers|dl|docs\d?|\d\.docs|drive|encrypted|encrypted-tbn[123]|feedburner|fiber|fonts|gg|glass||health|helpouts|history|(?:hosted)?talkgadget|investor|lh\d|(?:chatenabled\.)?mail|pack|pki|play|plus(?:\.sandbox)?|plusone|productforums|profiles|safebrowsing-cache|cert-test\.sandbox|sb-ssl|script|security|services|servicessites|sites|spreadsheets\d?|support|talk|tools)\.google\.com/"
|
|
||||||
to="https://$1.google.com/" />
|
|
||||||
|
|
||||||
<exclusion pattern="^http://clients[0-9]\.google\.com/ocsp"/>
|
|
||||||
|
|
||||||
<rule from="^http://earth\.google\.com/"
|
|
||||||
to="https://www.google.com/earth/" />
|
|
||||||
|
|
||||||
<rule from="^http://scholar\.google\.((?:com?\.)?\w{2,3})/intl/"
|
|
||||||
to="https://www.google.$1/intl/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:encrypted-)?tbn2\.google\.com/"
|
|
||||||
to="https://encrypted-tbn2.google.com/" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://knoll?\.google\.com/"
|
|
||||||
to="https://knol.google.com/" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://news\.google\.(?:com?\.)?\w{2,3}/(?:$|news|newshp)"
|
|
||||||
to="https://www.google.com/news" />
|
|
||||||
|
|
||||||
<rule from="^http://trends\.google\.com/"
|
|
||||||
to="https://www.google.com/trends" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://([^/:@\.]+\.)?googlecode\.com/"
|
|
||||||
to="https://$1googlecode.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://([^\./]\.)?googlesource\.com/"
|
|
||||||
to="https://$1googlesource.com/" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://partner\.googleadservices\.com/"
|
|
||||||
to="https://partner.googleadservices.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(pagead2|tpc)\.googlesyndication\.com/"
|
|
||||||
to="https://$1.googlesyndication.com/" />
|
|
||||||
|
|
||||||
<!-- !www doesn't exist.
|
|
||||||
-->
|
|
||||||
<rule from="^http://www\.googletagservices\.com/tag/js/"
|
|
||||||
to="https://www.googletagservices.com/tag/js/" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://([^@:\./]+)\.googleusercontent\.com/"
|
|
||||||
to="https://$1.googleusercontent.com/" />
|
|
||||||
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,28 +0,0 @@
|
||||||
<!--
|
|
||||||
For other Google coverage, see GoogleServices.xml.
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Google Shopping">
|
|
||||||
|
|
||||||
<target host="google.*" />
|
|
||||||
<target host="www.google.*" />
|
|
||||||
<target host="google.co.*" />
|
|
||||||
<target host="www.google.co.*" />
|
|
||||||
<target host="*.google.com" />
|
|
||||||
<target host="google.com.*" />
|
|
||||||
<target host="www.google.com.*" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://encrypted\.google\.com/(prdhp|shopping)"
|
|
||||||
to="https://www.google.com/$1" />
|
|
||||||
|
|
||||||
<rule from="^http://shopping\.google\.com/"
|
|
||||||
to="https://shopping.google.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:encrypted|www)\.google\.com/(.*tbm=shop)"
|
|
||||||
to="https://www.google.com/$1" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?google\.((?:com?\.)?(?:ae|ar|at|au|bg|bh|bo|br|ca|ch|cl|cr|co|cu|de|ec|eg|es|fi|fr|gh|gt|hr|id|ie|il|in|it|jm|jo|jp|ke|kr|kw|kz|lb|lk|ly|mx|my|na|ng|nl|no|nz|om|pa|pe|pk|pl|pt|py|qa|ro|ru|rw|sa|sg|sl|se|sv|th|tr|ug|uk|uy|ve|vn|za|zw))/(?=prdhp|shopping)"
|
|
||||||
to="https://www.google.com/$1" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,7 +0,0 @@
|
||||||
<ruleset name="GoogleSorry">
|
|
||||||
<target host="sorry.google.com" />
|
|
||||||
<target host="www.google.com" />
|
|
||||||
<target host="google.com" />
|
|
||||||
|
|
||||||
<rule from="^http://((sorry|www)\.)?google\.com/sorry/" to="https://sorry.google.com/sorry/" />
|
|
||||||
</ruleset>
|
|
|
@ -1,8 +0,0 @@
|
||||||
<ruleset name="Google Translate (broken)" default_off="redirect loops">
|
|
||||||
<target host="translate.googleapis.com" />
|
|
||||||
<target host="translate.google.com" />
|
|
||||||
|
|
||||||
<rule from="^http://translate\.googleapis\.com/" to="https://translate.googleapis.com/"/>
|
|
||||||
<rule from="^http://translate\.google\.com/"
|
|
||||||
to="https://translate.google.com/" />
|
|
||||||
</ruleset>
|
|
|
@ -1,83 +0,0 @@
|
||||||
<ruleset name="Google Videos">
|
|
||||||
<target host="*.google.com" />
|
|
||||||
<target host="google.com" />
|
|
||||||
<target host="www.google.com.*" />
|
|
||||||
<target host="google.com.*" />
|
|
||||||
<target host="www.google.co.*" />
|
|
||||||
<target host="google.co.*" />
|
|
||||||
<target host="www.google.*" />
|
|
||||||
<target host="google.*" />
|
|
||||||
|
|
||||||
<rule from="^http://encrypted\.google\.com/videohp"
|
|
||||||
to="https://encrypted.google.com/videohp" />
|
|
||||||
|
|
||||||
<!-- https://videos.google.com is currently broken; work around that... -->
|
|
||||||
<rule from="^https?://videos?\.google\.com/$"
|
|
||||||
to="https://encrypted.google.com/videohp" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com/videohp"
|
|
||||||
to="https://encrypted.google.com/videohp" />
|
|
||||||
<rule from="^http://(?:images|www|encrypted)\.google\.com/(.*tbm=isch)"
|
|
||||||
to="https://encrypted.google.com/$1" />
|
|
||||||
|
|
||||||
<rule
|
|
||||||
from="^http://(?:www\.)?google\.(?:com?\.)?(?:au|ca|gh|ie|in|jm|ke|lk|my|na|ng|nz|pk|rw|sl|sg|ug|uk|za|zw)/videohp"
|
|
||||||
to="https://encrypted.google.com/videohp" />
|
|
||||||
<rule
|
|
||||||
from="^http://(?:www\.)?google\.(?:com?\.)?(?:ar|bo|cl|co|cu|cr|ec|es|gt|mx|pa|pe|py|sv|uy|ve)/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=es" />
|
|
||||||
<rule
|
|
||||||
from="^http://(?:www\.)?google\.(?:com\.)?(?:ae|bh|eg|jo|kw|lb|ly|om|qa|sa)/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=ar" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.(?:at|ch|de)/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=de" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.(fr|nl|it|pl|ru|bg|pt|ro|hr|fi|no)/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=$1" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com?\.(id|th|tr)/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=$1" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.il/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=he" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.kr/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=ko" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.kz/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=kk" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.jp/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=ja" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.vn/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=vi" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.br/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=pt-BR" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.se/videohp$"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=sv" />
|
|
||||||
|
|
||||||
<!-- If there are URL parameters, keep them. -->
|
|
||||||
<rule
|
|
||||||
from="^http://(?:www\.)?google\.(?:com?\.)?(?:ar|bo|cl|co|cu|cr|ec|es|gt|mx|pa|pe|py|sv|uy|ve)/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=es&" />
|
|
||||||
<rule
|
|
||||||
from="^http://(?:www\.)?google\.(?:com\.)?(?:ae|bh|eg|jo|kw|lb|ly|om|qa|sa)/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=ar&" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.(?:at|ch|de)/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=de&" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.(fr|nl|it|pl|ru|bg|pt|ro|hr|fi|no)/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=$1&" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com?\.(id|th|tr)/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=$1&" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.il/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=he&" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.kr/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=ko&" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.kz/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=kk&" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.jp/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=ja&" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.vn/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=vi&" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.com\.br/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=pt-BR&" />
|
|
||||||
<rule from="^http://(?:www\.)?google\.se/videohp\?"
|
|
||||||
to="https://encrypted.google.com/videohp?hl=sv&" />
|
|
||||||
|
|
||||||
<rule from="^http://video\.google\.com/ThumbnailServer2"
|
|
||||||
to="https://video.google.com/ThumbnailServer2" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,17 +0,0 @@
|
||||||
<!--
|
|
||||||
gwbhrd.appspot.com
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="GoogleWatchBlog">
|
|
||||||
|
|
||||||
<target host="googlewatchblog.de" />
|
|
||||||
<target host="*.googlewatchblog.de" />
|
|
||||||
|
|
||||||
|
|
||||||
<securecookie host="^(?:www)?\.googlewatchblog\.de$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://(static\.|www\.)?googlewatchblog\.de/"
|
|
||||||
to="https://$1googlewatchblog.de/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,21 +0,0 @@
|
||||||
<!--
|
|
||||||
For other Google coverage, see GoogleServices.xml.
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Google App Engine">
|
|
||||||
|
|
||||||
<target host="appspot.com" />
|
|
||||||
<target host="*.appspot.com" />
|
|
||||||
<!--
|
|
||||||
Redirects to http for some reason.
|
|
||||||
-->
|
|
||||||
<exclusion pattern="^http://photomunchers\.appspot\.com/" />
|
|
||||||
|
|
||||||
|
|
||||||
<securecookie host="^.+\.appspot\.com$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://([^@:\./]+\.)?appspot\.com/"
|
|
||||||
to="https://$1appspot.com/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,16 +0,0 @@
|
||||||
<!-- This rule was automatically generated based on an HSTS
|
|
||||||
preload rule in the Chromium browser. See
|
|
||||||
https://src.chromium.org/viewvc/chrome/trunk/src/net/base/transport_security_state.cc
|
|
||||||
for the list of preloads. Sites are added to the Chromium HSTS
|
|
||||||
preload list on request from their administrators, so HTTPS should
|
|
||||||
work properly everywhere on this site.
|
|
||||||
|
|
||||||
Because Chromium and derived browsers automatically force HTTPS for
|
|
||||||
every access to this site, this rule applies only to Firefox. -->
|
|
||||||
<ruleset name="Googleplex.com (default off)" platform="firefox" default_off="Certificate error">
|
|
||||||
<target host="googleplex.com" />
|
|
||||||
|
|
||||||
<securecookie host="^googleplex\.com$" name=".+" />
|
|
||||||
|
|
||||||
<rule from="^http://googleplex\.com/" to="https://googleplex.com/" />
|
|
||||||
</ruleset>
|
|
|
@ -1,15 +0,0 @@
|
||||||
<ruleset name="OpenStreetMap">
|
|
||||||
|
|
||||||
<target host="openstreetmap.org"/>
|
|
||||||
<target host="*.openstreetmap.org"/>
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?openstreetmap\.org/"
|
|
||||||
to="https://www.openstreetmap.org/"/>
|
|
||||||
|
|
||||||
<rule from="^http://tile\.openstreetmap\.org/"
|
|
||||||
to="https://a.tile.openstreetmap.org/"/>
|
|
||||||
|
|
||||||
<rule from="^http://(blog|help|lists|nominatim|piwik|taginfo|[abc]\.tile|trac|wiki)\.openstreetmap\.org/"
|
|
||||||
to="https://$1.openstreetmap.org/"/>
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,14 +0,0 @@
|
||||||
<!--
|
|
||||||
www: cert only matches ^rawgithub.com
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="rawgithub.com">
|
|
||||||
|
|
||||||
<target host="rawgithub.com" />
|
|
||||||
<target host="www.rawgithub.com" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?rawgithub\.com/"
|
|
||||||
to="https://rawgithub.com/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,101 +0,0 @@
|
||||||
<!--
|
|
||||||
|
|
||||||
CDN buckets:
|
|
||||||
|
|
||||||
- akmedia-a.akamaihd.net
|
|
||||||
|
|
||||||
- soundcloud.assistly.com
|
|
||||||
|
|
||||||
- help.soundcloud.com
|
|
||||||
|
|
||||||
- cs70.wac.edgecastcdn.net
|
|
||||||
|
|
||||||
- a1.sndcdn.com
|
|
||||||
- i1.sndcdn.com
|
|
||||||
- w1.sndcdn.com
|
|
||||||
|
|
||||||
- wpc.658D.edgecastcdn.net
|
|
||||||
- m-a.sndcdn.com.edgesuite.net
|
|
||||||
- soundcloud.gettyimages.com
|
|
||||||
|
|
||||||
- scbackstage.wpengine.netdna-cdn.com
|
|
||||||
|
|
||||||
- ssl doesn't exist
|
|
||||||
- backstage.soundcloud.com
|
|
||||||
|
|
||||||
- soundcloud.wpengine.netdna-cdn.com
|
|
||||||
|
|
||||||
- -ssl doesn't exist
|
|
||||||
- blog.soundcloud.com
|
|
||||||
|
|
||||||
- gs1.wpc.v2cdn.netcdn.net
|
|
||||||
- gs1.wpc.v2cdn.net
|
|
||||||
|
|
||||||
- ec-media.soundcloud.com
|
|
||||||
|
|
||||||
Nonfunctional soundcloud.com subdomains:
|
|
||||||
|
|
||||||
- help (redirects to http, mismatched, CN: *.assistly.com)
|
|
||||||
- m (redirects to http)
|
|
||||||
- media
|
|
||||||
- status (times out)
|
|
||||||
|
|
||||||
|
|
||||||
Problematic domains:
|
|
||||||
|
|
||||||
- m-a.sndcdn.com (works, akamai)
|
|
||||||
|
|
||||||
|
|
||||||
Partially covered domains:
|
|
||||||
|
|
||||||
- backstage.soundcloud.com
|
|
||||||
|
|
||||||
|
|
||||||
Fully covered domains:
|
|
||||||
|
|
||||||
- sndcdn.com subdomains:
|
|
||||||
|
|
||||||
- a[12]
|
|
||||||
- api
|
|
||||||
- i[1-4]
|
|
||||||
- w[12]
|
|
||||||
- wis
|
|
||||||
|
|
||||||
- soundcloud.com subdomains:
|
|
||||||
|
|
||||||
- (www.)
|
|
||||||
- api
|
|
||||||
- blog
|
|
||||||
- connect
|
|
||||||
- developers
|
|
||||||
- ec-media
|
|
||||||
- eventlogger
|
|
||||||
- help-assets
|
|
||||||
- media
|
|
||||||
- visuals
|
|
||||||
- w
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Soundcloud (partial)">
|
|
||||||
|
|
||||||
<target host="scbackstage.wpengine.netdna-cdn.com" />
|
|
||||||
<target host="soundcloud.wpengine.netdna-cdn.com" />
|
|
||||||
<target host="*.sndcdn.com" />
|
|
||||||
<target host="soundcloud.com" />
|
|
||||||
<target host="*.soundcloud.com" />
|
|
||||||
<exclusion pattern="^https?://(?:scbackstage\.wpengine\.netdna-cdn|backstage\.soundcloud)\.com/(?!wp-content/)" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
|
|
||||||
to="https://$1.sndcdn.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.|)soundcloud\.com/"
|
|
||||||
to="https://$1soundcloud.com/" />
|
|
||||||
|
|
||||||
<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"
|
|
||||||
to="https://backstage.soundcloud.com/" />
|
|
||||||
|
|
||||||
<rule from="^https?://soundcloud\.wpengine\.netdna-cdn\.com/"
|
|
||||||
to="https://blog.soundcloud.com/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,36 +0,0 @@
|
||||||
<!--
|
|
||||||
Nonfunctional:
|
|
||||||
|
|
||||||
- image.bayimg.com
|
|
||||||
- (www.)thepiratebay.sx (http reply)
|
|
||||||
|
|
||||||
|
|
||||||
For problematic rules, see ThePirateBay-mismatches.xml.
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="The Pirate Bay (partial)">
|
|
||||||
|
|
||||||
<target host="suprbay.org" />
|
|
||||||
<target host="*.suprbay.org" />
|
|
||||||
<!-- * for cross-domain cookie -->
|
|
||||||
<target host="*.forum.suprbay.org" />
|
|
||||||
<target host="thepiratebay.org"/>
|
|
||||||
<target host="*.thepiratebay.org"/>
|
|
||||||
<target host="thepiratebay.se"/>
|
|
||||||
<target host="*.thepiratebay.se"/>
|
|
||||||
|
|
||||||
<securecookie host="^.*\.suprbay\.org$" name=".*" />
|
|
||||||
<securecookie host="^(.*\.)?thepiratebay\.se$" name=".*"/>
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Cert doesn't match (www.), redirects like so. -->
|
|
||||||
<rule from="^https?://(?:forum\.|www\.)?suprbay\.org/"
|
|
||||||
to="https://forum.suprbay.org/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?thepiratebay\.(?:org|se)/"
|
|
||||||
to="https://thepiratebay.se/"/>
|
|
||||||
|
|
||||||
<rule from="^http://(rss|static|torrents)\.thepiratebay\.(?:org|se)/"
|
|
||||||
to="https://$1.thepiratebay.se/"/>
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,18 +0,0 @@
|
||||||
<ruleset name="Tor Project">
|
|
||||||
|
|
||||||
<target host="torproject.org" />
|
|
||||||
<target host="*.torproject.org" />
|
|
||||||
<exclusion pattern="^http://torperf\.torproject\.org/" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Not secured by server:
|
|
||||||
-->
|
|
||||||
<!--securecookie host="^\.blog\.torproject\.org$" name="^SESS[0-9a-f]{32}$" /-->
|
|
||||||
|
|
||||||
<securecookie host="^(?:.*\.)?torproject\.org$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://([^/:@\.]+\.)?torproject\.org/"
|
|
||||||
to="https://$1torproject.org/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,169 +0,0 @@
|
||||||
<!--
|
|
||||||
Other Twitter rulesets:
|
|
||||||
|
|
||||||
- Twitter_Community.com.xml
|
|
||||||
|
|
||||||
|
|
||||||
Nonfunctional domains:
|
|
||||||
|
|
||||||
- status.twitter.com *
|
|
||||||
- status.twitter.jp *
|
|
||||||
|
|
||||||
* Tumblr
|
|
||||||
|
|
||||||
|
|
||||||
CDN buckets:
|
|
||||||
|
|
||||||
- a1095.g.akamai.net/=/1095/134446/1d/platform.twitter.com/ | platform2.twitter.com.edgesuite.net
|
|
||||||
|
|
||||||
- platform2.twitter.com
|
|
||||||
|
|
||||||
- twitter-any.s3.amazonaws.com
|
|
||||||
- twitter-blog.s3.amazonaws.com
|
|
||||||
|
|
||||||
- d2rdfnizen5apl.cloudfront.net
|
|
||||||
|
|
||||||
- s.twimg.com
|
|
||||||
|
|
||||||
- ssl2.twitter.com.edgekey.net
|
|
||||||
- twitter.github.com
|
|
||||||
|
|
||||||
|
|
||||||
Problematic domains:
|
|
||||||
|
|
||||||
- twimg.com subdomains:
|
|
||||||
|
|
||||||
- a5 *
|
|
||||||
- s (cloudfront)
|
|
||||||
|
|
||||||
- twitter.com subdomains:
|
|
||||||
|
|
||||||
- platform[0-3] (403, akamai)
|
|
||||||
|
|
||||||
* akamai
|
|
||||||
|
|
||||||
|
|
||||||
Fully covered domains:
|
|
||||||
|
|
||||||
- (www.)t.co (www → ^)
|
|
||||||
|
|
||||||
- twimg.com subdomains:
|
|
||||||
|
|
||||||
- a[5-9] (→ si0)
|
|
||||||
- a\d
|
|
||||||
- abs
|
|
||||||
- dnt
|
|
||||||
- ea
|
|
||||||
- g
|
|
||||||
- g2
|
|
||||||
- gu
|
|
||||||
- hca
|
|
||||||
- jp
|
|
||||||
- ma
|
|
||||||
- ma[0123]
|
|
||||||
- o
|
|
||||||
- p
|
|
||||||
- pbs
|
|
||||||
- r
|
|
||||||
- s (→ d2rdfnizen5apl.cloudfront.net)
|
|
||||||
- si[0-5]
|
|
||||||
- syndication
|
|
||||||
- cdn.syndication
|
|
||||||
- tailfeather
|
|
||||||
- ton
|
|
||||||
- v
|
|
||||||
- widgets
|
|
||||||
|
|
||||||
- twitter.com subdomains:
|
|
||||||
|
|
||||||
- (www.)
|
|
||||||
- 201[012]
|
|
||||||
- about
|
|
||||||
- ads
|
|
||||||
- analytics
|
|
||||||
- api
|
|
||||||
- cdn.api
|
|
||||||
- urls.api
|
|
||||||
- blog
|
|
||||||
- business
|
|
||||||
- preview.cdn
|
|
||||||
- preview-dev.cdn
|
|
||||||
- preview-stage.cdn
|
|
||||||
- de
|
|
||||||
- dev
|
|
||||||
- en
|
|
||||||
- engineering
|
|
||||||
- es
|
|
||||||
- firefox
|
|
||||||
- fr
|
|
||||||
- it
|
|
||||||
- ja
|
|
||||||
- jp
|
|
||||||
- m
|
|
||||||
- media
|
|
||||||
- mobile
|
|
||||||
- music
|
|
||||||
- oauth
|
|
||||||
- p
|
|
||||||
- pic
|
|
||||||
- platform
|
|
||||||
- platform[0-3] (→ platform)
|
|
||||||
- widgets.platform
|
|
||||||
- search
|
|
||||||
- static
|
|
||||||
- support
|
|
||||||
- transparency
|
|
||||||
- upload
|
|
||||||
|
|
||||||
|
|
||||||
These altnames don't exist:
|
|
||||||
|
|
||||||
- i3.twimg.com
|
|
||||||
- p-dev.twimg.com
|
|
||||||
- vmtc.twimg.com
|
|
||||||
|
|
||||||
- cdn-dev.api.twitter.com
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Twitter">
|
|
||||||
|
|
||||||
<target host="t.co" />
|
|
||||||
<target host="*.t.co" />
|
|
||||||
<target host="*.twimg.com" />
|
|
||||||
<target host="twitter.com" />
|
|
||||||
<target host="*.twitter.com" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Secured by server:
|
|
||||||
-->
|
|
||||||
<!--securecookie host="^\.twitter\.com$" name="^_twitter_sess$" /-->
|
|
||||||
<!--securecookie host="^support\.twitter\.com$" name="^_help_center_session$" /-->
|
|
||||||
<!--
|
|
||||||
Not secured by server:
|
|
||||||
-->
|
|
||||||
<!--securecookie host="^\.t\.co$" name="^muc$" /-->
|
|
||||||
<!--securecookie host="^\.twitter\.com$" name="^guest_id$" /-->
|
|
||||||
|
|
||||||
<securecookie host="^\.t\.co$" name=".+" />
|
|
||||||
<securecookie host="^(?:.*\.)?twitter\.com$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?t\.co/"
|
|
||||||
to="https://t.co/" />
|
|
||||||
|
|
||||||
<rule from="^http://a[5-9]\.twimg\.com/"
|
|
||||||
to="https://si0.twimg.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://(abs|a\d|dnt|ea|g[2u]?|hca|jp|ma\d?|o|p|pbs|r|si\d|(?:cdn\.)?syndication|tailfeather|ton|v|widgets)\.twimg\.com/"
|
|
||||||
to="https://$1.twimg.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://s\.twimg\.com/"
|
|
||||||
to="https://d2rdfnizen5apl.cloudfront.net/" />
|
|
||||||
|
|
||||||
<rule from="^http://((?:201\d|about|ads|analytics|blog|(?:cdn\.|urls\.)?api|business|preview(?:-dev|-stage)?\.cdn|de|dev|engineering|en|es|firefox|fr|it|ja|jp|m|media|mobile|music|oauth|p|pic|platform|widgets\.platform|search|static|support|transparency|upload|www)\.)?twitter\.com/"
|
|
||||||
to="https://$1twitter.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://platform\d\.twitter\.com/"
|
|
||||||
to="https://platform.twitter.com/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,75 +0,0 @@
|
||||||
<!--
|
|
||||||
CDN buckets:
|
|
||||||
|
|
||||||
- av.vimeo.com.edgesuite.net
|
|
||||||
|
|
||||||
- a808.g.akamai.net
|
|
||||||
|
|
||||||
- pdl.vimeocdn.com.edgesuite.net
|
|
||||||
|
|
||||||
- a1189.g.akamai.net
|
|
||||||
|
|
||||||
|
|
||||||
Problematic subdomains:
|
|
||||||
|
|
||||||
- av (pdl.../crossdomain.xml restricts to port 80)
|
|
||||||
- pdl (works, akamai)
|
|
||||||
|
|
||||||
|
|
||||||
Partially covered subdomains:
|
|
||||||
|
|
||||||
- developer (some pages redirect to http)
|
|
||||||
- pdl (→ akamai)
|
|
||||||
|
|
||||||
|
|
||||||
Fully covered subdomains:
|
|
||||||
|
|
||||||
- (www.)
|
|
||||||
- secure
|
|
||||||
|
|
||||||
|
|
||||||
Default off per https://trac.torproject.org/projects/tor/ticket/7569 -->
|
|
||||||
<ruleset name="Vimeo (default off)" default_off="breaks some video embedding">
|
|
||||||
|
|
||||||
<target host="vimeo.com" />
|
|
||||||
<target host="*.vimeo.com" />
|
|
||||||
<exclusion pattern="^http://av\.vimeo\.com/crossdomain\.xml" />
|
|
||||||
<!--exclusion pattern="^http://developer\.vimeo\.com/($|\?|(apps|guidelines|help|player)($|[?/]))" /-->
|
|
||||||
<exclusion pattern="^http://developer\.vimeo\.com/(?!apis(?:$|[?/])|favicon\.ico)" />
|
|
||||||
<target host="*.vimeocdn.com" />
|
|
||||||
<!--
|
|
||||||
Uses crossdomain.xml from s3.amazonaws.com, which sets secure="false"
|
|
||||||
|
|
||||||
https://mail1.eff.org/pipermail/https-everywhere/2012-October/001583.html
|
|
||||||
-->
|
|
||||||
<exclusion pattern="^http://a\.vimeocdn\.com/p/flash/moogaloop/" />
|
|
||||||
|
|
||||||
<!-- We cannot secure streams because crossdomain.xml
|
|
||||||
restricts to port 80 :(
|
|
||||||
-->
|
|
||||||
<exclusion pattern="^http://pdl\.vimeocdn\.com/(?!crossdomain\.xml)" />
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Tracking cookies:
|
|
||||||
-->
|
|
||||||
<securecookie host="^\.(?:player\.)?vimeo\.com$" name="^__utm\w$" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://((?:developer|player|secure|www)\.)?vimeo\.com/"
|
|
||||||
to="https://$1vimeo.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://av\.vimeo\.com/"
|
|
||||||
to="https://a248.e.akamai.net/f/808/9207/8m/av.vimeo.com/" />
|
|
||||||
|
|
||||||
<!-- a & b: Akamai -->
|
|
||||||
<rule from="^http://(?:secure-)?([ab])\.vimeocdn\.com/"
|
|
||||||
to="https://secure-$1.vimeocdn.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://i\.vimeocdn\.com/"
|
|
||||||
to="https://i.vimeocdn.com/" />
|
|
||||||
|
|
||||||
<rule from="^http://pdl\.vimeocdn\.com/"
|
|
||||||
to="https://a248.e.akamai.net/f/1189/4415/8d/pdl.vimeocdn.com/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
||||||
|
|
|
@ -1,13 +0,0 @@
|
||||||
<ruleset name="WikiLeaks">
|
|
||||||
|
|
||||||
<target host="wikileaks.org" />
|
|
||||||
<target host="*.wikileaks.org" />
|
|
||||||
|
|
||||||
|
|
||||||
<securecookie host="^(?:w*\.)?wikileaks\.org$" name=".+" />
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://((?:chat|search|shop|www)\.)?wikileaks\.org/"
|
|
||||||
to="https://$1wikileaks.org/" />
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,107 +0,0 @@
|
||||||
<!--
|
|
||||||
Wikipedia and other Wikimedia Foundation wikis previously had no real HTTPS support, and
|
|
||||||
URLs had to be rewritten to https://secure.wikimedia.org/$wikitype/$language/ . This is no
|
|
||||||
longer the case, see https://blog.wikimedia.org/2011/10/03/native-https-support-enabled-for-all-wikimedia-foundation-wikis/ ,
|
|
||||||
so this file is a lot simpler these days.
|
|
||||||
|
|
||||||
|
|
||||||
Mixed content:
|
|
||||||
|
|
||||||
- Images, on:
|
|
||||||
|
|
||||||
- stats.wikimedia.org from upload.wikimedia.org *
|
|
||||||
- stats.wikimedia.org from wikimediafoundation.org *
|
|
||||||
|
|
||||||
* Secured by us
|
|
||||||
|
|
||||||
-->
|
|
||||||
<ruleset name="Wikimedia">
|
|
||||||
|
|
||||||
<target host="enwp.org" />
|
|
||||||
<target host="frwp.org" />
|
|
||||||
|
|
||||||
<target host="mediawiki.org" />
|
|
||||||
<target host="www.mediawiki.org" />
|
|
||||||
<target host="wikimedia.org" />
|
|
||||||
<target host="*.wikimedia.org" />
|
|
||||||
<exclusion pattern="^http://(?:apt|cs|cz|parsoid-lb\.eqiad|status|torrus|ubuntu)\.wikimedia\.org" />
|
|
||||||
<!-- https://mail1.eff.org/pipermail/https-everywhere-rules/2012-June/001189.html -->
|
|
||||||
<exclusion pattern="^http://lists\.wikimedia\.org/pipermail(?:$|/)" />
|
|
||||||
<target host="wikimediafoundation.org" />
|
|
||||||
<target host="www.wikimediafoundation.org" />
|
|
||||||
|
|
||||||
<!-- Wikimedia projects (also some wikimedia.org subdomains) -->
|
|
||||||
<target host="wikibooks.org" />
|
|
||||||
<target host="*.wikibooks.org" />
|
|
||||||
<target host="wikidata.org" />
|
|
||||||
<target host="*.wikidata.org" />
|
|
||||||
<target host="wikinews.org" />
|
|
||||||
<target host="*.wikinews.org" />
|
|
||||||
<target host="wikipedia.org" />
|
|
||||||
<target host="*.wikipedia.org" />
|
|
||||||
<target host="wikiquote.org" />
|
|
||||||
<target host="*.wikiquote.org" />
|
|
||||||
<target host="wikisource.org" />
|
|
||||||
<target host="*.wikisource.org" />
|
|
||||||
<target host="wikiversity.org" />
|
|
||||||
<target host="*.wikiversity.org" />
|
|
||||||
<target host="wikivoyage.org" />
|
|
||||||
<target host="*.wikivoyage.org" />
|
|
||||||
<target host="wiktionary.org" />
|
|
||||||
<target host="*.wiktionary.org" />
|
|
||||||
|
|
||||||
<!-- Wikimedia chapters -->
|
|
||||||
<target host="wikimedia.ca" />
|
|
||||||
<target host="www.wikimedia.ca" />
|
|
||||||
|
|
||||||
<!-- Wikimedia Tool Labs -->
|
|
||||||
<target host="tools.wmflabs.org" />
|
|
||||||
<target host="icinga.wmflabs.org" />
|
|
||||||
<target host="ganglia.wmflabs.org" />
|
|
||||||
|
|
||||||
<!-- Not secured by server:
|
|
||||||
-->
|
|
||||||
<!--securecookie host="^\.wiki(books|ipedia)\.org$" name="^GeoIP$" /-->
|
|
||||||
|
|
||||||
<securecookie host="^^\.wik(?:ibooks|idata|imedia|inews|ipedia|iquote|isource|iversity|ivoyage|tionary)\.org$" name="^GeoIP$" />
|
|
||||||
<securecookie host="^([^@:/]+\.)?wik(ibooks|idata|inews|ipedia|iquote|isource|iversity|ivoyage|tionary)\.org$" name=".*" />
|
|
||||||
<securecookie host="^(species|commons|meta|incubator|wikitech).wikimedia.org$" name=".*" />
|
|
||||||
<securecookie host="^(?:www\.)?mediawiki\.org$" name=".*" />
|
|
||||||
<securecookie host="^wikimediafoundation.org$" name=".*" />
|
|
||||||
|
|
||||||
<rule from="^http://(en|fr)wp\.org/"
|
|
||||||
to="https://$1.wikipedia.org/wiki/" />
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?mediawiki\.org/"
|
|
||||||
to="https://www.mediawiki.org/" />
|
|
||||||
|
|
||||||
<rule from="^https?://download\.wikipedia\.org/"
|
|
||||||
to="https://dumps.wikimedia.org/" />
|
|
||||||
|
|
||||||
<rule from="^https?://(download|dataset2|sitemap)\.wikimedia\.org/"
|
|
||||||
to="https://dumps.wikimedia.org/" />
|
|
||||||
|
|
||||||
<rule from="^https?://(labs-ns[01]|virt0)\.wikimedia\.org/"
|
|
||||||
to="https://wikitech.wikimedia.org/" />
|
|
||||||
|
|
||||||
<rule from="^https?://noboard\.chapters\.wikimedia\.org/"
|
|
||||||
to="https://noboard-chapters.wikimedia.org/" />
|
|
||||||
|
|
||||||
<rule from="^https?://wg\.en\.wikipedia\.org/"
|
|
||||||
to="https://wg-en.wikipedia.org/" />
|
|
||||||
|
|
||||||
<rule from="^https?://arbcom\.(de|en|fi|nl)\.wikipedia\.org/"
|
|
||||||
to="https://arbcom-$1.wikipedia.org/" />
|
|
||||||
|
|
||||||
<rule from="^http://([^@:/]+\.)?wik(ibooks|idata|imedia|inews|ipedia|iquote|isource|iversity|ivoyage|tionary)\.org/"
|
|
||||||
to="https://$1wik$2.org/" />
|
|
||||||
|
|
||||||
<rule from="^http://(www\.)?wikimediafoundation\.org/"
|
|
||||||
to="https://$1wikimediafoundation.org/" />
|
|
||||||
|
|
||||||
<rule from="^http://(www\.)?wikimedia\.ca/"
|
|
||||||
to="https://wikimedia.ca/" />
|
|
||||||
|
|
||||||
<rule from="^http://([^@:/]+)\.wmflabs\.org/"
|
|
||||||
to="https://$1.wmflabs.org/" />
|
|
||||||
</ruleset>
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,46 +0,0 @@
|
||||||
<ruleset name="YouTube (partial)">
|
|
||||||
|
|
||||||
<target host="youtube.com" />
|
|
||||||
<target host="*.youtube.com" />
|
|
||||||
<exclusion pattern="^http://(?:www\.)?youtube\.com/crossdomain\.xml"/>
|
|
||||||
<exclusion pattern="^http://(?:www\.)?youtube\.com/(?:apiplayer|api_video_info)"/>
|
|
||||||
<exclusion pattern="^http://(?:[^/@:\.]+\.)?ytimg\.com/.*apiplayer[0-9]*\.swf"/>
|
|
||||||
<target host="*.ytimg.com" />
|
|
||||||
<target host="youtu.be" />
|
|
||||||
<target host="youtube-nocookie.com"/>
|
|
||||||
<target host="www.youtube-nocookie.com"/>
|
|
||||||
<target host="*.googlevideo.com"/>
|
|
||||||
<exclusion pattern="^http://([^/@:\.]+)\.googlevideo\.com/crossdomain\.xml"/>
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Not secured by server:
|
|
||||||
-->
|
|
||||||
<!--securecookie host="^\.youtube\.com$" name="^(GEUP|PREF|VISITOR_INFO1_LIVE|YSC)$" /-->
|
|
||||||
|
|
||||||
<!-- observed ^. cookies:
|
|
||||||
- use_hitbox
|
|
||||||
- VISITOR_INFO1_LIVE
|
|
||||||
- recently_watched_video_id_list
|
|
||||||
- .youtube.com -->
|
|
||||||
<securecookie host="^\.youtube\.com" name=".*"/>
|
|
||||||
|
|
||||||
|
|
||||||
<rule from="^http://(www\.)?youtube\.com/"
|
|
||||||
to="https://$1youtube.com/"/>
|
|
||||||
|
|
||||||
<rule from="^http://(br|de|es|fr|il|img|insight|jp|m|nl|uk)\.youtube\.com/"
|
|
||||||
to="https://$1.youtube.com/"/>
|
|
||||||
|
|
||||||
<rule from="^http://([^/@:\.]+)\.ytimg\.com/"
|
|
||||||
to="https://$1.ytimg.com/"/>
|
|
||||||
|
|
||||||
<rule from="^http://youtu\.be/"
|
|
||||||
to="https://youtu.be/"/>
|
|
||||||
|
|
||||||
<rule from="^http://(?:www\.)?youtube-nocookie\.com/"
|
|
||||||
to="https://www.youtube-nocookie.com/"/>
|
|
||||||
|
|
||||||
<rule from="^http://([^/@:\.]+)\.googlevideo\.com/"
|
|
||||||
to="https://$1.googlevideo.com/"/>
|
|
||||||
|
|
||||||
</ruleset>
|
|
|
@ -1,77 +0,0 @@
|
||||||
'''
|
|
||||||
searx is free software: you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU Affero General Public License as published by
|
|
||||||
the Free Software Foundation, either version 3 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
searx is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU Affero General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Affero General Public License
|
|
||||||
along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
|
||||||
|
|
||||||
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
|
|
||||||
'''
|
|
||||||
|
|
||||||
# list of language codes
|
|
||||||
language_codes = (
|
|
||||||
("ar_XA", "Arabic", "Arabia"),
|
|
||||||
("bg_BG", "Bulgarian", "Bulgaria"),
|
|
||||||
("cs_CZ", "Czech", "Czech Republic"),
|
|
||||||
("de_DE", "German", "Germany"),
|
|
||||||
("da_DK", "Danish", "Denmark"),
|
|
||||||
("de_AT", "German", "Austria"),
|
|
||||||
("de_CH", "German", "Switzerland"),
|
|
||||||
("el_GR", "Greek", "Greece"),
|
|
||||||
("en_AU", "English", "Australia"),
|
|
||||||
("en_CA", "English", "Canada"),
|
|
||||||
("en_GB", "English", "United Kingdom"),
|
|
||||||
("en_ID", "English", "Indonesia"),
|
|
||||||
("en_IE", "English", "Ireland"),
|
|
||||||
("en_IN", "English", "India"),
|
|
||||||
("en_MY", "English", "Malaysia"),
|
|
||||||
("en_NZ", "English", "New Zealand"),
|
|
||||||
("en_PH", "English", "Philippines"),
|
|
||||||
("en_SG", "English", "Singapore"),
|
|
||||||
("en_US", "English", "United States"),
|
|
||||||
("en_XA", "English", "Arabia"),
|
|
||||||
("en_ZA", "English", "South Africa"),
|
|
||||||
("es_AR", "Spanish", "Argentina"),
|
|
||||||
("es_CL", "Spanish", "Chile"),
|
|
||||||
("es_ES", "Spanish", "Spain"),
|
|
||||||
("es_MX", "Spanish", "Mexico"),
|
|
||||||
("es_US", "Spanish", "United States"),
|
|
||||||
("es_XL", "Spanish", "Latin America"),
|
|
||||||
("et_EE", "Estonian", "Estonia"),
|
|
||||||
("fi_FI", "Finnish", "Finland"),
|
|
||||||
("fr_BE", "French", "Belgium"),
|
|
||||||
("fr_CA", "French", "Canada"),
|
|
||||||
("fr_CH", "French", "Switzerland"),
|
|
||||||
("fr_FR", "French", "France"),
|
|
||||||
("he_IL", "Hebrew", "Israel"),
|
|
||||||
("hr_HR", "Croatian", "Croatia"),
|
|
||||||
("hu_HU", "Hungarian", "Hungary"),
|
|
||||||
("it_IT", "Italian", "Italy"),
|
|
||||||
("ja_JP", "Japanese", "Japan"),
|
|
||||||
("ko_KR", "Korean", "Korea"),
|
|
||||||
("lt_LT", "Lithuanian", "Lithuania"),
|
|
||||||
("lv_LV", "Latvian", "Latvia"),
|
|
||||||
("nb_NO", "Norwegian", "Norway"),
|
|
||||||
("nl_BE", "Dutch", "Belgium"),
|
|
||||||
("nl_NL", "Dutch", "Netherlands"),
|
|
||||||
("pl_PL", "Polish", "Poland"),
|
|
||||||
("pt_BR", "Portuguese", "Brazil"),
|
|
||||||
("pt_PT", "Portuguese", "Portugal"),
|
|
||||||
("ro_RO", "Romanian", "Romania"),
|
|
||||||
("ru_RU", "Russian", "Russia"),
|
|
||||||
("sk_SK", "Slovak", "Slovak Republic"),
|
|
||||||
("sl_SL", "Slovenian", "Slovenia"),
|
|
||||||
("sv_SE", "Swedish", "Sweden"),
|
|
||||||
("th_TH", "Thai", "Thailand"),
|
|
||||||
("tr_TR", "Turkish", "Turkey"),
|
|
||||||
("uk_UA", "Ukrainian", "Ukraine"),
|
|
||||||
("zh_CN", "Chinese", "China"),
|
|
||||||
("zh_HK", "Chinese", "Hong Kong SAR"),
|
|
||||||
("zh_TW", "Chinese", "Taiwan"))
|
|
|
@ -1,61 +0,0 @@
|
||||||
import requests
|
|
||||||
|
|
||||||
|
|
||||||
the_http_adapter = requests.adapters.HTTPAdapter(pool_connections=100)
|
|
||||||
the_https_adapter = requests.adapters.HTTPAdapter(pool_connections=100)
|
|
||||||
|
|
||||||
|
|
||||||
class SessionSinglePool(requests.Session):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
global the_https_adapter, the_http_adapter
|
|
||||||
super(SessionSinglePool, self).__init__()
|
|
||||||
|
|
||||||
# reuse the same adapters
|
|
||||||
self.adapters.clear()
|
|
||||||
self.mount('https://', the_https_adapter)
|
|
||||||
self.mount('http://', the_http_adapter)
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
"""Call super, but clear adapters since there are managed globaly"""
|
|
||||||
self.adapters.clear()
|
|
||||||
super(SessionSinglePool, self).close()
|
|
||||||
|
|
||||||
|
|
||||||
def request(method, url, **kwargs):
|
|
||||||
"""same as requests/requests/api.py request(...) except it use SessionSinglePool"""
|
|
||||||
session = SessionSinglePool()
|
|
||||||
response = session.request(method=method, url=url, **kwargs)
|
|
||||||
session.close()
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
def get(url, **kwargs):
|
|
||||||
kwargs.setdefault('allow_redirects', True)
|
|
||||||
return request('get', url, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def options(url, **kwargs):
|
|
||||||
kwargs.setdefault('allow_redirects', True)
|
|
||||||
return request('options', url, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def head(url, **kwargs):
|
|
||||||
kwargs.setdefault('allow_redirects', False)
|
|
||||||
return request('head', url, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def post(url, data=None, **kwargs):
|
|
||||||
return request('post', url, data=data, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def put(url, data=None, **kwargs):
|
|
||||||
return request('put', url, data=data, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def patch(url, data=None, **kwargs):
|
|
||||||
return request('patch', url, data=data, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def delete(url, **kwargs):
|
|
||||||
return request('delete', url, **kwargs)
|
|
132
sources/query.py
132
sources/query.py
|
@ -1,132 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
'''
|
|
||||||
searx is free software: you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU Affero General Public License as published by
|
|
||||||
the Free Software Foundation, either version 3 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
searx is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU Affero General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Affero General Public License
|
|
||||||
along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
|
||||||
|
|
||||||
(C) 2014 by Thomas Pointhuber, <thomas.pointhuber@gmx.at>
|
|
||||||
'''
|
|
||||||
|
|
||||||
from searx.languages import language_codes
|
|
||||||
from searx.engines import (
|
|
||||||
categories, engines, engine_shortcuts
|
|
||||||
)
|
|
||||||
import string
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class Query(object):
|
|
||||||
"""parse query"""
|
|
||||||
|
|
||||||
def __init__(self, query, blocked_engines):
|
|
||||||
self.query = query
|
|
||||||
self.blocked_engines = []
|
|
||||||
|
|
||||||
if blocked_engines:
|
|
||||||
self.blocked_engines = blocked_engines
|
|
||||||
|
|
||||||
self.query_parts = []
|
|
||||||
self.engines = []
|
|
||||||
self.languages = []
|
|
||||||
self.specific = False
|
|
||||||
|
|
||||||
# parse query, if tags are set, which
|
|
||||||
# change the serch engine or search-language
|
|
||||||
def parse_query(self):
|
|
||||||
self.query_parts = []
|
|
||||||
|
|
||||||
# split query, including whitespaces
|
|
||||||
raw_query_parts = re.split(r'(\s+)', self.query)
|
|
||||||
|
|
||||||
parse_next = True
|
|
||||||
|
|
||||||
for query_part in raw_query_parts:
|
|
||||||
if not parse_next:
|
|
||||||
self.query_parts[-1] += query_part
|
|
||||||
continue
|
|
||||||
|
|
||||||
parse_next = False
|
|
||||||
|
|
||||||
# part does only contain spaces, skip
|
|
||||||
if query_part.isspace()\
|
|
||||||
or query_part == '':
|
|
||||||
parse_next = True
|
|
||||||
self.query_parts.append(query_part)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# this force a language
|
|
||||||
if query_part[0] == ':':
|
|
||||||
lang = query_part[1:].lower()
|
|
||||||
|
|
||||||
# check if any language-code is equal with
|
|
||||||
# declared language-codes
|
|
||||||
for lc in language_codes:
|
|
||||||
lang_id, lang_name, country = map(str.lower, lc)
|
|
||||||
|
|
||||||
# if correct language-code is found
|
|
||||||
# set it as new search-language
|
|
||||||
if lang == lang_id\
|
|
||||||
or lang_id.startswith(lang)\
|
|
||||||
or lang == lang_name\
|
|
||||||
or lang.replace('_', ' ') == country:
|
|
||||||
parse_next = True
|
|
||||||
self.languages.append(lang)
|
|
||||||
break
|
|
||||||
|
|
||||||
# this force a engine or category
|
|
||||||
if query_part[0] == '!' or query_part[0] == '?':
|
|
||||||
prefix = query_part[1:].replace('_', ' ')
|
|
||||||
|
|
||||||
# check if prefix is equal with engine shortcut
|
|
||||||
if prefix in engine_shortcuts:
|
|
||||||
parse_next = True
|
|
||||||
self.engines.append({'category': 'none',
|
|
||||||
'name': engine_shortcuts[prefix]})
|
|
||||||
|
|
||||||
# check if prefix is equal with engine name
|
|
||||||
elif prefix in engines:
|
|
||||||
parse_next = True
|
|
||||||
self.engines.append({'category': 'none',
|
|
||||||
'name': prefix})
|
|
||||||
|
|
||||||
# check if prefix is equal with categorie name
|
|
||||||
elif prefix in categories:
|
|
||||||
# using all engines for that search, which
|
|
||||||
# are declared under that categorie name
|
|
||||||
parse_next = True
|
|
||||||
self.engines.extend({'category': prefix,
|
|
||||||
'name': engine.name}
|
|
||||||
for engine in categories[prefix]
|
|
||||||
if (engine.name, prefix) not in self.blocked_engines)
|
|
||||||
|
|
||||||
if query_part[0] == '!':
|
|
||||||
self.specific = True
|
|
||||||
|
|
||||||
# append query part to query_part list
|
|
||||||
self.query_parts.append(query_part)
|
|
||||||
|
|
||||||
def changeSearchQuery(self, search_query):
|
|
||||||
if len(self.query_parts):
|
|
||||||
self.query_parts[-1] = search_query
|
|
||||||
else:
|
|
||||||
self.query_parts.append(search_query)
|
|
||||||
|
|
||||||
def getSearchQuery(self):
|
|
||||||
if len(self.query_parts):
|
|
||||||
return self.query_parts[-1]
|
|
||||||
else:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getFullQuery(self):
|
|
||||||
# get full querry including whitespaces
|
|
||||||
return string.join(self.query_parts, '')
|
|
|
@ -1,556 +0,0 @@
|
||||||
'''
|
|
||||||
searx is free software: you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU Affero General Public License as published by
|
|
||||||
the Free Software Foundation, either version 3 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
searx is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU Affero General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Affero General Public License
|
|
||||||
along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
|
||||||
|
|
||||||
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
|
|
||||||
'''
|
|
||||||
|
|
||||||
import threading
|
|
||||||
import re
|
|
||||||
import searx.poolrequests as requests_lib
|
|
||||||
from itertools import izip_longest, chain
|
|
||||||
from operator import itemgetter
|
|
||||||
from Queue import Queue
|
|
||||||
from time import time
|
|
||||||
from urlparse import urlparse, unquote
|
|
||||||
from searx.engines import (
|
|
||||||
categories, engines
|
|
||||||
)
|
|
||||||
from searx.languages import language_codes
|
|
||||||
from searx.utils import gen_useragent, get_blocked_engines
|
|
||||||
from searx.query import Query
|
|
||||||
from searx import logger
|
|
||||||
|
|
||||||
logger = logger.getChild('search')
|
|
||||||
|
|
||||||
number_of_searches = 0
|
|
||||||
|
|
||||||
|
|
||||||
def search_request_wrapper(fn, url, engine_name, **kwargs):
|
|
||||||
try:
|
|
||||||
return fn(url, **kwargs)
|
|
||||||
except:
|
|
||||||
# increase errors stats
|
|
||||||
engines[engine_name].stats['errors'] += 1
|
|
||||||
|
|
||||||
# print engine name and specific error message
|
|
||||||
logger.exception('engine crash: {0}'.format(engine_name))
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
def threaded_requests(requests):
|
|
||||||
timeout_limit = max(r[2]['timeout'] for r in requests)
|
|
||||||
search_start = time()
|
|
||||||
for fn, url, request_args, engine_name in requests:
|
|
||||||
request_args['timeout'] = timeout_limit
|
|
||||||
th = threading.Thread(
|
|
||||||
target=search_request_wrapper,
|
|
||||||
args=(fn, url, engine_name),
|
|
||||||
kwargs=request_args,
|
|
||||||
name='search_request',
|
|
||||||
)
|
|
||||||
th._engine_name = engine_name
|
|
||||||
th.start()
|
|
||||||
|
|
||||||
for th in threading.enumerate():
|
|
||||||
if th.name == 'search_request':
|
|
||||||
remaining_time = max(0.0, timeout_limit - (time() - search_start))
|
|
||||||
th.join(remaining_time)
|
|
||||||
if th.isAlive():
|
|
||||||
logger.warning('engine timeout: {0}'.format(th._engine_name))
|
|
||||||
|
|
||||||
|
|
||||||
# get default reqest parameter
|
|
||||||
def default_request_params():
|
|
||||||
return {
|
|
||||||
'method': 'GET',
|
|
||||||
'headers': {},
|
|
||||||
'data': {},
|
|
||||||
'url': '',
|
|
||||||
'cookies': {},
|
|
||||||
'verify': True
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# create a callback wrapper for the search engine results
|
|
||||||
def make_callback(engine_name, results_queue, callback, params):
|
|
||||||
|
|
||||||
# creating a callback wrapper for the search engine results
|
|
||||||
def process_callback(response, **kwargs):
|
|
||||||
# check if redirect comparing to the True value,
|
|
||||||
# because resp can be a Mock object, and any attribut name returns something.
|
|
||||||
if response.is_redirect is True:
|
|
||||||
logger.debug('{0} redirect on: {1}'.format(engine_name, response))
|
|
||||||
return
|
|
||||||
|
|
||||||
response.search_params = params
|
|
||||||
|
|
||||||
timeout_overhead = 0.2 # seconds
|
|
||||||
search_duration = time() - params['started']
|
|
||||||
timeout_limit = engines[engine_name].timeout + timeout_overhead
|
|
||||||
if search_duration > timeout_limit:
|
|
||||||
engines[engine_name].stats['page_load_time'] += timeout_limit
|
|
||||||
engines[engine_name].stats['errors'] += 1
|
|
||||||
return
|
|
||||||
|
|
||||||
# callback
|
|
||||||
search_results = callback(response)
|
|
||||||
|
|
||||||
# add results
|
|
||||||
for result in search_results:
|
|
||||||
result['engine'] = engine_name
|
|
||||||
|
|
||||||
results_queue.put_nowait((engine_name, search_results))
|
|
||||||
|
|
||||||
# update stats with current page-load-time
|
|
||||||
engines[engine_name].stats['page_load_time'] += search_duration
|
|
||||||
|
|
||||||
return process_callback
|
|
||||||
|
|
||||||
|
|
||||||
# return the meaningful length of the content for a result
|
|
||||||
def content_result_len(content):
|
|
||||||
if isinstance(content, basestring):
|
|
||||||
content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
|
|
||||||
return len(content)
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
# score results and remove duplications
|
|
||||||
def score_results(results):
|
|
||||||
# calculate scoring parameters
|
|
||||||
flat_res = filter(
|
|
||||||
None, chain.from_iterable(izip_longest(*results.values())))
|
|
||||||
flat_len = len(flat_res)
|
|
||||||
engines_len = len(results)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# pass 1: deduplication + scoring
|
|
||||||
for i, res in enumerate(flat_res):
|
|
||||||
|
|
||||||
res['parsed_url'] = urlparse(res['url'])
|
|
||||||
|
|
||||||
res['host'] = res['parsed_url'].netloc
|
|
||||||
|
|
||||||
if res['host'].startswith('www.'):
|
|
||||||
res['host'] = res['host'].replace('www.', '', 1)
|
|
||||||
|
|
||||||
res['engines'] = [res['engine']]
|
|
||||||
|
|
||||||
weight = 1.0
|
|
||||||
|
|
||||||
# strip multiple spaces and cariage returns from content
|
|
||||||
if res.get('content'):
|
|
||||||
res['content'] = re.sub(' +', ' ',
|
|
||||||
res['content'].strip().replace('\n', ''))
|
|
||||||
|
|
||||||
# get weight of this engine if possible
|
|
||||||
if hasattr(engines[res['engine']], 'weight'):
|
|
||||||
weight = float(engines[res['engine']].weight)
|
|
||||||
|
|
||||||
# calculate score for that engine
|
|
||||||
score = int((flat_len - i) / engines_len) * weight + 1
|
|
||||||
|
|
||||||
# check for duplicates
|
|
||||||
duplicated = False
|
|
||||||
for new_res in results:
|
|
||||||
# remove / from the end of the url if required
|
|
||||||
p1 = res['parsed_url'].path[:-1]\
|
|
||||||
if res['parsed_url'].path.endswith('/')\
|
|
||||||
else res['parsed_url'].path
|
|
||||||
p2 = new_res['parsed_url'].path[:-1]\
|
|
||||||
if new_res['parsed_url'].path.endswith('/')\
|
|
||||||
else new_res['parsed_url'].path
|
|
||||||
|
|
||||||
# check if that result is a duplicate
|
|
||||||
if res['host'] == new_res['host'] and\
|
|
||||||
unquote(p1) == unquote(p2) and\
|
|
||||||
res['parsed_url'].query == new_res['parsed_url'].query and\
|
|
||||||
res.get('template') == new_res.get('template'):
|
|
||||||
duplicated = new_res
|
|
||||||
break
|
|
||||||
|
|
||||||
# merge duplicates together
|
|
||||||
if duplicated:
|
|
||||||
# using content with more text
|
|
||||||
if content_result_len(res.get('content', '')) >\
|
|
||||||
content_result_len(duplicated.get('content', '')):
|
|
||||||
duplicated['content'] = res['content']
|
|
||||||
|
|
||||||
# increase result-score
|
|
||||||
duplicated['score'] += score
|
|
||||||
|
|
||||||
# add engine to list of result-engines
|
|
||||||
duplicated['engines'].append(res['engine'])
|
|
||||||
|
|
||||||
# using https if possible
|
|
||||||
if duplicated['parsed_url'].scheme == 'https':
|
|
||||||
continue
|
|
||||||
elif res['parsed_url'].scheme == 'https':
|
|
||||||
duplicated['url'] = res['parsed_url'].geturl()
|
|
||||||
duplicated['parsed_url'] = res['parsed_url']
|
|
||||||
|
|
||||||
# if there is no duplicate found, append result
|
|
||||||
else:
|
|
||||||
res['score'] = score
|
|
||||||
results.append(res)
|
|
||||||
|
|
||||||
results = sorted(results, key=itemgetter('score'), reverse=True)
|
|
||||||
|
|
||||||
# pass 2 : group results by category and template
|
|
||||||
gresults = []
|
|
||||||
categoryPositions = {}
|
|
||||||
|
|
||||||
for i, res in enumerate(results):
|
|
||||||
# FIXME : handle more than one category per engine
|
|
||||||
category = engines[res['engine']].categories[0] + ':' + ''\
|
|
||||||
if 'template' not in res\
|
|
||||||
else res['template']
|
|
||||||
|
|
||||||
current = None if category not in categoryPositions\
|
|
||||||
else categoryPositions[category]
|
|
||||||
|
|
||||||
# group with previous results using the same category
|
|
||||||
# if the group can accept more result and is not too far
|
|
||||||
# from the current position
|
|
||||||
if current is not None and (current['count'] > 0)\
|
|
||||||
and (len(gresults) - current['index'] < 20):
|
|
||||||
# group with the previous results using
|
|
||||||
# the same category with this one
|
|
||||||
index = current['index']
|
|
||||||
gresults.insert(index, res)
|
|
||||||
|
|
||||||
# update every index after the current one
|
|
||||||
# (including the current one)
|
|
||||||
for k in categoryPositions:
|
|
||||||
v = categoryPositions[k]['index']
|
|
||||||
if v >= index:
|
|
||||||
categoryPositions[k]['index'] = v+1
|
|
||||||
|
|
||||||
# update this category
|
|
||||||
current['count'] -= 1
|
|
||||||
|
|
||||||
else:
|
|
||||||
# same category
|
|
||||||
gresults.append(res)
|
|
||||||
|
|
||||||
# update categoryIndex
|
|
||||||
categoryPositions[category] = {'index': len(gresults), 'count': 8}
|
|
||||||
|
|
||||||
# return gresults
|
|
||||||
return gresults
|
|
||||||
|
|
||||||
|
|
||||||
def merge_two_infoboxes(infobox1, infobox2):
|
|
||||||
if 'urls' in infobox2:
|
|
||||||
urls1 = infobox1.get('urls', None)
|
|
||||||
if urls1 is None:
|
|
||||||
urls1 = []
|
|
||||||
infobox1.set('urls', urls1)
|
|
||||||
|
|
||||||
urlSet = set()
|
|
||||||
for url in infobox1.get('urls', []):
|
|
||||||
urlSet.add(url.get('url', None))
|
|
||||||
|
|
||||||
for url in infobox2.get('urls', []):
|
|
||||||
if url.get('url', None) not in urlSet:
|
|
||||||
urls1.append(url)
|
|
||||||
|
|
||||||
if 'attributes' in infobox2:
|
|
||||||
attributes1 = infobox1.get('attributes', None)
|
|
||||||
if attributes1 is None:
|
|
||||||
attributes1 = []
|
|
||||||
infobox1.set('attributes', attributes1)
|
|
||||||
|
|
||||||
attributeSet = set()
|
|
||||||
for attribute in infobox1.get('attributes', []):
|
|
||||||
if attribute.get('label', None) not in attributeSet:
|
|
||||||
attributeSet.add(attribute.get('label', None))
|
|
||||||
|
|
||||||
for attribute in infobox2.get('attributes', []):
|
|
||||||
attributes1.append(attribute)
|
|
||||||
|
|
||||||
if 'content' in infobox2:
|
|
||||||
content1 = infobox1.get('content', None)
|
|
||||||
content2 = infobox2.get('content', '')
|
|
||||||
if content1 is not None:
|
|
||||||
if content_result_len(content2) > content_result_len(content1):
|
|
||||||
infobox1['content'] = content2
|
|
||||||
else:
|
|
||||||
infobox1.set('content', content2)
|
|
||||||
|
|
||||||
|
|
||||||
def merge_infoboxes(infoboxes):
|
|
||||||
results = []
|
|
||||||
infoboxes_id = {}
|
|
||||||
for infobox in infoboxes:
|
|
||||||
add_infobox = True
|
|
||||||
infobox_id = infobox.get('id', None)
|
|
||||||
if infobox_id is not None:
|
|
||||||
existingIndex = infoboxes_id.get(infobox_id, None)
|
|
||||||
if existingIndex is not None:
|
|
||||||
merge_two_infoboxes(results[existingIndex], infobox)
|
|
||||||
add_infobox = False
|
|
||||||
|
|
||||||
if add_infobox:
|
|
||||||
results.append(infobox)
|
|
||||||
infoboxes_id[infobox_id] = len(results)-1
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
class Search(object):
|
|
||||||
|
|
||||||
"""Search information container"""
|
|
||||||
|
|
||||||
def __init__(self, request):
|
|
||||||
# init vars
|
|
||||||
super(Search, self).__init__()
|
|
||||||
self.query = None
|
|
||||||
self.engines = []
|
|
||||||
self.categories = []
|
|
||||||
self.paging = False
|
|
||||||
self.pageno = 1
|
|
||||||
self.lang = 'all'
|
|
||||||
|
|
||||||
# set blocked engines
|
|
||||||
self.blocked_engines = get_blocked_engines(engines, request.cookies)
|
|
||||||
|
|
||||||
self.results = []
|
|
||||||
self.suggestions = []
|
|
||||||
self.answers = []
|
|
||||||
self.infoboxes = []
|
|
||||||
self.request_data = {}
|
|
||||||
|
|
||||||
# set specific language if set
|
|
||||||
if request.cookies.get('language')\
|
|
||||||
and request.cookies['language'] in (x[0] for x in language_codes):
|
|
||||||
self.lang = request.cookies['language']
|
|
||||||
|
|
||||||
# set request method
|
|
||||||
if request.method == 'POST':
|
|
||||||
self.request_data = request.form
|
|
||||||
else:
|
|
||||||
self.request_data = request.args
|
|
||||||
|
|
||||||
# TODO better exceptions
|
|
||||||
if not self.request_data.get('q'):
|
|
||||||
raise Exception('noquery')
|
|
||||||
|
|
||||||
# set pagenumber
|
|
||||||
pageno_param = self.request_data.get('pageno', '1')
|
|
||||||
if not pageno_param.isdigit() or int(pageno_param) < 1:
|
|
||||||
raise Exception('wrong pagenumber')
|
|
||||||
|
|
||||||
self.pageno = int(pageno_param)
|
|
||||||
|
|
||||||
# parse query, if tags are set, which change
|
|
||||||
# the serch engine or search-language
|
|
||||||
query_obj = Query(self.request_data['q'], self.blocked_engines)
|
|
||||||
query_obj.parse_query()
|
|
||||||
|
|
||||||
# set query
|
|
||||||
self.query = query_obj.getSearchQuery()
|
|
||||||
|
|
||||||
# get last selected language in query, if possible
|
|
||||||
# TODO support search with multible languages
|
|
||||||
if len(query_obj.languages):
|
|
||||||
self.lang = query_obj.languages[-1]
|
|
||||||
|
|
||||||
self.engines = query_obj.engines
|
|
||||||
|
|
||||||
self.categories = []
|
|
||||||
|
|
||||||
# if engines are calculated from query,
|
|
||||||
# set categories by using that informations
|
|
||||||
if self.engines and query_obj.specific:
|
|
||||||
self.categories = list(set(engine['category']
|
|
||||||
for engine in self.engines))
|
|
||||||
|
|
||||||
# otherwise, using defined categories to
|
|
||||||
# calculate which engines should be used
|
|
||||||
else:
|
|
||||||
# set used categories
|
|
||||||
for pd_name, pd in self.request_data.items():
|
|
||||||
if pd_name.startswith('category_'):
|
|
||||||
category = pd_name[9:]
|
|
||||||
|
|
||||||
# if category is not found in list, skip
|
|
||||||
if category not in categories:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if pd != 'off':
|
|
||||||
# add category to list
|
|
||||||
self.categories.append(category)
|
|
||||||
elif category in self.categories:
|
|
||||||
# remove category from list if property is set to 'off'
|
|
||||||
self.categories.remove(category)
|
|
||||||
|
|
||||||
# if no category is specified for this search,
|
|
||||||
# using user-defined default-configuration which
|
|
||||||
# (is stored in cookie)
|
|
||||||
if not self.categories:
|
|
||||||
cookie_categories = request.cookies.get('categories', '')
|
|
||||||
cookie_categories = cookie_categories.split(',')
|
|
||||||
for ccateg in cookie_categories:
|
|
||||||
if ccateg in categories:
|
|
||||||
self.categories.append(ccateg)
|
|
||||||
|
|
||||||
# if still no category is specified, using general
|
|
||||||
# as default-category
|
|
||||||
if not self.categories:
|
|
||||||
self.categories = ['general']
|
|
||||||
|
|
||||||
# using all engines for that search, which are
|
|
||||||
# declared under the specific categories
|
|
||||||
for categ in self.categories:
|
|
||||||
self.engines.extend({'category': categ,
|
|
||||||
'name': engine.name}
|
|
||||||
for engine in categories[categ]
|
|
||||||
if (engine.name, categ) not in self.blocked_engines)
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def search(self, request):
|
|
||||||
global number_of_searches
|
|
||||||
|
|
||||||
# init vars
|
|
||||||
requests = []
|
|
||||||
results_queue = Queue()
|
|
||||||
results = {}
|
|
||||||
suggestions = set()
|
|
||||||
answers = set()
|
|
||||||
infoboxes = []
|
|
||||||
|
|
||||||
# increase number of searches
|
|
||||||
number_of_searches += 1
|
|
||||||
|
|
||||||
# set default useragent
|
|
||||||
# user_agent = request.headers.get('User-Agent', '')
|
|
||||||
user_agent = gen_useragent()
|
|
||||||
|
|
||||||
# start search-reqest for all selected engines
|
|
||||||
for selected_engine in self.engines:
|
|
||||||
if selected_engine['name'] not in engines:
|
|
||||||
continue
|
|
||||||
|
|
||||||
engine = engines[selected_engine['name']]
|
|
||||||
|
|
||||||
# if paging is not supported, skip
|
|
||||||
if self.pageno > 1 and not engine.paging:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# if search-language is set and engine does not
|
|
||||||
# provide language-support, skip
|
|
||||||
if self.lang != 'all' and not engine.language_support:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# set default request parameters
|
|
||||||
request_params = default_request_params()
|
|
||||||
request_params['headers']['User-Agent'] = user_agent
|
|
||||||
request_params['category'] = selected_engine['category']
|
|
||||||
request_params['started'] = time()
|
|
||||||
request_params['pageno'] = self.pageno
|
|
||||||
request_params['language'] = self.lang
|
|
||||||
try:
|
|
||||||
# 0 = None, 1 = Moderate, 2 = Strict
|
|
||||||
request_params['safesearch'] = int(request.cookies.get('safesearch', 1))
|
|
||||||
except ValueError:
|
|
||||||
request_params['safesearch'] = 1
|
|
||||||
|
|
||||||
# update request parameters dependent on
|
|
||||||
# search-engine (contained in engines folder)
|
|
||||||
engine.request(self.query.encode('utf-8'), request_params)
|
|
||||||
|
|
||||||
if request_params['url'] is None:
|
|
||||||
# TODO add support of offline engines
|
|
||||||
pass
|
|
||||||
|
|
||||||
# create a callback wrapper for the search engine results
|
|
||||||
callback = make_callback(
|
|
||||||
selected_engine['name'],
|
|
||||||
results_queue,
|
|
||||||
engine.response,
|
|
||||||
request_params)
|
|
||||||
|
|
||||||
# create dictionary which contain all
|
|
||||||
# informations about the request
|
|
||||||
request_args = dict(
|
|
||||||
headers=request_params['headers'],
|
|
||||||
hooks=dict(response=callback),
|
|
||||||
cookies=request_params['cookies'],
|
|
||||||
timeout=engine.timeout,
|
|
||||||
verify=request_params['verify']
|
|
||||||
)
|
|
||||||
|
|
||||||
# specific type of request (GET or POST)
|
|
||||||
if request_params['method'] == 'GET':
|
|
||||||
req = requests_lib.get
|
|
||||||
else:
|
|
||||||
req = requests_lib.post
|
|
||||||
request_args['data'] = request_params['data']
|
|
||||||
|
|
||||||
# ignoring empty urls
|
|
||||||
if not request_params['url']:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# append request to list
|
|
||||||
requests.append((req, request_params['url'],
|
|
||||||
request_args,
|
|
||||||
selected_engine['name']))
|
|
||||||
|
|
||||||
if not requests:
|
|
||||||
return results, suggestions, answers, infoboxes
|
|
||||||
# send all search-request
|
|
||||||
threaded_requests(requests)
|
|
||||||
|
|
||||||
while not results_queue.empty():
|
|
||||||
engine_name, engine_results = results_queue.get_nowait()
|
|
||||||
|
|
||||||
# TODO type checks
|
|
||||||
[suggestions.add(x['suggestion'])
|
|
||||||
for x in list(engine_results)
|
|
||||||
if 'suggestion' in x
|
|
||||||
and engine_results.remove(x) is None]
|
|
||||||
|
|
||||||
[answers.add(x['answer'])
|
|
||||||
for x in list(engine_results)
|
|
||||||
if 'answer' in x
|
|
||||||
and engine_results.remove(x) is None]
|
|
||||||
|
|
||||||
infoboxes.extend(x for x in list(engine_results)
|
|
||||||
if 'infobox' in x
|
|
||||||
and engine_results.remove(x) is None)
|
|
||||||
|
|
||||||
results[engine_name] = engine_results
|
|
||||||
|
|
||||||
# update engine-specific stats
|
|
||||||
for engine_name, engine_results in results.items():
|
|
||||||
engines[engine_name].stats['search_count'] += 1
|
|
||||||
engines[engine_name].stats['result_count'] += len(engine_results)
|
|
||||||
|
|
||||||
# score results and remove duplications
|
|
||||||
results = score_results(results)
|
|
||||||
|
|
||||||
# merge infoboxes according to their ids
|
|
||||||
infoboxes = merge_infoboxes(infoboxes)
|
|
||||||
|
|
||||||
# update engine stats, using calculated score
|
|
||||||
for result in results:
|
|
||||||
for res_engine in result['engines']:
|
|
||||||
engines[result['engine']]\
|
|
||||||
.stats['score_count'] += result['score']
|
|
||||||
|
|
||||||
# return results, suggestions, answers and infoboxes
|
|
||||||
return results, suggestions, answers, infoboxes
|
|
|
@ -1,57 +0,0 @@
|
||||||
## 500px (Images)
|
|
||||||
#
|
|
||||||
# @website https://500px.com
|
|
||||||
# @provide-api yes (https://developers.500px.com/)
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML
|
|
||||||
# @stable no (HTML can change)
|
|
||||||
# @parse url, title, thumbnail, img_src, content
|
|
||||||
#
|
|
||||||
# @todo rewrite to api
|
|
||||||
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from urlparse import urljoin
|
|
||||||
from lxml import html
|
|
||||||
|
|
||||||
# engine dependent config
|
|
||||||
categories = ['images']
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
# search-url
|
|
||||||
base_url = 'https://500px.com'
|
|
||||||
search_url = base_url+'/search?search?page={pageno}&type=photos&{query}'
|
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(pageno=params['pageno'],
|
|
||||||
query=urlencode({'q': query}))
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in dom.xpath('//div[@class="photo"]'):
|
|
||||||
link = result.xpath('.//a')[0]
|
|
||||||
url = urljoin(base_url, link.attrib.get('href'))
|
|
||||||
title = result.xpath('.//div[@class="title"]//text()')[0]
|
|
||||||
img_src = link.xpath('.//img')[0].attrib['src']
|
|
||||||
content = result.xpath('.//div[@class="info"]//text()')[0]
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'img_src': img_src,
|
|
||||||
'content': content,
|
|
||||||
'template': 'images.html'})
|
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
|
|
@ -21,12 +21,18 @@ import re
|
||||||
# engine dependent config
|
# engine dependent config
|
||||||
categories = ['images']
|
categories = ['images']
|
||||||
paging = True
|
paging = True
|
||||||
|
safesearch = True
|
||||||
|
|
||||||
# search-url
|
# search-url
|
||||||
base_url = 'https://www.bing.com/'
|
base_url = 'https://www.bing.com/'
|
||||||
search_string = 'images/search?{query}&count=10&first={offset}'
|
search_string = 'images/search?{query}&count=10&first={offset}'
|
||||||
thumb_url = "http://ts1.mm.bing.net/th?id={ihk}"
|
thumb_url = "http://ts1.mm.bing.net/th?id={ihk}"
|
||||||
|
|
||||||
|
# safesearch definitions
|
||||||
|
safesearch_types = {2: 'STRICT',
|
||||||
|
1: 'DEMOTE',
|
||||||
|
0: 'OFF'}
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
# do search-request
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
|
@ -43,7 +49,8 @@ def request(query, params):
|
||||||
offset=offset)
|
offset=offset)
|
||||||
|
|
||||||
params['cookies']['SRCHHPGUSR'] = \
|
params['cookies']['SRCHHPGUSR'] = \
|
||||||
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
|
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\
|
||||||
|
'&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
|
||||||
|
|
||||||
params['url'] = base_url + search_path
|
params['url'] = base_url + search_path
|
||||||
|
|
||||||
|
|
|
@ -13,12 +13,9 @@ def request(query, params):
|
||||||
if not m:
|
if not m:
|
||||||
# wrong query
|
# wrong query
|
||||||
return params
|
return params
|
||||||
try:
|
|
||||||
ammount, from_currency, to_currency = m.groups()
|
ammount, from_currency, to_currency = m.groups()
|
||||||
ammount = float(ammount)
|
ammount = float(ammount)
|
||||||
except:
|
|
||||||
# wrong params
|
|
||||||
return params
|
|
||||||
|
|
||||||
q = (from_currency + to_currency).upper()
|
q = (from_currency + to_currency).upper()
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
|
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from lxml.html import fromstring
|
from lxml.html import fromstring
|
||||||
from searx.utils import html_to_text
|
from searx.engines.xpath import extract_text
|
||||||
|
|
||||||
# engine dependent config
|
# engine dependent config
|
||||||
categories = ['general']
|
categories = ['general']
|
||||||
|
@ -28,8 +28,8 @@ url = 'https://duckduckgo.com/html?{query}&s={offset}'
|
||||||
# specific xpath variables
|
# specific xpath variables
|
||||||
result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa
|
result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa
|
||||||
url_xpath = './/a[@class="large"]/@href'
|
url_xpath = './/a[@class="large"]/@href'
|
||||||
title_xpath = './/a[@class="large"]//text()'
|
title_xpath = './/a[@class="large"]'
|
||||||
content_xpath = './/div[@class="snippet"]//text()'
|
content_xpath = './/div[@class="snippet"]'
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
# do search-request
|
||||||
|
@ -64,8 +64,8 @@ def response(resp):
|
||||||
if not res_url:
|
if not res_url:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
title = html_to_text(''.join(r.xpath(title_xpath)))
|
title = extract_text(r.xpath(title_xpath))
|
||||||
content = html_to_text(''.join(r.xpath(content_xpath)))
|
content = extract_text(r.xpath(content_xpath))
|
||||||
|
|
||||||
# append result
|
# append result
|
||||||
results.append({'title': title,
|
results.append({'title': title,
|
||||||
|
|
|
@ -25,9 +25,10 @@ def request(query, params):
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
def response(resp):
|
||||||
search_res = json.loads(resp.text)
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
search_res = json.loads(resp.text)
|
||||||
|
|
||||||
content = ''
|
content = ''
|
||||||
heading = search_res.get('Heading', '')
|
heading = search_res.get('Heading', '')
|
||||||
attributes = []
|
attributes = []
|
||||||
|
@ -68,7 +69,7 @@ def response(resp):
|
||||||
results.append({'title': heading, 'url': firstURL})
|
results.append({'title': heading, 'url': firstURL})
|
||||||
|
|
||||||
# related topics
|
# related topics
|
||||||
for ddg_result in search_res.get('RelatedTopics', None):
|
for ddg_result in search_res.get('RelatedTopics', []):
|
||||||
if 'FirstURL' in ddg_result:
|
if 'FirstURL' in ddg_result:
|
||||||
suggestion = result_to_text(ddg_result.get('FirstURL', None),
|
suggestion = result_to_text(ddg_result.get('FirstURL', None),
|
||||||
ddg_result.get('Text', None),
|
ddg_result.get('Text', None),
|
||||||
|
|
|
@ -37,7 +37,7 @@ search_category = {'general': 'web',
|
||||||
|
|
||||||
# do search-request
|
# do search-request
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
offset = (params['pageno']-1) * number_of_results + 1
|
offset = (params['pageno'] - 1) * number_of_results + 1
|
||||||
categorie = search_category.get(params['category'], 'web')
|
categorie = search_category.get(params['category'], 'web')
|
||||||
|
|
||||||
if params['language'] == 'all':
|
if params['language'] == 'all':
|
||||||
|
@ -45,11 +45,11 @@ def request(query, params):
|
||||||
else:
|
else:
|
||||||
language = params['language'].split('_')[0]
|
language = params['language'].split('_')[0]
|
||||||
|
|
||||||
# skip, if language is not supported
|
# if language is not supported, put it in english
|
||||||
if language != 'en' and\
|
if language != 'en' and\
|
||||||
language != 'de' and\
|
language != 'de' and\
|
||||||
language != 'zh':
|
language != 'zh':
|
||||||
return params
|
language = 'en'
|
||||||
|
|
||||||
params['url'] = search_url.format(offset=offset,
|
params['url'] = search_url.format(offset=offset,
|
||||||
number_of_results=number_of_results,
|
number_of_results=number_of_results,
|
||||||
|
@ -69,12 +69,10 @@ def response(resp):
|
||||||
# HTTP-Code 401: api-key is not valide
|
# HTTP-Code 401: api-key is not valide
|
||||||
if resp.status_code == 401:
|
if resp.status_code == 401:
|
||||||
raise Exception("API key is not valide")
|
raise Exception("API key is not valide")
|
||||||
return []
|
|
||||||
|
|
||||||
# HTTP-Code 429: rate limit exceeded
|
# HTTP-Code 429: rate limit exceeded
|
||||||
if resp.status_code == 429:
|
if resp.status_code == 429:
|
||||||
raise Exception("rate limit has been exceeded!")
|
raise Exception("rate limit has been exceeded!")
|
||||||
return []
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
|
|
@ -1,95 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
# Flickr (Images)
|
|
||||||
#
|
|
||||||
# @website https://www.flickr.com
|
|
||||||
# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
|
|
||||||
#
|
|
||||||
# @using-api no
|
|
||||||
# @results HTML
|
|
||||||
# @stable no
|
|
||||||
# @parse url, title, thumbnail, img_src
|
|
||||||
|
|
||||||
from urllib import urlencode
|
|
||||||
from json import loads
|
|
||||||
import re
|
|
||||||
|
|
||||||
categories = ['images']
|
|
||||||
|
|
||||||
url = 'https://secure.flickr.com/'
|
|
||||||
search_url = url+'search/?{query}&page={page}'
|
|
||||||
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
|
|
||||||
regex = re.compile(r"\"search-photos-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL)
|
|
||||||
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's')
|
|
||||||
|
|
||||||
paging = True
|
|
||||||
|
|
||||||
|
|
||||||
def build_flickr_url(user_id, photo_id):
|
|
||||||
return photo_url.format(userid=user_id, photoid=photo_id)
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
|
||||||
params['url'] = search_url.format(query=urlencode({'text': query}),
|
|
||||||
page=params['pageno'])
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
matches = regex.search(resp.text)
|
|
||||||
|
|
||||||
if matches is None:
|
|
||||||
return results
|
|
||||||
|
|
||||||
match = matches.group(1)
|
|
||||||
search_results = loads(match)
|
|
||||||
|
|
||||||
if '_data' not in search_results:
|
|
||||||
return []
|
|
||||||
|
|
||||||
photos = search_results['_data']
|
|
||||||
|
|
||||||
for photo in photos:
|
|
||||||
|
|
||||||
# In paged configuration, the first pages' photos
|
|
||||||
# are represented by a None object
|
|
||||||
if photo is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
img_src = None
|
|
||||||
# From the biggest to the lowest format
|
|
||||||
for image_size in image_sizes:
|
|
||||||
if image_size in photo['sizes']:
|
|
||||||
img_src = photo['sizes'][image_size]['displayUrl']
|
|
||||||
break
|
|
||||||
|
|
||||||
if not img_src:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if 'id' not in photo['owner']:
|
|
||||||
continue
|
|
||||||
|
|
||||||
url = build_flickr_url(photo['owner']['id'], photo['id'])
|
|
||||||
|
|
||||||
title = photo.get('title', '')
|
|
||||||
|
|
||||||
content = '<span class="photo-author">' +\
|
|
||||||
photo['owner']['username'] +\
|
|
||||||
'</span><br />'
|
|
||||||
|
|
||||||
if 'description' in photo:
|
|
||||||
content = content +\
|
|
||||||
'<span class="description">' +\
|
|
||||||
photo['description'] +\
|
|
||||||
'</span>'
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url,
|
|
||||||
'title': title,
|
|
||||||
'img_src': img_src,
|
|
||||||
'content': content,
|
|
||||||
'template': 'images.html'})
|
|
||||||
|
|
||||||
return results
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue