update sources

This commit is contained in:
Adrien Beudin 2015-02-17 12:45:54 +01:00
parent 2357ffbf73
commit d265aca41c
745 changed files with 3627 additions and 52651 deletions

View File

@ -33,3 +33,4 @@ generally made searx better:
- Benjamin Sonntag - Benjamin Sonntag
- @opi - @opi
- @dimqua - @dimqua
- Giorgos Logiotatidis

21
sources/Dockerfile Normal file
View File

@ -0,0 +1,21 @@
FROM debian:stable
RUN apt-get update && \
apt-get install -y --no-install-recommends \
python-dev python2.7-minimal python-virtualenv \
python-pybabel python-pip zlib1g-dev \
libxml2-dev libxslt1-dev build-essential \
openssl
RUN useradd searx
WORKDIR /app
RUN pip install uwsgi
COPY requirements.txt /app/requirements.txt
RUN pip install -r requirements.txt
COPY . /app
RUN sed -i -e "s/ultrasecretkey/`openssl rand -hex 16`/g" searx/settings.yml
EXPOSE 5000
CMD ["/usr/local/bin/uwsgi", "--uid", "searx", "--gid", "searx", "--http", ":5000", "-w", "searx.webapp"]

View File

@ -46,7 +46,9 @@ minimal: bin/buildout minimal.cfg setup.py
styles: styles:
@lessc -x searx/static/themes/default/less/style.less > searx/static/themes/default/css/style.css @lessc -x searx/static/themes/default/less/style.less > searx/static/themes/default/css/style.css
@lessc -x searx/static/themes/default/less/style-rtl.less > searx/static/themes/default/css/style-rtl.css
@lessc -x searx/static/themes/courgette/less/style.less > searx/static/themes/courgette/css/style.css @lessc -x searx/static/themes/courgette/less/style.less > searx/static/themes/courgette/css/style.css
@lessc -x searx/static/themes/courgette/less/style-rtl.less > searx/static/themes/courgette/css/style-rtl.css
@lessc -x searx/static/less/bootstrap/bootstrap.less > searx/static/css/bootstrap.min.css @lessc -x searx/static/less/bootstrap/bootstrap.less > searx/static/css/bootstrap.min.css
@lessc -x searx/static/themes/oscar/less/oscar/oscar.less > searx/static/themes/oscar/css/oscar.min.css @lessc -x searx/static/themes/oscar/less/oscar/oscar.less > searx/static/themes/oscar/css/oscar.min.css

View File

@ -1,61 +0,0 @@
'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''
import logging
from os import environ
from os.path import realpath, dirname, join, abspath
try:
from yaml import load
except:
from sys import exit, stderr
stderr.write('[E] install pyyaml\n')
exit(2)
searx_dir = abspath(dirname(__file__))
engine_dir = dirname(realpath(__file__))
# if possible set path to settings using the
# enviroment variable SEARX_SETTINGS_PATH
if 'SEARX_SETTINGS_PATH' in environ:
settings_path = environ['SEARX_SETTINGS_PATH']
# otherwise using default path
else:
settings_path = join(searx_dir, 'settings.yml')
if 'SEARX_HTTPS_REWRITE_PATH' in environ:
https_rewrite_path = environ['SEARX_HTTPS_REWRITE_PATH']
else:
https_rewrite_path = join(searx_dir, 'https_rules')
# load settings
with open(settings_path) as settings_yaml:
settings = load(settings_yaml)
if settings.get('server', {}).get('debug'):
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger('searx')
# load https rules only if https rewrite is enabled
if settings.get('server', {}).get('https_rewrite'):
# loade https rules
from searx.https_rewrite import load_https_rules
load_https_rules(https_rewrite_path)
logger.info('Initialisation done')

View File

@ -1,162 +0,0 @@
'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''
from lxml import etree
from json import loads
from urllib import urlencode
from searx.languages import language_codes
from searx.engines import (
categories, engines, engine_shortcuts
)
from searx.poolrequests import get
def searx_bang(full_query):
'''check if the searchQuery contain a bang, and create fitting autocompleter results'''
# check if there is a query which can be parsed
if len(full_query.getSearchQuery()) == 0:
return []
results = []
# check if current query stats with !bang
first_char = full_query.getSearchQuery()[0]
if first_char == '!' or first_char == '?':
if len(full_query.getSearchQuery()) == 1:
# show some example queries
# TODO, check if engine is not avaliable
results.append(first_char + "images")
results.append(first_char + "wikipedia")
results.append(first_char + "osm")
else:
engine_query = full_query.getSearchQuery()[1:]
# check if query starts with categorie name
for categorie in categories:
if categorie.startswith(engine_query):
results.append(first_char+'{categorie}'.format(categorie=categorie))
# check if query starts with engine name
for engine in engines:
if engine.startswith(engine_query.replace('_', ' ')):
results.append(first_char+'{engine}'.format(engine=engine.replace(' ', '_')))
# check if query starts with engine shortcut
for engine_shortcut in engine_shortcuts:
if engine_shortcut.startswith(engine_query):
results.append(first_char+'{engine_shortcut}'.format(engine_shortcut=engine_shortcut))
# check if current query stats with :bang
elif first_char == ':':
if len(full_query.getSearchQuery()) == 1:
# show some example queries
results.append(":en")
results.append(":en_us")
results.append(":english")
results.append(":united_kingdom")
else:
engine_query = full_query.getSearchQuery()[1:]
for lc in language_codes:
lang_id, lang_name, country = map(str.lower, lc)
# check if query starts with language-id
if lang_id.startswith(engine_query):
if len(engine_query) <= 2:
results.append(':{lang_id}'.format(lang_id=lang_id.split('_')[0]))
else:
results.append(':{lang_id}'.format(lang_id=lang_id))
# check if query starts with language name
if lang_name.startswith(engine_query):
results.append(':{lang_name}'.format(lang_name=lang_name))
# check if query starts with country
if country.startswith(engine_query.replace('_', ' ')):
results.append(':{country}'.format(country=country.replace(' ', '_')))
# remove duplicates
result_set = set(results)
# remove results which are already contained in the query
for query_part in full_query.query_parts:
if query_part in result_set:
result_set.remove(query_part)
# convert result_set back to list
return list(result_set)
def dbpedia(query):
# dbpedia autocompleter
autocomplete_url = 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?' # noqa
response = get(autocomplete_url
+ urlencode(dict(QueryString=query)))
results = []
if response.ok:
dom = etree.fromstring(response.content)
results = dom.xpath('//a:Result/a:Label//text()',
namespaces={'a': 'http://lookup.dbpedia.org/'})
return results
def duckduckgo(query):
# duckduckgo autocompleter
url = 'https://ac.duckduckgo.com/ac/?{0}&type=list'
resp = loads(get(url.format(urlencode(dict(q=query)))).text)
if len(resp) > 1:
return resp[1]
return []
def google(query):
# google autocompleter
autocomplete_url = 'http://suggestqueries.google.com/complete/search?client=toolbar&' # noqa
response = get(autocomplete_url
+ urlencode(dict(q=query)))
results = []
if response.ok:
dom = etree.fromstring(response.text)
results = dom.xpath('//suggestion/@data')
return results
def wikipedia(query):
# wikipedia autocompleter
url = 'https://en.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json' # noqa
resp = loads(get(url.format(urlencode(dict(search=query)))).text)
if len(resp) > 1:
return resp[1]
return []
backends = {'dbpedia': dbpedia,
'duckduckgo': duckduckgo,
'google': google,
'wikipedia': wikipedia
}

View File

@ -1,210 +0,0 @@
'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''
from os.path import realpath, dirname, splitext, join
import sys
from imp import load_source
from flask.ext.babel import gettext
from operator import itemgetter
from searx import settings
from searx import logger
logger = logger.getChild('engines')
engine_dir = dirname(realpath(__file__))
engines = {}
categories = {'general': []}
engine_shortcuts = {}
def load_module(filename):
modname = splitext(filename)[0]
if modname in sys.modules:
del sys.modules[modname]
filepath = join(engine_dir, filename)
module = load_source(modname, filepath)
module.name = modname
return module
def load_engine(engine_data):
engine_name = engine_data['engine']
engine = load_module(engine_name + '.py')
for param_name in engine_data:
if param_name == 'engine':
continue
if param_name == 'categories':
if engine_data['categories'] == 'none':
engine.categories = []
else:
engine.categories = map(
str.strip, engine_data['categories'].split(','))
continue
setattr(engine, param_name, engine_data[param_name])
if not hasattr(engine, 'paging'):
engine.paging = False
if not hasattr(engine, 'categories'):
engine.categories = ['general']
if not hasattr(engine, 'language_support'):
engine.language_support = True
if not hasattr(engine, 'timeout'):
engine.timeout = settings['server']['request_timeout']
if not hasattr(engine, 'shortcut'):
engine.shortcut = ''
if not hasattr(engine, 'disabled'):
engine.disabled = False
# checking required variables
for engine_attr in dir(engine):
if engine_attr.startswith('_'):
continue
if getattr(engine, engine_attr) is None:
logger.error('Missing engine config attribute: "{0}.{1}"'
.format(engine.name, engine_attr))
sys.exit(1)
engine.stats = {
'result_count': 0,
'search_count': 0,
'page_load_time': 0,
'score_count': 0,
'errors': 0
}
if hasattr(engine, 'categories'):
for category_name in engine.categories:
categories.setdefault(category_name, []).append(engine)
else:
categories['general'].append(engine)
if engine.shortcut:
if engine.shortcut in engine_shortcuts:
logger.error('Engine config error: ambigious shortcut: {0}'
.format(engine.shortcut))
sys.exit(1)
engine_shortcuts[engine.shortcut] = engine.name
return engine
def get_engines_stats():
# TODO refactor
pageloads = []
results = []
scores = []
errors = []
scores_per_result = []
max_pageload = max_results = max_score = max_errors = max_score_per_result = 0 # noqa
for engine in engines.values():
if engine.stats['search_count'] == 0:
continue
results_num = \
engine.stats['result_count'] / float(engine.stats['search_count'])
load_times = engine.stats['page_load_time'] / float(engine.stats['search_count']) # noqa
if results_num:
score = engine.stats['score_count'] / float(engine.stats['search_count']) # noqa
score_per_result = score / results_num
else:
score = score_per_result = 0.0
max_results = max(results_num, max_results)
max_pageload = max(load_times, max_pageload)
max_score = max(score, max_score)
max_score_per_result = max(score_per_result, max_score_per_result)
max_errors = max(max_errors, engine.stats['errors'])
pageloads.append({'avg': load_times, 'name': engine.name})
results.append({'avg': results_num, 'name': engine.name})
scores.append({'avg': score, 'name': engine.name})
errors.append({'avg': engine.stats['errors'], 'name': engine.name})
scores_per_result.append({
'avg': score_per_result,
'name': engine.name
})
for engine in pageloads:
if max_pageload:
engine['percentage'] = int(engine['avg'] / max_pageload * 100)
else:
engine['percentage'] = 0
for engine in results:
if max_results:
engine['percentage'] = int(engine['avg'] / max_results * 100)
else:
engine['percentage'] = 0
for engine in scores:
if max_score:
engine['percentage'] = int(engine['avg'] / max_score * 100)
else:
engine['percentage'] = 0
for engine in scores_per_result:
if max_score_per_result:
engine['percentage'] = int(engine['avg']
/ max_score_per_result * 100)
else:
engine['percentage'] = 0
for engine in errors:
if max_errors:
engine['percentage'] = int(float(engine['avg']) / max_errors * 100)
else:
engine['percentage'] = 0
return [
(
gettext('Page loads (sec)'),
sorted(pageloads, key=itemgetter('avg'))
),
(
gettext('Number of results'),
sorted(results, key=itemgetter('avg'), reverse=True)
),
(
gettext('Scores'),
sorted(scores, key=itemgetter('avg'), reverse=True)
),
(
gettext('Scores per result'),
sorted(scores_per_result, key=itemgetter('avg'), reverse=True)
),
(
gettext('Errors'),
sorted(errors, key=itemgetter('avg'), reverse=True)
),
]
if 'engines' not in settings or not settings['engines']:
logger.error('No engines found. Edit your settings.yml')
exit(2)
for engine_data in settings['engines']:
engine = load_engine(engine_data)
engines[engine.name] = engine

View File

@ -1,84 +0,0 @@
## Bing (Web)
#
# @website https://www.bing.com
# @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
# max. 5000 query/month
#
# @using-api no (because of query limit)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo publishedDate
from urllib import urlencode
from cgi import escape
from lxml import html
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['general']
paging = True
language_support = True
# search-url
base_url = 'https://www.bing.com/'
search_string = 'search?{query}&first={offset}'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all':
language = 'en-US'
else:
language = params['language'].replace('_', '-')
search_path = search_string.format(
query=urlencode({'q': query, 'setmkt': language}),
offset=offset)
params['cookies']['SRCHHPGUSR'] = \
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
params['url'] = base_url + search_path
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.content)
# parse results
for result in dom.xpath('//div[@class="sa_cc"]'):
link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
content = escape(extract_text(result.xpath('.//p')))
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results if something is found
if results:
return results
# parse results again if nothing is found yet
for result in dom.xpath('//li[@class="b_algo"]'):
link = result.xpath('.//h2/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
content = escape(extract_text(result.xpath('.//p')))
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results

View File

@ -1,96 +0,0 @@
## Bing (Images)
#
# @website https://www.bing.com/images
# @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
# max. 5000 query/month
#
# @using-api no (because of query limit)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, img_src
#
# @todo currently there are up to 35 images receive per page,
# because bing does not parse count=10.
# limited response to 10 images
from urllib import urlencode
from lxml import html
from yaml import load
import re
# engine dependent config
categories = ['images']
paging = True
safesearch = True
# search-url
base_url = 'https://www.bing.com/'
search_string = 'images/search?{query}&count=10&first={offset}'
thumb_url = "http://ts1.mm.bing.net/th?id={ihk}"
# safesearch definitions
safesearch_types = {2: 'STRICT',
1: 'DEMOTE',
0: 'OFF'}
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
# required for cookie
if params['language'] == 'all':
language = 'en-US'
else:
language = params['language'].replace('_', '-')
search_path = search_string.format(
query=urlencode({'q': query}),
offset=offset)
params['cookies']['SRCHHPGUSR'] = \
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\
'&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
params['url'] = base_url + search_path
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.content)
# init regex for yaml-parsing
p = re.compile('({|,)([a-z]+):(")')
# parse results
for result in dom.xpath('//div[@class="dg_u"]'):
link = result.xpath('./a')[0]
# parse yaml-data (it is required to add a space, to make it parsable)
yaml_data = load(p.sub(r'\1\2: \3', link.attrib.get('m')))
title = link.attrib.get('t1')
ihk = link.attrib.get('ihk')
#url = 'http://' + link.attrib.get('t3')
url = yaml_data.get('surl')
img_src = yaml_data.get('imgurl')
# append result
results.append({'template': 'images.html',
'url': url,
'title': title,
'content': '',
'thumbnail_src': thumb_url.format(ihk=ihk),
'img_src': img_src})
# TODO stop parsing if 10 images are found
if len(results) >= 10:
break
# return results
return results

View File

@ -1,98 +0,0 @@
## Bing (News)
#
# @website https://www.bing.com/news
# @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
# max. 5000 query/month
#
# @using-api no (because of query limit)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, publishedDate
from urllib import urlencode
from cgi import escape
from lxml import html
from datetime import datetime, timedelta
from dateutil import parser
import re
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['news']
paging = True
language_support = True
# search-url
base_url = 'https://www.bing.com/'
search_string = 'news/search?{query}&first={offset}'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all':
language = 'en-US'
else:
language = params['language'].replace('_', '-')
search_path = search_string.format(
query=urlencode({'q': query, 'setmkt': language}),
offset=offset)
params['cookies']['_FP'] = "ui=en-US"
params['url'] = base_url + search_path
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.content)
# parse results
for result in dom.xpath('//div[@class="sn_r"]'):
link = result.xpath('.//div[@class="newstitle"]/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
content = escape(extract_text(contentXPath))
# parse publishedDate
publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
'//span[contains(@class,"sn_ST")]'
'//span[contains(@class,"sn_tm")]')
publishedDate = escape(extract_text(publishedDateXPath))
if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))\
- timedelta(minutes=int(timeNumbers[1]))
elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
else:
try:
publishedDate = parser.parse(publishedDate, dayfirst=False)
except TypeError:
publishedDate = datetime.now()
# append result
results.append({'url': url,
'title': title,
'publishedDate': publishedDate,
'content': content})
# return results
return results

View File

@ -1,104 +0,0 @@
## BTDigg (Videos, Music, Files)
#
# @website https://btdigg.org
# @provide-api yes (on demand)
#
# @using-api no
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, seed, leech, magnetlink
from urlparse import urljoin
from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['videos', 'music', 'files']
paging = True
# search-url
url = 'https://btdigg.org'
search_url = url + '/search?q={search_term}&p={pageno}'
# do search-request
def request(query, params):
params['url'] = search_url.format(search_term=quote(query),
pageno=params['pageno']-1)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = dom.xpath('//div[@id="search_res"]/table/tr')
# return empty array if nothing is found
if not search_res:
return []
# parse results
for result in search_res:
link = result.xpath('.//td[@class="torrent_name"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
title = escape(extract_text(link))
content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0]))
content = "<br />".join(content.split("\n"))
filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0]
filesize_multiplier = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[1]
files = result.xpath('.//span[@class="attr_val"]/text()')[1]
seed = result.xpath('.//span[@class="attr_val"]/text()')[2]
# convert seed to int if possible
if seed.isdigit():
seed = int(seed)
else:
seed = 0
leech = 0
# convert filesize to byte if possible
try:
filesize = float(filesize)
# convert filesize to byte
if filesize_multiplier == 'TB':
filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
elif filesize_multiplier == 'GB':
filesize = int(filesize * 1024 * 1024 * 1024)
elif filesize_multiplier == 'MB':
filesize = int(filesize * 1024 * 1024)
elif filesize_multiplier == 'KB':
filesize = int(filesize * 1024)
except:
filesize = None
# convert files to int if possible
if files.isdigit():
files = int(files)
else:
files = None
magnetlink = result.xpath('.//td[@class="ttth"]//a')[0].attrib['href']
# append result
results.append({'url': href,
'title': title,
'content': content,
'seed': seed,
'leech': leech,
'filesize': filesize,
'files': files,
'magnetlink': magnetlink,
'template': 'torrent.html'})
# return results sorted by seeder
return sorted(results, key=itemgetter('seed'), reverse=True)

View File

@ -1,57 +0,0 @@
from datetime import datetime
import re
categories = []
url = 'http://finance.yahoo.com/d/quotes.csv?e=.csv&f=sl1d1t1&s={query}=X'
weight = 100
parser_re = re.compile(r'^\W*(\d+(?:\.\d+)?)\W*([a-z]{3})\W*(?:in)?\W*([a-z]{3})\W*$', re.I) # noqa
def request(query, params):
m = parser_re.match(query)
if not m:
# wrong query
return params
ammount, from_currency, to_currency = m.groups()
ammount = float(ammount)
q = (from_currency + to_currency).upper()
params['url'] = url.format(query=q)
params['ammount'] = ammount
params['from'] = from_currency
params['to'] = to_currency
return params
def response(resp):
results = []
try:
_, conversion_rate, _ = resp.text.split(',', 2)
conversion_rate = float(conversion_rate)
except:
return results
answer = '{0} {1} = {2} {3} (1 {1} = {4} {3})'.format(
resp.search_params['ammount'],
resp.search_params['from'],
resp.search_params['ammount'] * conversion_rate,
resp.search_params['to'],
conversion_rate
)
now_date = datetime.now().strftime('%Y%m%d')
url = 'http://finance.yahoo.com/currency/converter-results/{0}/{1}-{2}-to-{3}.html' # noqa
url = url.format(
now_date,
resp.search_params['ammount'],
resp.search_params['from'].lower(),
resp.search_params['to'].lower()
)
results.append({'answer': answer, 'url': url})
return results

View File

@ -1,72 +0,0 @@
## Dailymotion (Videos)
#
# @website https://www.dailymotion.com
# @provide-api yes (http://www.dailymotion.com/developer)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, thumbnail, publishedDate, embedded
#
# @todo set content-parameter with correct data
from urllib import urlencode
from json import loads
from cgi import escape
from datetime import datetime
# engine dependent config
categories = ['videos']
paging = True
language_support = True
# search-url
# see http://www.dailymotion.com/doc/api/obj-video.html
search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,description,duration,url,thumbnail_360_url,id&sort=relevance&limit=5&page={pageno}&{query}' # noqa
embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\
'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>'
# do search-request
def request(query, params):
if params['language'] == 'all':
locale = 'en-US'
else:
locale = params['language']
params['url'] = search_url.format(
query=urlencode({'search': query, 'localization': locale}),
pageno=params['pageno'])
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# return empty array if there are no results
if not 'list' in search_res:
return []
# parse results
for res in search_res['list']:
title = res['title']
url = res['url']
content = escape(res['description'])
thumbnail = res['thumbnail_360_url']
publishedDate = datetime.fromtimestamp(res['created_time'], None)
embedded = embedded_url.format(videoid=res['id'])
results.append({'template': 'videos.html',
'url': url,
'title': title,
'content': content,
'publishedDate': publishedDate,
'embedded': embedded,
'thumbnail': thumbnail})
# return results
return results

View File

@ -1,61 +0,0 @@
## Deezer (Music)
#
# @website https://deezer.com
# @provide-api yes (http://developers.deezer.com/api/)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, content, embedded
from json import loads
from urllib import urlencode
# engine dependent config
categories = ['music']
paging = True
# search-url
url = 'http://api.deezer.com/'
search_url = url + 'search?{query}&index={offset}'
embedded_url = '<iframe scrolling="no" frameborder="0" allowTransparency="true" ' +\
'data-src="http://www.deezer.com/plugins/player?type=tracks&id={audioid}" ' +\
'width="540" height="80"></iframe>'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 25
params['url'] = search_url.format(query=urlencode({'q': query}),
offset=offset)
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# parse results
for result in search_res.get('data', []):
if result['type'] == 'track':
title = result['title']
url = result['link']
content = result['artist']['name'] +\
" &bull; " +\
result['album']['title'] +\
" &bull; " + result['title']
embedded = embedded_url.format(audioid=result['id'])
# append result
results.append({'url': url,
'title': title,
'embedded': embedded,
'content': content})
# return results
return results

View File

@ -1,67 +0,0 @@
## Deviantart (Images)
#
# @website https://www.deviantart.com/
# @provide-api yes (https://www.deviantart.com/developers/) (RSS)
#
# @using-api no (TODO, rewrite to api)
# @results HTML
# @stable no (HTML can change)
# @parse url, title, thumbnail_src, img_src
#
# @todo rewrite to api
from urllib import urlencode
from urlparse import urljoin
from lxml import html
import re
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['images']
paging = True
# search-url
base_url = 'https://www.deviantart.com/'
search_url = base_url+'search?offset={offset}&{query}'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 24
params['url'] = search_url.format(offset=offset,
query=urlencode({'q': query}))
return params
# get response from search-request
def response(resp):
results = []
# return empty array if a redirection code is returned
if resp.status_code == 302:
return []
dom = html.fromstring(resp.text)
regex = re.compile('\/200H\/')
# parse results
for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'):
link = result.xpath('.//a[contains(@class, "thumb")]')[0]
url = urljoin(base_url, link.attrib.get('href'))
title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]')
title = extract_text(title_links[0])
thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
img_src = regex.sub('/', thumbnail_src)
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'template': 'images.html'})
# return results
return results

View File

@ -1,70 +0,0 @@
## Digg (News, Social media)
#
# @website https://digg.com/
# @provide-api no
#
# @using-api no
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, publishedDate, thumbnail
from urllib import quote_plus
from json import loads
from lxml import html
from cgi import escape
from dateutil import parser
# engine dependent config
categories = ['news', 'social media']
paging = True
# search-url
base_url = 'https://digg.com/'
search_url = base_url+'api/search/{query}.json?position={position}&format=html'
# specific xpath variables
results_xpath = '//article'
link_xpath = './/small[@class="time"]//a'
title_xpath = './/h2//a//text()'
content_xpath = './/p//text()'
pubdate_xpath = './/time'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10
params['url'] = search_url.format(position=offset,
query=quote_plus(query))
return params
# get response from search-request
def response(resp):
results = []
search_result = loads(resp.text)
if 'html' not in search_result or search_result['html'] == '':
return results
dom = html.fromstring(search_result['html'])
# parse results
for result in dom.xpath(results_xpath):
url = result.attrib.get('data-contenturl')
thumbnail = result.xpath('.//img')[0].attrib.get('src')
title = ''.join(result.xpath(title_xpath))
content = escape(''.join(result.xpath(content_xpath)))
pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
publishedDate = parser.parse(pubdate)
# append result
results.append({'url': url,
'title': title,
'content': content,
'template': 'videos.html',
'publishedDate': publishedDate,
'thumbnail': thumbnail})
# return results
return results

View File

@ -1,76 +0,0 @@
## DuckDuckGo (Web)
#
# @website https://duckduckgo.com/
# @provide-api yes (https://duckduckgo.com/api),
# but not all results from search-site
#
# @using-api no
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo rewrite to api
# @todo language support
# (the current used site does not support language-change)
from urllib import urlencode
from lxml.html import fromstring
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['general']
paging = True
language_support = True
# search-url
url = 'https://duckduckgo.com/html?{query}&s={offset}'
# specific xpath variables
result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa
url_xpath = './/a[@class="large"]/@href'
title_xpath = './/a[@class="large"]'
content_xpath = './/div[@class="snippet"]'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 30
if params['language'] == 'all':
locale = 'en-us'
else:
locale = params['language'].replace('_', '-').lower()
params['url'] = url.format(
query=urlencode({'q': query, 'kl': locale}),
offset=offset)
return params
# get response from search-request
def response(resp):
results = []
doc = fromstring(resp.text)
# parse results
for r in doc.xpath(result_xpath):
try:
res_url = r.xpath(url_xpath)[-1]
except:
continue
if not res_url:
continue
title = extract_text(r.xpath(title_xpath))
content = extract_text(r.xpath(content_xpath))
# append result
results.append({'title': title,
'content': content,
'url': res_url})
# return results
return results

View File

@ -1,149 +0,0 @@
import json
from urllib import urlencode
from lxml import html
from searx.utils import html_to_text
from searx.engines.xpath import extract_text
url = 'https://api.duckduckgo.com/'\
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1'
def result_to_text(url, text, htmlResult):
# TODO : remove result ending with "Meaning" or "Category"
dom = html.fromstring(htmlResult)
a = dom.xpath('//a')
if len(a) >= 1:
return extract_text(a[0])
else:
return text
def request(query, params):
# TODO add kl={locale}
params['url'] = url.format(query=urlencode({'q': query}))
return params
def response(resp):
results = []
search_res = json.loads(resp.text)
content = ''
heading = search_res.get('Heading', '')
attributes = []
urls = []
infobox_id = None
relatedTopics = []
# add answer if there is one
answer = search_res.get('Answer', '')
if answer != '':
results.append({'answer': html_to_text(answer)})
# add infobox
if 'Definition' in search_res:
content = content + search_res.get('Definition', '')
if 'Abstract' in search_res:
content = content + search_res.get('Abstract', '')
# image
image = search_res.get('Image', '')
image = None if image == '' else image
# attributes
if 'Infobox' in search_res:
infobox = search_res.get('Infobox', None)
if 'content' in infobox:
for info in infobox.get('content'):
attributes.append({'label': info.get('label'),
'value': info.get('value')})
# urls
for ddg_result in search_res.get('Results', []):
if 'FirstURL' in ddg_result:
firstURL = ddg_result.get('FirstURL', '')
text = ddg_result.get('Text', '')
urls.append({'title': text, 'url': firstURL})
results.append({'title': heading, 'url': firstURL})
# related topics
for ddg_result in search_res.get('RelatedTopics', []):
if 'FirstURL' in ddg_result:
suggestion = result_to_text(ddg_result.get('FirstURL', None),
ddg_result.get('Text', None),
ddg_result.get('Result', None))
if suggestion != heading:
results.append({'suggestion': suggestion})
elif 'Topics' in ddg_result:
suggestions = []
relatedTopics.append({'name': ddg_result.get('Name', ''),
'suggestions': suggestions})
for topic_result in ddg_result.get('Topics', []):
suggestion = result_to_text(topic_result.get('FirstURL', None),
topic_result.get('Text', None),
topic_result.get('Result', None))
if suggestion != heading:
suggestions.append(suggestion)
# abstract
abstractURL = search_res.get('AbstractURL', '')
if abstractURL != '':
# add as result ? problem always in english
infobox_id = abstractURL
urls.append({'title': search_res.get('AbstractSource'),
'url': abstractURL})
# definition
definitionURL = search_res.get('DefinitionURL', '')
if definitionURL != '':
# add as result ? as answer ? problem always in english
infobox_id = definitionURL
urls.append({'title': search_res.get('DefinitionSource'),
'url': definitionURL})
# entity
entity = search_res.get('Entity', None)
# TODO continent / country / department / location / waterfall /
# mountain range :
# link to map search, get weather, near by locations
# TODO musician : link to music search
# TODO concert tour : ??
# TODO film / actor / television / media franchise :
# links to IMDB / rottentomatoes (or scrap result)
# TODO music : link tu musicbrainz / last.fm
# TODO book : ??
# TODO artist / playwright : ??
# TODO compagny : ??
# TODO software / os : ??
# TODO software engineer : ??
# TODO prepared food : ??
# TODO website : ??
# TODO performing art : ??
# TODO prepared food : ??
# TODO programming language : ??
# TODO file format : ??
if len(heading) > 0:
# TODO get infobox.meta.value where .label='article_title'
if image is None and len(attributes) == 0 and len(urls) == 1 and\
len(relatedTopics) == 0 and len(content) == 0:
results.append({
'url': urls[0]['url'],
'title': heading,
'content': content
})
else:
results.append({
'infobox': heading,
'id': infobox_id,
'entity': entity,
'content': content,
'img_src': image,
'attributes': attributes,
'urls': urls,
'relatedTopics': relatedTopics
})
return results

View File

@ -1,14 +0,0 @@
## Dummy
#
# @results empty array
# @stable yes
# do search-request
def request(query, params):
return params
# get response from search-request
def response(resp):
return []

View File

@ -1,114 +0,0 @@
## Faroo (Web, News)
#
# @website http://www.faroo.com
# @provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, content, publishedDate, img_src
from urllib import urlencode
from json import loads
import datetime
from searx.utils import searx_useragent
# engine dependent config
categories = ['general', 'news']
paging = True
language_support = True
number_of_results = 10
api_key = None
# search-url
url = 'http://www.faroo.com/'
search_url = url + 'api?{query}'\
'&start={offset}'\
'&length={number_of_results}'\
'&l={language}'\
'&src={categorie}'\
'&i=false'\
'&f=json'\
'&key={api_key}' # noqa
search_category = {'general': 'web',
'news': 'news'}
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * number_of_results + 1
categorie = search_category.get(params['category'], 'web')
if params['language'] == 'all':
language = 'en'
else:
language = params['language'].split('_')[0]
# if language is not supported, put it in english
if language != 'en' and\
language != 'de' and\
language != 'zh':
language = 'en'
params['url'] = search_url.format(offset=offset,
number_of_results=number_of_results,
query=urlencode({'q': query}),
language=language,
categorie=categorie,
api_key=api_key)
# using searx User-Agent
params['headers']['User-Agent'] = searx_useragent()
return params
# get response from search-request
def response(resp):
# HTTP-Code 401: api-key is not valide
if resp.status_code == 401:
raise Exception("API key is not valide")
# HTTP-Code 429: rate limit exceeded
if resp.status_code == 429:
raise Exception("rate limit has been exceeded!")
results = []
search_res = loads(resp.text)
# return empty array if there are no results
if not search_res.get('results', {}):
return []
# parse results
for result in search_res['results']:
if result['news']:
# timestamp (milliseconds since 1970)
publishedDate = datetime.datetime.fromtimestamp(result['date']/1000.0) # noqa
# append news result
results.append({'url': result['url'],
'title': result['title'],
'publishedDate': publishedDate,
'content': result['kwic']})
else:
# append general result
# TODO, publishedDate correct?
results.append({'url': result['url'],
'title': result['title'],
'content': result['kwic']})
# append image result if image url is set
# TODO, show results with an image like in faroo
if result['iurl']:
results.append({'template': 'images.html',
'url': result['url'],
'title': result['title'],
'content': result['kwic'],
'img_src': result['iurl']})
# return results
return results

View File

@ -1,84 +0,0 @@
from urllib import urlencode
from HTMLParser import HTMLParser
url = 'http://www.filecrop.com/'
search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa
paging = True
class FilecropResultParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__start_processing = False
self.results = []
self.result = {}
self.tr_counter = 0
self.data_counter = 0
def handle_starttag(self, tag, attrs):
if tag == 'tr':
if ('bgcolor', '#edeff5') in attrs or\
('bgcolor', '#ffffff') in attrs:
self.__start_processing = True
if not self.__start_processing:
return
if tag == 'label':
self.result['title'] = [attr[1] for attr in attrs
if attr[0] == 'title'][0]
elif tag == 'a' and ('rel', 'nofollow') in attrs\
and ('class', 'sourcelink') in attrs:
if 'content' in self.result:
self.result['content'] += [attr[1] for attr in attrs
if attr[0] == 'title'][0]
else:
self.result['content'] = [attr[1] for attr in attrs
if attr[0] == 'title'][0]
self.result['content'] += ' '
elif tag == 'a':
self.result['url'] = url + [attr[1] for attr in attrs
if attr[0] == 'href'][0]
def handle_endtag(self, tag):
if self.__start_processing is False:
return
if tag == 'tr':
self.tr_counter += 1
if self.tr_counter == 2:
self.__start_processing = False
self.tr_counter = 0
self.data_counter = 0
self.results.append(self.result)
self.result = {}
def handle_data(self, data):
if not self.__start_processing:
return
if 'content' in self.result:
self.result['content'] += data + ' '
else:
self.result['content'] = data + ' '
self.data_counter += 1
def request(query, params):
index = 1 + (params['pageno'] - 1) * 30
params['url'] = search_url.format(query=urlencode({'w': query}),
index=index)
return params
def response(resp):
parser = FilecropResultParser()
parser.feed(resp.text)
return parser.results

View File

@ -1,96 +0,0 @@
#!/usr/bin/env python
## Flickr (Images)
#
# @website https://www.flickr.com
# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, thumbnail, img_src
#More info on api-key : https://www.flickr.com/services/apps/create/
from urllib import urlencode
from json import loads
categories = ['images']
nb_per_page = 15
paging = True
api_key = None
url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search' +\
'&api_key={api_key}&{text}&sort=relevance' +\
'&extras=description%2C+owner_name%2C+url_o%2C+url_n%2C+url_z' +\
'&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
paging = True
def build_flickr_url(user_id, photo_id):
return photo_url.format(userid=user_id, photoid=photo_id)
def request(query, params):
params['url'] = url.format(text=urlencode({'text': query}),
api_key=api_key,
nb_per_page=nb_per_page,
page=params['pageno'])
return params
def response(resp):
results = []
search_results = loads(resp.text)
# return empty array if there are no results
if not 'photos' in search_results:
return []
if not 'photo' in search_results['photos']:
return []
photos = search_results['photos']['photo']
# parse results
for photo in photos:
if 'url_o' in photo:
img_src = photo['url_o']
elif 'url_z' in photo:
img_src = photo['url_z']
else:
continue
# For a bigger thumbnail, keep only the url_z, not the url_n
if 'url_n' in photo:
thumbnail_src = photo['url_n']
elif 'url_z' in photo:
thumbnail_src = photo['url_z']
else:
thumbnail_src = img_src
url = build_flickr_url(photo['owner'], photo['id'])
title = photo['title']
content = '<span class="photo-author">' +\
photo['ownername'] +\
'</span><br />' +\
'<span class="description">' +\
photo['description']['_content'] +\
'</span>'
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'content': content,
'template': 'images.html'})
# return results
return results

View File

@ -1,109 +0,0 @@
#!/usr/bin/env python
# Flickr (Images)
#
# @website https://www.flickr.com
# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
#
# @using-api no
# @results HTML
# @stable no
# @parse url, title, thumbnail, img_src
from urllib import urlencode
from json import loads
import re
from searx.engines import logger
logger = logger.getChild('flickr-noapi')
categories = ['images']
url = 'https://secure.flickr.com/'
search_url = url + 'search/?{query}&page={page}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
regex = re.compile(r"\"search-photos-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL)
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's')
paging = True
def build_flickr_url(user_id, photo_id):
return photo_url.format(userid=user_id, photoid=photo_id)
def request(query, params):
params['url'] = search_url.format(query=urlencode({'text': query}),
page=params['pageno'])
return params
def response(resp):
results = []
matches = regex.search(resp.text)
if matches is None:
return results
match = matches.group(1)
search_results = loads(match)
if '_data' not in search_results:
return []
photos = search_results['_data']
for photo in photos:
# In paged configuration, the first pages' photos
# are represented by a None object
if photo is None:
continue
img_src = None
# From the biggest to the lowest format
for image_size in image_sizes:
if image_size in photo['sizes']:
img_src = photo['sizes'][image_size]['url']
break
if not img_src:
logger.debug('cannot find valid image size: {0}'.format(repr(photo)))
continue
if 'id' not in photo['owner']:
continue
# For a bigger thumbnail, keep only the url_z, not the url_n
if 'n' in photo['sizes']:
thumbnail_src = photo['sizes']['n']['url']
elif 'z' in photo['sizes']:
thumbnail_src = photo['sizes']['z']['url']
else:
thumbnail_src = img_src
url = build_flickr_url(photo['owner']['id'], photo['id'])
title = photo.get('title', '')
content = '<span class="photo-author">' +\
photo['owner']['username'] +\
'</span><br />'
if 'description' in photo:
content = content +\
'<span class="description">' +\
photo['description'] +\
'</span>'
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'content': content,
'template': 'images.html'})
return results

View File

@ -1,60 +0,0 @@
## General Files (Files)
#
# @website http://www.general-files.org
# @provide-api no (nothing found)
#
# @using-api no (because nothing found)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo detect torrents?
from lxml import html
# engine dependent config
categories = ['files']
paging = True
# search-url
base_url = 'http://www.general-file.com'
search_url = base_url + '/files-{letter}/{query}/{pageno}'
# specific xpath variables
result_xpath = '//table[@class="block-file"]'
title_xpath = './/h2/a//text()'
url_xpath = './/h2/a/@href'
content_xpath = './/p//text()'
# do search-request
def request(query, params):
params['url'] = search_url.format(query=query,
letter=query[0],
pageno=params['pageno'])
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(result_xpath):
url = result.xpath(url_xpath)[0]
# skip fast download links
if not url.startswith('/'):
continue
# append result
results.append({'url': base_url + url,
'title': ''.join(result.xpath(title_xpath)),
'content': ''.join(result.xpath(content_xpath))})
# return results
return results

View File

@ -1,59 +0,0 @@
## Github (It)
#
# @website https://github.com/
# @provide-api yes (https://developer.github.com/v3/)
#
# @using-api yes
# @results JSON
# @stable yes (using api)
# @parse url, title, content
from urllib import urlencode
from json import loads
from cgi import escape
# engine dependent config
categories = ['it']
# search-url
search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa
accept_header = 'application/vnd.github.preview.text-match+json'
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}))
params['headers']['Accept'] = accept_header
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# check if items are recieved
if not 'items' in search_res:
return []
# parse results
for res in search_res['items']:
title = res['name']
url = res['html_url']
if res['description']:
content = escape(res['description'][:500])
else:
content = ''
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results

View File

@ -1,140 +0,0 @@
# Google (Web)
#
# @website https://www.google.com
# @provide-api yes (https://developers.google.com/custom-search/)
#
# @using-api no
# @results HTML
# @stable no (HTML can change)
# @parse url, title, content, suggestion
from urllib import urlencode
from urlparse import urlparse, parse_qsl
from lxml import html
from searx.poolrequests import get
from searx.engines.xpath import extract_text, extract_url
# engine dependent config
categories = ['general']
paging = True
language_support = True
# search-url
google_hostname = 'www.google.com'
search_path = '/search'
redirect_path = '/url'
images_path = '/images'
search_url = ('https://' +
google_hostname +
search_path +
'?{query}&start={offset}&gbv=1')
# specific xpath variables
results_xpath = '//li[@class="g"]'
url_xpath = './/h3/a/@href'
title_xpath = './/h3'
content_xpath = './/span[@class="st"]'
suggestion_xpath = '//p[@class="_Bmc"]'
images_xpath = './/div/a'
image_url_xpath = './@href'
image_img_src_xpath = './img/@src'
pref_cookie = ''
# see https://support.google.com/websearch/answer/873?hl=en
def get_google_pref_cookie():
global pref_cookie
if pref_cookie == '':
resp = get('https://www.google.com/ncr', allow_redirects=False)
pref_cookie = resp.cookies["PREF"]
return pref_cookie
# remove google-specific tracking-url
def parse_url(url_string):
parsed_url = urlparse(url_string)
if (parsed_url.netloc in [google_hostname, '']
and parsed_url.path == redirect_path):
query = dict(parse_qsl(parsed_url.query))
return query['q']
else:
return url_string
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10
if params['language'] == 'all':
language = 'en'
else:
language = params['language'].replace('_', '-').lower()
params['url'] = search_url.format(offset=offset,
query=urlencode({'q': query}))
params['headers']['Accept-Language'] = language
params['cookies']['PREF'] = get_google_pref_cookie()
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
title = extract_text(result.xpath(title_xpath)[0])
try:
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
parsed_url = urlparse(url)
if (parsed_url.netloc == google_hostname
and parsed_url.path == search_path):
# remove the link to google news
continue
# images result
if (parsed_url.netloc == google_hostname
and parsed_url.path == images_path):
# only thumbnail image provided,
# so skipping image results
# results = results + parse_images(result)
pass
else:
# normal result
content = extract_text(result.xpath(content_xpath)[0])
# append result
results.append({'url': url,
'title': title,
'content': content})
except:
continue
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
# return results
return results
def parse_images(result):
results = []
for image in result.xpath(images_xpath):
url = parse_url(extract_text(image.xpath(image_url_xpath)[0]))
img_src = extract_text(image.xpath(image_img_src_xpath)[0])
# append result
results.append({'url': url,
'title': '',
'content': '',
'img_src': img_src,
'template': 'images.html'})
return results

View File

@ -1,68 +0,0 @@
## Google (Images)
#
# @website https://www.google.com
# @provide-api yes (https://developers.google.com/web-search/docs/),
# deprecated!
#
# @using-api yes
# @results JSON
# @stable yes (but deprecated)
# @parse url, title, img_src
from urllib import urlencode, unquote
from json import loads
# engine dependent config
categories = ['images']
paging = True
safesearch = True
# search-url
url = 'https://ajax.googleapis.com/'
search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe={safesearch}&filter=off&{query}'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 8
if params['safesearch'] == 0:
safesearch = 'off'
else:
safesearch = 'on'
params['url'] = search_url.format(query=urlencode({'q': query}),
offset=offset,
safesearch=safesearch)
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# return empty array if there are no results
if not search_res.get('responseData', {}).get('results'):
return []
# parse results
for result in search_res['responseData']['results']:
href = result['originalContextUrl']
title = result['title']
if 'url' not in result:
continue
thumbnail_src = result['tbUrl']
# append result
results.append({'url': href,
'title': title,
'content': result['content'],
'thumbnail_src': thumbnail_src,
'img_src': unquote(result['url']),
'template': 'images.html'})
# return results
return results

View File

@ -1,65 +0,0 @@
## Google (News)
#
# @website https://www.google.com
# @provide-api yes (https://developers.google.com/web-search/docs/),
# deprecated!
#
# @using-api yes
# @results JSON
# @stable yes (but deprecated)
# @parse url, title, content, publishedDate
from urllib import urlencode
from json import loads
from dateutil import parser
# search-url
categories = ['news']
paging = True
language_support = True
# engine dependent config
url = 'https://ajax.googleapis.com/'
search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={lang}'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 8
language = 'en-US'
if params['language'] != 'all':
language = params['language'].replace('_', '-')
params['url'] = search_url.format(offset=offset,
query=urlencode({'q': query}),
lang=language)
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# return empty array if there are no results
if not search_res.get('responseData', {}).get('results'):
return []
# parse results
for result in search_res['responseData']['results']:
# parse publishedDate
publishedDate = parser.parse(result['publishedDate'])
if 'url' not in result:
continue
# append result
results.append({'url': result['unescapedUrl'],
'title': result['titleNoFormatting'],
'publishedDate': publishedDate,
'content': result['content']})
# return results
return results

View File

@ -1,87 +0,0 @@
from urllib import urlencode
from json import loads
from collections import Iterable
search_url = None
url_query = None
content_query = None
title_query = None
#suggestion_xpath = ''
def iterate(iterable):
if type(iterable) == dict:
it = iterable.iteritems()
else:
it = enumerate(iterable)
for index, value in it:
yield str(index), value
def is_iterable(obj):
if type(obj) == str:
return False
if type(obj) == unicode:
return False
return isinstance(obj, Iterable)
def parse(query):
q = []
for part in query.split('/'):
if part == '':
continue
else:
q.append(part)
return q
def do_query(data, q):
ret = []
if not q:
return ret
qkey = q[0]
for key, value in iterate(data):
if len(q) == 1:
if key == qkey:
ret.append(value)
elif is_iterable(value):
ret.extend(do_query(value, q))
else:
if not is_iterable(value):
continue
if key == qkey:
ret.extend(do_query(value, q[1:]))
else:
ret.extend(do_query(value, q))
return ret
def query(data, query_string):
q = parse(query_string)
return do_query(data, q)
def request(query, params):
query = urlencode({'q': query})[2:]
params['url'] = search_url.format(query=query)
params['query'] = query
return params
def response(resp):
results = []
json = loads(resp.text)
urls = query(json, url_query)
contents = query(json, content_query)
titles = query(json, title_query)
for url, title, content in zip(urls, titles, contents):
results.append({'url': url, 'title': title, 'content': content})
return results

View File

@ -1,120 +0,0 @@
## Kickass Torrent (Videos, Music, Files)
#
# @website https://kickass.so
# @provide-api no (nothing found)
#
# @using-api no
# @results HTML (using search portal)
# @stable yes (HTML can change)
# @parse url, title, content, seed, leech, magnetlink
from urlparse import urljoin
from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['videos', 'music', 'files']
paging = True
# search-url
url = 'https://kickass.to/'
search_url = url + 'search/{search_term}/{pageno}/'
# specific xpath variables
magnet_xpath = './/a[@title="Torrent magnet link"]'
torrent_xpath = './/a[@title="Download torrent file"]'
content_xpath = './/span[@class="font11px lightgrey block"]'
# do search-request
def request(query, params):
params['url'] = search_url.format(search_term=quote(query),
pageno=params['pageno'])
# FIX: SSLError: hostname 'kickass.so'
# doesn't match either of '*.kickass.to', 'kickass.to'
params['verify'] = False
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = dom.xpath('//table[@class="data"]//tr')
# return empty array if nothing is found
if not search_res:
return []
# parse results
for result in search_res[1:]:
link = result.xpath('.//a[@class="cellMainLink"]')[0]
href = urljoin(url, link.attrib['href'])
title = extract_text(link)
content = escape(extract_text(result.xpath(content_xpath)))
seed = result.xpath('.//td[contains(@class, "green")]/text()')[0]
leech = result.xpath('.//td[contains(@class, "red")]/text()')[0]
filesize = result.xpath('.//td[contains(@class, "nobr")]/text()')[0]
filesize_multiplier = result.xpath('.//td[contains(@class, "nobr")]//span/text()')[0]
files = result.xpath('.//td[contains(@class, "center")][2]/text()')[0]
# convert seed to int if possible
if seed.isdigit():
seed = int(seed)
else:
seed = 0
# convert leech to int if possible
if leech.isdigit():
leech = int(leech)
else:
leech = 0
# convert filesize to byte if possible
try:
filesize = float(filesize)
# convert filesize to byte
if filesize_multiplier == 'TB':
filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
elif filesize_multiplier == 'GB':
filesize = int(filesize * 1024 * 1024 * 1024)
elif filesize_multiplier == 'MB':
filesize = int(filesize * 1024 * 1024)
elif filesize_multiplier == 'KB':
filesize = int(filesize * 1024)
except:
filesize = None
# convert files to int if possible
if files.isdigit():
files = int(files)
else:
files = None
magnetlink = result.xpath(magnet_xpath)[0].attrib['href']
torrentfile = result.xpath(torrent_xpath)[0].attrib['href']
torrentfileurl = quote(torrentfile, safe="%/:=&?~#+!$,;'@()*")
# append result
results.append({'url': href,
'title': title,
'content': content,
'seed': seed,
'leech': leech,
'filesize': filesize,
'files': files,
'magnetlink': magnetlink,
'torrentfile': torrentfileurl,
'template': 'torrent.html'})
# return results sorted by seeder
return sorted(results, key=itemgetter('seed'), reverse=True)

View File

@ -1,81 +0,0 @@
## general mediawiki-engine (Web)
#
# @website websites built on mediawiki (https://www.mediawiki.org)
# @provide-api yes (http://www.mediawiki.org/wiki/API:Search)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title
#
# @todo content
from json import loads
from string import Formatter
from urllib import urlencode, quote
# engine dependent config
categories = ['general']
language_support = True
paging = True
number_of_results = 1
# search-url
base_url = 'https://{language}.wikipedia.org/'
search_url = base_url + 'w/api.php?action=query'\
'&list=search'\
'&{query}'\
'&srprop=timestamp'\
'&format=json'\
'&sroffset={offset}'\
'&srlimit={limit}' # noqa
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * number_of_results
string_args = dict(query=urlencode({'srsearch': query}),
offset=offset,
limit=number_of_results)
format_strings = list(Formatter().parse(base_url))
if params['language'] == 'all':
language = 'en'
else:
language = params['language'].split('_')[0]
if len(format_strings) > 1:
string_args['language'] = language
# write search-language back to params, required in response
params['language'] = language
params['url'] = search_url.format(**string_args)
return params
# get response from search-request
def response(resp):
results = []
search_results = loads(resp.text)
# return empty array if there are no results
if not search_results.get('query', {}).get('search'):
return []
# parse results
for result in search_results['query']['search']:
url = base_url.format(language=resp.search_params['language']) +\
'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8'))
# append result
results.append({'url': url,
'title': result['title'],
'content': ''})
# return results
return results

View File

@ -1,59 +0,0 @@
## Mixcloud (Music)
#
# @website https://http://www.mixcloud.com/
# @provide-api yes (http://www.mixcloud.com/developers/
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, content, embedded, publishedDate
from json import loads
from urllib import urlencode
from dateutil import parser
# engine dependent config
categories = ['music']
paging = True
# search-url
url = 'http://api.mixcloud.com/'
search_url = url + 'search/?{query}&type=cloudcast&limit=10&offset={offset}'
embedded_url = '<iframe scrolling="no" frameborder="0" allowTransparency="true" ' +\
'data-src="https://www.mixcloud.com/widget/iframe/?feed={url}" width="300" height="300"></iframe>'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10
params['url'] = search_url.format(query=urlencode({'q': query}),
offset=offset)
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# parse results
for result in search_res.get('data', []):
title = result['name']
url = result['url']
content = result['user']['name']
embedded = embedded_url.format(url=url)
publishedDate = parser.parse(result['created_time'])
# append result
results.append({'url': url,
'title': title,
'embedded': embedded,
'publishedDate': publishedDate,
'content': content})
# return results
return results

View File

@ -1,97 +0,0 @@
## OpenStreetMap (Map)
#
# @website https://openstreetmap.org/
# @provide-api yes (http://wiki.openstreetmap.org/wiki/Nominatim)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title
from json import loads
from searx.utils import searx_useragent
# engine dependent config
categories = ['map']
paging = False
# search-url
base_url = 'https://nominatim.openstreetmap.org/'
search_string = 'search/{query}?format=json&polygon_geojson=1&addressdetails=1'
result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
# do search-request
def request(query, params):
params['url'] = base_url + search_string.format(query=query)
# using searx User-Agent
params['headers']['User-Agent'] = searx_useragent()
return params
# get response from search-request
def response(resp):
results = []
json = loads(resp.text)
# parse results
for r in json:
if 'display_name' not in r:
continue
title = r['display_name']
osm_type = r.get('osm_type', r.get('type'))
url = result_base_url.format(osm_type=osm_type,
osm_id=r['osm_id'])
osm = {'type': osm_type,
'id': r['osm_id']}
geojson = r.get('geojson')
# if no geojson is found and osm_type is a node, add geojson Point
if not geojson and osm_type == 'node':
geojson = {u'type': u'Point', u'coordinates': [r['lon'], r['lat']]}
address_raw = r.get('address')
address = {}
# get name
if r['class'] == 'amenity' or\
r['class'] == 'shop' or\
r['class'] == 'tourism' or\
r['class'] == 'leisure':
if address_raw.get('address29'):
address = {'name': address_raw.get('address29')}
else:
address = {'name': address_raw.get(r['type'])}
# add rest of adressdata, if something is already found
if address.get('name'):
address.update({'house_number': address_raw.get('house_number'),
'road': address_raw.get('road'),
'locality': address_raw.get('city',
address_raw.get('town', # noqa
address_raw.get('village'))), # noqa
'postcode': address_raw.get('postcode'),
'country': address_raw.get('country'),
'country_code': address_raw.get('country_code')})
else:
address = None
# append result
results.append({'template': 'map.html',
'title': title,
'content': '',
'longitude': r['lon'],
'latitude': r['lat'],
'boundingbox': r['boundingbox'],
'geojson': geojson,
'address': address,
'osm': osm,
'url': url})
# return results
return results

View File

@ -1,132 +0,0 @@
## Photon (Map)
#
# @website https://photon.komoot.de
# @provide-api yes (https://photon.komoot.de/)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title
from urllib import urlencode
from json import loads
from searx.utils import searx_useragent
# engine dependent config
categories = ['map']
paging = False
language_support = True
number_of_results = 10
# search-url
base_url = 'https://photon.komoot.de/'
search_string = 'api/?{query}&limit={limit}'
result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
# list of supported languages
allowed_languages = ['de', 'en', 'fr', 'it']
# do search-request
def request(query, params):
params['url'] = base_url +\
search_string.format(query=urlencode({'q': query}),
limit=number_of_results)
if params['language'] != 'all':
language = params['language'].split('_')[0]
if language in allowed_languages:
params['url'] = params['url'] + "&lang=" + language
# using searx User-Agent
params['headers']['User-Agent'] = searx_useragent()
# FIX: SSLError: SSL3_GET_SERVER_CERTIFICATE:certificate verify failed
params['verify'] = False
return params
# get response from search-request
def response(resp):
results = []
json = loads(resp.text)
# parse results
for r in json.get('features', {}):
properties = r.get('properties')
if not properties:
continue
# get title
title = properties.get('name')
# get osm-type
if properties.get('osm_type') == 'N':
osm_type = 'node'
elif properties.get('osm_type') == 'W':
osm_type = 'way'
elif properties.get('osm_type') == 'R':
osm_type = 'relation'
else:
# continue if invalide osm-type
continue
url = result_base_url.format(osm_type=osm_type,
osm_id=properties.get('osm_id'))
osm = {'type': osm_type,
'id': properties.get('osm_id')}
geojson = r.get('geometry')
if properties.get('extent'):
boundingbox = [properties.get('extent')[3],
properties.get('extent')[1],
properties.get('extent')[0],
properties.get('extent')[2]]
else:
# TODO: better boundingbox calculation
boundingbox = [geojson['coordinates'][1],
geojson['coordinates'][1],
geojson['coordinates'][0],
geojson['coordinates'][0]]
# address calculation
address = {}
# get name
if properties.get('osm_key') == 'amenity' or\
properties.get('osm_key') == 'shop' or\
properties.get('osm_key') == 'tourism' or\
properties.get('osm_key') == 'leisure':
address = {'name': properties.get('name')}
# add rest of adressdata, if something is already found
if address.get('name'):
address.update({'house_number': properties.get('housenumber'),
'road': properties.get('street'),
'locality': properties.get('city',
properties.get('town', # noqa
properties.get('village'))), # noqa
'postcode': properties.get('postcode'),
'country': properties.get('country')})
else:
address = None
# append result
results.append({'template': 'map.html',
'title': title,
'content': '',
'longitude': geojson['coordinates'][0],
'latitude': geojson['coordinates'][1],
'boundingbox': boundingbox,
'geojson': geojson,
'address': address,
'osm': osm,
'url': url})
# return results
return results

View File

@ -1,94 +0,0 @@
## Piratebay (Videos, Music, Files)
#
# @website https://thepiratebay.se
# @provide-api no (nothing found)
#
# @using-api no
# @results HTML (using search portal)
# @stable yes (HTML can change)
# @parse url, title, content, seed, leech, magnetlink
from urlparse import urljoin
from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['videos', 'music', 'files']
paging = True
# search-url
url = 'https://thepiratebay.se/'
search_url = url + 'search/{search_term}/{pageno}/99/{search_type}'
# piratebay specific type-definitions
search_types = {'files': '0',
'music': '100',
'videos': '200'}
# specific xpath variables
magnet_xpath = './/a[@title="Download this torrent using magnet"]'
torrent_xpath = './/a[@title="Download this torrent"]'
content_xpath = './/font[@class="detDesc"]'
# do search-request
def request(query, params):
search_type = search_types.get(params['category'], '0')
params['url'] = search_url.format(search_term=quote(query),
search_type=search_type,
pageno=params['pageno'] - 1)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = dom.xpath('//table[@id="searchResult"]//tr')
# return empty array if nothing is found
if not search_res:
return []
# parse results
for result in search_res[1:]:
link = result.xpath('.//div[@class="detName"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
title = extract_text(link)
content = escape(extract_text(result.xpath(content_xpath)))
seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
# convert seed to int if possible
if seed.isdigit():
seed = int(seed)
else:
seed = 0
# convert leech to int if possible
if leech.isdigit():
leech = int(leech)
else:
leech = 0
magnetlink = result.xpath(magnet_xpath)[0]
torrentfile = result.xpath(torrent_xpath)[0]
# append result
results.append({'url': href,
'title': title,
'content': content,
'seed': seed,
'leech': leech,
'magnetlink': magnetlink.attrib.get('href'),
'torrentfile': torrentfile.attrib.get('href'),
'template': 'torrent.html'})
# return results sorted by seeder
return sorted(results, key=itemgetter('seed'), reverse=True)

View File

@ -1,68 +0,0 @@
## Searchcode (It)
#
# @website https://searchcode.com/
# @provide-api yes (https://searchcode.com/api/)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, content
from urllib import urlencode
from json import loads
# engine dependent config
categories = ['it']
paging = True
# search-url
url = 'https://searchcode.com/'
search_url = url+'api/codesearch_I/?{query}&p={pageno}'
# special code-endings which are not recognised by the file ending
code_endings = {'cs': 'c#',
'h': 'c',
'hpp': 'cpp',
'cxx': 'cpp'}
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
pageno=params['pageno']-1)
return params
# get response from search-request
def response(resp):
results = []
search_results = loads(resp.text)
# parse results
for result in search_results.get('results', []):
href = result['url']
title = "" + result['name'] + " - " + result['filename']
repo = result['repo']
lines = dict()
for line, code in result['lines'].items():
lines[int(line)] = code
code_language = code_endings.get(
result['filename'].split('.')[-1].lower(),
result['filename'].split('.')[-1].lower())
# append result
results.append({'url': href,
'title': title,
'content': '',
'repository': repo,
'codelines': sorted(lines.items()),
'code_language': code_language,
'template': 'code.html'})
# return results
return results

View File

@ -1,56 +0,0 @@
## Searchcode (It)
#
# @website https://searchcode.com/
# @provide-api yes (https://searchcode.com/api/)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, content
from urllib import urlencode
from json import loads
# engine dependent config
categories = ['it']
paging = True
# search-url
url = 'https://searchcode.com/'
search_url = url+'api/search_IV/?{query}&p={pageno}'
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
pageno=params['pageno']-1)
return params
# get response from search-request
def response(resp):
results = []
search_results = loads(resp.text)
# parse results
for result in search_results.get('results', []):
href = result['url']
title = "[" + result['type'] + "] " +\
result['namespace'] +\
" " + result['name']
content = '<span class="highlight">[' +\
result['type'] + "] " +\
result['name'] + " " +\
result['synopsis'] +\
"</span><br />" +\
result['description']
# append result
results.append({'url': href,
'title': title,
'content': content})
# return results
return results

View File

@ -1,70 +0,0 @@
## Soundcloud (Music)
#
# @website https://soundcloud.com
# @provide-api yes (https://developers.soundcloud.com/)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, content, publishedDate, embedded
from json import loads
from urllib import urlencode, quote_plus
from dateutil import parser
# engine dependent config
categories = ['music']
paging = True
# api-key
guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28'
# search-url
url = 'https://api.soundcloud.com/'
search_url = url + 'search?{query}'\
'&facet=model'\
'&limit=20'\
'&offset={offset}'\
'&linked_partitioning=1'\
'&client_id={client_id}' # noqa
embedded_url = '<iframe width="100%" height="166" ' +\
'scrolling="no" frameborder="no" ' +\
'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 20
params['url'] = search_url.format(query=urlencode({'q': query}),
offset=offset,
client_id=guest_client_id)
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# parse results
for result in search_res.get('collection', []):
if result['kind'] in ('track', 'playlist'):
title = result['title']
content = result['description']
publishedDate = parser.parse(result['last_modified'])
uri = quote_plus(result['uri'])
embedded = embedded_url.format(uri=uri)
# append result
results.append({'url': result['permalink_url'],
'title': title,
'publishedDate': publishedDate,
'embedded': embedded,
'content': content})
# return results
return results

View File

@ -1,58 +0,0 @@
## Stackoverflow (It)
#
# @website https://stackoverflow.com/
# @provide-api not clear (https://api.stackexchange.com/docs/advanced-search)
#
# @using-api no
# @results HTML
# @stable no (HTML can change)
# @parse url, title, content
from urlparse import urljoin
from cgi import escape
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['it']
paging = True
# search-url
url = 'http://stackoverflow.com/'
search_url = url+'search?{query}&page={pageno}'
# specific xpath variables
results_xpath = '//div[contains(@class,"question-summary")]'
link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
content_xpath = './/div[@class="excerpt"]'
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
pageno=params['pageno'])
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
link = result.xpath(link_xpath)[0]
href = urljoin(url, link.attrib.get('href'))
title = escape(extract_text(link))
content = escape(extract_text(result.xpath(content_xpath)))
# append result
results.append({'url': href,
'title': title,
'content': content})
# return results
return results

View File

@ -1,85 +0,0 @@
# Startpage (Web)
#
# @website https://startpage.com
# @provide-api no (nothing found)
#
# @using-api no
# @results HTML
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo paging
from lxml import html
from cgi import escape
import re
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['general']
# there is a mechanism to block "bot" search
# (probably the parameter qid), require
# storing of qid's between mulitble search-calls
# paging = False
language_support = True
# search-url
base_url = 'https://startpage.com/'
search_url = base_url + 'do/search'
# specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads: div[@class="result"] are the direct childs of div[@id="results"]
results_xpath = '//div[@class="result"]'
link_xpath = './/h3/a'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10
params['url'] = search_url
params['method'] = 'POST'
params['data'] = {'query': query,
'startat': offset}
# set language if specified
if params['language'] != 'all':
params['data']['with_language'] = ('lang_' + params['language'].split('_')[0])
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.content)
# parse results
for result in dom.xpath(results_xpath):
links = result.xpath(link_xpath)
if not links:
continue
link = links[0]
url = link.attrib.get('href')
# block google-ad url's
if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
continue
title = escape(extract_text(link))
if result.xpath('./p[@class="desc"]'):
content = escape(extract_text(result.xpath('./p[@class="desc"]')))
else:
content = ''
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results

View File

@ -1,79 +0,0 @@
## Subtitleseeker (Video)
#
# @website http://www.subtitleseeker.com
# @provide-api no
#
# @using-api no
# @results HTML
# @stable no (HTML can change)
# @parse url, title, content
from cgi import escape
from urllib import quote_plus
from lxml import html
from searx.languages import language_codes
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['videos']
paging = True
language = ""
# search-url
url = 'http://www.subtitleseeker.com/'
search_url = url + 'search/TITLES/{query}&p={pageno}'
# specific xpath variables
results_xpath = '//div[@class="boxRows"]'
# do search-request
def request(query, params):
params['url'] = search_url.format(query=quote_plus(query),
pageno=params['pageno'])
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_lang = ""
if resp.search_params['language'] != 'all':
search_lang = [lc[1]
for lc in language_codes
if lc[0][:2] == resp.search_params['language'].split('_')[0]][0]
# parse results
for result in dom.xpath(results_xpath):
link = result.xpath(".//a")[0]
href = link.attrib.get('href')
if language is not "":
href = href + language + '/'
elif search_lang:
href = href + search_lang + '/'
title = escape(extract_text(link))
content = extract_text(result.xpath('.//div[contains(@class,"red")]'))
content = content + " - "
text = extract_text(result.xpath('.//div[contains(@class,"grey-web")]')[0])
content = content + text
if result.xpath(".//span") != []:
content = content +\
" - (" +\
extract_text(result.xpath(".//span")) +\
")"
# append result
results.append({'url': href,
'title': title,
'content': escape(content)})
# return results
return results

View File

@ -1,77 +0,0 @@
## Twitter (Social media)
#
# @website https://twitter.com/
# @provide-api yes (https://dev.twitter.com/docs/using-search)
#
# @using-api no
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo publishedDate
from urlparse import urljoin
from urllib import urlencode
from lxml import html
from datetime import datetime
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['social media']
language_support = True
# search-url
base_url = 'https://twitter.com/'
search_url = base_url + 'search?'
# specific xpath variables
results_xpath = '//li[@data-item-type="tweet"]'
link_xpath = './/small[@class="time"]//a'
title_xpath = './/span[@class="username js-action-profile-name"]'
content_xpath = './/p[@class="js-tweet-text tweet-text"]'
timestamp_xpath = './/span[contains(@class,"_timestamp")]'
# do search-request
def request(query, params):
params['url'] = search_url + urlencode({'q': query})
# set language if specified
if params['language'] != 'all':
params['cookies']['lang'] = params['language'].split('_')[0]
else:
params['cookies']['lang'] = 'en'
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for tweet in dom.xpath(results_xpath):
link = tweet.xpath(link_xpath)[0]
url = urljoin(base_url, link.attrib.get('href'))
title = extract_text(tweet.xpath(title_xpath))
content = extract_text(tweet.xpath(content_xpath)[0])
pubdate = tweet.xpath(timestamp_xpath)
if len(pubdate) > 0:
timestamp = float(pubdate[0].attrib.get('data-time'))
publishedDate = datetime.fromtimestamp(timestamp, None)
# append result
results.append({'url': url,
'title': title,
'content': content,
'publishedDate': publishedDate})
else:
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results

View File

@ -1,75 +0,0 @@
# Vimeo (Videos)
#
# @website https://vimeo.com/
# @provide-api yes (http://developer.vimeo.com/api),
# they have a maximum count of queries/hour
#
# @using-api no (TODO, rewrite to api)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, publishedDate, thumbnail, embedded
#
# @todo rewrite to api
# @todo set content-parameter with correct data
from urllib import urlencode
from lxml import html
from HTMLParser import HTMLParser
from searx.engines.xpath import extract_text
from dateutil import parser
# engine dependent config
categories = ['videos']
paging = True
# search-url
base_url = 'http://vimeo.com'
search_url = base_url + '/search/page:{pageno}?{query}'
# specific xpath variables
results_xpath = '//div[@id="browse_content"]/ol/li'
url_xpath = './a/@href'
title_xpath = './a/div[@class="data"]/p[@class="title"]'
content_xpath = './a/img/@src'
publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
embedded_url = '<iframe data-src="//player.vimeo.com/video{videoid}" ' +\
'width="540" height="304" frameborder="0" ' +\
'webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>'
# do search-request
def request(query, params):
params['url'] = search_url.format(pageno=params['pageno'],
query=urlencode({'q': query}))
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
p = HTMLParser()
# parse results
for result in dom.xpath(results_xpath):
videoid = result.xpath(url_xpath)[0]
url = base_url + videoid
title = p.unescape(extract_text(result.xpath(title_xpath)))
thumbnail = extract_text(result.xpath(content_xpath)[0])
publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
embedded = embedded_url.format(videoid=videoid)
# append result
results.append({'url': url,
'title': title,
'content': '',
'template': 'videos.html',
'publishedDate': publishedDate,
'embedded': embedded,
'thumbnail': thumbnail})
# return results
return results

View File

@ -1,305 +0,0 @@
import json
from urllib import urlencode
from searx.poolrequests import get
from searx.utils import format_date_by_locale
result_count = 1
wikidata_host = 'https://www.wikidata.org'
wikidata_api = wikidata_host + '/w/api.php'
url_search = wikidata_api \
+ '?action=query&list=search&format=json'\
+ '&srnamespace=0&srprop=sectiontitle&{query}'
url_detail = wikidata_api\
+ '?action=wbgetentities&format=json'\
+ '&props=labels%7Cinfo%7Csitelinks'\
+ '%7Csitelinks%2Furls%7Cdescriptions%7Cclaims'\
+ '&{query}'
url_map = 'https://www.openstreetmap.org/'\
+ '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
def request(query, params):
params['url'] = url_search.format(
query=urlencode({'srsearch': query,
'srlimit': result_count}))
return params
def response(resp):
results = []
search_res = json.loads(resp.text)
wikidata_ids = set()
for r in search_res.get('query', {}).get('search', {}):
wikidata_ids.add(r.get('title', ''))
language = resp.search_params['language'].split('_')[0]
if language == 'all':
language = 'en'
url = url_detail.format(query=urlencode({'ids': '|'.join(wikidata_ids),
'languages': language + '|en'}))
htmlresponse = get(url)
jsonresponse = json.loads(htmlresponse.content)
for wikidata_id in wikidata_ids:
results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])
return results
def getDetail(jsonresponse, wikidata_id, language, locale):
results = []
urls = []
attributes = []
result = jsonresponse.get('entities', {}).get(wikidata_id, {})
title = result.get('labels', {}).get(language, {}).get('value', None)
if title is None:
title = result.get('labels', {}).get('en', {}).get('value', None)
if title is None:
return results
description = result\
.get('descriptions', {})\
.get(language, {})\
.get('value', None)
if description is None:
description = result\
.get('descriptions', {})\
.get('en', {})\
.get('value', '')
claims = result.get('claims', {})
official_website = get_string(claims, 'P856', None)
if official_website is not None:
urls.append({'title': 'Official site', 'url': official_website})
results.append({'title': title, 'url': official_website})
wikipedia_link_count = 0
if language != 'en':
wikipedia_link_count += add_url(urls,
'Wikipedia (' + language + ')',
get_wikilink(result, language +
'wiki'))
wikipedia_en_link = get_wikilink(result, 'enwiki')
wikipedia_link_count += add_url(urls,
'Wikipedia (en)',
wikipedia_en_link)
if wikipedia_link_count == 0:
misc_language = get_wiki_firstlanguage(result, 'wiki')
if misc_language is not None:
add_url(urls,
'Wikipedia (' + misc_language + ')',
get_wikilink(result, misc_language + 'wiki'))
if language != 'en':
add_url(urls,
'Wiki voyage (' + language + ')',
get_wikilink(result, language + 'wikivoyage'))
add_url(urls,
'Wiki voyage (en)',
get_wikilink(result, 'enwikivoyage'))
if language != 'en':
add_url(urls,
'Wikiquote (' + language + ')',
get_wikilink(result, language + 'wikiquote'))
add_url(urls,
'Wikiquote (en)',
get_wikilink(result, 'enwikiquote'))
add_url(urls,
'Commons wiki',
get_wikilink(result, 'commonswiki'))
add_url(urls,
'Location',
get_geolink(claims, 'P625', None))
add_url(urls,
'Wikidata',
'https://www.wikidata.org/wiki/'
+ wikidata_id + '?uselang=' + language)
musicbrainz_work_id = get_string(claims, 'P435')
if musicbrainz_work_id is not None:
add_url(urls,
'MusicBrainz',
'http://musicbrainz.org/work/'
+ musicbrainz_work_id)
musicbrainz_artist_id = get_string(claims, 'P434')
if musicbrainz_artist_id is not None:
add_url(urls,
'MusicBrainz',
'http://musicbrainz.org/artist/'
+ musicbrainz_artist_id)
musicbrainz_release_group_id = get_string(claims, 'P436')
if musicbrainz_release_group_id is not None:
add_url(urls,
'MusicBrainz',
'http://musicbrainz.org/release-group/'
+ musicbrainz_release_group_id)
musicbrainz_label_id = get_string(claims, 'P966')
if musicbrainz_label_id is not None:
add_url(urls,
'MusicBrainz',
'http://musicbrainz.org/label/'
+ musicbrainz_label_id)
# musicbrainz_area_id = get_string(claims, 'P982')
# P1407 MusicBrainz series ID
# P1004 MusicBrainz place ID
# P1330 MusicBrainz instrument ID
# P1407 MusicBrainz series ID
postal_code = get_string(claims, 'P281', None)
if postal_code is not None:
attributes.append({'label': 'Postal code(s)', 'value': postal_code})
date_of_birth = get_time(claims, 'P569', None)
if date_of_birth is not None:
date_of_birth = format_date_by_locale(date_of_birth[8:], locale)
attributes.append({'label': 'Date of birth', 'value': date_of_birth})
date_of_death = get_time(claims, 'P570', None)
if date_of_death is not None:
date_of_death = format_date_by_locale(date_of_death[8:], locale)
attributes.append({'label': 'Date of death', 'value': date_of_death})
if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
results.append({
'url': urls[0]['url'],
'title': title,
'content': description
})
else:
results.append({
'infobox': title,
'id': wikipedia_en_link,
'content': description,
'attributes': attributes,
'urls': urls
})
return results
def add_url(urls, title, url):
if url is not None:
urls.append({'title': title, 'url': url})
return 1
else:
return 0
def get_mainsnak(claims, propertyName):
propValue = claims.get(propertyName, {})
if len(propValue) == 0:
return None
propValue = propValue[0].get('mainsnak', None)
return propValue
def get_string(claims, propertyName, defaultValue=None):
propValue = claims.get(propertyName, {})
if len(propValue) == 0:
return defaultValue
result = []
for e in propValue:
mainsnak = e.get('mainsnak', {})
datavalue = mainsnak.get('datavalue', {})
if datavalue is not None:
result.append(datavalue.get('value', ''))
if len(result) == 0:
return defaultValue
else:
# TODO handle multiple urls
return result[0]
def get_time(claims, propertyName, defaultValue=None):
propValue = claims.get(propertyName, {})
if len(propValue) == 0:
return defaultValue
result = []
for e in propValue:
mainsnak = e.get('mainsnak', {})
datavalue = mainsnak.get('datavalue', {})
if datavalue is not None:
value = datavalue.get('value', '')
result.append(value.get('time', ''))
if len(result) == 0:
return defaultValue
else:
return ', '.join(result)
def get_geolink(claims, propertyName, defaultValue=''):
mainsnak = get_mainsnak(claims, propertyName)
if mainsnak is None:
return defaultValue
datatype = mainsnak.get('datatype', '')
datavalue = mainsnak.get('datavalue', {})
if datatype != 'globe-coordinate':
return defaultValue
value = datavalue.get('value', {})
precision = value.get('precision', 0.0002)
# there is no zoom information, deduce from precision (error prone)
# samples :
# 13 --> 5
# 1 --> 6
# 0.016666666666667 --> 9
# 0.00027777777777778 --> 19
# wolframalpha :
# quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}}
# 14.1186-8.8322 x+0.625447 x^2
if precision < 0.0003:
zoom = 19
else:
zoom = int(15 - precision*8.8322 + precision*precision*0.625447)
url = url_map\
.replace('{latitude}', str(value.get('latitude', 0)))\
.replace('{longitude}', str(value.get('longitude', 0)))\
.replace('{zoom}', str(zoom))
return url
def get_wikilink(result, wikiid):
url = result.get('sitelinks', {}).get(wikiid, {}).get('url', None)
if url is None:
return url
elif url.startswith('http://'):
url = url.replace('http://', 'https://')
elif url.startswith('//'):
url = 'https:' + url
return url
def get_wiki_firstlanguage(result, wikipatternid):
for k in result.get('sitelinks', {}).keys():
if k.endswith(wikipatternid) and len(k) == (2+len(wikipatternid)):
return k[0:2]
return None

View File

@ -1,82 +0,0 @@
## 1x (Images)
#
# @website http://1x.com/
# @provide-api no
#
# @using-api no
# @results HTML
# @stable no (HTML can change)
# @parse url, title, thumbnail, img_src, content
from urllib import urlencode
from urlparse import urljoin
from lxml import html
import string
import re
# engine dependent config
categories = ['images']
paging = False
# search-url
base_url = 'http://1x.com'
search_url = base_url+'/backend/search.php?{query}'
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}))
return params
# get response from search-request
def response(resp):
results = []
# get links from result-text
regex = re.compile('(</a>|<a)')
results_parts = re.split(regex, resp.text)
cur_element = ''
# iterate over link parts
for result_part in results_parts:
# processed start and end of link
if result_part == '<a':
cur_element = result_part
continue
elif result_part != '</a>':
cur_element += result_part
continue
cur_element += result_part
# fix xml-error
cur_element = string.replace(cur_element, '"></a>', '"/></a>')
dom = html.fromstring(cur_element)
link = dom.xpath('//a')[0]
url = urljoin(base_url, link.attrib.get('href'))
title = link.attrib.get('title', '')
thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src'])
# TODO: get image with higher resolution
img_src = thumbnail_src
# check if url is showing to a photo
if '/photo/' not in url:
continue
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'content': '',
'thumbnail_src': thumbnail_src,
'template': 'images.html'})
# return results
return results

View File

@ -1,64 +0,0 @@
## 500px (Images)
#
# @website https://500px.com
# @provide-api yes (https://developers.500px.com/)
#
# @using-api no
# @results HTML
# @stable no (HTML can change)
# @parse url, title, thumbnail, img_src, content
#
# @todo rewrite to api
from urllib import urlencode
from urlparse import urljoin
from lxml import html
import re
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['images']
paging = True
# search-url
base_url = 'https://500px.com'
search_url = base_url + '/search?search?page={pageno}&type=photos&{query}'
# do search-request
def request(query, params):
params['url'] = search_url.format(pageno=params['pageno'],
query=urlencode({'q': query}))
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
regex = re.compile('3\.jpg.*$')
# parse results
for result in dom.xpath('//div[@class="photo"]'):
link = result.xpath('.//a')[0]
url = urljoin(base_url, link.attrib.get('href'))
title = extract_text(result.xpath('.//div[@class="title"]'))
thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
# To have a bigger thumbnail, uncomment the next line
# thumbnail_src = regex.sub('4.jpg', thumbnail_src)
content = extract_text(result.xpath('.//div[@class="info"]'))
img_src = regex.sub('2048.jpg', thumbnail_src)
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'content': content,
'thumbnail_src': thumbnail_src,
'template': 'images.html'})
# return results
return results

View File

@ -1,106 +0,0 @@
from lxml import html
from urllib import urlencode, unquote
from urlparse import urlparse, urljoin
from lxml.etree import _ElementStringResult, _ElementUnicodeResult
from searx.utils import html_to_text
search_url = None
url_xpath = None
content_xpath = None
title_xpath = None
suggestion_xpath = ''
results_xpath = ''
'''
if xpath_results is list, extract the text from each result and concat the list
if xpath_results is a xml element, extract all the text node from it
( text_content() method from lxml )
if xpath_results is a string element, then it's already done
'''
def extract_text(xpath_results):
if type(xpath_results) == list:
# it's list of result : concat everything using recursive call
if not xpath_results:
raise Exception('Empty url resultset')
result = ''
for e in xpath_results:
result = result + extract_text(e)
return result.strip()
elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
# it's a string
return ''.join(xpath_results)
else:
# it's a element
return html_to_text(xpath_results.text_content()).strip()
def extract_url(xpath_results, search_url):
url = extract_text(xpath_results)
if url.startswith('//'):
# add http or https to this kind of url //example.com/
parsed_search_url = urlparse(search_url)
url = parsed_search_url.scheme+url
elif url.startswith('/'):
# fix relative url to the search engine
url = urljoin(search_url, url)
# normalize url
url = normalize_url(url)
return url
def normalize_url(url):
parsed_url = urlparse(url)
# add a / at this end of the url if there is no path
if not parsed_url.netloc:
raise Exception('Cannot parse url')
if not parsed_url.path:
url += '/'
# FIXME : hack for yahoo
if parsed_url.hostname == 'search.yahoo.com'\
and parsed_url.path.startswith('/r'):
p = parsed_url.path
mark = p.find('/**')
if mark != -1:
return unquote(p[mark+3:]).decode('utf-8')
return url
def request(query, params):
query = urlencode({'q': query})[2:]
params['url'] = search_url.format(query=query)
params['query'] = query
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
if results_xpath:
for result in dom.xpath(results_xpath):
url = extract_url(result.xpath(url_xpath), search_url)
title = extract_text(result.xpath(title_xpath)[0])
content = extract_text(result.xpath(content_xpath)[0])
results.append({'url': url, 'title': title, 'content': content})
else:
for url, title, content in zip(
(extract_url(x, search_url) for
x in dom.xpath(url_xpath)),
map(extract_text, dom.xpath(title_xpath)),
map(extract_text, dom.xpath(content_xpath))
):
results.append({'url': url, 'title': title, 'content': content})
if not suggestion_xpath:
return results
for suggestion in dom.xpath(suggestion_xpath):
results.append({'suggestion': extract_text(suggestion)})
return results

View File

@ -1,97 +0,0 @@
## Yacy (Web, Images, Videos, Music, Files)
#
# @website http://yacy.net
# @provide-api yes
# (http://www.yacy-websuche.de/wiki/index.php/Dev:APIyacysearch)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse (general) url, title, content, publishedDate
# @parse (images) url, title, img_src
#
# @todo parse video, audio and file results
from json import loads
from urllib import urlencode
from dateutil import parser
# engine dependent config
categories = ['general', 'images'] # TODO , 'music', 'videos', 'files'
paging = True
language_support = True
number_of_results = 5
# search-url
base_url = 'http://localhost:8090'
search_url = '/yacysearch.json?{query}'\
'&startRecord={offset}'\
'&maximumRecords={limit}'\
'&contentdom={search_type}'\
'&resource=global'
# yacy specific type-definitions
search_types = {'general': 'text',
'images': 'image',
'files': 'app',
'music': 'audio',
'videos': 'video'}
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * number_of_results
search_type = search_types.get(params.get('category'), '0')
params['url'] = base_url +\
search_url.format(query=urlencode({'query': query}),
offset=offset,
limit=number_of_results,
search_type=search_type)
# add language tag if specified
if params['language'] != 'all':
params['url'] += '&lr=lang_' + params['language'].split('_')[0]
return params
# get response from search-request
def response(resp):
results = []
raw_search_results = loads(resp.text)
# return empty array if there are no results
if not raw_search_results:
return []
search_results = raw_search_results.get('channels', [])
if len(search_results) == 0:
return []
for result in search_results[0].get('items', []):
# parse image results
if result.get('image'):
# append result
results.append({'url': result['url'],
'title': result['title'],
'content': '',
'img_src': result['image'],
'template': 'images.html'})
# parse general results
else:
publishedDate = parser.parse(result['pubDate'])
# append result
results.append({'url': result['link'],
'title': result['title'],
'content': result['description'],
'publishedDate': publishedDate})
# TODO parse video, audio and file results
# return results
return results

View File

@ -1,103 +0,0 @@
## Yahoo (Web)
#
# @website https://search.yahoo.com/web
# @provide-api yes (https://developer.yahoo.com/boss/search/),
# $0.80/1000 queries
#
# @using-api no (because pricing)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, suggestion
from urllib import urlencode
from urlparse import unquote
from lxml import html
from searx.engines.xpath import extract_text, extract_url
# engine dependent config
categories = ['general']
paging = True
language_support = True
# search-url
base_url = 'https://search.yahoo.com/'
search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
# specific xpath variables
results_xpath = '//div[@class="res"]'
url_xpath = './/h3/a/@href'
title_xpath = './/h3/a'
content_xpath = './/div[@class="abstr"]'
suggestion_xpath = '//div[@id="satat"]//a'
# remove yahoo-specific tracking-url
def parse_url(url_string):
endings = ['/RS', '/RK']
endpositions = []
start = url_string.find('http', url_string.find('/RU=') + 1)
for ending in endings:
endpos = url_string.rfind(ending)
if endpos > -1:
endpositions.append(endpos)
if start == 0 or len(endpositions) == 0:
return url_string
else:
end = min(endpositions)
return unquote(url_string[start:end])
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all':
language = 'en'
else:
language = params['language'].split('_')[0]
params['url'] = base_url + search_url.format(offset=offset,
query=urlencode({'p': query}),
lang=language)
# TODO required?
params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
.format(lang=language)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
try:
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
title = extract_text(result.xpath(title_xpath)[0])
except:
continue
content = extract_text(result.xpath(content_xpath)[0])
# append result
results.append({'url': url,
'title': title,
'content': content})
# if no suggestion found, return results
if not dom.xpath(suggestion_xpath):
return results
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
# return results
return results

View File

@ -1,93 +0,0 @@
# Yahoo (News)
#
# @website https://news.yahoo.com
# @provide-api yes (https://developer.yahoo.com/boss/search/)
# $0.80/1000 queries
#
# @using-api no (because pricing)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, publishedDate
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text, extract_url
from searx.engines.yahoo import parse_url
from datetime import datetime, timedelta
import re
from dateutil import parser
# engine dependent config
categories = ['news']
paging = True
language_support = True
# search-url
search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}' # noqa
# specific xpath variables
results_xpath = '//div[@class="res"]'
url_xpath = './/h3/a/@href'
title_xpath = './/h3/a'
content_xpath = './/div[@class="abstr"]'
publishedDate_xpath = './/span[@class="timestamp"]'
suggestion_xpath = '//div[@id="satat"]//a'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all':
language = 'en'
else:
language = params['language'].split('_')[0]
params['url'] = search_url.format(offset=offset,
query=urlencode({'p': query}),
lang=language)
# TODO required?
params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
.format(lang=language)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
title = extract_text(result.xpath(title_xpath)[0])
content = extract_text(result.xpath(content_xpath)[0])
# parse publishedDate
publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) # noqa
else:
if re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$",
publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))\
- timedelta(minutes=int(timeNumbers[1]))
else:
publishedDate = parser.parse(publishedDate)
if publishedDate.year == 1900:
publishedDate = publishedDate.replace(year=datetime.now().year)
# append result
results.append({'url': url,
'title': title,
'content': content,
'publishedDate': publishedDate})
# return results
return results

View File

@ -1,93 +0,0 @@
## Youtube (Videos)
#
# @website https://www.youtube.com/
# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, content, publishedDate, thumbnail, embedded
from json import loads
from urllib import urlencode
from dateutil import parser
# engine dependent config
categories = ['videos', 'music']
paging = True
language_support = True
# search-url
base_url = 'https://gdata.youtube.com/feeds/api/videos'
search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5'
embedded_url = '<iframe width="540" height="304" ' +\
'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\
'frameborder="0" allowfullscreen></iframe>'
# do search-request
def request(query, params):
index = (params['pageno'] - 1) * 5 + 1
params['url'] = search_url.format(query=urlencode({'q': query}),
index=index)
# add language tag if specified
if params['language'] != 'all':
params['url'] += '&lr=' + params['language'].split('_')[0]
return params
# get response from search-request
def response(resp):
results = []
search_results = loads(resp.text)
# return empty array if there are no results
if not 'feed' in search_results:
return []
feed = search_results['feed']
# parse results
for result in feed['entry']:
url = [x['href'] for x in result['link'] if x['type'] == 'text/html']
if not url:
continue
# remove tracking
url = url[0].replace('feature=youtube_gdata', '')
if url.endswith('&'):
url = url[:-1]
videoid = url[32:]
title = result['title']['$t']
content = ''
thumbnail = ''
pubdate = result['published']['$t']
publishedDate = parser.parse(pubdate)
if 'media$thumbnail' in result['media$group']:
thumbnail = result['media$group']['media$thumbnail'][0]['url']
content = result['content']['$t']
embedded = embedded_url.format(videoid=videoid)
# append result
results.append({'url': url,
'title': title,
'content': content,
'template': 'videos.html',
'publishedDate': publishedDate,
'embedded': embedded,
'thumbnail': thumbnail})
# return results
return results

View File

@ -1,209 +0,0 @@
'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''
import re
from urlparse import urlparse
from lxml import etree
from os import listdir
from os.path import isfile, isdir, join
from searx import logger
logger = logger.getChild("https_rewrite")
# https://gitweb.torproject.org/\
# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
# HTTPS rewrite rules
https_rules = []
# load single ruleset from a xml file
def load_single_https_ruleset(filepath):
ruleset = ()
# init parser
parser = etree.XMLParser()
# load and parse xml-file
try:
tree = etree.parse(filepath, parser)
except:
# TODO, error message
return ()
# get root node
root = tree.getroot()
# check if root is a node with the name ruleset
# TODO improve parsing
if root.tag != 'ruleset':
return ()
# check if rule is deactivated by default
if root.attrib.get('default_off'):
return ()
# check if rule does only work for specific platforms
if root.attrib.get('platform'):
return ()
hosts = []
rules = []
exclusions = []
# parse childs from ruleset
for ruleset in root:
# this child define a target
if ruleset.tag == 'target':
# check if required tags available
if not ruleset.attrib.get('host'):
continue
# convert host-rule to valid regex
host = ruleset.attrib.get('host')\
.replace('.', '\.').replace('*', '.*')
# append to host list
hosts.append(host)
# this child define a rule
elif ruleset.tag == 'rule':
# check if required tags available
if not ruleset.attrib.get('from')\
or not ruleset.attrib.get('to'):
continue
# TODO hack, which convert a javascript regex group
# into a valid python regex group
rule_from = ruleset.attrib['from'].replace('$', '\\')
if rule_from.endswith('\\'):
rule_from = rule_from[:-1]+'$'
rule_to = ruleset.attrib['to'].replace('$', '\\')
if rule_to.endswith('\\'):
rule_to = rule_to[:-1]+'$'
# TODO, not working yet because of the hack above,
# currently doing that in webapp.py
# rule_from_rgx = re.compile(rule_from, re.I)
# append rule
try:
rules.append((re.compile(rule_from, re.I | re.U), rule_to))
except:
# TODO log regex error
continue
# this child define an exclusion
elif ruleset.tag == 'exclusion':
# check if required tags available
if not ruleset.attrib.get('pattern'):
continue
exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
# append exclusion
exclusions.append(exclusion_rgx)
# convert list of possible hosts to a simple regex
# TODO compress regex to improve performance
try:
target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
except:
return ()
# return ruleset
return (target_hosts, rules, exclusions)
# load all https rewrite rules
def load_https_rules(rules_path):
# check if directory exists
if not isdir(rules_path):
logger.error("directory not found: '" + rules_path + "'")
return
# search all xml files which are stored in the https rule directory
xml_files = [join(rules_path, f)
for f in listdir(rules_path)
if isfile(join(rules_path, f)) and f[-4:] == '.xml']
# load xml-files
for ruleset_file in xml_files:
# calculate rewrite-rules
ruleset = load_single_https_ruleset(ruleset_file)
# skip if no ruleset returned
if not ruleset:
continue
# append ruleset
https_rules.append(ruleset)
logger.info('{n} rules loaded'.format(n=len(https_rules)))
def https_url_rewrite(result):
skip_https_rewrite = False
# check if HTTPS rewrite is possible
for target, rules, exclusions in https_rules:
# check if target regex match with url
if target.match(result['parsed_url'].netloc):
# process exclusions
for exclusion in exclusions:
# check if exclusion match with url
if exclusion.match(result['url']):
skip_https_rewrite = True
break
# skip https rewrite if required
if skip_https_rewrite:
break
# process rules
for rule in rules:
try:
new_result_url = rule[0].sub(rule[1], result['url'])
except:
break
# parse new url
new_parsed_url = urlparse(new_result_url)
# continiue if nothing was rewritten
if result['url'] == new_result_url:
continue
# get domainname from result
# TODO, does only work correct with TLD's like
# asdf.com, not for asdf.com.de
# TODO, using publicsuffix instead of this rewrite rule
old_result_domainname = '.'.join(
result['parsed_url'].hostname.split('.')[-2:])
new_result_domainname = '.'.join(
new_parsed_url.hostname.split('.')[-2:])
# check if rewritten hostname is the same,
# to protect against wrong or malicious rewrite rules
if old_result_domainname == new_result_domainname:
# set new url
result['url'] = new_result_url
# target has matched, do not search over the other rules
break
return result

View File

@ -1,17 +0,0 @@
<!--
This directory contains web site rewriting rules for the
HTTPS Everywhere software, available from
https://www.eff.org/https-everywhere
These rules were contributed to the project by users and aim to
enable routine secure access to as many different web sites as
possible. They are automatically installed together with the
HTTPS Everywhere software. The presence of these rules does not
mean that an HTTPS Everywhere user accessed, or intended to
access, any particular web site.
For information about how to create additional HTTPS Everywhere
rewriting rules to add support for new sites, please see
https://www.eff.org/https-everywhere/rulesets
-->

View File

@ -1,56 +0,0 @@
<!--
For other Microsoft coverage, see Microsoft.xml.
CDN buckets:
- a134.lm.akamai.net
- akam.bing.com
- *.mm.bing.net
Nonfunctional domains:
- m2.cn.bing.com
- origin.bj1.bing.com
- blogs.bing.com
Fully covered domains:
- bing.com subdomains:
- (www.)
- c.bing (tracking beacons)
- cn.bing
- h.bing
- ssl
- testfamilysafety.bing
- udc.bing
- (www.)bing
- *.mm.bing.net
- api.bing.com
-->
<ruleset name="Bing">
<target host="bing.com" />
<target host="*.bing.com" />
<target host="*.mm.bing.net" />
<securecookie host=".*\.bing\.com$" name=".+" />
<rule from="^http://((?:c|cn|h|ssl|testfamilysafety|udc|www)\.)?bing\.com/"
to="https://$1bing.com/" />
<rule from="^http://([^/:@]*)\.mm\.bing\.net/"
to="https://$1.mm.bing.com/"/>
<rule from="^http://([^/:@]*)\.api\.bing\.net/"
to="https://$1.api.bing.com/"/>
</ruleset>

View File

@ -1,69 +0,0 @@
<!--
Nonfunctional domains:
- blog.dailymotion.com
- press.dailymotion.com (shows steaw.com, CN: www.steaw.com)
- proxy-46.dailymotion.com
- publicite.dailymotion.com
- publisher.dailymotion.com (reset)
- vid.ak.dmcdn.net (403, Akamai)
- vid2.ak.dmcdn.net (504, akamai)
Problematic domains:
- ak2.static.dailymotion.com (mismatched, CN: *.dmcdn.net)
- support.dmcloud.net (mismatched, CN: *.zendesk.com)
Partially covered domains:
- (www.)dailymotion.com
- cdn/manifest/video/\w+.mnft 403s
- crossdomain.xml breaks videos
-->
<ruleset name="Dailymotion (default off)" default_off="breaks some embedded videos">
<target host="dailymotion.com" />
<!--
* for cross-domain cookie.
-->
<target host="*.dailymotion.com" />
<!--
https://mail1.eff.org/pipermail/https-everywhere-rules/2012-July/001241.html
-->
<exclusion pattern="^http://(?:www\.)?dailymotion\.com/(?:cdn/[\w-]+/video/|crossdomain\.xml$)" />
<target host="ak2.static.dailymotion.com" />
<target host="*.dmcdn.net" />
<target host="dmcloud.net" />
<target host="*.dmcloud.net" />
<!-- Testing wrt embedded breakage.
securecookie host="^.*\.dailymotion\.com$" name=".+" /-->
<!--
Omniture tracking cookies:
-->
<securecookie host="^\.dailymotion\.com$" name="^s_\w+$" />
<securecookie host="^www\.dailymotion\.com$" name=".+" />
<rule from="^http://(erroracct\.|www\.)?dailymotion\.com/"
to="https://$1dailymotion.com/" />
<rule from="^http://(s\d|static(?:\d|s\d-ssl))\.dmcdn\.net/"
to="https://$1.dmcdn.net/" />
<rule from="^https?://ak2\.static\.dailymotion\.com/"
to="https://static1-ssl.dmcdn.net/" />
<rule from="^http://(s\.|www\.)?dmcloud\.net/"
to="https://$1dmcloud.net/" />
<rule from="^https?://support\.dmcloud\.net/"
to="https://dmcloud.zendesk.com/" />
</ruleset>

View File

@ -1,53 +0,0 @@
<!--
For problematic rules, see Deviantart-mismatches.xml.
Other deviantArt rulesets:
- Sta.sh.xml
ToDo: Find edgecast URL for /(fc|th)\d+.
Mixed content:
- Images on *.....com from e.deviantart.net *
* Secured by us
-->
<ruleset name="DeviantArt (pending)" default_off="site operator says not ready yet">
<target host="deviantart.com" />
<target host="*.deviantart.com" />
<target host="deviantart.net" />
<target host="*.deviantart.net" />
<!-- Not secured by server:
-->
<!--securecookie host="^\.deviantart\.com$" name="^userinfo$" /-->
<securecookie host="^\.deviantart\.com$" name=".*" />
<!-- Redirects from com to net, but does so successfully by itself.
-->
<rule from="^http://([aei]|fc\d\d|s[ht]|th\d\d)\.deviantart\.(com|net)/"
to="https://$1.deviantart.$2/" />
<!-- This handles everything that isn't in the first rule.
Namely, usernames, backend, fc, th, and (www.).
These domains present a cert that is only
valid for .com.
Note that .net isn't used on DA, but.net does
redirect to .com, and we shouldn't break what would
otherwise work.
Mustn't rewrite from https here, as doing so
would conflict with the first rule.
-->
<rule from="^http://([^/:@\.]+\.)?deviantart\.(?:com|net)/"
to="https://$1deviantart.com/" />
</ruleset>

View File

@ -1,38 +0,0 @@
<!--
Problematic domains:
- www.dukgo.com (mismatched, CN: dukgo.com)
Fully covered domains:
- (www.)dukgo.com (www → ^)
-->
<ruleset name="DuckDuckGo">
<target host="duckduckgo.com" />
<target host="*.duckduckgo.com" />
<target host="ddg.gg" />
<target host="duck.co" />
<target host="i.duck.co" />
<target host="dukgo.com" />
<target host="www.dukgo.com" />
<exclusion pattern="^http://(help|meme)\.duckduckgo\.com/" />
<securecookie host="^duck\.co$" name=".*"/>
<rule from="^http://duckduckgo\.com/" to="https://duckduckgo.com/"/>
<rule from="^http://([^/:@\.]+)\.duckduckgo\.com/" to="https://$1.duckduckgo.com/"/>
<!-- TODO: What does ddg.gg/foo do? Runs query foo, redirects to homepage, or error? -->
<rule from="^http://ddg\.gg/$" to="https://duckduckgo.com/" />
<rule from="^http://duck\.co/" to="https://duck.co/" />
<rule from="^http://i\.duck\.co/"
to="https://duckduckgo.com/"/>
<rule from="^http://(?:www\.)?dukgo\.com/"
to="https://dukgo.com/" />
</ruleset>

View File

@ -1,44 +0,0 @@
<!--
For other Yahoo coverage, see Yahoo.xml.
These altnames don't exist:
- www.blog.flickr.net
- www.code.flickr.net
-->
<ruleset name="Flickr">
<target host="flic.kr" />
<target host="*.flic.kr" />
<target host="flickr.com" />
<target host="*.flickr.com" />
<target host="*.flickr.net" />
<target host="*.staticflickr.com" />
<!-- Not secured by server:
-->
<!--securecookie host="^\.flic\.kr$" name="^BX$" /-->
<securecookie host="^\.flic\.kr$" name=".+" />
<securecookie host=".*\.flickr\.com$" name=".+" />
<rule from="^http://flic\.kr/"
to="https://flic.kr/" />
<rule from="^http://(api\.|www\.)?flickr\.com/"
to="https://$1flickr.com/" />
<rule from="^http://s(ecure|tatic)\.flickr\.com/"
to="https://s$1.flickr.com/" />
<rule from="^http://(c2|farm\d+)\.static(\.)?flickr\.com/"
to="https://$1.static$2flickr.com/" />
<rule from="^http://(blog|code)\.flickr\.net/"
to="https://$1.flickr.net/" />
</ruleset>

View File

@ -1,11 +0,0 @@
<!--
For other GitHub coverage, see Github.xml.
-->
<ruleset name="GitHub Pages">
<target host="*.github.io" />
<rule from="^http://([^/@:\.]+)\.github\.io/"
to="https://$1.github.io/" />
</ruleset>

View File

@ -1,94 +0,0 @@
<!--
Other GitHub rulesets:
- Github-Pages.xml
- Guag.es.xml
- Speaker_Deck.com.xml
CDN buckets:
- github-images.s3.amazonaws.com
- github.global.ssl.fastly.net
- a248.e.akamai.net/assets.github.com/
- a248.e.akamai.net/camo.github.com/
- s3.amazonaws.com/github/ | d24z2fz21y4fag.cloudfront.net
- github.myshopify.com
Fully covered domains:
- github.com subdomains:
- (www.)
- assets\d+
- assets-cdn
- bounty
- cloud
- f.cloud
- codeload
- developer
- eclipse
- enterprise
- gist
- gist-assets
- help
- identicons
- jobs
- mac
- mobile
- nodeload
- octodex
- pages
- raw
- rg3
- shop
- status
- support
- training
- try
- wiki
- windows
- collector.githubapp.com
- githubusercontent.com
-->
<ruleset name="GitHub">
<target host="github.com" />
<target host="*.github.com" />
<target host="github.io" />
<target host="*.githubusercontent.com" />
<target host="collector.githubapp.com" />
<!-- Secured by server:
-->
<!--securecookie host="^github\.com$" name="^(_gh_sess|tz|user_session)$" /-->
<!--securecookie host="^\.github\.com$" name="^(dotcom_user|logged_in)$" /-->
<!--securecookie host="^enterprise\.github\.com$" name="^(_enterprise_web|request_method)$" /-->
<!--securecookie host="^gist\.github\.com$" name="^_gist_session$" /-->
<!--securecookie host="^help\.github\.com$" name="^_help_session$" /-->
<!--
Not secured by server:
-->
<!--securecookie host="^status\.github\.com$" name="^rack\.session$" /-->
<securecookie host="^(?:.*\.)?github\.com$" name=".+" />
<rule from="^http://((?:assets\d+|assets-cdn|bounty|cloud|f\.cloud|codeload|developer|eclipse|enterprise|gist|gist-assets|help|identicons|jobs|mac|mobile|nodeload|octodex|pages|raw|rg3|shop|status|support|training|try|wiki|windows|www)\.)?github\.com/"
to="https://$1github.com/" />
<rule from="^http://collector\.githubapp\.com/"
to="https://collector.githubapp.com/" />
<rule from="^https?://github\.io/"
to="https://pages.github.com/" />
<rule from="^http://([^/@:\.]+)\.githubusercontent\.com/"
to="https://$1.githubusercontent.com/" />
</ruleset>

View File

@ -1,26 +0,0 @@
<!--
Problematic domains:
- (www.)apture.com (works, mismatched, CN: *.google.com)
-->
<ruleset name="Google (mismatches)" default_off="mismatches">
<!-- Akamai -->
<target host="js.admeld.com"/>
<target host="apture.com" />
<target host="www.apture.com" />
<target host="googleartproject.com"/>
<target host="www.googleartproject.com"/>
<rule from="^http://js\.admeld\.com/"
to="https://js.admeld.com/"/>
<rule from="^https?://(?:www\.)?apture\.com/"
to="https://apture.com/" />
<rule from="^http://(?:www\.)?googleartproject\.com/"
to="https://www.googleartproject.com/"/>
</ruleset>

View File

@ -1,14 +0,0 @@
<!--
For other Google coverage, see GoogleServices.xml.
-->
<ruleset name="Google.org">
<target host="google.org" />
<target host="www.google.org" />
<rule from="^http://(www\.)?google\.org/"
to="https://$1google.org/" />
</ruleset>

View File

@ -1,143 +0,0 @@
<!--
For other Google coverage, see GoogleServices.xml.
Nonfunctional domains:
- hosted.gmodules.com *
- img0.gmodules.com *
- p.gmodules.com *
* 404; mismatched, CN: *.googleusercontent.com
Problematic domains:
- gmodules.com (503, CN: www.google.com)
- www.gmodules.com (503, CN: *.googleusercontent.com)
- gstatic.com (404, valid cert)
- api.recaptcha.net (works; mismatched, CN: google.com)
Partially covered domains:
- (www.)gmodules.com (→ www.google.com)
- (www.)google.com
- chart.apis.google.com (→ chart.googleapis.com)
Fully covered domains:
- api.google.com
- *.clients.google.com:
- linkhelp
- ssl.google-analytics.com
- www.google-analytics.com
- googleapis.com subdomains:
- ajax
- chart
- *.commondatastorage
- fonts
- *.storage
- www
- gstatic.com subdomains:
- (www.) (^ → www)
- csi
- encrypted-tbn\d
- g0
- *.metric
- ssl
- t\d
- api.recaptcha.net (→ www.google.com)
- api-secure.recaptcha.net
- gdata.youtube.com
ssl.google-analytics.com/ga.js sets __utm\w wildcard
cookies on whichever domain it is loaded from.
-->
<ruleset name="Google APIs">
<target host="gmodules.com" />
<target host="www.gmodules.com" />
<target host="google.com" />
<target host="apis.google.com" />
<target host="*.apis.google.com" />
<target host="*.clients.google.com" />
<target host="www.google.com" />
<target host="*.google-analytics.com" />
<target host="*.googleapis.com" />
<target host="gstatic.com" />
<target host="*.gstatic.com" />
<!-- Captive portal detection redirects to this URL, and many captive
portals break TLS, so exempt this redirect URL.
See GitHub bug #368
-->
<exclusion pattern="^http://www\.gstatic\.com/generate_204" />
<target host="*.recaptcha.net" />
<target host="gdata.youtube.com" />
<exclusion pattern="^http://gdata\.youtube\.com/crossdomain\.xml" />
<securecookie host="^ssl\.google-analytics\.com$" name=".+" />
<rule from="^http://(?:www\.)?gmodules\.com/ig/images/"
to="https://www.google.com/ig/images/" />
<!-- jsapi was causing problems on some sites that embed google maps:
https://trac.torproject.org/projects/tor/ticket/2335
Apparently now fixed; thanks, Google!
-->
<rule from="^http://(?:www\.)?google\.com/(afsonline/|chart|jsapi|recaptcha/|uds)"
to="https://www.google.com/$1" />
<rule from="^http://(api|[\w-]+\.client)s\.google\.com/"
to="https://$1s.google.com/" />
<rule from="^http://chart\.apis\.google\.com/chart"
to="https://chart.googleapis.com/chart" />
<rule from="^http://(ssl|www)\.google-analytics\.com/"
to="https://$1.google-analytics.com/" />
<rule from="^http://(ajax|chart|fonts|www)\.googleapis\.com/"
to="https://$1.googleapis.com/" />
<rule from="^http://([^@:\./]+\.)?(commondata)?storage\.googleapis\.com/"
to="https://$1$2storage.googleapis.com/" />
<!-- There is an interesting question about whether we should
append &strip=1 to all cache URLs. This causes them to load
without images and styles, which is more secure but can look
worse.
Without &strip=1, the images and styles from the cached
pages still load from the original, typically unencrypted, page.
With &strip=1, the cached page will be text-only and
will come exclusively from Google's HTTPS server.
-->
<rule from="^http://(?:www\.)?gstatic\.com/"
to="https://www.gstatic.com/" />
<rule from="^http://(csi|encrypted-tbn\d|g0|[\w-]+\.metric|ssl|t\d)\.gstatic\.com/"
to="https://$1.gstatic.com/" />
<rule from="^http://api\.recaptcha\.net/"
to="https://www.google.com/recaptcha/api/" />
<rule from="^http://api-secure\.recaptcha\.net/"
to="https://api-secure.recaptcha.net/" />
<rule from="^http://gdata\.youtube\.com/"
to="https://gdata.youtube.com/" />
</ruleset>

View File

@ -1,6 +0,0 @@
<ruleset name="GoogleCanada">
<target host="google.ca" />
<target host="*.google.ca" />
<rule from="^http://([^/:@\.]+)\.google\.ca/finance" to="https://$1.google.ca/finance"/>
</ruleset>

View File

@ -1,65 +0,0 @@
<!--
For other Google coverage, see GoogleServices.xml.
Problematic domains:
- www.google.bo *
- www.google.co *
- www.google.ec *
- www.google.in *
- www.google.kr *
- www.google.com.kz **
- www.google.com.lk *
- www.google.mx **
- www.google.sg *
- www.google.sl *
- www.google.ug *
- www.google.vn *
* 404; mismatched, CN: google.com
** Works; mismatched, CN: google.com
-->
<ruleset name="Google Images">
<target host="google.*" />
<target host="www.google.*" />
<target host="google.co.*" />
<target host="www.google.co.*" />
<target host="google.com" />
<target host="images.google.com" />
<target host="google.com.*" />
<target host="www.google.com.*" />
<!--
Only handle image-related paths in this ruleset:
-->
<exclusion pattern="^http://(?:www\.)?google(?:\.com?)?\.\w{2,3}/(?!(?:advanced_image_search|imghp|.*tb(?:m=isch|s=sbi)))" />
<rule from="^http://(?:www\.)?google\.com/"
to="https://www.google.com/" />
<rule from="^http://images\.google\.com/"
to="https://images.google.com/" />
<!-- First handle problematic domains:
-->
<rule from="^http://(?:www\.)?google\.co/"
to="https://www.google.com/" />
<rule from="^http://(?:www\.)?google\.(?:co\.)?(in|kr|ug)/"
to="https://www.google.co.$1/" />
<rule from="^http://(?:www\.)?google\.(?:com\.)?(kz|lk)/"
to="https://www.google.$1/" />
<rule from="^http://(?:www\.)?google\.(?:com\.)?(bo|ec|mx|sg|sl|vn)/"
to="https://www.google.com.$1/" />
<!-- And then the rest:
-->
<rule from="^http://(?:www\.)?google\.(com?\.)?(ae|ar|at|au|bg|bh|br|ca|ch|cl|co|cr|cu|de|eg|es|fi|fr|gh|gt|hr|id|ie|il|it|jo|jp|jm|ke|kw|lb|ly|my|na|ng|nl|no|nz|om|pa|pe|pk|pl|pt|py|qa|ro|ru|rw|sa|se|sv|th|tr|uk|uy|ve|za|zw)/"
to="https://www.google.$1$2/" />
</ruleset>

View File

@ -1,78 +0,0 @@
<ruleset name="Search www.google.com">
<!--
Enabling this ruleset should cause searches to go to
https://www.google.com rather than https://encrypted.google.com. Note that
the filename is important; it must be before GoogleSearch.xml in a bash
expansion of src/chrome/content/rules/*.xml in order to take precedence.
-->
<target host="*.google.com" />
<target host="google.com" />
<target host="www.google.com.*" />
<target host="google.com.*" />
<target host="www.google.co.*" />
<target host="google.co.*" />
<target host="www.google.*" />
<target host="google.*" />
<!-- beyond clients1 these do not currently exist in the ccTLDs,
but just in case... -->
<target host="clients1.google.com.*" />
<target host="clients2.google.com.*" />
<target host="clients3.google.com.*" />
<target host="clients4.google.com.*" />
<target host="clients5.google.com.*" />
<target host="clients6.google.com.*" />
<target host="clients1.google.co.*" />
<target host="clients2.google.co.*" />
<target host="clients3.google.co.*" />
<target host="clients4.google.co.*" />
<target host="clients5.google.co.*" />
<target host="clients6.google.co.*" />
<target host="clients1.google.*" />
<target host="clients2.google.*" />
<target host="clients3.google.*" />
<target host="clients4.google.*" />
<target host="clients5.google.*" />
<target host="clients6.google.*" />
<rule from="^http://www\.google\.com/$"
to="https://www.google.com/"/>
<!-- The most basic case. -->
<rule from="^http://(?:www\.)?google\.com/search"
to="https://www.google.com/search"/>
<!-- A very annoying exception that we seem to need for the basic case -->
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbs=shop" />
<exclusion pattern="^http://clients[0-9]\.google\.com/.*client=products.*" />
<exclusion pattern="^http://suggestqueries\.google\.com/.*client=.*" />
<!-- https://trac.torproject.org/projects/tor/ticket/9713 -->
<exclusion pattern="^http://clients[0-9]\.google\.com/ocsp" />
<!-- This is necessary for image results links from web search results -->
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbm=isch.*" />
<rule from="^http://(?:www\.)?google\.com/webhp"
to="https://www.google.com/webhp"/>
<rule from="^http://(?:www\.)?google\.com/#"
to="https://www.google.com/#"/>
<rule from="^http://(?:www\.)?google\.com/$"
to="https://www.google.com/"/>
<!-- Completion urls look like this:
http://clients2.google.co.jp/complete/search?hl=ja&client=hp&expIds=17259,24660,24729,24745&q=m&cp=1 HTTP/1.1\r\n
-->
<rule from="^http://clients[0-9]\.google\.com/complete/search"
to="https://clients1.google.com/complete/search"/>
</ruleset>

View File

@ -1,67 +0,0 @@
<!--
Problematic domains:
- khms *
- khms[0-3] *
* $ 404s
Fully covered domains:
- google.com subdomains:
- khms
- khms[0-3]
-->
<ruleset name="Google Maps">
<target host="maps.google.*" />
<!--
https://trac.torproject.org/projects/tor/ticket/8627
-->
<exclusion pattern="^http://maps\.google\.com/local_url" />
<exclusion pattern="^http://maps\.google\.gr/transitathens" />
<target host="maps.google.co.*" />
<target host="khms.google.com" />
<target host="khms0.google.com" />
<target host="khms1.google.com" />
<target host="khms2.google.com" />
<target host="khms3.google.com" />
<target host="maps-api-ssl.google.com" />
<target host="mw2.google.com" />
<target host="maps.google.com.*" />
<target host="maps.googleapis.com" />
<!--
https://mail1.eff.org/pipermail/https-everywhere-rules/2012-September/001317.html
-->
<!--exclusion pattern="^http://maps\.googleapis\.com/map(files/lib/map_1_20\.swf|sapi/publicapi\?file=flashapi)" /-->
<exclusion pattern="^http://maps\.googleapis\.com/map(?:files/lib/map_\d+_\d+\.swf|sapi/publicapi\?file=flashapi)" />
<target host="maps.gstatic.com" />
<!--securecookie host="^maps\.google\.(com?\.)?(au|ca|gh|ie|in|jm|ke|lk|my|n[agz]|pk|rw|sl|sg|ug|uk|za|zw)$" name=".+" /-->
<securecookie host="^maps\.google\.[\w.]{2,6}$" name=".+" />
<securecookie host="^maps\.g(?:oogle|oogleapis|static)\.com$" name=".+" />
<securecookie host="^maps-api-ssl\.google\.com$" name=".+" />
<rule from="^http://maps\.google\.([^/]+)/"
to="https://maps.google.$1/" />
<!-- http://khms.../$ 404s:
-->
<rule from="^http://khms\d?\.google\.com/+\??$"
to="https://www.google.com/" />
<rule from="^http://(khms\d?|maps-api-ssl|mw2)\.google\.com/"
to="https://$1.google.com/" />
<rule from="^http://maps\.g(oogleapis|static)\.com/"
to="https://maps.g$1.com/" />
<rule from="^https://maps\.googleapis\.com/map(?=files/lib/map_\d+_\d+\.swf|sapi/publicapi\?file=flashapi)"
to="http://maps.googleapis.com/map" downgrade="1" />
</ruleset>

View File

@ -1,6 +0,0 @@
<ruleset name="GoogleMelange">
<target host="www.google-melange.com" />
<target host="google-melange.com" />
<rule from="^http://(www\.)?google-melange\.com/" to="https://www.google-melange.com/" />
</ruleset>

View File

@ -1,135 +0,0 @@
<ruleset name="Google Search">
<target host="google.com" />
<target host="*.google.com" />
<target host="google.com.*" />
<target host="www.google.com.*" />
<target host="google.co.*" />
<target host="www.google.co.*" />
<target host="google.*" />
<target host="www.google.*" />
<!--
Beyond clients1 these do not currently
exist in the ccTLDs, but just in case...
-->
<target host="clients1.google.com.*" />
<target host="clients2.google.com.*" />
<target host="clients3.google.com.*" />
<target host="clients4.google.com.*" />
<target host="clients5.google.com.*" />
<target host="clients6.google.com.*" />
<target host="clients1.google.co.*" />
<target host="clients2.google.co.*" />
<target host="clients3.google.co.*" />
<target host="clients4.google.co.*" />
<target host="clients5.google.co.*" />
<target host="clients6.google.co.*" />
<target host="clients1.google.*" />
<target host="clients2.google.*" />
<target host="clients3.google.*" />
<target host="clients4.google.*" />
<target host="clients5.google.*" />
<target host="clients6.google.*" />
<!-- Some Google pages can generate naive links back to the
unencrypted version of encrypted.google.com, which is
a 301 but theoretically vulnerable to SSL stripping.
-->
<rule from="^http://encrypted\.google\.com/"
to="https://encrypted.google.com/" />
<!-- The most basic case.
-->
<rule from="^http://(?:www\.)?google\.com/search"
to="https://encrypted.google.com/search" />
<!-- A very annoying exception that we
seem to need for the basic case
-->
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbs=shop" />
<exclusion pattern="^http://clients\d\.google\.com/.*client=products.*" />
<exclusion pattern="^http://suggestqueries\.google\.com/.*client=.*" />
<!-- https://trac.torproject.org/projects/tor/ticket/9713
-->
<exclusion pattern="^http://clients[0-9]\.google\.com/ocsp" />
<!-- This is necessary for image results
links from web search results
-->
<exclusion pattern="^http://(?:www\.)?google\.com/search.*tbm=isch.*" />
<rule from="^http://(?:www\.)?google\.com/about"
to="https://www.google.com/about" />
<!-- There are two distinct cases for these firefox searches -->
<rule from="^http://(?:www\.)?google(?:\.com?)?\.[a-z]{2}/firefox/?$"
to="https://encrypted.google.com/" />
<rule from="^http://(?:www\.)?google(?:\.com?)?\.[a-z]{2}/firefox"
to="https://encrypted.google.com/webhp" />
<rule from="^http://(?:www\.)?google\.com/webhp"
to="https://encrypted.google.com/webhp" />
<rule from="^http://codesearch\.google\.com/"
to="https://codesearch.google.com/" />
<rule from="^http://(?:www\.)?google\.com/codesearch"
to="https://www.google.com/codesearch" />
<rule from="^http://(?:www\.)?google\.com/#"
to="https://encrypted.google.com/#" />
<rule from="^http://(?:www\.)?google\.com/$"
to="https://encrypted.google.com/" />
<!-- Google supports IPv6 search, including
HTTPS with a valid certificate! -->
<rule from="^http://ipv6\.google\.com/"
to="https://ipv6.google.com/" />
<!-- most google international sites look like
"google.fr", some look like "google.co.jp",
and some crazy ones like "google.com.au" -->
<rule from="^http://(www\.)?google(\.com?)?\.([a-z]{2})/(search\?|#)"
to="https://$1google$2.$3/$4" />
<!-- Language preference setting -->
<rule from="^http://(www\.)?google(\.com?)?\.([a-z]{2})/setprefs"
to="https://$1google$2.$3/setprefs" />
<!-- Completion urls look like this:
http://clients2.google.co.jp/complete/search?hl=ja&client=hp&expIds=17259,24660,24729,24745&q=m&cp=1 HTTP/1.1\r\n
-->
<rule from="^http://clients\d\.google\.com/complete/search"
to="https://clients1.google.com/complete/search" />
<rule from="^http://clients\d\.google(\.com?\.[a-z]{2})/complete/search"
to="https://clients1.google.$1/complete/search" />
<rule from="^http://clients\d\.google\.([a-z]{2})/complete/search"
to="https://clients1.google.$1/complete/search" />
<rule from="^http://suggestqueries\.google\.com/complete/search"
to="https://clients1.google.com/complete/search" />
<rule from="^http://(www\.)?google\.(com?\.)?([a-z]{2})/(?:webhp)?$"
to="https://$1google.$2$3/" />
<!-- If there are URL parameters, keep them. -->
<rule from="^http://(www\.)?google\.(com?\.)?([a-z]{2})/(?:webhp)?\?"
to="https://$1google.$2$3/webhp?" />
<!-- teapot -->
<rule from="^http://(www\.)?google(\.com?)?\.([a-z]{2})/teapot"
to="https://$1google$2.$3/teapot" />
</ruleset>

View File

@ -1,345 +0,0 @@
<!--
Other Google rulesets:
- 2mdn.net.xml
- Admeld.xml
- ChannelIntelligence.com.xml
- Doubleclick.net.xml
- FeedBurner.xml
- Google.org.xml
- GoogleAPIs.xml
- Google_App_Engine.xml
- GoogleImages.xml
- GoogleShopping.xml
- Ingress.xml
- Meebo.xml
- Orkut.xml
- Postini.xml
- WebM_Project.org.xml
Nonfunctional domains:
- feedproxy.google.com (404, valid cert)
- partnerpage.google.com *
- safebrowsing.clients.google.com (404, mismatched)
- (www.)googlesyndicatedsearch.com (404; mismatched, CN: google.com)
- buttons.googlesyndication.com *
* 404, valid cert
Nonfunctional google.com paths:
- analytics (redirects to http)
- imgres
- gadgets *
- hangouts (404)
- u/ (404)
* Redirects to http
Problematic domains:
- www.goo.gl (404; mismatched, CN: *.google.com)
- google.com subdomains:
- books (googlebooks/, images/, & intl/ 404, but works when rewritten to www)
- cbks0 ****
- earth *
- gg ($ 404s)
- knoll *
- scholar **
- trends *
- news.google.cctld **
- scholar.google.cctld **
- *-opensocial.googleusercontent.com ***
**** $ 404s
* 404, valid cert
** Redirects to http, valid cert
*** Breaks followers widget - https://trac.torproject.org/projects/tor/ticket/7294
Partially covered domains:
- google.cctld subdomains:
- scholar (→ www)
- google.com subdomains:
- (www.)
- cbks0 ($ 404s)
- gg ($ 404s)
- news (→ www)
- scholar (→ www)
- *.googleusercontent.com (*-opensocial excluded)
Fully covered domains:
- lh[3-6].ggpht.com
- (www.)goo.gl (www → ^)
- google.com subdomains:
- accounts
- adwords
- apis
- appengine
- books (→ encrypted)
- calendar
- checkout
- chrome
- clients[12]
- code
- *.corp
- developers
- dl
- docs
- docs\d
- \d.docs
- drive
- earth (→ www)
- encrypted
- encrypted-tbn[123]
- feedburner
- fiber
- finance
- glass
- groups
- health
- helpouts
- history
- hostedtalkgadget
- id
- investor
- knol
- knoll (→ knol)
- lh\d
- mail
- chatenabled.mail
- pack
- picasaweb
- pki
- play
- plus
- plusone
- productforums
- profiles
- safebrowsing-cache
- cert-test.sandbox
- plus.sandbox
- sb-ssl
- script
- security
- services
- servicessites
- sites
- spreadsheets
- spreadsheets\d
- support
- talk
- talkgadget
- tbn2 (→ encrypted-tbn2)
- tools
- trends (→ www)
- partner.googleadservices.com
- (www.)googlecode.com
- *.googlecode.com (per-project subdomains)
- googlesource.com
- *.googlesource.com
- pagead2.googlesyndication.com
- tpc.googlesyndication.com
- mail-attachment.googleusercontent.com
- webcache.googleusercontent.com
XXX: Needs more testing
-->
<ruleset name="Google Services">
<target host="*.ggpht.com" />
<target host="gmail.com" />
<target host="www.gmail.com" />
<target host="goo.gl" />
<target host="www.goo.gl" />
<target host="google.*" />
<target host="accounts.google.*" />
<target host="adwords.google.*" />
<target host="finance.google.*" />
<target host="groups.google.*" />
<target host="it.google.*" />
<target host="news.google.*" />
<exclusion pattern="^http://(?:news\.)?google\.com/(?:archivesearch|newspapers)" />
<target host="picasaweb.google.*" />
<target host="scholar.google.*" />
<target host="www.google.*" />
<target host="*.google.ca" />
<target host="google.co.*" />
<target host="accounts.google.co.*" />
<target host="adwords.google.co.*" />
<target host="finance.google.co.*" />
<target host="groups.google.co.*" />
<target host="id.google.co.*" />
<target host="news.google.co.*" />
<target host="picasaweb.google.co.*" />
<target host="scholar.google.co.*" />
<target host="www.google.co.*" />
<target host="google.com" />
<target host="*.google.com" />
<exclusion pattern="^http://(?:www\.)?google\.com/analytics/*(?:/[^/]+)?(?:\?.*)?$" />
<!--exclusion pattern="^http://books\.google\.com/(?!books/(\w+\.js|css/|javascript/)|favicon\.ico|googlebooks/|images/|intl/)" /-->
<exclusion pattern="^http://cbks0\.google\.com/(?:$|\?)" />
<exclusion pattern="^http://gg\.google\.com/(?!csi(?:$|\?))" />
<target host="google.com.*" />
<target host="accounts.google.com.*" />
<target host="adwords.google.com.*" />
<target host="groups.google.com.*" />
<target host="id.google.com.*" />
<target host="news.google.com.*" />
<target host="picasaweb.google.com.*" />
<target host="scholar.google.com.*" />
<target host="www.google.com.*" />
<target host="partner.googleadservices.com" />
<target host="googlecode.com" />
<target host="*.googlecode.com" />
<target host="googlemail.com" />
<target host="www.googlemail.com" />
<target host="googlesource.com" />
<target host="*.googlesource.com" />
<target host="*.googlesyndication.com" />
<target host="www.googletagservices.com" />
<target host="googleusercontent.com" />
<target host="*.googleusercontent.com" />
<!--
Necessary for the Followers widget:
https://trac.torproject.org/projects/tor/ticket/7294
-->
<exclusion pattern="http://[^@:\./]+-opensocial\.googleusercontent\.com" />
<!-- Can we secure any of these wildcard cookies safely?
-->
<!--securecookie host="^\.google\.com$" name="^(hl|I4SUserLocale|NID|PREF|S)$" /-->
<!--securecookie host="^\.google\.[\w.]{2,6}$" name="^(hl|I4SUserLocale|NID|PREF|S|S_awfe)$" /-->
<securecookie host="^(?:accounts|adwords|\.code|login\.corp|developers|docs|\d\.docs|fiber|mail|picasaweb|plus|\.?productforums|support)\.google\.[\w.]{2,6}$" name=".+" />
<securecookie host="^www\.google\.com$" name="^GoogleAccountsLocale_session$" />
<securecookie host="^mail-attachment\.googleusercontent\.com$" name=".+" />
<securecookie host="^gmail\.com$" name=".+" />
<securecookie host="^www\.gmail\.com$" name=".+" />
<securecookie host="^googlemail\.com$" name=".+" />
<securecookie host="^www\.googlemail\.com$" name=".+" />
<!-- - lh 3-6 exist
- All appear identical
- Identical to lh\d.googleusercontent.com
-->
<rule from="^http://lh(\d)\.ggpht\.com/"
to="https://lh$1.ggpht.com/" />
<rule from="^http://lh(\d)\.google\.ca/"
to="https://lh$1.google.ca/" />
<rule from="^http://(www\.)?g(oogle)?mail\.com/"
to="https://$1g$2mail.com/" />
<rule from="^http://(?:www\.)?goo\.gl/"
to="https://goo.gl/" />
<!-- Redirects to http when rewritten to www:
-->
<rule from="^http://books\.google\.com/"
to="https://encrypted.google.com/" />
<!-- tisp$ 404s:
-->
<rule from="^http://(?:www\.)?google\.((?:com?\.)?\w{2,3})/tisp(?=$|\?)"
to="https://www.google.$1/tisp/" />
<!-- Paths that work on all in google.*
-->
<rule from="^http://(?:www\.)?google\.((?:com?\.)?\w{2,3})/(accounts|adplanner|ads|adsense|adwords|analytics|bookmarks|chrome|contacts|coop|cse|css|culturalinstitute|doodles|earth|favicon\.ico|finance|get|goodtoknow|googleblogs|grants|green|hostednews|images|intl|js|landing|logos|mapmaker|newproducts|news|nexus|patents|policies|prdhp|profiles|products|reader|s2|settings|shopping|support|tisp|tools|transparencyreport|trends|urchin|webmasters)(?=$|[?/])"
to="https://www.google.$1/$2" />
<!-- Paths that 404 on .ccltd, but work on .com:
-->
<rule from="^http://(?:www\.)?google\.(?:com?\.)?\w{2,3}/(?=calendar|dictionary|doubleclick|help|ideas|pacman|postini|powermeter|url)"
to="https://www.google.com/" />
<rule from="^http://(?:www\.)?google\.(?:com?\.)?\w{2,3}/custom"
to="https://www.google.com/cse" />
<!-- Paths that only exist/work on .com
-->
<rule from="^http://(?:www\.)?google\.com/(\+|appsstatus|books|buzz|extern_js|glass|googlebooks|ig|insights|moderator|phone|safebrowsing|videotargetting|webfonts)(?=$|[?/])"
to="https://www.google.com/$1" />
<!-- Subdomains that work on all in google.*
-->
<rule from="^http://(accounts|adwords|finance|groups|id|picasaweb|)\.google\.((?:com?\.)?\w{2,3})/"
to="https://$1.google.$2/" />
<!-- Subdomains that only exist/work on .com
-->
<rule from="^http://(apis|appengine|books|calendar|cbks0|chat|checkout|chrome|clients[12]|code|[\w-]+\.corp|developers|dl|docs\d?|\d\.docs|drive|encrypted|encrypted-tbn[123]|feedburner|fiber|fonts|gg|glass||health|helpouts|history|(?:hosted)?talkgadget|investor|lh\d|(?:chatenabled\.)?mail|pack|pki|play|plus(?:\.sandbox)?|plusone|productforums|profiles|safebrowsing-cache|cert-test\.sandbox|sb-ssl|script|security|services|servicessites|sites|spreadsheets\d?|support|talk|tools)\.google\.com/"
to="https://$1.google.com/" />
<exclusion pattern="^http://clients[0-9]\.google\.com/ocsp"/>
<rule from="^http://earth\.google\.com/"
to="https://www.google.com/earth/" />
<rule from="^http://scholar\.google\.((?:com?\.)?\w{2,3})/intl/"
to="https://www.google.$1/intl/" />
<rule from="^http://(?:encrypted-)?tbn2\.google\.com/"
to="https://encrypted-tbn2.google.com/" />
<rule from="^http://knoll?\.google\.com/"
to="https://knol.google.com/" />
<rule from="^http://news\.google\.(?:com?\.)?\w{2,3}/(?:$|news|newshp)"
to="https://www.google.com/news" />
<rule from="^http://trends\.google\.com/"
to="https://www.google.com/trends" />
<rule from="^http://([^/:@\.]+\.)?googlecode\.com/"
to="https://$1googlecode.com/" />
<rule from="^http://([^\./]\.)?googlesource\.com/"
to="https://$1googlesource.com/" />
<rule from="^http://partner\.googleadservices\.com/"
to="https://partner.googleadservices.com/" />
<rule from="^http://(pagead2|tpc)\.googlesyndication\.com/"
to="https://$1.googlesyndication.com/" />
<!-- !www doesn't exist.
-->
<rule from="^http://www\.googletagservices\.com/tag/js/"
to="https://www.googletagservices.com/tag/js/" />
<rule from="^http://([^@:\./]+)\.googleusercontent\.com/"
to="https://$1.googleusercontent.com/" />
</ruleset>

View File

@ -1,28 +0,0 @@
<!--
For other Google coverage, see GoogleServices.xml.
-->
<ruleset name="Google Shopping">
<target host="google.*" />
<target host="www.google.*" />
<target host="google.co.*" />
<target host="www.google.co.*" />
<target host="*.google.com" />
<target host="google.com.*" />
<target host="www.google.com.*" />
<rule from="^http://encrypted\.google\.com/(prdhp|shopping)"
to="https://www.google.com/$1" />
<rule from="^http://shopping\.google\.com/"
to="https://shopping.google.com/" />
<rule from="^http://(?:encrypted|www)\.google\.com/(.*tbm=shop)"
to="https://www.google.com/$1" />
<rule from="^http://(?:www\.)?google\.((?:com?\.)?(?:ae|ar|at|au|bg|bh|bo|br|ca|ch|cl|cr|co|cu|de|ec|eg|es|fi|fr|gh|gt|hr|id|ie|il|in|it|jm|jo|jp|ke|kr|kw|kz|lb|lk|ly|mx|my|na|ng|nl|no|nz|om|pa|pe|pk|pl|pt|py|qa|ro|ru|rw|sa|sg|sl|se|sv|th|tr|ug|uk|uy|ve|vn|za|zw))/(?=prdhp|shopping)"
to="https://www.google.com/$1" />
</ruleset>

View File

@ -1,7 +0,0 @@
<ruleset name="GoogleSorry">
<target host="sorry.google.com" />
<target host="www.google.com" />
<target host="google.com" />
<rule from="^http://((sorry|www)\.)?google\.com/sorry/" to="https://sorry.google.com/sorry/" />
</ruleset>

View File

@ -1,8 +0,0 @@
<ruleset name="Google Translate (broken)" default_off="redirect loops">
<target host="translate.googleapis.com" />
<target host="translate.google.com" />
<rule from="^http://translate\.googleapis\.com/" to="https://translate.googleapis.com/"/>
<rule from="^http://translate\.google\.com/"
to="https://translate.google.com/" />
</ruleset>

View File

@ -1,83 +0,0 @@
<ruleset name="Google Videos">
<target host="*.google.com" />
<target host="google.com" />
<target host="www.google.com.*" />
<target host="google.com.*" />
<target host="www.google.co.*" />
<target host="google.co.*" />
<target host="www.google.*" />
<target host="google.*" />
<rule from="^http://encrypted\.google\.com/videohp"
to="https://encrypted.google.com/videohp" />
<!-- https://videos.google.com is currently broken; work around that... -->
<rule from="^https?://videos?\.google\.com/$"
to="https://encrypted.google.com/videohp" />
<rule from="^http://(?:www\.)?google\.com/videohp"
to="https://encrypted.google.com/videohp" />
<rule from="^http://(?:images|www|encrypted)\.google\.com/(.*tbm=isch)"
to="https://encrypted.google.com/$1" />
<rule
from="^http://(?:www\.)?google\.(?:com?\.)?(?:au|ca|gh|ie|in|jm|ke|lk|my|na|ng|nz|pk|rw|sl|sg|ug|uk|za|zw)/videohp"
to="https://encrypted.google.com/videohp" />
<rule
from="^http://(?:www\.)?google\.(?:com?\.)?(?:ar|bo|cl|co|cu|cr|ec|es|gt|mx|pa|pe|py|sv|uy|ve)/videohp$"
to="https://encrypted.google.com/videohp?hl=es" />
<rule
from="^http://(?:www\.)?google\.(?:com\.)?(?:ae|bh|eg|jo|kw|lb|ly|om|qa|sa)/videohp$"
to="https://encrypted.google.com/videohp?hl=ar" />
<rule from="^http://(?:www\.)?google\.(?:at|ch|de)/videohp$"
to="https://encrypted.google.com/videohp?hl=de" />
<rule from="^http://(?:www\.)?google\.(fr|nl|it|pl|ru|bg|pt|ro|hr|fi|no)/videohp$"
to="https://encrypted.google.com/videohp?hl=$1" />
<rule from="^http://(?:www\.)?google\.com?\.(id|th|tr)/videohp$"
to="https://encrypted.google.com/videohp?hl=$1" />
<rule from="^http://(?:www\.)?google\.com\.il/videohp$"
to="https://encrypted.google.com/videohp?hl=he" />
<rule from="^http://(?:www\.)?google\.com\.kr/videohp$"
to="https://encrypted.google.com/videohp?hl=ko" />
<rule from="^http://(?:www\.)?google\.com\.kz/videohp$"
to="https://encrypted.google.com/videohp?hl=kk" />
<rule from="^http://(?:www\.)?google\.com\.jp/videohp$"
to="https://encrypted.google.com/videohp?hl=ja" />
<rule from="^http://(?:www\.)?google\.com\.vn/videohp$"
to="https://encrypted.google.com/videohp?hl=vi" />
<rule from="^http://(?:www\.)?google\.com\.br/videohp$"
to="https://encrypted.google.com/videohp?hl=pt-BR" />
<rule from="^http://(?:www\.)?google\.se/videohp$"
to="https://encrypted.google.com/videohp?hl=sv" />
<!-- If there are URL parameters, keep them. -->
<rule
from="^http://(?:www\.)?google\.(?:com?\.)?(?:ar|bo|cl|co|cu|cr|ec|es|gt|mx|pa|pe|py|sv|uy|ve)/videohp\?"
to="https://encrypted.google.com/videohp?hl=es&#38;" />
<rule
from="^http://(?:www\.)?google\.(?:com\.)?(?:ae|bh|eg|jo|kw|lb|ly|om|qa|sa)/videohp\?"
to="https://encrypted.google.com/videohp?hl=ar&#38;" />
<rule from="^http://(?:www\.)?google\.(?:at|ch|de)/videohp\?"
to="https://encrypted.google.com/videohp?hl=de&#38;" />
<rule from="^http://(?:www\.)?google\.(fr|nl|it|pl|ru|bg|pt|ro|hr|fi|no)/videohp\?"
to="https://encrypted.google.com/videohp?hl=$1&#38;" />
<rule from="^http://(?:www\.)?google\.com?\.(id|th|tr)/videohp\?"
to="https://encrypted.google.com/videohp?hl=$1&#38;" />
<rule from="^http://(?:www\.)?google\.com\.il/videohp\?"
to="https://encrypted.google.com/videohp?hl=he&#38;" />
<rule from="^http://(?:www\.)?google\.com\.kr/videohp\?"
to="https://encrypted.google.com/videohp?hl=ko&#38;" />
<rule from="^http://(?:www\.)?google\.com\.kz/videohp\?"
to="https://encrypted.google.com/videohp?hl=kk&#38;" />
<rule from="^http://(?:www\.)?google\.com\.jp/videohp\?"
to="https://encrypted.google.com/videohp?hl=ja&#38;" />
<rule from="^http://(?:www\.)?google\.com\.vn/videohp\?"
to="https://encrypted.google.com/videohp?hl=vi&#38;" />
<rule from="^http://(?:www\.)?google\.com\.br/videohp\?"
to="https://encrypted.google.com/videohp?hl=pt-BR&#38;" />
<rule from="^http://(?:www\.)?google\.se/videohp\?"
to="https://encrypted.google.com/videohp?hl=sv&#38;" />
<rule from="^http://video\.google\.com/ThumbnailServer2"
to="https://video.google.com/ThumbnailServer2" />
</ruleset>

View File

@ -1,17 +0,0 @@
<!--
gwbhrd.appspot.com
-->
<ruleset name="GoogleWatchBlog">
<target host="googlewatchblog.de" />
<target host="*.googlewatchblog.de" />
<securecookie host="^(?:www)?\.googlewatchblog\.de$" name=".+" />
<rule from="^http://(static\.|www\.)?googlewatchblog\.de/"
to="https://$1googlewatchblog.de/" />
</ruleset>

View File

@ -1,21 +0,0 @@
<!--
For other Google coverage, see GoogleServices.xml.
-->
<ruleset name="Google App Engine">
<target host="appspot.com" />
<target host="*.appspot.com" />
<!--
Redirects to http for some reason.
-->
<exclusion pattern="^http://photomunchers\.appspot\.com/" />
<securecookie host="^.+\.appspot\.com$" name=".+" />
<rule from="^http://([^@:\./]+\.)?appspot\.com/"
to="https://$1appspot.com/" />
</ruleset>

View File

@ -1,16 +0,0 @@
<!-- This rule was automatically generated based on an HSTS
preload rule in the Chromium browser. See
https://src.chromium.org/viewvc/chrome/trunk/src/net/base/transport_security_state.cc
for the list of preloads. Sites are added to the Chromium HSTS
preload list on request from their administrators, so HTTPS should
work properly everywhere on this site.
Because Chromium and derived browsers automatically force HTTPS for
every access to this site, this rule applies only to Firefox. -->
<ruleset name="Googleplex.com (default off)" platform="firefox" default_off="Certificate error">
<target host="googleplex.com" />
<securecookie host="^googleplex\.com$" name=".+" />
<rule from="^http://googleplex\.com/" to="https://googleplex.com/" />
</ruleset>

View File

@ -1,15 +0,0 @@
<ruleset name="OpenStreetMap">
<target host="openstreetmap.org"/>
<target host="*.openstreetmap.org"/>
<rule from="^http://(?:www\.)?openstreetmap\.org/"
to="https://www.openstreetmap.org/"/>
<rule from="^http://tile\.openstreetmap\.org/"
to="https://a.tile.openstreetmap.org/"/>
<rule from="^http://(blog|help|lists|nominatim|piwik|taginfo|[abc]\.tile|trac|wiki)\.openstreetmap\.org/"
to="https://$1.openstreetmap.org/"/>
</ruleset>

View File

@ -1,14 +0,0 @@
<!--
www: cert only matches ^rawgithub.com
-->
<ruleset name="rawgithub.com">
<target host="rawgithub.com" />
<target host="www.rawgithub.com" />
<rule from="^http://(?:www\.)?rawgithub\.com/"
to="https://rawgithub.com/" />
</ruleset>

View File

@ -1,101 +0,0 @@
<!--
CDN buckets:
- akmedia-a.akamaihd.net
- soundcloud.assistly.com
- help.soundcloud.com
- cs70.wac.edgecastcdn.net
- a1.sndcdn.com
- i1.sndcdn.com
- w1.sndcdn.com
- wpc.658D.edgecastcdn.net
- m-a.sndcdn.com.edgesuite.net
- soundcloud.gettyimages.com
- scbackstage.wpengine.netdna-cdn.com
- ssl doesn't exist
- backstage.soundcloud.com
- soundcloud.wpengine.netdna-cdn.com
- -ssl doesn't exist
- blog.soundcloud.com
- gs1.wpc.v2cdn.netcdn.net
- gs1.wpc.v2cdn.net
- ec-media.soundcloud.com
Nonfunctional soundcloud.com subdomains:
- help (redirects to http, mismatched, CN: *.assistly.com)
- m (redirects to http)
- media
- status (times out)
Problematic domains:
- m-a.sndcdn.com (works, akamai)
Partially covered domains:
- backstage.soundcloud.com
Fully covered domains:
- sndcdn.com subdomains:
- a[12]
- api
- i[1-4]
- w[12]
- wis
- soundcloud.com subdomains:
- (www.)
- api
- blog
- connect
- developers
- ec-media
- eventlogger
- help-assets
- media
- visuals
- w
-->
<ruleset name="Soundcloud (partial)">
<target host="scbackstage.wpengine.netdna-cdn.com" />
<target host="soundcloud.wpengine.netdna-cdn.com" />
<target host="*.sndcdn.com" />
<target host="soundcloud.com" />
<target host="*.soundcloud.com" />
<exclusion pattern="^https?://(?:scbackstage\.wpengine\.netdna-cdn|backstage\.soundcloud)\.com/(?!wp-content/)" />
<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
to="https://$1.sndcdn.com/" />
<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.|)soundcloud\.com/"
to="https://$1soundcloud.com/" />
<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"
to="https://backstage.soundcloud.com/" />
<rule from="^https?://soundcloud\.wpengine\.netdna-cdn\.com/"
to="https://blog.soundcloud.com/" />
</ruleset>

View File

@ -1,36 +0,0 @@
<!--
Nonfunctional:
- image.bayimg.com
- (www.)thepiratebay.sx (http reply)
For problematic rules, see ThePirateBay-mismatches.xml.
-->
<ruleset name="The Pirate Bay (partial)">
<target host="suprbay.org" />
<target host="*.suprbay.org" />
<!-- * for cross-domain cookie -->
<target host="*.forum.suprbay.org" />
<target host="thepiratebay.org"/>
<target host="*.thepiratebay.org"/>
<target host="thepiratebay.se"/>
<target host="*.thepiratebay.se"/>
<securecookie host="^.*\.suprbay\.org$" name=".*" />
<securecookie host="^(.*\.)?thepiratebay\.se$" name=".*"/>
<!-- Cert doesn't match (www.), redirects like so. -->
<rule from="^https?://(?:forum\.|www\.)?suprbay\.org/"
to="https://forum.suprbay.org/" />
<rule from="^http://(?:www\.)?thepiratebay\.(?:org|se)/"
to="https://thepiratebay.se/"/>
<rule from="^http://(rss|static|torrents)\.thepiratebay\.(?:org|se)/"
to="https://$1.thepiratebay.se/"/>
</ruleset>

View File

@ -1,18 +0,0 @@
<ruleset name="Tor Project">
<target host="torproject.org" />
<target host="*.torproject.org" />
<exclusion pattern="^http://torperf\.torproject\.org/" />
<!-- Not secured by server:
-->
<!--securecookie host="^\.blog\.torproject\.org$" name="^SESS[0-9a-f]{32}$" /-->
<securecookie host="^(?:.*\.)?torproject\.org$" name=".+" />
<rule from="^http://([^/:@\.]+\.)?torproject\.org/"
to="https://$1torproject.org/" />
</ruleset>

View File

@ -1,169 +0,0 @@
<!--
Other Twitter rulesets:
- Twitter_Community.com.xml
Nonfunctional domains:
- status.twitter.com *
- status.twitter.jp *
* Tumblr
CDN buckets:
- a1095.g.akamai.net/=/1095/134446/1d/platform.twitter.com/ | platform2.twitter.com.edgesuite.net
- platform2.twitter.com
- twitter-any.s3.amazonaws.com
- twitter-blog.s3.amazonaws.com
- d2rdfnizen5apl.cloudfront.net
- s.twimg.com
- ssl2.twitter.com.edgekey.net
- twitter.github.com
Problematic domains:
- twimg.com subdomains:
- a5 *
- s (cloudfront)
- twitter.com subdomains:
- platform[0-3] (403, akamai)
* akamai
Fully covered domains:
- (www.)t.co (www → ^)
- twimg.com subdomains:
- a[5-9] (→ si0)
- a\d
- abs
- dnt
- ea
- g
- g2
- gu
- hca
- jp
- ma
- ma[0123]
- o
- p
- pbs
- r
- s (→ d2rdfnizen5apl.cloudfront.net)
- si[0-5]
- syndication
- cdn.syndication
- tailfeather
- ton
- v
- widgets
- twitter.com subdomains:
- (www.)
- 201[012]
- about
- ads
- analytics
- api
- cdn.api
- urls.api
- blog
- business
- preview.cdn
- preview-dev.cdn
- preview-stage.cdn
- de
- dev
- en
- engineering
- es
- firefox
- fr
- it
- ja
- jp
- m
- media
- mobile
- music
- oauth
- p
- pic
- platform
- platform[0-3] (→ platform)
- widgets.platform
- search
- static
- support
- transparency
- upload
These altnames don't exist:
- i3.twimg.com
- p-dev.twimg.com
- vmtc.twimg.com
- cdn-dev.api.twitter.com
-->
<ruleset name="Twitter">
<target host="t.co" />
<target host="*.t.co" />
<target host="*.twimg.com" />
<target host="twitter.com" />
<target host="*.twitter.com" />
<!-- Secured by server:
-->
<!--securecookie host="^\.twitter\.com$" name="^_twitter_sess$" /-->
<!--securecookie host="^support\.twitter\.com$" name="^_help_center_session$" /-->
<!--
Not secured by server:
-->
<!--securecookie host="^\.t\.co$" name="^muc$" /-->
<!--securecookie host="^\.twitter\.com$" name="^guest_id$" /-->
<securecookie host="^\.t\.co$" name=".+" />
<securecookie host="^(?:.*\.)?twitter\.com$" name=".+" />
<rule from="^http://(?:www\.)?t\.co/"
to="https://t.co/" />
<rule from="^http://a[5-9]\.twimg\.com/"
to="https://si0.twimg.com/" />
<rule from="^http://(abs|a\d|dnt|ea|g[2u]?|hca|jp|ma\d?|o|p|pbs|r|si\d|(?:cdn\.)?syndication|tailfeather|ton|v|widgets)\.twimg\.com/"
to="https://$1.twimg.com/" />
<rule from="^http://s\.twimg\.com/"
to="https://d2rdfnizen5apl.cloudfront.net/" />
<rule from="^http://((?:201\d|about|ads|analytics|blog|(?:cdn\.|urls\.)?api|business|preview(?:-dev|-stage)?\.cdn|de|dev|engineering|en|es|firefox|fr|it|ja|jp|m|media|mobile|music|oauth|p|pic|platform|widgets\.platform|search|static|support|transparency|upload|www)\.)?twitter\.com/"
to="https://$1twitter.com/" />
<rule from="^http://platform\d\.twitter\.com/"
to="https://platform.twitter.com/" />
</ruleset>

View File

@ -1,75 +0,0 @@
<!--
CDN buckets:
- av.vimeo.com.edgesuite.net
- a808.g.akamai.net
- pdl.vimeocdn.com.edgesuite.net
- a1189.g.akamai.net
Problematic subdomains:
- av (pdl.../crossdomain.xml restricts to port 80)
- pdl (works, akamai)
Partially covered subdomains:
- developer (some pages redirect to http)
- pdl (→ akamai)
Fully covered subdomains:
- (www.)
- secure
Default off per https://trac.torproject.org/projects/tor/ticket/7569 -->
<ruleset name="Vimeo (default off)" default_off="breaks some video embedding">
<target host="vimeo.com" />
<target host="*.vimeo.com" />
<exclusion pattern="^http://av\.vimeo\.com/crossdomain\.xml" />
<!--exclusion pattern="^http://developer\.vimeo\.com/($|\?|(apps|guidelines|help|player)($|[?/]))" /-->
<exclusion pattern="^http://developer\.vimeo\.com/(?!apis(?:$|[?/])|favicon\.ico)" />
<target host="*.vimeocdn.com" />
<!--
Uses crossdomain.xml from s3.amazonaws.com, which sets secure="false"
https://mail1.eff.org/pipermail/https-everywhere/2012-October/001583.html
-->
<exclusion pattern="^http://a\.vimeocdn\.com/p/flash/moogaloop/" />
<!-- We cannot secure streams because crossdomain.xml
restricts to port 80 :(
-->
<exclusion pattern="^http://pdl\.vimeocdn\.com/(?!crossdomain\.xml)" />
<!-- Tracking cookies:
-->
<securecookie host="^\.(?:player\.)?vimeo\.com$" name="^__utm\w$" />
<rule from="^http://((?:developer|player|secure|www)\.)?vimeo\.com/"
to="https://$1vimeo.com/" />
<rule from="^http://av\.vimeo\.com/"
to="https://a248.e.akamai.net/f/808/9207/8m/av.vimeo.com/" />
<!-- a & b: Akamai -->
<rule from="^http://(?:secure-)?([ab])\.vimeocdn\.com/"
to="https://secure-$1.vimeocdn.com/" />
<rule from="^http://i\.vimeocdn\.com/"
to="https://i.vimeocdn.com/" />
<rule from="^http://pdl\.vimeocdn\.com/"
to="https://a248.e.akamai.net/f/1189/4415/8d/pdl.vimeocdn.com/" />
</ruleset>

View File

@ -1,13 +0,0 @@
<ruleset name="WikiLeaks">
<target host="wikileaks.org" />
<target host="*.wikileaks.org" />
<securecookie host="^(?:w*\.)?wikileaks\.org$" name=".+" />
<rule from="^http://((?:chat|search|shop|www)\.)?wikileaks\.org/"
to="https://$1wikileaks.org/" />
</ruleset>

View File

@ -1,107 +0,0 @@
<!--
Wikipedia and other Wikimedia Foundation wikis previously had no real HTTPS support, and
URLs had to be rewritten to https://secure.wikimedia.org/$wikitype/$language/ . This is no
longer the case, see https://blog.wikimedia.org/2011/10/03/native-https-support-enabled-for-all-wikimedia-foundation-wikis/ ,
so this file is a lot simpler these days.
Mixed content:
- Images, on:
- stats.wikimedia.org from upload.wikimedia.org *
- stats.wikimedia.org from wikimediafoundation.org *
* Secured by us
-->
<ruleset name="Wikimedia">
<target host="enwp.org" />
<target host="frwp.org" />
<target host="mediawiki.org" />
<target host="www.mediawiki.org" />
<target host="wikimedia.org" />
<target host="*.wikimedia.org" />
<exclusion pattern="^http://(?:apt|cs|cz|parsoid-lb\.eqiad|status|torrus|ubuntu)\.wikimedia\.org" />
<!-- https://mail1.eff.org/pipermail/https-everywhere-rules/2012-June/001189.html -->
<exclusion pattern="^http://lists\.wikimedia\.org/pipermail(?:$|/)" />
<target host="wikimediafoundation.org" />
<target host="www.wikimediafoundation.org" />
<!-- Wikimedia projects (also some wikimedia.org subdomains) -->
<target host="wikibooks.org" />
<target host="*.wikibooks.org" />
<target host="wikidata.org" />
<target host="*.wikidata.org" />
<target host="wikinews.org" />
<target host="*.wikinews.org" />
<target host="wikipedia.org" />
<target host="*.wikipedia.org" />
<target host="wikiquote.org" />
<target host="*.wikiquote.org" />
<target host="wikisource.org" />
<target host="*.wikisource.org" />
<target host="wikiversity.org" />
<target host="*.wikiversity.org" />
<target host="wikivoyage.org" />
<target host="*.wikivoyage.org" />
<target host="wiktionary.org" />
<target host="*.wiktionary.org" />
<!-- Wikimedia chapters -->
<target host="wikimedia.ca" />
<target host="www.wikimedia.ca" />
<!-- Wikimedia Tool Labs -->
<target host="tools.wmflabs.org" />
<target host="icinga.wmflabs.org" />
<target host="ganglia.wmflabs.org" />
<!-- Not secured by server:
-->
<!--securecookie host="^\.wiki(books|ipedia)\.org$" name="^GeoIP$" /-->
<securecookie host="^^\.wik(?:ibooks|idata|imedia|inews|ipedia|iquote|isource|iversity|ivoyage|tionary)\.org$" name="^GeoIP$" />
<securecookie host="^([^@:/]+\.)?wik(ibooks|idata|inews|ipedia|iquote|isource|iversity|ivoyage|tionary)\.org$" name=".*" />
<securecookie host="^(species|commons|meta|incubator|wikitech).wikimedia.org$" name=".*" />
<securecookie host="^(?:www\.)?mediawiki\.org$" name=".*" />
<securecookie host="^wikimediafoundation.org$" name=".*" />
<rule from="^http://(en|fr)wp\.org/"
to="https://$1.wikipedia.org/wiki/" />
<rule from="^http://(?:www\.)?mediawiki\.org/"
to="https://www.mediawiki.org/" />
<rule from="^https?://download\.wikipedia\.org/"
to="https://dumps.wikimedia.org/" />
<rule from="^https?://(download|dataset2|sitemap)\.wikimedia\.org/"
to="https://dumps.wikimedia.org/" />
<rule from="^https?://(labs-ns[01]|virt0)\.wikimedia\.org/"
to="https://wikitech.wikimedia.org/" />
<rule from="^https?://noboard\.chapters\.wikimedia\.org/"
to="https://noboard-chapters.wikimedia.org/" />
<rule from="^https?://wg\.en\.wikipedia\.org/"
to="https://wg-en.wikipedia.org/" />
<rule from="^https?://arbcom\.(de|en|fi|nl)\.wikipedia\.org/"
to="https://arbcom-$1.wikipedia.org/" />
<rule from="^http://([^@:/]+\.)?wik(ibooks|idata|imedia|inews|ipedia|iquote|isource|iversity|ivoyage|tionary)\.org/"
to="https://$1wik$2.org/" />
<rule from="^http://(www\.)?wikimediafoundation\.org/"
to="https://$1wikimediafoundation.org/" />
<rule from="^http://(www\.)?wikimedia\.ca/"
to="https://wikimedia.ca/" />
<rule from="^http://([^@:/]+)\.wmflabs\.org/"
to="https://$1.wmflabs.org/" />
</ruleset>

File diff suppressed because it is too large Load Diff

View File

@ -1,46 +0,0 @@
<ruleset name="YouTube (partial)">
<target host="youtube.com" />
<target host="*.youtube.com" />
<exclusion pattern="^http://(?:www\.)?youtube\.com/crossdomain\.xml"/>
<exclusion pattern="^http://(?:www\.)?youtube\.com/(?:apiplayer|api_video_info)"/>
<exclusion pattern="^http://(?:[^/@:\.]+\.)?ytimg\.com/.*apiplayer[0-9]*\.swf"/>
<target host="*.ytimg.com" />
<target host="youtu.be" />
<target host="youtube-nocookie.com"/>
<target host="www.youtube-nocookie.com"/>
<target host="*.googlevideo.com"/>
<exclusion pattern="^http://([^/@:\.]+)\.googlevideo\.com/crossdomain\.xml"/>
<!-- Not secured by server:
-->
<!--securecookie host="^\.youtube\.com$" name="^(GEUP|PREF|VISITOR_INFO1_LIVE|YSC)$" /-->
<!-- observed ^. cookies:
- use_hitbox
- VISITOR_INFO1_LIVE
- recently_watched_video_id_list
- .youtube.com -->
<securecookie host="^\.youtube\.com" name=".*"/>
<rule from="^http://(www\.)?youtube\.com/"
to="https://$1youtube.com/"/>
<rule from="^http://(br|de|es|fr|il|img|insight|jp|m|nl|uk)\.youtube\.com/"
to="https://$1.youtube.com/"/>
<rule from="^http://([^/@:\.]+)\.ytimg\.com/"
to="https://$1.ytimg.com/"/>
<rule from="^http://youtu\.be/"
to="https://youtu.be/"/>
<rule from="^http://(?:www\.)?youtube-nocookie\.com/"
to="https://www.youtube-nocookie.com/"/>
<rule from="^http://([^/@:\.]+)\.googlevideo\.com/"
to="https://$1.googlevideo.com/"/>
</ruleset>

View File

@ -1,77 +0,0 @@
'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''
# list of language codes
language_codes = (
("ar_XA", "Arabic", "Arabia"),
("bg_BG", "Bulgarian", "Bulgaria"),
("cs_CZ", "Czech", "Czech Republic"),
("de_DE", "German", "Germany"),
("da_DK", "Danish", "Denmark"),
("de_AT", "German", "Austria"),
("de_CH", "German", "Switzerland"),
("el_GR", "Greek", "Greece"),
("en_AU", "English", "Australia"),
("en_CA", "English", "Canada"),
("en_GB", "English", "United Kingdom"),
("en_ID", "English", "Indonesia"),
("en_IE", "English", "Ireland"),
("en_IN", "English", "India"),
("en_MY", "English", "Malaysia"),
("en_NZ", "English", "New Zealand"),
("en_PH", "English", "Philippines"),
("en_SG", "English", "Singapore"),
("en_US", "English", "United States"),
("en_XA", "English", "Arabia"),
("en_ZA", "English", "South Africa"),
("es_AR", "Spanish", "Argentina"),
("es_CL", "Spanish", "Chile"),
("es_ES", "Spanish", "Spain"),
("es_MX", "Spanish", "Mexico"),
("es_US", "Spanish", "United States"),
("es_XL", "Spanish", "Latin America"),
("et_EE", "Estonian", "Estonia"),
("fi_FI", "Finnish", "Finland"),
("fr_BE", "French", "Belgium"),
("fr_CA", "French", "Canada"),
("fr_CH", "French", "Switzerland"),
("fr_FR", "French", "France"),
("he_IL", "Hebrew", "Israel"),
("hr_HR", "Croatian", "Croatia"),
("hu_HU", "Hungarian", "Hungary"),
("it_IT", "Italian", "Italy"),
("ja_JP", "Japanese", "Japan"),
("ko_KR", "Korean", "Korea"),
("lt_LT", "Lithuanian", "Lithuania"),
("lv_LV", "Latvian", "Latvia"),
("nb_NO", "Norwegian", "Norway"),
("nl_BE", "Dutch", "Belgium"),
("nl_NL", "Dutch", "Netherlands"),
("pl_PL", "Polish", "Poland"),
("pt_BR", "Portuguese", "Brazil"),
("pt_PT", "Portuguese", "Portugal"),
("ro_RO", "Romanian", "Romania"),
("ru_RU", "Russian", "Russia"),
("sk_SK", "Slovak", "Slovak Republic"),
("sl_SL", "Slovenian", "Slovenia"),
("sv_SE", "Swedish", "Sweden"),
("th_TH", "Thai", "Thailand"),
("tr_TR", "Turkish", "Turkey"),
("uk_UA", "Ukrainian", "Ukraine"),
("zh_CN", "Chinese", "China"),
("zh_HK", "Chinese", "Hong Kong SAR"),
("zh_TW", "Chinese", "Taiwan"))

View File

@ -1,61 +0,0 @@
import requests
the_http_adapter = requests.adapters.HTTPAdapter(pool_connections=100)
the_https_adapter = requests.adapters.HTTPAdapter(pool_connections=100)
class SessionSinglePool(requests.Session):
def __init__(self):
global the_https_adapter, the_http_adapter
super(SessionSinglePool, self).__init__()
# reuse the same adapters
self.adapters.clear()
self.mount('https://', the_https_adapter)
self.mount('http://', the_http_adapter)
def close(self):
"""Call super, but clear adapters since there are managed globaly"""
self.adapters.clear()
super(SessionSinglePool, self).close()
def request(method, url, **kwargs):
"""same as requests/requests/api.py request(...) except it use SessionSinglePool"""
session = SessionSinglePool()
response = session.request(method=method, url=url, **kwargs)
session.close()
return response
def get(url, **kwargs):
kwargs.setdefault('allow_redirects', True)
return request('get', url, **kwargs)
def options(url, **kwargs):
kwargs.setdefault('allow_redirects', True)
return request('options', url, **kwargs)
def head(url, **kwargs):
kwargs.setdefault('allow_redirects', False)
return request('head', url, **kwargs)
def post(url, data=None, **kwargs):
return request('post', url, data=data, **kwargs)
def put(url, data=None, **kwargs):
return request('put', url, data=data, **kwargs)
def patch(url, data=None, **kwargs):
return request('patch', url, data=data, **kwargs)
def delete(url, **kwargs):
return request('delete', url, **kwargs)

View File

@ -1,132 +0,0 @@
#!/usr/bin/env python
'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2014 by Thomas Pointhuber, <thomas.pointhuber@gmx.at>
'''
from searx.languages import language_codes
from searx.engines import (
categories, engines, engine_shortcuts
)
import string
import re
class Query(object):
"""parse query"""
def __init__(self, query, blocked_engines):
self.query = query
self.blocked_engines = []
if blocked_engines:
self.blocked_engines = blocked_engines
self.query_parts = []
self.engines = []
self.languages = []
self.specific = False
# parse query, if tags are set, which
# change the serch engine or search-language
def parse_query(self):
self.query_parts = []
# split query, including whitespaces
raw_query_parts = re.split(r'(\s+)', self.query)
parse_next = True
for query_part in raw_query_parts:
if not parse_next:
self.query_parts[-1] += query_part
continue
parse_next = False
# part does only contain spaces, skip
if query_part.isspace()\
or query_part == '':
parse_next = True
self.query_parts.append(query_part)
continue
# this force a language
if query_part[0] == ':':
lang = query_part[1:].lower()
# check if any language-code is equal with
# declared language-codes
for lc in language_codes:
lang_id, lang_name, country = map(str.lower, lc)
# if correct language-code is found
# set it as new search-language
if lang == lang_id\
or lang_id.startswith(lang)\
or lang == lang_name\
or lang.replace('_', ' ') == country:
parse_next = True
self.languages.append(lang)
break
# this force a engine or category
if query_part[0] == '!' or query_part[0] == '?':
prefix = query_part[1:].replace('_', ' ')
# check if prefix is equal with engine shortcut
if prefix in engine_shortcuts:
parse_next = True
self.engines.append({'category': 'none',
'name': engine_shortcuts[prefix]})
# check if prefix is equal with engine name
elif prefix in engines:
parse_next = True
self.engines.append({'category': 'none',
'name': prefix})
# check if prefix is equal with categorie name
elif prefix in categories:
# using all engines for that search, which
# are declared under that categorie name
parse_next = True
self.engines.extend({'category': prefix,
'name': engine.name}
for engine in categories[prefix]
if (engine.name, prefix) not in self.blocked_engines)
if query_part[0] == '!':
self.specific = True
# append query part to query_part list
self.query_parts.append(query_part)
def changeSearchQuery(self, search_query):
if len(self.query_parts):
self.query_parts[-1] = search_query
else:
self.query_parts.append(search_query)
def getSearchQuery(self):
if len(self.query_parts):
return self.query_parts[-1]
else:
return ''
def getFullQuery(self):
# get full querry including whitespaces
return string.join(self.query_parts, '')

View File

@ -1,556 +0,0 @@
'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''
import threading
import re
import searx.poolrequests as requests_lib
from itertools import izip_longest, chain
from operator import itemgetter
from Queue import Queue
from time import time
from urlparse import urlparse, unquote
from searx.engines import (
categories, engines
)
from searx.languages import language_codes
from searx.utils import gen_useragent, get_blocked_engines
from searx.query import Query
from searx import logger
logger = logger.getChild('search')
number_of_searches = 0
def search_request_wrapper(fn, url, engine_name, **kwargs):
try:
return fn(url, **kwargs)
except:
# increase errors stats
engines[engine_name].stats['errors'] += 1
# print engine name and specific error message
logger.exception('engine crash: {0}'.format(engine_name))
return
def threaded_requests(requests):
timeout_limit = max(r[2]['timeout'] for r in requests)
search_start = time()
for fn, url, request_args, engine_name in requests:
request_args['timeout'] = timeout_limit
th = threading.Thread(
target=search_request_wrapper,
args=(fn, url, engine_name),
kwargs=request_args,
name='search_request',
)
th._engine_name = engine_name
th.start()
for th in threading.enumerate():
if th.name == 'search_request':
remaining_time = max(0.0, timeout_limit - (time() - search_start))
th.join(remaining_time)
if th.isAlive():
logger.warning('engine timeout: {0}'.format(th._engine_name))
# get default reqest parameter
def default_request_params():
return {
'method': 'GET',
'headers': {},
'data': {},
'url': '',
'cookies': {},
'verify': True
}
# create a callback wrapper for the search engine results
def make_callback(engine_name, results_queue, callback, params):
# creating a callback wrapper for the search engine results
def process_callback(response, **kwargs):
# check if redirect comparing to the True value,
# because resp can be a Mock object, and any attribut name returns something.
if response.is_redirect is True:
logger.debug('{0} redirect on: {1}'.format(engine_name, response))
return
response.search_params = params
timeout_overhead = 0.2 # seconds
search_duration = time() - params['started']
timeout_limit = engines[engine_name].timeout + timeout_overhead
if search_duration > timeout_limit:
engines[engine_name].stats['page_load_time'] += timeout_limit
engines[engine_name].stats['errors'] += 1
return
# callback
search_results = callback(response)
# add results
for result in search_results:
result['engine'] = engine_name
results_queue.put_nowait((engine_name, search_results))
# update stats with current page-load-time
engines[engine_name].stats['page_load_time'] += search_duration
return process_callback
# return the meaningful length of the content for a result
def content_result_len(content):
if isinstance(content, basestring):
content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
return len(content)
else:
return 0
# score results and remove duplications
def score_results(results):
# calculate scoring parameters
flat_res = filter(
None, chain.from_iterable(izip_longest(*results.values())))
flat_len = len(flat_res)
engines_len = len(results)
results = []
# pass 1: deduplication + scoring
for i, res in enumerate(flat_res):
res['parsed_url'] = urlparse(res['url'])
res['host'] = res['parsed_url'].netloc
if res['host'].startswith('www.'):
res['host'] = res['host'].replace('www.', '', 1)
res['engines'] = [res['engine']]
weight = 1.0
# strip multiple spaces and cariage returns from content
if res.get('content'):
res['content'] = re.sub(' +', ' ',
res['content'].strip().replace('\n', ''))
# get weight of this engine if possible
if hasattr(engines[res['engine']], 'weight'):
weight = float(engines[res['engine']].weight)
# calculate score for that engine
score = int((flat_len - i) / engines_len) * weight + 1
# check for duplicates
duplicated = False
for new_res in results:
# remove / from the end of the url if required
p1 = res['parsed_url'].path[:-1]\
if res['parsed_url'].path.endswith('/')\
else res['parsed_url'].path
p2 = new_res['parsed_url'].path[:-1]\
if new_res['parsed_url'].path.endswith('/')\
else new_res['parsed_url'].path
# check if that result is a duplicate
if res['host'] == new_res['host'] and\
unquote(p1) == unquote(p2) and\
res['parsed_url'].query == new_res['parsed_url'].query and\
res.get('template') == new_res.get('template'):
duplicated = new_res
break
# merge duplicates together
if duplicated:
# using content with more text
if content_result_len(res.get('content', '')) >\
content_result_len(duplicated.get('content', '')):
duplicated['content'] = res['content']
# increase result-score
duplicated['score'] += score
# add engine to list of result-engines
duplicated['engines'].append(res['engine'])
# using https if possible
if duplicated['parsed_url'].scheme == 'https':
continue
elif res['parsed_url'].scheme == 'https':
duplicated['url'] = res['parsed_url'].geturl()
duplicated['parsed_url'] = res['parsed_url']
# if there is no duplicate found, append result
else:
res['score'] = score
results.append(res)
results = sorted(results, key=itemgetter('score'), reverse=True)
# pass 2 : group results by category and template
gresults = []
categoryPositions = {}
for i, res in enumerate(results):
# FIXME : handle more than one category per engine
category = engines[res['engine']].categories[0] + ':' + ''\
if 'template' not in res\
else res['template']
current = None if category not in categoryPositions\
else categoryPositions[category]
# group with previous results using the same category
# if the group can accept more result and is not too far
# from the current position
if current is not None and (current['count'] > 0)\
and (len(gresults) - current['index'] < 20):
# group with the previous results using
# the same category with this one
index = current['index']
gresults.insert(index, res)
# update every index after the current one
# (including the current one)
for k in categoryPositions:
v = categoryPositions[k]['index']
if v >= index:
categoryPositions[k]['index'] = v+1
# update this category
current['count'] -= 1
else:
# same category
gresults.append(res)
# update categoryIndex
categoryPositions[category] = {'index': len(gresults), 'count': 8}
# return gresults
return gresults
def merge_two_infoboxes(infobox1, infobox2):
if 'urls' in infobox2:
urls1 = infobox1.get('urls', None)
if urls1 is None:
urls1 = []
infobox1.set('urls', urls1)
urlSet = set()
for url in infobox1.get('urls', []):
urlSet.add(url.get('url', None))
for url in infobox2.get('urls', []):
if url.get('url', None) not in urlSet:
urls1.append(url)
if 'attributes' in infobox2:
attributes1 = infobox1.get('attributes', None)
if attributes1 is None:
attributes1 = []
infobox1.set('attributes', attributes1)
attributeSet = set()
for attribute in infobox1.get('attributes', []):
if attribute.get('label', None) not in attributeSet:
attributeSet.add(attribute.get('label', None))
for attribute in infobox2.get('attributes', []):
attributes1.append(attribute)
if 'content' in infobox2:
content1 = infobox1.get('content', None)
content2 = infobox2.get('content', '')
if content1 is not None:
if content_result_len(content2) > content_result_len(content1):
infobox1['content'] = content2
else:
infobox1.set('content', content2)
def merge_infoboxes(infoboxes):
results = []
infoboxes_id = {}
for infobox in infoboxes:
add_infobox = True
infobox_id = infobox.get('id', None)
if infobox_id is not None:
existingIndex = infoboxes_id.get(infobox_id, None)
if existingIndex is not None:
merge_two_infoboxes(results[existingIndex], infobox)
add_infobox = False
if add_infobox:
results.append(infobox)
infoboxes_id[infobox_id] = len(results)-1
return results
class Search(object):
"""Search information container"""
def __init__(self, request):
# init vars
super(Search, self).__init__()
self.query = None
self.engines = []
self.categories = []
self.paging = False
self.pageno = 1
self.lang = 'all'
# set blocked engines
self.blocked_engines = get_blocked_engines(engines, request.cookies)
self.results = []
self.suggestions = []
self.answers = []
self.infoboxes = []
self.request_data = {}
# set specific language if set
if request.cookies.get('language')\
and request.cookies['language'] in (x[0] for x in language_codes):
self.lang = request.cookies['language']
# set request method
if request.method == 'POST':
self.request_data = request.form
else:
self.request_data = request.args
# TODO better exceptions
if not self.request_data.get('q'):
raise Exception('noquery')
# set pagenumber
pageno_param = self.request_data.get('pageno', '1')
if not pageno_param.isdigit() or int(pageno_param) < 1:
raise Exception('wrong pagenumber')
self.pageno = int(pageno_param)
# parse query, if tags are set, which change
# the serch engine or search-language
query_obj = Query(self.request_data['q'], self.blocked_engines)
query_obj.parse_query()
# set query
self.query = query_obj.getSearchQuery()
# get last selected language in query, if possible
# TODO support search with multible languages
if len(query_obj.languages):
self.lang = query_obj.languages[-1]
self.engines = query_obj.engines
self.categories = []
# if engines are calculated from query,
# set categories by using that informations
if self.engines and query_obj.specific:
self.categories = list(set(engine['category']
for engine in self.engines))
# otherwise, using defined categories to
# calculate which engines should be used
else:
# set used categories
for pd_name, pd in self.request_data.items():
if pd_name.startswith('category_'):
category = pd_name[9:]
# if category is not found in list, skip
if category not in categories:
continue
if pd != 'off':
# add category to list
self.categories.append(category)
elif category in self.categories:
# remove category from list if property is set to 'off'
self.categories.remove(category)
# if no category is specified for this search,
# using user-defined default-configuration which
# (is stored in cookie)
if not self.categories:
cookie_categories = request.cookies.get('categories', '')
cookie_categories = cookie_categories.split(',')
for ccateg in cookie_categories:
if ccateg in categories:
self.categories.append(ccateg)
# if still no category is specified, using general
# as default-category
if not self.categories:
self.categories = ['general']
# using all engines for that search, which are
# declared under the specific categories
for categ in self.categories:
self.engines.extend({'category': categ,
'name': engine.name}
for engine in categories[categ]
if (engine.name, categ) not in self.blocked_engines)
# do search-request
def search(self, request):
global number_of_searches
# init vars
requests = []
results_queue = Queue()
results = {}
suggestions = set()
answers = set()
infoboxes = []
# increase number of searches
number_of_searches += 1
# set default useragent
# user_agent = request.headers.get('User-Agent', '')
user_agent = gen_useragent()
# start search-reqest for all selected engines
for selected_engine in self.engines:
if selected_engine['name'] not in engines:
continue
engine = engines[selected_engine['name']]
# if paging is not supported, skip
if self.pageno > 1 and not engine.paging:
continue
# if search-language is set and engine does not
# provide language-support, skip
if self.lang != 'all' and not engine.language_support:
continue
# set default request parameters
request_params = default_request_params()
request_params['headers']['User-Agent'] = user_agent
request_params['category'] = selected_engine['category']
request_params['started'] = time()
request_params['pageno'] = self.pageno
request_params['language'] = self.lang
try:
# 0 = None, 1 = Moderate, 2 = Strict
request_params['safesearch'] = int(request.cookies.get('safesearch', 1))
except ValueError:
request_params['safesearch'] = 1
# update request parameters dependent on
# search-engine (contained in engines folder)
engine.request(self.query.encode('utf-8'), request_params)
if request_params['url'] is None:
# TODO add support of offline engines
pass
# create a callback wrapper for the search engine results
callback = make_callback(
selected_engine['name'],
results_queue,
engine.response,
request_params)
# create dictionary which contain all
# informations about the request
request_args = dict(
headers=request_params['headers'],
hooks=dict(response=callback),
cookies=request_params['cookies'],
timeout=engine.timeout,
verify=request_params['verify']
)
# specific type of request (GET or POST)
if request_params['method'] == 'GET':
req = requests_lib.get
else:
req = requests_lib.post
request_args['data'] = request_params['data']
# ignoring empty urls
if not request_params['url']:
continue
# append request to list
requests.append((req, request_params['url'],
request_args,
selected_engine['name']))
if not requests:
return results, suggestions, answers, infoboxes
# send all search-request
threaded_requests(requests)
while not results_queue.empty():
engine_name, engine_results = results_queue.get_nowait()
# TODO type checks
[suggestions.add(x['suggestion'])
for x in list(engine_results)
if 'suggestion' in x
and engine_results.remove(x) is None]
[answers.add(x['answer'])
for x in list(engine_results)
if 'answer' in x
and engine_results.remove(x) is None]
infoboxes.extend(x for x in list(engine_results)
if 'infobox' in x
and engine_results.remove(x) is None)
results[engine_name] = engine_results
# update engine-specific stats
for engine_name, engine_results in results.items():
engines[engine_name].stats['search_count'] += 1
engines[engine_name].stats['result_count'] += len(engine_results)
# score results and remove duplications
results = score_results(results)
# merge infoboxes according to their ids
infoboxes = merge_infoboxes(infoboxes)
# update engine stats, using calculated score
for result in results:
for res_engine in result['engines']:
engines[result['engine']]\
.stats['score_count'] += result['score']
# return results, suggestions, answers and infoboxes
return results, suggestions, answers, infoboxes

View File

@ -1,57 +0,0 @@
## 500px (Images)
#
# @website https://500px.com
# @provide-api yes (https://developers.500px.com/)
#
# @using-api no
# @results HTML
# @stable no (HTML can change)
# @parse url, title, thumbnail, img_src, content
#
# @todo rewrite to api
from urllib import urlencode
from urlparse import urljoin
from lxml import html
# engine dependent config
categories = ['images']
paging = True
# search-url
base_url = 'https://500px.com'
search_url = base_url+'/search?search?page={pageno}&type=photos&{query}'
# do search-request
def request(query, params):
params['url'] = search_url.format(pageno=params['pageno'],
query=urlencode({'q': query}))
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath('//div[@class="photo"]'):
link = result.xpath('.//a')[0]
url = urljoin(base_url, link.attrib.get('href'))
title = result.xpath('.//div[@class="title"]//text()')[0]
img_src = link.xpath('.//img')[0].attrib['src']
content = result.xpath('.//div[@class="info"]//text()')[0]
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'content': content,
'template': 'images.html'})
# return results
return results

View File

@ -21,12 +21,18 @@ import re
# engine dependent config # engine dependent config
categories = ['images'] categories = ['images']
paging = True paging = True
safesearch = True
# search-url # search-url
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
search_string = 'images/search?{query}&count=10&first={offset}' search_string = 'images/search?{query}&count=10&first={offset}'
thumb_url = "http://ts1.mm.bing.net/th?id={ihk}" thumb_url = "http://ts1.mm.bing.net/th?id={ihk}"
# safesearch definitions
safesearch_types = {2: 'STRICT',
1: 'DEMOTE',
0: 'OFF'}
# do search-request # do search-request
def request(query, params): def request(query, params):
@ -43,7 +49,8 @@ def request(query, params):
offset=offset) offset=offset)
params['cookies']['SRCHHPGUSR'] = \ params['cookies']['SRCHHPGUSR'] = \
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\
'&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
params['url'] = base_url + search_path params['url'] = base_url + search_path

View File

@ -13,12 +13,9 @@ def request(query, params):
if not m: if not m:
# wrong query # wrong query
return params return params
try:
ammount, from_currency, to_currency = m.groups() ammount, from_currency, to_currency = m.groups()
ammount = float(ammount) ammount = float(ammount)
except:
# wrong params
return params
q = (from_currency + to_currency).upper() q = (from_currency + to_currency).upper()

View File

@ -15,7 +15,7 @@
from urllib import urlencode from urllib import urlencode
from lxml.html import fromstring from lxml.html import fromstring
from searx.utils import html_to_text from searx.engines.xpath import extract_text
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
@ -28,8 +28,8 @@ url = 'https://duckduckgo.com/html?{query}&s={offset}'
# specific xpath variables # specific xpath variables
result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa
url_xpath = './/a[@class="large"]/@href' url_xpath = './/a[@class="large"]/@href'
title_xpath = './/a[@class="large"]//text()' title_xpath = './/a[@class="large"]'
content_xpath = './/div[@class="snippet"]//text()' content_xpath = './/div[@class="snippet"]'
# do search-request # do search-request
@ -64,8 +64,8 @@ def response(resp):
if not res_url: if not res_url:
continue continue
title = html_to_text(''.join(r.xpath(title_xpath))) title = extract_text(r.xpath(title_xpath))
content = html_to_text(''.join(r.xpath(content_xpath))) content = extract_text(r.xpath(content_xpath))
# append result # append result
results.append({'title': title, results.append({'title': title,

View File

@ -25,9 +25,10 @@ def request(query, params):
def response(resp): def response(resp):
search_res = json.loads(resp.text)
results = [] results = []
search_res = json.loads(resp.text)
content = '' content = ''
heading = search_res.get('Heading', '') heading = search_res.get('Heading', '')
attributes = [] attributes = []
@ -68,7 +69,7 @@ def response(resp):
results.append({'title': heading, 'url': firstURL}) results.append({'title': heading, 'url': firstURL})
# related topics # related topics
for ddg_result in search_res.get('RelatedTopics', None): for ddg_result in search_res.get('RelatedTopics', []):
if 'FirstURL' in ddg_result: if 'FirstURL' in ddg_result:
suggestion = result_to_text(ddg_result.get('FirstURL', None), suggestion = result_to_text(ddg_result.get('FirstURL', None),
ddg_result.get('Text', None), ddg_result.get('Text', None),

View File

@ -37,7 +37,7 @@ search_category = {'general': 'web',
# do search-request # do search-request
def request(query, params): def request(query, params):
offset = (params['pageno']-1) * number_of_results + 1 offset = (params['pageno'] - 1) * number_of_results + 1
categorie = search_category.get(params['category'], 'web') categorie = search_category.get(params['category'], 'web')
if params['language'] == 'all': if params['language'] == 'all':
@ -45,11 +45,11 @@ def request(query, params):
else: else:
language = params['language'].split('_')[0] language = params['language'].split('_')[0]
# skip, if language is not supported # if language is not supported, put it in english
if language != 'en' and\ if language != 'en' and\
language != 'de' and\ language != 'de' and\
language != 'zh': language != 'zh':
return params language = 'en'
params['url'] = search_url.format(offset=offset, params['url'] = search_url.format(offset=offset,
number_of_results=number_of_results, number_of_results=number_of_results,
@ -69,12 +69,10 @@ def response(resp):
# HTTP-Code 401: api-key is not valide # HTTP-Code 401: api-key is not valide
if resp.status_code == 401: if resp.status_code == 401:
raise Exception("API key is not valide") raise Exception("API key is not valide")
return []
# HTTP-Code 429: rate limit exceeded # HTTP-Code 429: rate limit exceeded
if resp.status_code == 429: if resp.status_code == 429:
raise Exception("rate limit has been exceeded!") raise Exception("rate limit has been exceeded!")
return []
results = [] results = []

View File

@ -1,95 +0,0 @@
#!/usr/bin/env python
# Flickr (Images)
#
# @website https://www.flickr.com
# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
#
# @using-api no
# @results HTML
# @stable no
# @parse url, title, thumbnail, img_src
from urllib import urlencode
from json import loads
import re
categories = ['images']
url = 'https://secure.flickr.com/'
search_url = url+'search/?{query}&page={page}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
regex = re.compile(r"\"search-photos-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL)
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's')
paging = True
def build_flickr_url(user_id, photo_id):
return photo_url.format(userid=user_id, photoid=photo_id)
def request(query, params):
params['url'] = search_url.format(query=urlencode({'text': query}),
page=params['pageno'])
return params
def response(resp):
results = []
matches = regex.search(resp.text)
if matches is None:
return results
match = matches.group(1)
search_results = loads(match)
if '_data' not in search_results:
return []
photos = search_results['_data']
for photo in photos:
# In paged configuration, the first pages' photos
# are represented by a None object
if photo is None:
continue
img_src = None
# From the biggest to the lowest format
for image_size in image_sizes:
if image_size in photo['sizes']:
img_src = photo['sizes'][image_size]['displayUrl']
break
if not img_src:
continue
if 'id' not in photo['owner']:
continue
url = build_flickr_url(photo['owner']['id'], photo['id'])
title = photo.get('title', '')
content = '<span class="photo-author">' +\
photo['owner']['username'] +\
'</span><br />'
if 'description' in photo:
content = content +\
'<span class="description">' +\
photo['description'] +\
'</span>'
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'content': content,
'template': 'images.html'})
return results

Some files were not shown because too many files have changed in this diff Show More