Adrien Beudin 2015-12-29 03:26:27 -05:00
parent f4b734aba4
commit fbde09444f
@ -3,13 +3,13 @@ general:
safe_search : 0 # Filter results. 0: None, 1: Moderate, 2: Strict
autocomplete : "duckduckgo" # Existing autocomplete backends: "dbpedia", "duckduckgo", "google", "startpage", "wikipedia" - leave blank to turn it off by default
autocomplete : "" # Existing autocomplete backends: "dbpedia", "duckduckgo", "google", "startpage", "wikipedia" - leave blank to turn it off by default
port : 8888
bind_address : "" # address to listen on
secret_key : "ultrasecretkey" # change this!
base_url : ynhbaseurl # Set custom base_url. Possible values: False or "https://your.custom.host/location/"
base_url : False # Set custom base_url. Possible values: False or "https://your.custom.host/location/"
image_proxy : False # Proxying image results through searx
@ -274,6 +274,11 @@ engines:
engine : yahoo
shortcut : yh
- name : yandex
engine : yandex
shortcut : yn
disabled : True
- name : yahoo news
engine : yahoo_news
shortcut : yhn
@ -311,7 +316,7 @@ engines:
en : English
de : Deutsch
he : Hebrew
he : עברית
hu : Magyar
fr : Français
es : Español

@ -38,3 +38,6 @@ generally made searx better:
- Niklas Haas
- @underr
- Emmanuel Benazera
- @GreenLunar
- Noemi Vanyi
- Kang-min Liu

@ -1,3 +1,23 @@
0.8.1 2015.12.22
- More efficient result parsing
- Rewritten google engine to prevent app crashes
- Other engine fixes/tweaks
- Bing news
- Btdigg
- Gigablast
- Google images
- Startpage
New documentation page is available: https://asciimoo.github.io/searx
0.8.0 2015.09.08
@ -44,6 +64,7 @@ News
@dalf joined the maintainer "team"
0.7.0 2015.02.03

@ -60,7 +60,7 @@ locales:
@pybabel compile -d searx/translations
@rm -rf .installed.cfg .mr.developer.cfg bin parts develop-eggs \
searx.egg-info lib include .coverage coverage searx/static/themes/default/css/*.css
@rm -rf .installed.cfg .mr.developer.cfg bin parts develop-eggs eggs \
searx.egg-info lib include .coverage coverage
.PHONY: all tests robot flake8 coverage production minimal styles locales clean

@ -96,7 +96,7 @@ remember 'untested code is broken code'.
Runs robot (Selenium) tests, you must have ``firefox`` installed because
this functional tests actually run the browser and perform operations on
it. Also searx is executed with
`settings\_robot <https://github.com/asciimoo/searx/blob/master/searx/settings_robot.py>`__.
`settings\_robot <https://github.com/asciimoo/searx/blob/master/searx/settings_robot.yml>`__.
``make flake8``

@ -1,16 +1,10 @@
extends = versions.cfg
versions = versions
unzip = true
newest = false
extends = versions.cfg
versions = versions
prefer-final = true
develop = .
extensions =
eggs =

@ -18,75 +18,17 @@ The script accepts buildout command-line options, so you can
use the -c option to specify an alternate configuration file.
import os, shutil, sys, tempfile, urllib, urllib2, subprocess
import os
import shutil
import sys
import tempfile
from optparse import OptionParser
if sys.platform == 'win32':
def quote(c):
if ' ' in c:
return '"%s"' % c # work around spawn lamosity on windows
return c
quote = str
__version__ = '2015-07-01'
# See zc.buildout's changelog if this version is up to date.
# See zc.buildout.easy_install._has_broken_dash_S for motivation and comments.
stdout, stderr = subprocess.Popen(
[sys.executable, '-Sc',
' import ConfigParser\n'
'except ImportError:\n'
' print 1\n'
' print 0\n'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
has_broken_dash_S = bool(int(stdout.strip()))
# In order to be more robust in the face of system Pythons, we want to
# run without site-packages loaded. This is somewhat tricky, in
# particular because Python 2.6's distutils imports site, so starting
# with the -S flag is not sufficient. However, we'll start with that:
if not has_broken_dash_S and 'site' in sys.modules:
# We will restart with python -S.
args = sys.argv[:]
args[0:0] = [sys.executable, '-S']
args = map(quote, args)
os.execv(sys.executable, args)
# Now we are running with -S. We'll get the clean sys.path, import site
# because distutils will do it later, and then reset the path and clean
# out any namespace packages from site-packages that might have been
# loaded by .pth files.
clean_path = sys.path[:]
import site # imported because of its side effects
sys.path[:] = clean_path
for k, v in sys.modules.items():
if k in ('setuptools', 'pkg_resources') or (
hasattr(v, '__path__') and
len(v.__path__) == 1 and
not os.path.exists(os.path.join(v.__path__[0], '__init__.py'))):
# This is a namespace package. Remove it.
is_jython = sys.platform.startswith('java')
setuptools_source = 'http://peak.telecommunity.com/dist/ez_setup.py'
distribute_source = 'http://python-distribute.org/distribute_setup.py'
distribute_source = 'https://bitbucket.org/pypa/setuptools/raw/f657df1f1ed46596d236376649c99a470662b4ba/distribute_setup.py'
# parsing arguments
def normalize_to_url(option, opt_str, value, parser):
if value:
if '://' not in value: # It doesn't smell like a URL.
value = 'file://%s' % (
if opt_str == '--download-base' and not value.endswith('/'):
# Download base needs a trailing slash to make the world happy.
value += '/'
value = None
name = opt_str[2:].replace('-', '_')
setattr(parser.values, name, value)
tmpeggs = tempfile.mkdtemp(prefix='bootstrap-')
usage = '''\
[DESIRED PYTHON FOR BUILDOUT] bootstrap.py [options]
@ -96,31 +38,14 @@ Bootstraps a buildout-based project.
Simply run this script in a directory containing a buildout.cfg, using the
Python that you want bin/buildout to use.
Note that by using --setup-source and --download-base to point to
local resources, you can keep this script from going over the network.
Note that by using --find-links to point to local resources, you can keep
this script from going over the network.
parser = OptionParser(usage=usage)
parser.add_option("-v", "--version", dest="version",
help="use a specific zc.buildout version")
parser.add_option("-d", "--distribute",
action="store_true", dest="use_distribute", default=False,
help="Use Distribute rather than Setuptools.")
parser.add_option("--setup-source", action="callback", dest="setup_source",
callback=normalize_to_url, nargs=1, type="string",
help=("Specify a URL or file location for the setup file. "
"If you use Setuptools, this will default to " +
setuptools_source + "; if you use Distribute, this "
"will default to " + distribute_source + "."))
parser.add_option("--download-base", action="callback", dest="download_base",
callback=normalize_to_url, nargs=1, type="string",
help=("Specify a URL or directory for downloading "
"zc.buildout and either Setuptools or Distribute. "
"Defaults to PyPI."))
help=("Specify a directory for storing eggs. Defaults to "
"a temporary directory that is deleted when the "
"bootstrap script completes."))
action="store_true", default=False,
help=("Return bootstrap.py version."))
parser.add_option("-t", "--accept-buildout-test-releases",
action="store_true", default=False,
@ -130,95 +55,117 @@ parser.add_option("-t", "--accept-buildout-test-releases",
"extensions for you. If you use this flag, "
"bootstrap and buildout will get the newest releases "
"even if they are alphas or betas."))
parser.add_option("-c", None, action="store", dest="config_file",
help=("Specify the path to the buildout configuration "
"file to be used."))
parser.add_option("-c", "--config-file",
help=("Specify the path to the buildout configuration "
"file to be used."))
parser.add_option("-f", "--find-links",
help=("Specify a URL to search for buildout releases"))
action="store_true", default=False,
help=("Let bootstrap.py use existing site packages"))
help="Use a specific zc.buildout version")
help="Use a specific setuptools version")
help=("Allow for re-use of existing directory of "
"setuptools versions"))
options, args = parser.parse_args()
if options.version:
print("bootstrap.py version %s" % __version__)
if options.eggs:
eggs_dir = os.path.abspath(os.path.expanduser(options.eggs))
eggs_dir = tempfile.mkdtemp()
if options.setup_source is None:
if options.use_distribute:
options.setup_source = distribute_source
options.setup_source = setuptools_source
if options.accept_buildout_test_releases:
args.insert(0, 'buildout:accept-buildout-test-releases=true')
# load/install setuptools
import pkg_resources
import setuptools # A flag. Sometimes pkg_resources is installed alone.
if not hasattr(pkg_resources, '_distribute'):
raise ImportError
from urllib.request import urlopen
except ImportError:
ez_code = urllib2.urlopen(
options.setup_source).read().replace('\r\n', '\n')
ez = {}
exec ez_code in ez
setup_args = dict(to_dir=eggs_dir, download_delay=0)
if options.download_base:
setup_args['download_base'] = options.download_base
if options.use_distribute:
setup_args['no_fake'] = True
if sys.version_info[:2] == (2, 4):
setup_args['version'] = '0.6.32'
if 'pkg_resources' in sys.modules:
import pkg_resources
# This does not (always?) update the default working set. We will
# do it.
for path in sys.path:
if path not in pkg_resources.working_set.entries:
from urllib2 import urlopen
cmd = [quote(sys.executable),
quote('from setuptools.command.easy_install import main; main()'),
if not has_broken_dash_S:
cmd.insert(1, '-S')
find_links = options.download_base
if not find_links:
find_links = os.environ.get('bootstrap-testing-find-links')
if not find_links and options.accept_buildout_test_releases:
find_links = 'http://downloads.buildout.org/'
if find_links:
cmd.extend(['-f', quote(find_links)])
if options.use_distribute:
setup_requirement = 'distribute'
ez = {}
if os.path.exists('ez_setup.py'):
exec(open('ez_setup.py').read(), ez)
setup_requirement = 'setuptools'
exec(urlopen('https://bootstrap.pypa.io/ez_setup.py').read(), ez)
if not options.allow_site_packages:
# ez_setup imports site, which adds site packages
# this will remove them from the path to ensure that incompatible versions
# of setuptools are not in the path
import site
# inside a virtualenv, there is no 'getsitepackages'.
# We can't remove these reliably
if hasattr(site, 'getsitepackages'):
for sitepackage_path in site.getsitepackages():
# Strip all site-packages directories from sys.path that
# are not sys.prefix; this is because on Windows
# sys.prefix is a site-package directory.
if sitepackage_path != sys.prefix:
sys.path[:] = [x for x in sys.path
if sitepackage_path not in x]
setup_args = dict(to_dir=tmpeggs, download_delay=0)
if options.setuptools_version is not None:
setup_args['version'] = options.setuptools_version
if options.setuptools_to_dir is not None:
setup_args['to_dir'] = options.setuptools_to_dir
import setuptools
import pkg_resources
# This does not (always?) update the default working set. We will
# do it.
for path in sys.path:
if path not in pkg_resources.working_set.entries:
# Install buildout
ws = pkg_resources.working_set
setup_requirement_path = ws.find(
env = dict(
setuptools_path = ws.find(
# Fix sys.path here as easy_install.pth added before PYTHONPATH
cmd = [sys.executable, '-c',
'import sys; sys.path[0:0] = [%r]; ' % setuptools_path +
'from setuptools.command.easy_install import main; main()',
'-mZqNxd', tmpeggs]
find_links = os.environ.get(
options.find_links or
if options.accept_buildout_test_releases else None)
if find_links:
cmd.extend(['-f', find_links])
requirement = 'zc.buildout'
version = options.version
version = options.buildout_version
if version is None and not options.accept_buildout_test_releases:
# Figure out the most recent final version of zc.buildout.
import setuptools.package_index
_final_parts = '*final-', '*final'
def _final_version(parsed_version):
for part in parsed_version:
if (part[:1] == '*') and (part not in _final_parts):
return False
return True
return not parsed_version.is_prerelease
except AttributeError:
# Older setuptools
for part in parsed_version:
if (part[:1] == '*') and (part not in _final_parts):
return False
return True
index = setuptools.package_index.PackageIndex(
if find_links:
req = pkg_resources.Requirement.parse(requirement)
@ -227,8 +174,6 @@ if version is None and not options.accept_buildout_test_releases:
bestv = None
for dist in index[req.project_name]:
distv = dist.parsed_version
if distv >= pkg_resources.parse_version('2dev'):
if _final_version(distv):
if bestv is None or distv > bestv:
best = [dist]
@ -238,40 +183,28 @@ if version is None and not options.accept_buildout_test_releases:
if best:
version = best[-1].version
if version:
requirement += '=='+version
requirement += '<2dev'
requirement = '=='.join((requirement, version))
if is_jython:
import subprocess
exitcode = subprocess.Popen(cmd, env=env).wait()
else: # Windows prefers this, apparently; otherwise we would prefer subprocess
exitcode = os.spawnle(*([os.P_WAIT, sys.executable] + cmd + [env]))
if exitcode != 0:
print ("An error occurred when trying to install zc.buildout. "
"Look above this message for any errors that "
"were output by easy_install.")
import subprocess
if subprocess.call(cmd) != 0:
raise Exception(
"Failed to execute command:\n%s" % repr(cmd)[1:-1])
# Import and run buildout
import zc.buildout.buildout
# If there isn't already a command in the args, add bootstrap
if not [a for a in args if '=' not in a]:
# if -c was provided, we push it back into args for buildout's main function
# if -c was provided, we push it back into args for buildout' main function
if options.config_file is not None:
args[0:0] = ['-c', options.config_file]
if not options.eggs: # clean up temporary egg directory

@ -68,7 +68,7 @@ def request(query, params):
def response(resp):
results = []
rss = etree.fromstring(resp.text)
rss = etree.fromstring(resp.content)
ns = rss.nsmap

@ -38,7 +38,7 @@ def request(query, params):
def response(resp):
results = []
dom = html.fromstring(resp.text)
dom = html.fromstring(resp.content)
search_res = dom.xpath('//div[@id="search_res"]/table/tr')

@ -1,8 +1,8 @@
Gigablast (Web)
@website http://gigablast.com
@provide-api yes (http://gigablast.com/api.html)
@website https://gigablast.com
@provide-api yes (https://gigablast.com/api.html)
@using-api yes
@results XML
@ -13,6 +13,8 @@
from urllib import urlencode
from cgi import escape
from lxml import etree
from random import randint
from time import time
# engine dependent config
categories = ['general']
@ -20,8 +22,8 @@ paging = True
number_of_results = 5
# search-url, invalid HTTPS certificate
base_url = 'http://gigablast.com/'
search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0'
base_url = 'https://gigablast.com/'
search_string = 'search?{query}&n={number_of_results}&s={offset}&format=xml&qh=0&rxiyd={rxiyd}&rand={rand}'
# specific xpath variables
results_xpath = '//response//result'
@ -37,7 +39,9 @@ def request(query, params):
search_path = search_string.format(
query=urlencode({'q': query}),
rxiyd=randint(10000, 10000000),
params['url'] = base_url + search_path

@ -9,11 +9,14 @@
# @parse url, title, content, suggestion
import re
from cgi import escape
from urllib import urlencode
from urlparse import urlparse, parse_qsl
from lxml import html
from searx.poolrequests import get
from lxml import html, etree
from searx.engines.xpath import extract_text, extract_url
from searx.search import logger
logger = logger.getChild('google engine')
# engine dependent config
@ -87,7 +90,7 @@ url_map = 'https://www.openstreetmap.org/'\
search_path = '/search'
search_url = ('https://{hostname}' +
search_path +
# other URLs
map_hostname_start = 'maps.google.'
@ -125,27 +128,6 @@ image_img_src_xpath = './img/@src'
property_address = "Address"
property_phone = "Phone number"
# cookies
pref_cookie = ''
nid_cookie = {}
# see https://support.google.com/websearch/answer/873?hl=en
def get_google_pref_cookie():
global pref_cookie
if pref_cookie == '':
resp = get('https://www.google.com/ncr', allow_redirects=False)
pref_cookie = resp.cookies["PREF"]
return pref_cookie
def get_google_nid_cookie(google_hostname):
global nid_cookie
if google_hostname not in nid_cookie:
resp = get('https://' + google_hostname)
nid_cookie[google_hostname] = resp.cookies.get("NID", None)
return nid_cookie[google_hostname]
# remove google-specific tracking-url
def parse_url(url_string, google_hostname):
@ -167,7 +149,7 @@ def parse_url(url_string, google_hostname):
def extract_text_from_dom(result, xpath):
r = result.xpath(xpath)
if len(r) > 0:
return extract_text(r[0])
return escape(extract_text(r[0]))
return None
@ -197,9 +179,6 @@ def request(query, params):
params['headers']['Accept-Language'] = language
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
if google_hostname == default_hostname:
params['cookies']['PREF'] = get_google_pref_cookie()
params['cookies']['NID'] = get_google_nid_cookie(google_hostname)
params['google_hostname'] = google_hostname
@ -224,8 +203,8 @@ def response(resp):
# parse results
for result in dom.xpath(results_xpath):
title = extract_text(result.xpath(title_xpath)[0])
title = extract_text(result.xpath(title_xpath)[0])
url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
parsed_url = urlparse(url, google_hostname)
@ -268,12 +247,13 @@ def response(resp):
'content': content
logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
results.append({'suggestion': escape(extract_text(suggestion))})
# return results
return results

@ -2,41 +2,42 @@
Google (Images)
@website https://www.google.com
@provide-api yes (https://developers.google.com/web-search/docs/),
@provide-api yes (https://developers.google.com/custom-search/)
@using-api yes
@results JSON
@stable yes (but deprecated)
@using-api no
@results HTML chunks with JSON inside
@stable no
@parse url, title, img_src
from urllib import urlencode, unquote
from urllib import urlencode
from urlparse import parse_qs
from json import loads
from lxml import html
# engine dependent config
categories = ['images']
paging = True
safesearch = True
# search-url
url = 'https://ajax.googleapis.com/'
search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe={safesearch}&filter=off&{query}'
search_url = 'https://www.google.com/search'\
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 8
if params['safesearch'] == 0:
safesearch = 'off'
safesearch = 'on'
offset = (params['pageno'] - 1) * 100
params['url'] = search_url.format(query=urlencode({'q': query}),
if safesearch and params['safesearch']:
params['url'] += '&' + urlencode({'safe': 'active'})
return params
@ -44,29 +45,26 @@ def request(query, params):
def response(resp):
results = []
search_res = loads(resp.text)
# return empty array if there are no results
if not search_res.get('responseData', {}).get('results'):
return []
dom = html.fromstring(resp.text)
# parse results
for result in search_res['responseData']['results']:
href = result['originalContextUrl']
title = result['title']
if 'url' not in result:
thumbnail_src = result['tbUrl']
for result in dom.xpath('//div[@data-ved]'):
data_url = result.xpath('./a/@href')[0]
data_query = {k: v[0] for k, v in parse_qs(data_url.split('?', 1)[1]).iteritems()}
metadata = loads(result.xpath('./div[@class="rg_meta"]/text()')[0])
thumbnail_src = metadata['tu']
# http to https
thumbnail_src = thumbnail_src.replace("http://", "https://")
# append result
results.append({'url': href,
'title': title,
'content': result['content'],
'thumbnail_src': thumbnail_src,
'img_src': unquote(result['url']),
results.append({'url': data_query['imgrefurl'],
'title': metadata['pt'],
'content': metadata['s'],
'thumbnail_src': metadata['tu'],
'img_src': data_query['imgurl'],
'template': 'images.html'})
# return results

@ -12,6 +12,8 @@
from lxml import html
from cgi import escape
from dateutil import parser
from datetime import datetime, timedelta
import re
from searx.engines.xpath import extract_text
@ -79,15 +81,44 @@ def response(resp):
title = escape(extract_text(link))
if result.xpath('./p[@class="desc"]'):
content = escape(extract_text(result.xpath('./p[@class="desc"]')))
if result.xpath('./p[@class="desc clk"]'):
content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
content = ''
# append result
results.append({'url': url,
'title': title,
'content': content})
published_date = None
# check if search result starts with something like: "2 Sep 2014 ... "
if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
published_date = parser.parse(date_string, dayfirst=True)
# fix content string
content = content[date_pos:]
# check if search result starts with something like: "5 days ago ... "
elif re.match("^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
# fix content string
content = content[date_pos:]
if published_date:
# append result
results.append({'url': url,
'title': title,
'content': content,
'publishedDate': published_date})
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results

@ -0,0 +1,62 @@
Yahoo (Web)
@website https://yandex.ru/
@provide-api ?
@using-api no
@results HTML (using search portal)
@stable no (HTML can change)
@parse url, title, content
from urllib import urlencode
from lxml import html
from searx.search import logger
logger = logger.getChild('yandex engine')
# engine dependent config
categories = ['general']
paging = True
language_support = True # TODO
default_tld = 'com'
language_map = {'ru': 'ru',
'ua': 'uk',
'tr': 'com.tr'}
# search-url
base_url = 'https://yandex.{tld}/'
search_url = 'search/?{query}&p={page}'
results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]'
url_xpath = './/h2/a/@href'
title_xpath = './/h2/a//text()'
content_xpath = './/div[@class="serp-item__text"]//text()'
def request(query, params):
lang = params['language'].split('_')[0]
host = base_url.format(tld=language_map.get(lang) or default_tld)
params['url'] = host + search_url.format(page=params['pageno']-1,
query=urlencode({'text': query}))
return params
# get response from search-request
def response(resp):
dom = html.fromstring(resp.text)
results = []
for result in dom.xpath(results_xpath):
res = {'url': result.xpath(url_xpath)[0],
'title': ''.join(result.xpath(title_xpath)),
'content': ''.join(result.xpath(content_xpath))}
logger.exception('yandex parse crash')
return results

@ -35,10 +35,10 @@ def post_search(request, ctx):
ip = x_forwarded_for[0]
ip = request.remote_addr
elif p.match(ctx['search'].query):
ua = request.user_agent
return True

View File

@ -1,5 +1,7 @@
import requests
from itertools import cycle
from threading import RLock
from searx import settings
@ -55,9 +57,10 @@ class SessionSinglePool(requests.Session):
super(SessionSinglePool, self).__init__()
# reuse the same adapters
self.mount('https://', next(https_adapters))
self.mount('http://', next(http_adapters))
with RLock():
self.mount('https://', next(https_adapters))
self.mount('http://', next(http_adapters))
def close(self):
"""Call super, but clear adapters since there are managed globaly"""
@ -67,7 +70,6 @@ class SessionSinglePool(requests.Session):
def request(method, url, **kwargs):
"""same as requests/requests/api.py request(...) except it use SessionSinglePool and force proxies"""
global settings
session = SessionSinglePool()
kwargs['proxies'] = settings['outgoing'].get('proxies', None)
response = session.request(method=method, url=url, **kwargs)

View File

@ -0,0 +1,239 @@
import re
from collections import defaultdict
from operator import itemgetter
from threading import RLock
from urlparse import urlparse, unquote
from searx.engines import engines
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile('[,;:!?\./\\\\ ()-_]', re.M | re.U)
WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
# return the meaningful length of the content for a result
def result_content_len(content):
if isinstance(content, basestring):
return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
return 0
def compare_urls(url_a, url_b):
if url_a.netloc != url_b.netloc or url_a.query != url_b.query:
return False
# remove / from the end of the url if required
path_a = url_a.path[:-1]\
if url_a.path.endswith('/')\
else url_a.path
path_b = url_b.path[:-1]\
if url_b.path.endswith('/')\
else url_b.path
return unquote(path_a) == unquote(path_b)
def merge_two_infoboxes(infobox1, infobox2):
if 'urls' in infobox2:
urls1 = infobox1.get('urls', None)
if urls1 is None:
urls1 = []
infobox1.set('urls', urls1)
urlSet = set()
for url in infobox1.get('urls', []):
urlSet.add(url.get('url', None))
for url in infobox2.get('urls', []):
if url.get('url', None) not in urlSet:
if 'attributes' in infobox2:
attributes1 = infobox1.get('attributes', None)
if attributes1 is None:
attributes1 = []
infobox1.set('attributes', attributes1)
attributeSet = set()
for attribute in infobox1.get('attributes', []):
if attribute.get('label', None) not in attributeSet:
attributeSet.add(attribute.get('label', None))
for attribute in infobox2.get('attributes', []):
if 'content' in infobox2:
content1 = infobox1.get('content', None)
content2 = infobox2.get('content', '')
if content1 is not None:
if result_content_len(content2) > result_content_len(content1):
infobox1['content'] = content2
infobox1.set('content', content2)
def result_score(result):
weight = 1.0
for result_engine in result['engines']:
if hasattr(engines[result_engine], 'weight'):
weight *= float(engines[result_engine].weight)
occurences = len(result['positions'])
return sum((occurences * weight) / position for position in result['positions'])
class ResultContainer(object):
"""docstring for ResultContainer"""
def __init__(self):
super(ResultContainer, self).__init__()
self.results = defaultdict(list)
self._merged_results = []
self.infoboxes = []
self._infobox_ids = {}
self.suggestions = set()
self.answers = set()
def extend(self, engine_name, results):
for result in list(results):
if 'suggestion' in result:
elif 'answer' in result:
elif 'infobox' in result:
with RLock():
engines[engine_name].stats['search_count'] += 1
engines[engine_name].stats['result_count'] += len(results)
if not results:
for i, result in enumerate(results):
position = i + 1
self._merge_result(result, position)
def _merge_infobox(self, infobox):
add_infobox = True
infobox_id = infobox.get('id', None)
if infobox_id is not None:
existingIndex = self._infobox_ids.get(infobox_id, None)
if existingIndex is not None:
merge_two_infoboxes(self.infoboxes[existingIndex], infobox)
add_infobox = False
if add_infobox:
self._infobox_ids[infobox_id] = len(self.infoboxes) - 1
def _merge_result(self, result, position):
result['parsed_url'] = urlparse(result['url'])
# if the result has no scheme, use http as default
if not result['parsed_url'].scheme:
result['parsed_url'] = result['parsed_url']._replace(scheme="http")
result['host'] = result['parsed_url'].netloc
if result['host'].startswith('www.'):
result['host'] = result['host'].replace('www.', '', 1)
result['engines'] = [result['engine']]
# strip multiple spaces and cariage returns from content
if result.get('content'):
result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
# check for duplicates
duplicated = False
for merged_result in self._merged_results:
if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
and result.get('template') == merged_result.get('template'):
duplicated = merged_result
# merge duplicates together
if duplicated:
# using content with more text
if result_content_len(result.get('content', '')) >\
result_content_len(duplicated.get('content', '')):
duplicated['content'] = result['content']
# add the new position
# add engine to list of result-engines
# using https if possible
if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
duplicated['url'] = result['parsed_url'].geturl()
duplicated['parsed_url'] = result['parsed_url']
# if there is no duplicate found, append result
result['positions'] = [position]
with RLock():
def get_ordered_results(self):
for result in self._merged_results:
score = result_score(result)
result['score'] = score
with RLock():
for result_engine in result['engines']:
engines[result_engine].stats['score_count'] += score
results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
# pass 2 : group results by category and template
gresults = []
categoryPositions = {}
for i, res in enumerate(results):
# FIXME : handle more than one category per engine
category = engines[res['engine']].categories[0] + ':' + ''\
if 'template' not in res\
else res['template']
current = None if category not in categoryPositions\
else categoryPositions[category]
# group with previous results using the same category
# if the group can accept more result and is not too far
# from the current position
if current is not None and (current['count'] > 0)\
and (len(gresults) - current['index'] < 20):
# group with the previous results using
# the same category with this one
index = current['index']
gresults.insert(index, res)
# update every index after the current one
# (including the current one)
for k in categoryPositions:
v = categoryPositions[k]['index']
if v >= index:
categoryPositions[k]['index'] = v + 1
# update this category
current['count'] -= 1
# same category
# update categoryIndex
categoryPositions[category] = {'index': len(gresults), 'count': 8}
# return gresults
return gresults
def results_length(self):
return len(self._merged_results)

@ -16,13 +16,8 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
import threading
import re
import searx.poolrequests as requests_lib
from itertools import izip_longest, chain
from operator import itemgetter
from Queue import Queue
from time import time
from urlparse import urlparse, unquote
from searx import settings
from searx.engines import (
categories, engines
@ -30,6 +25,7 @@ from searx.engines import (
from searx.languages import language_codes
from searx.utils import gen_useragent, get_blocked_engines
from searx.query import Query
from searx.results import ResultContainer
from searx import logger
logger = logger.getChild('search')
@ -42,7 +38,8 @@ def search_request_wrapper(fn, url, engine_name, **kwargs):
return fn(url, **kwargs)
# increase errors stats
engines[engine_name].stats['errors'] += 1
with threading.RLock():
engines[engine_name].stats['errors'] += 1
# print engine name and specific error message
logger.exception('engine crash: {0}'.format(engine_name))
@ -84,7 +81,7 @@ def default_request_params():
# create a callback wrapper for the search engine results
def make_callback(engine_name, results_queue, callback, params):
def make_callback(engine_name, callback, params, result_container):
# creating a callback wrapper for the search engine results
def process_callback(response, **kwargs):
@ -96,12 +93,17 @@ def make_callback(engine_name, results_queue, callback, params):
response.search_params = params
timeout_overhead = 0.2 # seconds
search_duration = time() - params['started']
# update stats with current page-load-time
with threading.RLock():
engines[engine_name].stats['page_load_time'] += search_duration
timeout_overhead = 0.2 # seconds
timeout_limit = engines[engine_name].timeout + timeout_overhead
if search_duration > timeout_limit:
engines[engine_name].stats['page_load_time'] += timeout_limit
engines[engine_name].stats['errors'] += 1
with threading.RLock():
engines[engine_name].stats['errors'] += 1
# callback
@ -111,211 +113,11 @@ def make_callback(engine_name, results_queue, callback, params):
for result in search_results:
result['engine'] = engine_name
results_queue.put_nowait((engine_name, search_results))
# update stats with current page-load-time
engines[engine_name].stats['page_load_time'] += search_duration
result_container.extend(engine_name, search_results)
return process_callback
# return the meaningful length of the content for a result
def content_result_len(content):
if isinstance(content, basestring):
content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
return len(content)
return 0
# score results and remove duplications
def score_results(results):
# calculate scoring parameters
flat_res = filter(
None, chain.from_iterable(izip_longest(*results.values())))
flat_len = len(flat_res)
engines_len = len(results)
results = []
# pass 1: deduplication + scoring
for i, res in enumerate(flat_res):
res['parsed_url'] = urlparse(res['url'])
res['host'] = res['parsed_url'].netloc
if res['host'].startswith('www.'):
res['host'] = res['host'].replace('www.', '', 1)
res['engines'] = [res['engine']]
weight = 1.0
# strip multiple spaces and cariage returns from content
if res.get('content'):
res['content'] = re.sub(' +', ' ',
res['content'].strip().replace('\n', ''))
# get weight of this engine if possible
if hasattr(engines[res['engine']], 'weight'):
weight = float(engines[res['engine']].weight)
# calculate score for that engine
score = int((flat_len - i) / engines_len) * weight + 1
# check for duplicates
duplicated = False
for new_res in results:
# remove / from the end of the url if required
p1 = res['parsed_url'].path[:-1]\
if res['parsed_url'].path.endswith('/')\
else res['parsed_url'].path
p2 = new_res['parsed_url'].path[:-1]\
if new_res['parsed_url'].path.endswith('/')\
else new_res['parsed_url'].path
# check if that result is a duplicate
if res['host'] == new_res['host'] and\
unquote(p1) == unquote(p2) and\
res['parsed_url'].query == new_res['parsed_url'].query and\
res.get('template') == new_res.get('template'):
duplicated = new_res
# merge duplicates together
if duplicated:
# using content with more text
if content_result_len(res.get('content', '')) >\
content_result_len(duplicated.get('content', '')):
duplicated['content'] = res['content']
# increase result-score
duplicated['score'] += score
# add engine to list of result-engines
# using https if possible
if duplicated['parsed_url'].scheme == 'https':
elif res['parsed_url'].scheme == 'https':
duplicated['url'] = res['parsed_url'].geturl()
duplicated['parsed_url'] = res['parsed_url']
# if there is no duplicate found, append result
res['score'] = score
# if the result has no scheme, use http as default
if res['parsed_url'].scheme == '':
res['parsed_url'] = res['parsed_url']._replace(scheme="http")
results = sorted(results, key=itemgetter('score'), reverse=True)
# pass 2 : group results by category and template
gresults = []
categoryPositions = {}
for i, res in enumerate(results):
# FIXME : handle more than one category per engine
category = engines[res['engine']].categories[0] + ':' + ''\
if 'template' not in res\
else res['template']
current = None if category not in categoryPositions\
else categoryPositions[category]
# group with previous results using the same category
# if the group can accept more result and is not too far
# from the current position
if current is not None and (current['count'] > 0)\
and (len(gresults) - current['index'] < 20):
# group with the previous results using
# the same category with this one
index = current['index']
gresults.insert(index, res)
# update every index after the current one
# (including the current one)
for k in categoryPositions:
v = categoryPositions[k]['index']
if v >= index:
categoryPositions[k]['index'] = v + 1
# update this category
current['count'] -= 1
# same category
# update categoryIndex
categoryPositions[category] = {'index': len(gresults), 'count': 8}
# return gresults
return gresults
def merge_two_infoboxes(infobox1, infobox2):
if 'urls' in infobox2:
urls1 = infobox1.get('urls', None)
if urls1 is None:
urls1 = []
infobox1.set('urls', urls1)
urlSet = set()
for url in infobox1.get('urls', []):
urlSet.add(url.get('url', None))
for url in infobox2.get('urls', []):
if url.get('url', None) not in urlSet:
if 'attributes' in infobox2:
attributes1 = infobox1.get('attributes', None)
if attributes1 is None:
attributes1 = []
infobox1.set('attributes', attributes1)
attributeSet = set()
for attribute in infobox1.get('attributes', []):
if attribute.get('label', None) not in attributeSet:
attributeSet.add(attribute.get('label', None))
for attribute in infobox2.get('attributes', []):
if 'content' in infobox2:
content1 = infobox1.get('content', None)
content2 = infobox2.get('content', '')
if content1 is not None:
if content_result_len(content2) > content_result_len(content1):
infobox1['content'] = content2
infobox1.set('content', content2)
def merge_infoboxes(infoboxes):
results = []
infoboxes_id = {}
for infobox in infoboxes:
add_infobox = True
infobox_id = infobox.get('id', None)
if infobox_id is not None:
existingIndex = infoboxes_id.get(infobox_id, None)
if existingIndex is not None:
merge_two_infoboxes(results[existingIndex], infobox)
add_infobox = False
if add_infobox:
infoboxes_id[infobox_id] = len(results) - 1
return results
class Search(object):
"""Search information container"""
@ -333,10 +135,7 @@ class Search(object):
# set blocked engines
self.blocked_engines = get_blocked_engines(engines, request.cookies)
self.results = []
self.suggestions = set()
self.answers = set()
self.infoboxes = []
self.result_container = ResultContainer()
self.request_data = {}
# set specific language if set
@ -357,7 +156,7 @@ class Search(object):
# set pagenumber
pageno_param = self.request_data.get('pageno', '1')
if not pageno_param.isdigit() or int(pageno_param) < 1:
raise Exception('wrong pagenumber')
pageno_param = 1
self.pageno = int(pageno_param)
@ -448,8 +247,6 @@ class Search(object):
# init vars
requests = []
results_queue = Queue()
results = {}
# increase number of searches
number_of_searches += 1
@ -503,9 +300,9 @@ class Search(object):
# create a callback wrapper for the search engine results
callback = make_callback(
# create dictionary which contain all
# informations about the request
@ -538,42 +335,5 @@ class Search(object):
# send all search-request
while not results_queue.empty():
engine_name, engine_results = results_queue.get_nowait()
# TODO type checks
for x in list(engine_results)
if 'suggestion' in x
and engine_results.remove(x) is None]
for x in list(engine_results)
if 'answer' in x
and engine_results.remove(x) is None]
self.infoboxes.extend(x for x in list(engine_results)
if 'infobox' in x
and engine_results.remove(x) is None)
results[engine_name] = engine_results
# update engine-specific stats
for engine_name, engine_results in results.items():
engines[engine_name].stats['search_count'] += 1
engines[engine_name].stats['result_count'] += len(engine_results)
# score results and remove duplications
self.results = score_results(results)
# merge infoboxes according to their ids
self.infoboxes = merge_infoboxes(self.infoboxes)
# update engine stats, using calculated score
for result in self.results:
for res_engine in result['engines']:
.stats['score_count'] += result['score']
# return results, suggestions, answers and infoboxes
return self

View File

@ -274,6 +274,11 @@ engines:
engine : yahoo
shortcut : yh
- name : yandex
engine : yandex
shortcut : yn
disabled : True
- name : yahoo news
engine : yahoo_news
shortcut : yhn
@ -311,7 +316,7 @@ engines:
en : English
de : Deutsch
he : Hebrew
he : עברית
hu : Magyar
fr : Français
es : Español

@ -1 +1,88 @@
html{position:relative;min-height:100%}body{margin-bottom:80px}.footer{position:absolute;bottom:0;width:100%;height:60px}input[type=checkbox]:checked+.label_hide_if_checked,input[type=checkbox]:checked+.label_hide_if_not_checked+.label_hide_if_checked{display:none}input[type=checkbox]:not(:checked)+.label_hide_if_not_checked,input[type=checkbox]:not(:checked)+.label_hide_if_checked+.label_hide_if_not_checked{display:none}.result_header{margin-bottom:5px;margin-top:20px}.result_header .favicon{margin-bottom:-3px}.result_header a{vertical-align:bottom}.result_header a .highlight{font-weight:bold}.result-content{margin-top:5px;word-wrap:break-word}.result-content .highlight{font-weight:bold}.result-default{clear:both}.result-images{float:left !important}.img-thumbnail{margin:5px;max-height:128px;min-height:128px}.result-videos{clear:both}.result-torrents{clear:both}.result-map{clear:both}.result-code{clear:both}.suggestion_item{margin:2px 5px}.result_download{margin-right:5px}#pagination{margin-top:30px;padding-bottom:50px}.infobox .infobox_part{margin-bottom:20px;word-wrap:break-word}.infobox .infobox_part:last-child{margin-bottom:0}.search_categories{margin:10px 0;text-transform:capitalize}.cursor-text{cursor:text !important}.cursor-pointer{cursor:pointer !important}.highlight .hll{background-color:#ffc}.highlight{background:#f8f8f8}.highlight .c{color:#408080;font-style:italic}.highlight .err{border:1px solid #f00}.highlight .k{color:#008000;font-weight:bold}.highlight .o{color:#666}.highlight .cm{color:#408080;font-style:italic}.highlight .cp{color:#bc7a00}.highlight .c1{color:#408080;font-style:italic}.highlight .cs{color:#408080;font-style:italic}.highlight .gd{color:#a00000}.highlight .ge{font-style:italic}.highlight .gr{color:#f00}.highlight .gh{color:#000080;font-weight:bold}.highlight .gi{color:#00a000}.highlight .go{color:#888}.highlight .gp{color:#000080;font-weight:bold}.highlight .gs{font-weight:bold}.highlight .gu{color:#800080;font-weight:bold}.highlight .gt{color:#04d}.highlight .kc{color:#008000;font-weight:bold}.highlight .kd{color:#008000;font-weight:bold}.highlight .kn{color:#008000;font-weight:bold}.highlight .kp{color:#008000}.highlight .kr{color:#008000;font-weight:bold}.highlight .kt{color:#b00040}.highlight .m{color:#666}.highlight .s{color:#ba2121}.highlight .na{color:#7d9029}.highlight .nb{color:#008000}.highlight .nc{color:#00f;font-weight:bold}.highlight .no{color:#800}.highlight .nd{color:#a2f}.highlight .ni{color:#999;font-weight:bold}.highlight .ne{color:#d2413a;font-weight:bold}.highlight .nf{color:#00f}.highlight .nl{color:#a0a000}.highlight .nn{color:#00f;font-weight:bold}.highlight .nt{color:#008000;font-weight:bold}.highlight .nv{color:#19177c}.highlight .ow{color:#a2f;font-weight:bold}.highlight .w{color:#bbb}.highlight .mf{color:#666}.highlight .mh{color:#666}.highlight .mi{color:#666}.highlight .mo{color:#666}.highlight .sb{color:#ba2121}.highlight .sc{color:#ba2121}.highlight .sd{color:#ba2121;font-style:italic}.highlight .s2{color:#ba2121}.highlight .se{color:#b62;font-weight:bold}.highlight .sh{color:#ba2121}.highlight .si{color:#b68;font-weight:bold}.highlight .sx{color:#008000}.highlight .sr{color:#b68}.highlight .s1{color:#ba2121}.highlight .ss{color:#19177c}.highlight .bp{color:#008000}.highlight .vc{color:#19177c}.highlight .vg{color:#19177c}.highlight .vi{color:#19177c}.highlight .il{color:#666}.highlight .lineno{-webkit-touch-callout:none;-webkit-user-select:none;-khtml-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;cursor:default}.highlight .lineno::selection{background:transparent}.highlight .lineno::-moz-selection{background:transparent}
.result_header{margin-bottom:5px;margin-top:20px}.result_header .favicon{margin-bottom:-3px}
.result_header a{vertical-align:bottom}.result_header a .highlight{font-weight:bold}
.result-content{margin-top:5px;word-wrap:break-word}.result-content .highlight{font-weight:bold}
.result-images{float:left !important}
.suggestion_item{margin:2px 5px}
.infobox .infobox_part{margin-bottom:20px;word-wrap:break-word}
.infobox .infobox_part:last-child{margin-bottom:0}
.search_categories{margin:10px 0;text-transform:capitalize}
.cursor-text{cursor:text !important}
.cursor-pointer{cursor:pointer !important}
.highlight .hll{background-color:#ffc}
.highlight .c{color:#408080;font-style:italic}
.highlight .err{border:1px solid #f00}
.highlight .k{color:#008000;font-weight:bold}
.highlight .o{color:#666}
.highlight .cm{color:#408080;font-style:italic}
.highlight .cp{color:#bc7a00}
.highlight .c1{color:#408080;font-style:italic}
.highlight .cs{color:#408080;font-style:italic}
.highlight .gd{color:#a00000}
.highlight .ge{font-style:italic}
.highlight .gr{color:#f00}
.highlight .gh{color:#000080;font-weight:bold}
.highlight .gi{color:#00a000}
.highlight .go{color:#888}
.highlight .gp{color:#000080;font-weight:bold}
.highlight .gs{font-weight:bold}
.highlight .gu{color:#800080;font-weight:bold}
.highlight .gt{color:#04d}
.highlight .kc{color:#008000;font-weight:bold}
.highlight .kd{color:#008000;font-weight:bold}
.highlight .kn{color:#008000;font-weight:bold}
.highlight .kp{color:#008000}
.highlight .kr{color:#008000;font-weight:bold}
.highlight .kt{color:#b00040}
.highlight .m{color:#666}
.highlight .s{color:#ba2121}
.highlight .na{color:#7d9029}
.highlight .nb{color:#008000}
.highlight .nc{color:#00f;font-weight:bold}
.highlight .no{color:#800}
.highlight .nd{color:#a2f}
.highlight .ni{color:#999;font-weight:bold}
.highlight .ne{color:#d2413a;font-weight:bold}
.highlight .nf{color:#00f}
.highlight .nl{color:#a0a000}
.highlight .nn{color:#00f;font-weight:bold}
.highlight .nt{color:#008000;font-weight:bold}
.highlight .nv{color:#19177c}
.highlight .ow{color:#a2f;font-weight:bold}
.highlight .w{color:#bbb}
.highlight .mf{color:#666}
.highlight .mh{color:#666}
.highlight .mi{color:#666}
.highlight .mo{color:#666}
.highlight .sb{color:#ba2121}
.highlight .sc{color:#ba2121}
.highlight .sd{color:#ba2121;font-style:italic}
.highlight .s2{color:#ba2121}
.highlight .se{color:#b62;font-weight:bold}
.highlight .sh{color:#ba2121}
.highlight .si{color:#b68;font-weight:bold}
.highlight .sx{color:#008000}
.highlight .sr{color:#b68}
.highlight .s1{color:#ba2121}
.highlight .ss{color:#19177c}
.highlight .bp{color:#008000}
.highlight .vc{color:#19177c}
.highlight .vg{color:#19177c}
.highlight .vi{color:#19177c}
.highlight .il{color:#666}
.highlight .lineno{-webkit-touch-callout:none;-webkit-user-select:none;-khtml-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;cursor:default}.highlight .lineno::selection{background:transparent;}
.highlight .lineno::-moz-selection{background:transparent;}

@ -76,3 +76,8 @@
margin-top: 30px;
padding-bottom: 50px;
.label-default {
color: #AAA;
background: #FFF;

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/">
<Description>Search searx</Description>
<Description>a privacy-respecting, hackable metasearch engine</Description>
<Image>{{ host }}{{ url_for('static', filename='img/favicon.png') | replace("/", "", 1) }}</Image>
<LongName>searx metasearch</LongName>

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/">
<Description>Search searx</Description>
<Description>a privacy-respecting, hackable metasearch engine</Description>
<Image>{{ host }}{{ url_for('static', filename='img/favicon.png') | replace("/", "", 1) }}</Image>
<LongName>searx metasearch</LongName>

@ -25,7 +25,11 @@
<!-- Draw result footer -->
{% macro result_footer(result) -%}
<div class="clearfix"></div>
<span class="label label-default pull-right">{{ result.engine }}</span>
<div class="pull-right">
{% for engine in result.engines %}
<span class="label label-default">{{ engine }}</span>
{% endfor %}
<p class="text-muted">{{ result.pretty_url }}</p>
{%- endmacro %}

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/">
<Description>Search searx</Description>
<Description>a privacy-respecting, hackable metasearch engine</Description>
<Image>{{ host }}{{ url_for('static', filename='img/favicon.png') | replace("/", "", 1) }}</Image>
<LongName>searx metasearch</LongName>

@ -53,8 +53,8 @@
<th>{{ _('Engine name') }}</th>
<th>{{ _('Allow') }} / {{ _('Block') }}</th>
{% for (categ,search_engines) in categs %}
{% for search_engine in search_engines %}
{% for categ in all_categories %}
{% for search_engine in engines_by_category[categ] %}
{% if not search_engine.private %}

@ -28,10 +28,10 @@ class TestBingNewsEngine(SearxTestCase):
self.assertRaises(AttributeError, bing_news.response, '')
self.assertRaises(AttributeError, bing_news.response, '[]')
response = mock.Mock(text='<html></html>')
response = mock.Mock(content='<html></html>')
self.assertEqual(bing_news.response(response), [])
response = mock.Mock(text='<html></html>')
response = mock.Mock(content='<html></html>')
self.assertEqual(bing_news.response(response), [])
html = """<?xml version="1.0" encoding="utf-8" ?>
@ -66,7 +66,7 @@ class TestBingNewsEngine(SearxTestCase):
</rss>""" # noqa
response = mock.Mock(text=html)
response = mock.Mock(content=html)
results = bing_news.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
@ -105,7 +105,7 @@ class TestBingNewsEngine(SearxTestCase):
</rss>""" # noqa
response = mock.Mock(text=html)
response = mock.Mock(content=html)
results = bing_news.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 1)
@ -128,11 +128,11 @@ class TestBingNewsEngine(SearxTestCase):
</rss>""" # noqa
response = mock.Mock(text=html)
response = mock.Mock(content=html)
results = bing_news.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 0)
html = """<?xml version="1.0" encoding="utf-8" ?>gabarge"""
response = mock.Mock(text=html)
response = mock.Mock(content=html)
self.assertRaises(lxml.etree.XMLSyntaxError, bing_news.response, response)

@ -22,7 +22,7 @@ class TestBtdiggEngine(SearxTestCase):
self.assertRaises(AttributeError, btdigg.response, '')
self.assertRaises(AttributeError, btdigg.response, '[]')
response = mock.Mock(text='<html></html>')
response = mock.Mock(content='<html></html>')
self.assertEqual(btdigg.response(response), [])
html = """
@ -82,7 +82,7 @@ class TestBtdiggEngine(SearxTestCase):
response = mock.Mock(text=html)
response = mock.Mock(content=html)
results = btdigg.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 1)
@ -101,7 +101,7 @@ class TestBtdiggEngine(SearxTestCase):
response = mock.Mock(text=html)
response = mock.Mock(content=html)
results = btdigg.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 0)
@ -367,7 +367,7 @@ class TestBtdiggEngine(SearxTestCase):
response = mock.Mock(text=html)
response = mock.Mock(content=html)
results = btdigg.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 5)

@ -9,7 +9,7 @@ from searx.testing import SearxTestCase
class TestGoogleEngine(SearxTestCase):
def mock_response(self, text):
response = mock.Mock(text=text, url='https://www.google.com/search?q=test&start=0&gbv=1')
response = mock.Mock(text=text, url='https://www.google.com/search?q=test&start=0&gbv=1&gws_rd=cr')
response.search_params = mock.Mock()
response.search_params.get = mock.Mock(return_value='www.google.com')
return response
@ -23,16 +23,12 @@ class TestGoogleEngine(SearxTestCase):
self.assertIn('url', params)
self.assertIn(query, params['url'])
self.assertIn('google.fr', params['url'])
self.assertNotIn('PREF', params['cookies'])
self.assertIn('NID', params['cookies'])
self.assertIn('fr', params['headers']['Accept-Language'])
dicto['language'] = 'all'
params = google.request(query, dicto)
self.assertIn('google.com', params['url'])
self.assertIn('en', params['headers']['Accept-Language'])
self.assertIn('PREF', params['cookies'])
self.assertIn('NID', params['cookies'])
def test_response(self):
self.assertRaises(AttributeError, google.response, None)

@ -10,15 +10,15 @@ class TestGoogleImagesEngine(SearxTestCase):
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
dicto['safesearch'] = 1
params = google_images.request(query, dicto)
self.assertIn('url', params)
self.assertIn(query, params['url'])
self.assertIn('googleapis.com', params['url'])
self.assertIn('safe=on', params['url'])
self.assertIn('safe=active', params['url'])
dicto['safesearch'] = 0
params = google_images.request(query, dicto)
self.assertIn('safe=off', params['url'])
self.assertNotIn('safe', params['url'])
def test_response(self):
self.assertRaises(AttributeError, google_images.response, None)
@ -26,88 +26,33 @@ class TestGoogleImagesEngine(SearxTestCase):
self.assertRaises(AttributeError, google_images.response, '')
self.assertRaises(AttributeError, google_images.response, '[]')
response = mock.Mock(text='{}')
response = mock.Mock(text='<div></div>')
self.assertEqual(google_images.response(response), [])
response = mock.Mock(text='{"data": []}')
self.assertEqual(google_images.response(response), [])
json = """
"responseData": {
"results": [
"GsearchResultClass": "GimageSearch",
"width": "400",
"height": "400",
"imageId": "ANd9GcQbYb9FJuAbG_hT4i8FeC0O0x-P--EHdzgRIF9ao97nHLl7C2mREn6qTQ",
"tbWidth": "124",
"tbHeight": "124",
"unescapedUrl": "http://unescaped.url.jpg",
"url": "http://image.url.jpg",
"visibleUrl": "insolitebuzz.fr",
"title": "This is the title",
"titleNoFormatting": "Petit test sympa qui rend fou tout le monde ! A faire",
"originalContextUrl": "http://this.is.the.url",
"content": "<b>test</b>",
"contentNoFormatting": "test",
"tbUrl": "http://thumbnail.url"
"responseDetails": null,
"responseStatus": 200
response = mock.Mock(text=json)
html = """
<div style="display:none">
<div eid="fWhnVq4Shqpp3pWo4AM" id="isr_scm_1" style="display:none"></div>
<div data-cei="fWhnVq4Shqpp3pWo4AM" class="rg_add_chunk"><!--m-->
<div class="rg_di rg_el ivg-i" data-ved="0ahUKEwjuxPWQts3JAhUGVRoKHd4KCjwQMwgDKAAwAA">
<a href="/imgres?imgurl=http://www.clker.com/cliparts/H/X/l/b/0/0/south-arrow-hi.png&amp;imgrefurl=http://www.clker.com/clipart-south-arrow.html&amp;h=598&amp;w=504&amp;tbnid=bQWQ9wz9loJmjM:&amp;docid=vlONkeBtERfDuM&amp;ei=fWhnVq4Shqpp3pWo4AM&amp;tbm=isch" jsaction="fire.ivg_o;mouseover:str.hmov;mouseout:str.hmou" class="rg_l"><img data-src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRsxy3gKnEX0lrwwpRxdPWyLJ8iZ--PXZ-ThbBA2_xXDG_bdQutMQ" data-sz="f" name="bQWQ9wz9loJmjM:" class="rg_i" alt="Image result for south" jsaction="load:str.tbn" onload="google.aft&&google.aft(this)">
<div class="_aOd rg_ilm">
<div class="rg_ilmbg"><span class="rg_ilmn"> 504&nbsp;&#215;&nbsp;598 - clker.com </span>
<div class="rg_meta">
{"id":"bQWQ9wz9loJmjM:","isu":"clker.com","ity":"png","md":"/search?tbs\u003dsbi:AMhZZit7u1mHyop9pQisu-5idR-8W_1Itvwc3afChmsjQYPx_1yYMzBvUZgtkcGoojqekKZ-6n_1rjX9ySH0OWA_1eO5OijFY6BBDw_1GApr6xxb1bXJcBcj-DiguMoXWW7cZSG7MRQbwnI5SoDZNXcv_1xGszy886I7NVb_1oRKSliTHtzqbXAxhvYreM","msu":"/search?q\u003dsouth\u0026biw\u003d1364\u0026bih\u003d235\u0026tbm\u003disch\u0026tbs\u003dsimg:CAQSEgltBZD3DP2WgiG-U42R4G0RFw","oh":598,"os":"13KB","ow":504,"pt":"South Arrow Clip Art at Clker.com - vector clip art online ...","rid":"vlONkeBtERfDuM","s":"Download this image as:","sc":1,"si":"/search?q\u003dsouth\u0026biw\u003d1364\u0026bih\u003d235\u0026tbm\u003disch\u0026tbs\u003dsimg:CAESEgltBZD3DP2WgiG-U42R4G0RFw","th":245,"tu":"https://thumbnail.url/","tw":206}
""" # noqa
response = mock.Mock(text=html)
results = google_images.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['title'], 'This is the title')
self.assertEqual(results[0]['url'], 'http://this.is.the.url')
self.assertEqual(results[0]['thumbnail_src'], 'https://thumbnail.url')
self.assertEqual(results[0]['img_src'], 'http://image.url.jpg')
self.assertEqual(results[0]['content'], '<b>test</b>')
json = """
"responseData": {
"results": [
"GsearchResultClass": "GimageSearch",
"width": "400",
"height": "400",
"imageId": "ANd9GcQbYb9FJuAbG_hT4i8FeC0O0x-P--EHdzgRIF9ao97nHLl7C2mREn6qTQ",
"tbWidth": "124",
"tbHeight": "124",
"unescapedUrl": "http://unescaped.url.jpg",
"visibleUrl": "insolitebuzz.fr",
"title": "This is the title",
"titleNoFormatting": "Petit test sympa qui rend fou tout le monde ! A faire",
"originalContextUrl": "http://this.is.the.url",
"content": "<b>test</b>",
"contentNoFormatting": "test",
"tbUrl": "http://thumbnail.url"
"responseDetails": null,
"responseStatus": 200
response = mock.Mock(text=json)
results = google_images.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 0)
json = """
"responseData": {},
"responseDetails": null,
"responseStatus": 200
response = mock.Mock(text=json)
results = google_images.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 0)
self.assertEqual(results[0]['title'], u'South Arrow Clip Art at Clker.com - vector clip art online ...')
self.assertEqual(results[0]['url'], 'http://www.clker.com/clipart-south-arrow.html')
self.assertEqual(results[0]['thumbnail_src'], 'https://thumbnail.url/')
self.assertEqual(results[0]['img_src'], 'http://www.clker.com/cliparts/H/X/l/b/0/0/south-arrow-hi.png')
self.assertEqual(results[0]['content'], 'Download this image as:')

@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase):
<span id='title_stars_2' name='title_stars_2'> </span>
<p class='desc'>
<p class='desc clk'>
This should be the content.
@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase):
<span id='title_stars_2' name='title_stars_2'> </span>
<p class='desc'>
<p class='desc clk'>
This should be the content.
@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase):
<span id='title_stars_2' name='title_stars_2'> </span>
<p class='desc'>
<p class='desc clk'>
This should be the content.

@ -5,6 +5,12 @@ from searx import plugins
from mock import Mock
def get_search_mock(query, **kwargs):
return {'search': Mock(query=query,
class PluginStoreTest(SearxTestCase):
def test_PluginStore_init(self):
@ -46,23 +52,23 @@ class SelfIPTest(SearxTestCase):
request = Mock(user_plugins=store.plugins,
request.headers.getlist.return_value = []
ctx = {'search': Mock(answers=set(),
ctx = get_search_mock(query='ip')
store.call('post_search', request, ctx)
self.assertTrue('' in ctx['search'].answers)
self.assertTrue('' in ctx['search'].result_container.answers)
# User agent test
request = Mock(user_plugins=store.plugins,
request.headers.getlist.return_value = []
ctx = {'search': Mock(answers=set(),
ctx = get_search_mock(query='user-agent')
store.call('post_search', request, ctx)
self.assertTrue('Mock' in ctx['search'].answers)
ctx = {'search': Mock(answers=set(),
query='user agent')}
self.assertTrue('Mock' in ctx['search'].result_container.answers)
ctx = get_search_mock(query='user-agent')
store.call('post_search', request, ctx)
self.assertTrue('Mock' in ctx['search'].answers)
ctx = {'search': Mock(answers=set(),
query='What is my User-Agent?')}
self.assertTrue('Mock' in ctx['search'].result_container.answers)
ctx = get_search_mock(query='What is my User-Agent?')
store.call('post_search', request, ctx)
self.assertTrue('Mock' in ctx['search'].result_container.answers)

@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
from searx.results import ResultContainer
from searx.testing import SearxTestCase
def fake_result(url='https://aa.bb/cc?dd=ee#ff',
engine='wikipedia', **kwargs):
result = {'url': url,
'title': title,
'content': content,
'engine': engine}
return result
class ResultContainerTestCase(SearxTestCase):
def test_empty(self):
c = ResultContainer()
self.assertEqual(c.get_ordered_results(), [])
def test_one_result(self):
c = ResultContainer()
c.extend('wikipedia', [fake_result()])
self.assertEqual(c.results_length(), 1)
def test_one_suggestion(self):
c = ResultContainer()
c.extend('wikipedia', [fake_result(suggestion=True)])
self.assertEqual(len(c.suggestions), 1)
self.assertEqual(c.results_length(), 0)
def test_result_merge(self):
c = ResultContainer()
c.extend('wikipedia', [fake_result()])
c.extend('wikidata', [fake_result(), fake_result(url='https://example.com/')])
self.assertEqual(c.results_length(), 2)

@ -1,25 +1,10 @@
# -*- coding: utf-8 -*-
from searx.search import score_results
from searx.testing import SearxTestCase
def fake_result(url='https://aa.bb/cc?dd=ee#ff',
return {'url': url,
'title': title,
'content': content,
'engine': engine}
class SearchTestCase(SearxTestCase):
class ScoreResultsTestCase(SearxTestCase):
def test_empty(self):
self.assertEqual(score_results(dict()), [])
def test_urlparse(self):
results = score_results(dict(a=[fake_result(url='https://aa.bb/cc?dd=ee#ff')]))
parsed_url = results[0]['parsed_url']
self.assertEqual(parsed_url.query, 'dd=ee')
def test_(self):

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import json
from mock import Mock
from urlparse import ParseResult
from searx import webapp
from searx.testing import SearxTestCase
@ -33,7 +34,12 @@ class ViewsTestCase(SearxTestCase):
def search_mock(search_self, *args):
search_self.results = self.test_results
search_self.result_container = Mock(get_ordered_results=lambda: self.test_results,
results_length=lambda: len(self.test_results))
webapp.Search.search = search_mock
@ -138,7 +144,7 @@ class ViewsTestCase(SearxTestCase):
def test_opensearch_xml(self):
result = self.app.get('/opensearch.xml')
self.assertEqual(result.status_code, 200)
self.assertIn('<Description>Search searx</Description>', result.data)
self.assertIn('<Description>a privacy-respecting, hackable metasearch engine</Description>', result.data)
def test_favicon(self):
result = self.app.get('/favicon.ico')

@ -19,7 +19,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
# version of searx
VERSION_STRING = "{0}.{1}.{2}".format(VERSION_MAJOR,

View File

@ -42,7 +42,7 @@ except:
from datetime import datetime, timedelta
from urllib import urlencode
from urlparse import urlparse
from urlparse import urlparse, urljoin
from werkzeug.contrib.fixers import ProxyFix
from flask import (
Flask, request, render_template, url_for, Response, make_response,
@ -383,7 +383,7 @@ def index():
plugins.call('post_search', request, locals())
for result in search.results:
for result in search.result_container.get_ordered_results():
plugins.call('on_result', request, locals())
if not search.paging and engines[result['engine']].paging:
@ -411,7 +411,7 @@ def index():
minutes = int((timedifference.seconds / 60) % 60)
hours = int(timedifference.seconds / 60 / 60)
if hours == 0:
result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) # noqa
result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes)
result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa
@ -419,17 +419,16 @@ def index():
if search.request_data.get('format') == 'json':
return Response(json.dumps({'query': search.query,
'results': search.results}),
'results': search.result_container.get_ordered_results()}),
elif search.request_data.get('format') == 'csv':
csv = UnicodeWriter(cStringIO.StringIO())
keys = ('title', 'url', 'content', 'host', 'engine', 'score')
if search.results:
for row in search.results:
row['host'] = row['parsed_url'].netloc
csv.writerow([row.get(key, '') for key in keys])
for row in search.result_container.get_ordered_results():
row['host'] = row['parsed_url'].netloc
csv.writerow([row.get(key, '') for key in keys])
response = Response(csv.stream.read(), mimetype='application/csv')
cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search.query)
response.headers.add('Content-Disposition', cont_disp)
@ -437,24 +436,24 @@ def index():
elif search.request_data.get('format') == 'rss':
response_rss = render(
return Response(response_rss, mimetype='text/xml')
return render(
@ -532,7 +531,7 @@ def preferences():
blocked_engines = []
resp = make_response(redirect(url_for('index')))
resp = make_response(redirect(urljoin(settings['server']['base_url'], url_for('index'))))
if request.method == 'GET':
blocked_engines = get_blocked_engines(engines, request.cookies)
@ -767,7 +766,7 @@ def favicon():
def clear_cookies():
resp = make_response(redirect(url_for('index')))
resp = make_response(redirect(urljoin(settings['server']['base_url'], url_for('index'))))
for cookie_name in request.cookies:
return resp

View File

@ -8,7 +8,6 @@ Pygments = 2.0.2
WebOb = 1.4.1
WebTest = 2.0.18
Werkzeug = 0.10.4
buildout-versions = 1.7
collective.recipe.omelette = 0.16
coverage = 3.7.1
decorator = 3.4.2
@ -38,7 +37,6 @@ pyasn1 = 0.1.8
pyasn1-modules = 0.0.6
certifi = 2015.04.28
cffi = 1.1.2
cryptography = 0.9.1