diff --git a/conf/settings.yml b/conf/settings.yml index 3707cf0..eac7593 100644 --- a/conf/settings.yml +++ b/conf/settings.yml @@ -92,10 +92,10 @@ engines: shortcut : sp # +30% page load time -# - name : ixquick -# engine : startpage -# base_url : 'https://www.ixquick.com/' -# search_url : 'https://www.ixquick.com/do/search' +# - name : ixquick +# engine : startpage +# base_url : 'https://www.ixquick.com/' +# search_url : 'https://www.ixquick.com/do/search' - name : twitter engine : twitter @@ -103,20 +103,20 @@ engines: shortcut : tw # maybe in a fun category -# - name : uncyclopedia -# engine : mediawiki -# categories : general -# shortcut : unc -# url : https://uncyclopedia.wikia.com/ +# - name : uncyclopedia +# engine : mediawiki +# categories : general +# shortcut : unc +# url : https://uncyclopedia.wikia.com/ # tmp suspended - too slow, too many errors -# - name : urbandictionary -# engine : xpath -# search_url : http://www.urbandictionary.com/define.php?term={query} -# url_xpath : //div[@class="word"]//a/@href -# title_xpath : //div[@class="word"]//a -# content_xpath : //div[@class="definition"] -# shortcut : ud +# - name : urbandictionary +# engine : xpath +# search_url : http://www.urbandictionary.com/define.php?term={query} +# url_xpath : //div[@class="word"]//a/@href +# title_xpath : //div[@class="word"]//a +# content_xpath : //div[@class="definition"] +# shortcut : ud - name : yahoo engine : yahoo diff --git a/conf/settings.yml-noroot b/conf/settings.yml-noroot new file mode 100644 index 0000000..b773839 --- /dev/null +++ b/conf/settings.yml-noroot @@ -0,0 +1,156 @@ +server: + port : 8888 + secret_key : "ultrasecretkey" # change this! + debug : True + request_timeout : 2.0 # seconds + base_url : True + +engines: + - name : wikipedia + engine : wikipedia + number_of_results : 1 + paging : False + shortcut : wp + + - name : bing + engine : bing + locale : en-US + shortcut : bi + + - name : bing news + engine : bing_news + locale : en-US + shortcut : bin + + - name : currency + engine : currency_convert + categories : general + shortcut : cc + + - name : deviantart + engine : deviantart + categories : images + shortcut : da + timeout: 3.0 + + - name : ddg definitions + engine : duckduckgo_definitions + shortcut : ddd + + - name : duckduckgo + engine : duckduckgo + locale : en-us + shortcut : ddg + + - name : filecrop + engine : filecrop + categories : files + shortcut : fc + + - name : flickr + engine : flickr + categories : images + shortcut : fl + timeout: 3.0 + + - name : github + engine : github + categories : it + shortcut : gh + + - name : google + engine : google + shortcut : go + + - name : google images + engine : google_images + shortcut : goi + + - name : google news + engine : google_news + shortcut : gon + + - name : piratebay + engine : piratebay + categories : videos, music, files + shortcut : tpb + + - name : soundcloud + engine : soundcloud + categories : music + shortcut : sc + + - name : stackoverflow + engine : stackoverflow + categories : it + shortcut : st + + - name : startpage + engine : startpage + base_url : 'https://startpage.com/' + search_url : 'https://startpage.com/do/search' + shortcut : sp + +# +30% page load time +# - name : ixquick +# engine : startpage +# base_url : 'https://www.ixquick.com/' +# search_url : 'https://www.ixquick.com/do/search' + + - name : twitter + engine : twitter + categories : social media + shortcut : tw + +# maybe in a fun category +# - name : uncyclopedia +# engine : mediawiki +# categories : general +# shortcut : unc +# url : https://uncyclopedia.wikia.com/ + +# tmp suspended - too slow, too many errors +# - name : urbandictionary +# engine : xpath +# search_url : http://www.urbandictionary.com/define.php?term={query} +# url_xpath : //div[@class="word"]//a/@href +# title_xpath : //div[@class="word"]//a +# content_xpath : //div[@class="definition"] +# shortcut : ud + + - name : yahoo + engine : yahoo + shortcut : yh + + - name : yahoo news + engine : yahoo_news + shortcut : yhn + + - name : youtube + engine : youtube + categories : videos + shortcut : yt + + - name : dailymotion + engine : dailymotion + locale : en_US + categories : videos + shortcut : dm + + - name : vimeo + engine : vimeo + categories : videos + results_xpath : //div[@id="browse_content"]/ol/li + url_xpath : ./a/@href + title_xpath : ./a/div[@class="data"]/p[@class="title"]/text() + content_xpath : ./a/img/@src + shortcut : vm + +locales: + en : English + de : Deutsch + hu : Magyar + fr : Français + es : Español + it : Italiano + nl : Nederlands diff --git a/manifest.json b/manifest.json index 9ae1fa2..17fb1dc 100644 --- a/manifest.json +++ b/manifest.json @@ -26,7 +26,16 @@ }, "example": "/searx", "default": "/searx" + }, + { + "name": "public_site", + "ask": { + "en": "Is it a public Searx ?" + }, + "choices": ["Yes", "No"], + "default": "Yes" } + ] } } diff --git a/scripts/install b/scripts/install index 9d5c9d7..60f67a5 100644 --- a/scripts/install +++ b/scripts/install @@ -41,7 +41,12 @@ then fi #Configuration Searx -sudo cp ../conf/settings.yml /opt/searx/searx/ +if [ $path != "/" ]; +then + sudo cp ../conf/settings.yml-noroot /opt/searx/searx/settings.yml +else + sudo cp ../conf/settings.yml /opt/searx/searx/ +fi sudo sed -i -e "s/ultrasecretkey/`openssl rand -hex 16`/g" /opt/searx/searx/settings.yml # Set permissions to searx directory @@ -62,13 +67,15 @@ else fi # Fix permission -#sudo chmod 755 /etc/searx/ -#sudo find /opt/yunohost/searx/ -type d -exec chmod 2755 {} \; -#sudo find /opt/yunohost/searx/ -type f -exec chmod g+r,o+r {} \; -#sudo chmod 644 /etc/searx/* +#sudo find /opt/searx/ -type d -exec chmod 2755 {} \; +#sudo find /opt/searx/ -type f -exec chmod g+r,o+r {} \; ## Reload Nginx and regenerate SSOwat conf sudo service nginx reload sudo service uwsgi restart -#sudo yunohost app setting searx skipped_uris -v "/" + +if [ $is_public = "Yes" ]; +then +sudo yunohost app setting searx skipped_uris -v "/" +fi sudo yunohost app ssowatconf diff --git a/sources/AUTHORS.rst b/sources/AUTHORS.rst new file mode 100644 index 0000000..d00d542 --- /dev/null +++ b/sources/AUTHORS.rst @@ -0,0 +1,26 @@ +Searx was created and is maintained by Adam Tauber. + +Major contributing authors: + +- Adam Tauber `@asciimoo `_ +- Matej Cotman +- Thomas Pointhuber +- Alexandre Flament + +People who have submitted patches/translates, reported bugs, consulted features or +generally made searx better: + +- Laszlo Hammerl +- Stefan Marsiske +- Gabor Nagy +- @pw3t +- @rhapsodhy +- András Veres-Szentkirályi +- Benjamin Sonntag +- @HLFH +- @TheRadialActive +- @Okhin +- André Koot +- Alejandro León Aznar +- rike +- dp diff --git a/sources/searx/engines/__init__.py b/sources/searx/engines/__init__.py index 72e5374..31e2821 100644 --- a/sources/searx/engines/__init__.py +++ b/sources/searx/engines/__init__.py @@ -154,16 +154,24 @@ def score_results(results): # deduplication + scoring for i, res in enumerate(flat_res): res['parsed_url'] = urlparse(res['url']) + res['host'] = res['parsed_url'].netloc + + if res['host'].startswith('www.'): + res['host'] = res['host'].replace('www.', '', 1) + res['engines'] = [res['engine']] weight = 1.0 + if hasattr(engines[res['engine']], 'weight'): weight = float(engines[res['engine']].weight) + score = int((flat_len - i) / engines_len) * weight + 1 duplicated = False + for new_res in results: p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa - if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\ + if res['host'] == new_res['host'] and\ p1 == p2 and\ res['parsed_url'].query == new_res['parsed_url'].query and\ res.get('template') == new_res.get('template'): diff --git a/sources/searx/engines/flickr.py b/sources/searx/engines/flickr.py index 265c59a..4ec2841 100644 --- a/sources/searx/engines/flickr.py +++ b/sources/searx/engines/flickr.py @@ -1,35 +1,52 @@ #!/usr/bin/env python from urllib import urlencode -from lxml import html +#from json import loads from urlparse import urljoin +from lxml import html +from time import time categories = ['images'] url = 'https://secure.flickr.com/' search_url = url+'search/?{query}&page={page}' -results_xpath = '//div[@id="thumbnails"]//a[@class="rapidnofollow photo-click" and @data-track="photo-click"]' # noqa +results_xpath = '//div[@class="view display-item-tile"]/figure/div' paging = True def request(query, params): - params['url'] = search_url.format(query=urlencode({'q': query}), + params['url'] = search_url.format(query=urlencode({'text': query}), page=params['pageno']) + time_string = str(int(time())-3) + params['cookies']['BX'] = '3oqjr6d9nmpgl&b=3&s=dh' + params['cookies']['xb'] = '421409' + params['cookies']['localization'] = 'en-us' + params['cookies']['flrbp'] = time_string +\ + '-3a8cdb85a427a33efda421fbda347b2eaf765a54' + params['cookies']['flrbs'] = time_string +\ + '-ed142ae8765ee62c9ec92a9513665e0ee1ba6776' + params['cookies']['flrb'] = '9' return params def response(resp): - global base_url results = [] dom = html.fromstring(resp.text) for result in dom.xpath(results_xpath): - href = urljoin(url, result.attrib.get('href')) - img = result.xpath('.//img')[0] - title = img.attrib.get('alt', '') - img_src = img.attrib.get('data-defer-src') + img = result.xpath('.//img') + + if not img: + continue + + img = img[0] + img_src = 'https:'+img.attrib.get('src') + if not img_src: continue + + href = urljoin(url, result.xpath('.//a')[0].attrib.get('href')) + title = img.attrib.get('alt', '') results.append({'url': href, 'title': title, 'img_src': img_src, diff --git a/sources/searx/engines/piratebay.py b/sources/searx/engines/piratebay.py index ca10851..bb48868 100644 --- a/sources/searx/engines/piratebay.py +++ b/sources/searx/engines/piratebay.py @@ -2,6 +2,7 @@ from urlparse import urljoin from cgi import escape from urllib import quote from lxml import html +from operator import itemgetter categories = ['videos', 'music'] @@ -29,14 +30,27 @@ def response(resp): results = [] dom = html.fromstring(resp.text) search_res = dom.xpath('//table[@id="searchResult"]//tr') + if not search_res: return results + for result in search_res[1:]: link = result.xpath('.//div[@class="detName"]//a')[0] href = urljoin(url, link.attrib.get('href')) title = ' '.join(link.xpath('.//text()')) content = escape(' '.join(result.xpath(content_xpath))) seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] + + if seed.isdigit(): + seed = int(seed) + else: + seed = 0 + + if leech.isdigit(): + leech = int(leech) + else: + leech = 0 + magnetlink = result.xpath(magnet_xpath)[0] results.append({'url': href, 'title': title, @@ -45,4 +59,5 @@ def response(resp): 'leech': leech, 'magnetlink': magnetlink.attrib['href'], 'template': 'torrent.html'}) - return results + + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/sources/searx/tests/test_webapp.py b/sources/searx/tests/test_webapp.py index 1d12b3a..bb608ab 100644 --- a/sources/searx/tests/test_webapp.py +++ b/sources/searx/tests/test_webapp.py @@ -51,7 +51,7 @@ class ViewsTestCase(SearxTestCase): result.data ) self.assertIn( - '

first test content

', + '

first test content

', # noqa result.data ) diff --git a/sources/searx/utils.py b/sources/searx/utils.py index e881a8b..b8c7a8c 100644 --- a/sources/searx/utils.py +++ b/sources/searx/utils.py @@ -7,7 +7,9 @@ import re from random import choice ua_versions = ('26.0', '27.0', '28.0') -ua_os = ('Windows NT 6.3; WOW64', 'X11; Linux x86_64; rv:26.0') +ua_os = ('Windows NT 6.3; WOW64', + 'X11; Linux x86_64', + 'X11; Linux x86') ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}" @@ -28,7 +30,8 @@ def highlight_content(content, query): query = query.decode('utf-8') if content.lower().find(query.lower()) > -1: query_regex = u'({0})'.format(re.escape(query)) - content = re.sub(query_regex, '\\1', content, flags=re.I | re.U) + content = re.sub(query_regex, '\\1', + content, flags=re.I | re.U) else: regex_parts = [] for chunk in query.split(): @@ -37,7 +40,8 @@ def highlight_content(content, query): else: regex_parts.append(u'{0}'.format(re.escape(chunk))) query_regex = u'({0})'.format('|'.join(regex_parts)) - content = re.sub(query_regex, '\\1', content, flags=re.I | re.U) + content = re.sub(query_regex, '\\1', + content, flags=re.I | re.U) return content