add support noroot url

2014-05-27 12:13:54 +02:00 · 2014-05-27 12:13:54 +02:00 · fc207de1ab
parent 5a0f8811d5
commit fc207de1ab
10 changed files with 278 additions and 36 deletions
--- a/conf/settings.yml
+++ b/conf/settings.yml
@ -92,10 +92,10 @@ engines:
    shortcut : sp

 # +30% page load time
-# - name : ixquick
-# engine : startpage
-# base_url : 'https://www.ixquick.com/'
-# search_url : 'https://www.ixquick.com/do/search'
+#  - name : ixquick
+#    engine : startpage
+#    base_url : 'https://www.ixquick.com/'
+#    search_url : 'https://www.ixquick.com/do/search'

  - name : twitter
    engine : twitter
@ -103,20 +103,20 @@ engines:
    shortcut : tw

 # maybe in a fun category
-# - name : uncyclopedia
-# engine : mediawiki
-# categories : general
-# shortcut : unc
-# url : https://uncyclopedia.wikia.com/
+#  - name : uncyclopedia
+#    engine : mediawiki
+#    categories : general
+#    shortcut : unc
+#    url : https://uncyclopedia.wikia.com/

 # tmp suspended - too slow, too many errors
-# - name : urbandictionary
-# engine : xpath
-# search_url : http://www.urbandictionary.com/define.php?term={query}
-# url_xpath : //div[@class="word"]//a/@href
-# title_xpath : //div[@class="word"]//a
-# content_xpath : //div[@class="definition"]
-# shortcut : ud
+#  - name : urbandictionary
+#    engine        : xpath
+#    search_url    : http://www.urbandictionary.com/define.php?term={query}
+#    url_xpath     : //div[@class="word"]//a/@href
+#    title_xpath   : //div[@class="word"]//a
+#    content_xpath : //div[@class="definition"]
+#    shortcut : ud

  - name : yahoo
    engine : yahoo
--- a/conf/settings.yml-noroot
+++ b/conf/settings.yml-noroot
@ -0,0 +1,156 @@
+server:
+    port : 8888
+    secret_key : "ultrasecretkey" # change this!
+    debug : True
+    request_timeout : 2.0 # seconds
+    base_url : True
+
+engines:
+  - name : wikipedia
+    engine : wikipedia
+    number_of_results : 1
+    paging : False
+    shortcut : wp
+
+  - name : bing
+    engine : bing
+    locale : en-US
+    shortcut : bi
+
+  - name : bing news
+    engine : bing_news
+    locale : en-US
+    shortcut : bin
+
+  - name : currency
+    engine : currency_convert
+    categories : general
+    shortcut : cc
+
+  - name : deviantart
+    engine : deviantart
+    categories : images
+    shortcut : da
+    timeout: 3.0
+
+  - name : ddg definitions
+    engine : duckduckgo_definitions
+    shortcut : ddd
+
+  - name : duckduckgo
+    engine : duckduckgo
+    locale : en-us
+    shortcut : ddg
+
+  - name : filecrop
+    engine : filecrop
+    categories : files
+    shortcut : fc
+
+  - name : flickr
+    engine : flickr
+    categories : images
+    shortcut : fl
+    timeout: 3.0
+
+  - name : github
+    engine : github
+    categories : it
+    shortcut : gh
+
+  - name : google
+    engine : google
+    shortcut : go
+
+  - name : google images
+    engine : google_images
+    shortcut : goi
+
+  - name : google news
+    engine : google_news
+    shortcut : gon
+
+  - name : piratebay
+    engine : piratebay
+    categories : videos, music, files
+    shortcut : tpb
+
+  - name : soundcloud
+    engine : soundcloud
+    categories : music
+    shortcut : sc
+
+  - name : stackoverflow
+    engine : stackoverflow
+    categories : it
+    shortcut : st
+
+  - name : startpage
+    engine : startpage
+    base_url : 'https://startpage.com/'
+    search_url : 'https://startpage.com/do/search'
+    shortcut : sp
+
+# +30% page load time
+#  - name : ixquick
+#    engine : startpage
+#    base_url : 'https://www.ixquick.com/'
+#    search_url : 'https://www.ixquick.com/do/search'
+
+  - name : twitter
+    engine : twitter
+    categories : social media
+    shortcut : tw
+
+# maybe in a fun category
+#  - name : uncyclopedia
+#    engine : mediawiki
+#    categories : general
+#    shortcut : unc
+#    url : https://uncyclopedia.wikia.com/
+
+# tmp suspended - too slow, too many errors
+#  - name : urbandictionary
+#    engine        : xpath
+#    search_url    : http://www.urbandictionary.com/define.php?term={query}
+#    url_xpath     : //div[@class="word"]//a/@href
+#    title_xpath   : //div[@class="word"]//a
+#    content_xpath : //div[@class="definition"]
+#    shortcut : ud
+
+  - name : yahoo
+    engine : yahoo
+    shortcut : yh
+
+  - name : yahoo news
+    engine : yahoo_news
+    shortcut : yhn
+
+  - name : youtube
+    engine : youtube
+    categories : videos
+    shortcut : yt
+
+  - name : dailymotion
+    engine : dailymotion
+    locale : en_US
+    categories : videos
+    shortcut : dm
+
+  - name : vimeo
+    engine : vimeo
+    categories : videos
+    results_xpath : //div[@id="browse_content"]/ol/li
+    url_xpath : ./a/@href
+    title_xpath : ./a/div[@class="data"]/p[@class="title"]/text()
+    content_xpath : ./a/img/@src
+    shortcut : vm
+
+locales:
+    en : English
+    de : Deutsch
+    hu : Magyar
+    fr : Français
+    es : Español
+    it : Italiano
+    nl : Nederlands
--- a/manifest.json
+++ b/manifest.json
@ -26,7 +26,16 @@
                },
                "example": "/searx",
                "default": "/searx"
+            },
+            {
+                "name": "public_site",
+                "ask": {
+                    "en": "Is it a public Searx ?"
+                },
+                "choices": ["Yes", "No"],
+                "default": "Yes"
            }
+
        ]
    }
 }
--- a/scripts/install
+++ b/scripts/install
@ -41,7 +41,12 @@ then
 fi

 #Configuration Searx
-sudo cp ../conf/settings.yml /opt/searx/searx/
+if [ $path != "/" ];
+then
+	sudo cp ../conf/settings.yml-noroot /opt/searx/searx/settings.yml
+else
+	sudo cp ../conf/settings.yml /opt/searx/searx/
+fi
 sudo sed -i -e "s/ultrasecretkey/`openssl rand -hex 16`/g" /opt/searx/searx/settings.yml

 # Set permissions to searx directory
@ -62,13 +67,15 @@ else
 fi

 # Fix permission
-#sudo chmod 755 /etc/searx/
-#sudo find /opt/yunohost/searx/ -type d -exec chmod 2755 {} \;
-#sudo find /opt/yunohost/searx/ -type f -exec chmod g+r,o+r {} \;
-#sudo chmod 644 /etc/searx/*
+#sudo find /opt/searx/ -type d -exec chmod 2755 {} \;
+#sudo find /opt/searx/ -type f -exec chmod g+r,o+r {} \;

 ## Reload Nginx and regenerate SSOwat conf
 sudo service nginx reload
 sudo service uwsgi restart
-#sudo yunohost app setting searx skipped_uris -v "/"
+
+if [ $is_public = "Yes" ];
+then
+sudo yunohost app setting searx skipped_uris -v "/"
+fi
 sudo yunohost app ssowatconf
--- a/sources/AUTHORS.rst
+++ b/sources/AUTHORS.rst
@ -0,0 +1,26 @@
+Searx was created and is maintained by Adam Tauber.
+
+Major contributing authors:
+
+- Adam Tauber <asciimoo@gmail.com> `@asciimoo <https://github.com/asciimoo>`_
+- Matej Cotman
+- Thomas Pointhuber
+- Alexandre Flament
+
+People who have submitted patches/translates, reported bugs, consulted features or
+generally made searx better:
+
+- Laszlo Hammerl
+- Stefan Marsiske
+- Gabor Nagy
+- @pw3t
+- @rhapsodhy
+- András Veres-Szentkirályi
+- Benjamin Sonntag
+- @HLFH
+- @TheRadialActive
+- @Okhin
+- André Koot
+- Alejandro León Aznar
+- rike
+- dp
--- a/sources/searx/engines/init.py
+++ b/sources/searx/engines/init.py
@ -154,16 +154,24 @@ def score_results(results):
    # deduplication + scoring
    for i, res in enumerate(flat_res):
        res['parsed_url'] = urlparse(res['url'])
+        res['host'] = res['parsed_url'].netloc
+
+        if res['host'].startswith('www.'):
+            res['host'] = res['host'].replace('www.', '', 1)
+
        res['engines'] = [res['engine']]
        weight = 1.0
+
        if hasattr(engines[res['engine']], 'weight'):
            weight = float(engines[res['engine']].weight)
+
        score = int((flat_len - i) / engines_len) * weight + 1
        duplicated = False
+
        for new_res in results:
            p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa
            p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path  # noqa
-            if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\
+            if res['host'] == new_res['host'] and\
               p1 == p2 and\
               res['parsed_url'].query == new_res['parsed_url'].query and\
               res.get('template') == new_res.get('template'):
--- a/sources/searx/engines/flickr.py
+++ b/sources/searx/engines/flickr.py
@ -1,35 +1,52 @@
 #!/usr/bin/env python

 from urllib import urlencode
-from lxml import html
+#from json import loads
 from urlparse import urljoin
+from lxml import html
+from time import time

 categories = ['images']

 url = 'https://secure.flickr.com/'
 search_url = url+'search/?{query}&page={page}'
-results_xpath = '//div[@id="thumbnails"]//a[@class="rapidnofollow photo-click" and @data-track="photo-click"]'  # noqa
+results_xpath = '//div[@class="view display-item-tile"]/figure/div'

 paging = True


 def request(query, params):
-    params['url'] = search_url.format(query=urlencode({'q': query}),
+    params['url'] = search_url.format(query=urlencode({'text': query}),
                                      page=params['pageno'])
+    time_string = str(int(time())-3)
+    params['cookies']['BX'] = '3oqjr6d9nmpgl&b=3&s=dh'
+    params['cookies']['xb'] = '421409'
+    params['cookies']['localization'] = 'en-us'
+    params['cookies']['flrbp'] = time_string +\
+        '-3a8cdb85a427a33efda421fbda347b2eaf765a54'
+    params['cookies']['flrbs'] = time_string +\
+        '-ed142ae8765ee62c9ec92a9513665e0ee1ba6776'
+    params['cookies']['flrb'] = '9'
    return params


 def response(resp):
-    global base_url
    results = []
    dom = html.fromstring(resp.text)
    for result in dom.xpath(results_xpath):
-        href = urljoin(url, result.attrib.get('href'))
-        img = result.xpath('.//img')[0]
-        title = img.attrib.get('alt', '')
-        img_src = img.attrib.get('data-defer-src')
+        img = result.xpath('.//img')
+
+        if not img:
+            continue
+
+        img = img[0]
+        img_src = 'https:'+img.attrib.get('src')
+
        if not img_src:
            continue
+
+        href = urljoin(url, result.xpath('.//a')[0].attrib.get('href'))
+        title = img.attrib.get('alt', '')
        results.append({'url': href,
                        'title': title,
                        'img_src': img_src,
--- a/sources/searx/engines/piratebay.py
+++ b/sources/searx/engines/piratebay.py
@ -2,6 +2,7 @@ from urlparse import urljoin
 from cgi import escape
 from urllib import quote
 from lxml import html
+from operator import itemgetter

 categories = ['videos', 'music']

@ -29,14 +30,27 @@ def response(resp):
    results = []
    dom = html.fromstring(resp.text)
    search_res = dom.xpath('//table[@id="searchResult"]//tr')
+
    if not search_res:
        return results
+
    for result in search_res[1:]:
        link = result.xpath('.//div[@class="detName"]//a')[0]
        href = urljoin(url, link.attrib.get('href'))
        title = ' '.join(link.xpath('.//text()'))
        content = escape(' '.join(result.xpath(content_xpath)))
        seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
+
+        if seed.isdigit():
+            seed = int(seed)
+        else:
+            seed = 0
+
+        if leech.isdigit():
+            leech = int(leech)
+        else:
+            leech = 0
+
        magnetlink = result.xpath(magnet_xpath)[0]
        results.append({'url': href,
                        'title': title,
@ -45,4 +59,5 @@ def response(resp):
                        'leech': leech,
                        'magnetlink': magnetlink.attrib['href'],
                        'template': 'torrent.html'})
-    return results
+
+    return sorted(results, key=itemgetter('seed'), reverse=True)
--- a/sources/searx/tests/test_webapp.py
+++ b/sources/searx/tests/test_webapp.py
@ -51,7 +51,7 @@ class ViewsTestCase(SearxTestCase):
            result.data
        )
        self.assertIn(
-            '<p class="content">first <span class="highlight">test</span> content<br /></p>',
+            '<p class="content">first <span class="highlight">test</span> content<br /></p>',  # noqa
            result.data
        )

--- a/sources/searx/utils.py
+++ b/sources/searx/utils.py
@ -7,7 +7,9 @@ import re
 from random import choice

 ua_versions = ('26.0', '27.0', '28.0')
-ua_os = ('Windows NT 6.3; WOW64', 'X11; Linux x86_64; rv:26.0')
+ua_os = ('Windows NT 6.3; WOW64',
+         'X11; Linux x86_64',
+         'X11; Linux x86')
 ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}"


@ -28,7 +30,8 @@ def highlight_content(content, query):
    query = query.decode('utf-8')
    if content.lower().find(query.lower()) > -1:
        query_regex = u'({0})'.format(re.escape(query))
-        content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U)
+        content = re.sub(query_regex, '<span class="highlight">\\1</span>',
+                         content, flags=re.I | re.U)
    else:
        regex_parts = []
        for chunk in query.split():
@ -37,7 +40,8 @@ def highlight_content(content, query):
            else:
                regex_parts.append(u'{0}'.format(re.escape(chunk)))
        query_regex = u'({0})'.format('|'.join(regex_parts))
-        content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U)
+        content = re.sub(query_regex, '<span class="highlight">\\1</span>',
+                         content, flags=re.I | re.U)

    return content