From 3ebc0ad4d3c2fe0b0951a334642b769bd521a799 Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Fri, 9 Feb 2018 23:04:47 +0100 Subject: [PATCH] Full-text search for authorized statuses (#6423) * Add full-text search for authorized statuses - Search API will return statuses that match the query - Only for logged in users - Only if you are author of the status, - Or you were mentioned in it - Or you favourited or reblogged it - Configuration over `ES_ENABLED`, `ES_HOST`, `ES_PORT`, `ES_PREFIX` - Run `rails chewy:deploy` to create & populate index Fix #5880 Fix #4293 Fix #1152 * Add commented out docker-compose configuration for ES container * Optimize index import, filter search results * Add basic normalization to the index * Add better stemming and normalization to the index * Skip webfinger request if search query includes both @ and a space * Fix code style * Visually separate search result sections * Fix code style issues --- .env.production.sample | 4 ++ Gemfile | 1 + Gemfile.lock | 22 +++++++ app/chewy/statuses_index.rb | 61 +++++++++++++++++++ .../compose/components/search_results.js | 6 ++ .../styles/mastodon/components.scss | 39 +++++++++++- app/lib/status_filter.rb | 1 + app/models/favourite.rb | 2 + app/models/status.rb | 18 ++++++ app/services/search_service.rb | 43 +++++++++++-- config/initializers/chewy.rb | 22 +++++++ docker-compose.yml | 12 ++++ spec/spec_helper.rb | 4 ++ 13 files changed, 230 insertions(+), 5 deletions(-) create mode 100644 app/chewy/statuses_index.rb create mode 100644 config/initializers/chewy.rb diff --git a/.env.production.sample b/.env.production.sample index a4b689a31f..38f7326f0c 100644 --- a/.env.production.sample +++ b/.env.production.sample @@ -9,6 +9,10 @@ DB_USER=postgres DB_NAME=postgres DB_PASS= DB_PORT=5432 +# Optional ElasticSearch configuration +# ES_ENABLED=true +# ES_HOST=localhost +# ES_PORT=9200 # Federation # Note: Changing LOCAL_DOMAIN at a later time will cause unwanted side effects, including breaking all existing federation. diff --git a/Gemfile b/Gemfile index 3b39f39462..d1c00b498e 100644 --- a/Gemfile +++ b/Gemfile @@ -27,6 +27,7 @@ gem 'bootsnap' gem 'browser' gem 'charlock_holmes', '~> 0.7.5' gem 'iso-639' +gem 'chewy', '~> 0.10', git: 'https://github.com/toptal/chewy.git' gem 'cld3', '~> 3.2.0' gem 'devise', '~> 4.4' gem 'devise-two-factor', '~> 3.0' diff --git a/Gemfile.lock b/Gemfile.lock index c357bfbd1c..b82fc49a67 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,3 +1,12 @@ +GIT + remote: https://github.com/toptal/chewy.git + revision: a7d21eb4b0bd7415533ef134bb6d31b2df309701 + specs: + chewy (0.10.1) + activesupport (>= 4.0) + elasticsearch (>= 2.0.0) + elasticsearch-dsl + GEM remote: https://rubygems.org/ specs: @@ -154,6 +163,15 @@ GEM json thread thread_safe + elasticsearch (6.0.1) + elasticsearch-api (= 6.0.1) + elasticsearch-transport (= 6.0.1) + elasticsearch-api (6.0.1) + multi_json + elasticsearch-dsl (0.1.5) + elasticsearch-transport (6.0.1) + faraday + multi_json encryptor (3.0.0) erubi (1.7.0) et-orbi (1.0.8) @@ -163,6 +181,8 @@ GEM fabrication (2.18.0) faker (1.8.4) i18n (~> 0.5) + faraday (0.14.0) + multipart-post (>= 1.2, < 3) fast_blank (1.0.0) ffi (1.9.18) fog-core (1.45.0) @@ -291,6 +311,7 @@ GEM minitest (5.11.3) msgpack (1.1.0) multi_json (1.12.2) + multipart-post (2.0.0) net-scp (1.2.1) net-ssh (>= 2.6.5) net-ssh (4.2.0) @@ -583,6 +604,7 @@ DEPENDENCIES capistrano-yarn (~> 2.0) capybara (~> 2.15) charlock_holmes (~> 0.7.5) + chewy (~> 0.10)! cld3 (~> 3.2.0) climate_control (~> 0.2) devise (~> 4.4) diff --git a/app/chewy/statuses_index.rb b/app/chewy/statuses_index.rb new file mode 100644 index 0000000000..8bf5b4af7c --- /dev/null +++ b/app/chewy/statuses_index.rb @@ -0,0 +1,61 @@ +# frozen_string_literal: true + +class StatusesIndex < Chewy::Index + settings index: { refresh_interval: '15m' }, analysis: { + filter: { + english_stop: { + type: 'stop', + stopwords: '_english_', + }, + english_stemmer: { + type: 'stemmer', + language: 'english', + }, + english_possessive_stemmer: { + type: 'stemmer', + language: 'possessive_english', + }, + }, + analyzer: { + content: { + tokenizer: 'uax_url_email', + filter: %w( + english_possessive_stemmer + lowercase + asciifolding + cjk_width + english_stop + english_stemmer + ), + }, + }, + } + + define_type ::Status.without_reblogs do + crutch :mentions do |collection| + data = ::Mention.where(status_id: collection.map(&:id)).pluck(:status_id, :account_id) + data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) } + end + + crutch :favourites do |collection| + data = ::Favourite.where(status_id: collection.map(&:id)).pluck(:status_id, :account_id) + data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) } + end + + crutch :reblogs do |collection| + data = ::Status.where(reblog_of_id: collection.map(&:id)).pluck(:reblog_of_id, :account_id) + data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) } + end + + root date_detection: false do + field :account_id, type: 'long' + + field :text, type: 'text', value: ->(status) { [status.spoiler_text, Formatter.instance.plaintext(status)].join("\n\n") } do + field :stemmed, type: 'text', analyzer: 'content' + end + + field :searchable_by, type: 'long', value: ->(status, crutches) { status.searchable_by(crutches) } + field :created_at, type: 'date' + end + end +end diff --git a/app/javascript/mastodon/features/compose/components/search_results.js b/app/javascript/mastodon/features/compose/components/search_results.js index d16f7fce76..84455563c4 100644 --- a/app/javascript/mastodon/features/compose/components/search_results.js +++ b/app/javascript/mastodon/features/compose/components/search_results.js @@ -22,6 +22,8 @@ export default class SearchResults extends ImmutablePureComponent { count += results.get('accounts').size; accounts = (
+
+ {results.get('accounts').map(accountId => )}
); @@ -31,6 +33,8 @@ export default class SearchResults extends ImmutablePureComponent { count += results.get('statuses').size; statuses = (
+
+ {results.get('statuses').map(statusId => )}
); @@ -40,6 +44,8 @@ export default class SearchResults extends ImmutablePureComponent { count += results.get('hashtags').size; hashtags = (
+
+ {results.get('hashtags').map(hashtag => ( #{hashtag} diff --git a/app/javascript/styles/mastodon/components.scss b/app/javascript/styles/mastodon/components.scss index c2c9a040fc..fe895809a4 100644 --- a/app/javascript/styles/mastodon/components.scss +++ b/app/javascript/styles/mastodon/components.scss @@ -1786,7 +1786,7 @@ flex: 1; min-height: 47px; - > img { + > img { display: block; object-fit: contain; object-position: bottom left; @@ -3229,6 +3229,43 @@ font-weight: 500; } +.search-results__section { + margin-bottom: 20px; + + h5 { + position: relative; + + &::before { + content: ""; + display: block; + position: absolute; + left: 0; + right: 0; + top: 50%; + width: 100%; + height: 0; + border-top: 1px solid lighten($ui-base-color, 8%); + } + + span { + display: inline-block; + background: $ui-base-color; + color: $ui-primary-color; + font-size: 14px; + font-weight: 500; + padding: 10px; + position: relative; + z-index: 1; + cursor: default; + } + } + + .account:last-child, + & > div:last-child .status { + border-bottom: 0; + } +} + .search-results__hashtag { display: block; padding: 10px; diff --git a/app/lib/status_filter.rb b/app/lib/status_filter.rb index a6a050ce1d..41d4381e56 100644 --- a/app/lib/status_filter.rb +++ b/app/lib/status_filter.rb @@ -9,6 +9,7 @@ class StatusFilter end def filtered? + return false if !account.nil? && account.id == status.account_id blocked_by_policy? || (account_present? && filtered_status?) || silenced_account? end diff --git a/app/models/favourite.rb b/app/models/favourite.rb index 2b1271f31d..fa1884b866 100644 --- a/app/models/favourite.rb +++ b/app/models/favourite.rb @@ -13,6 +13,8 @@ class Favourite < ApplicationRecord include Paginable + update_index('statuses#status', :status) if Chewy.enabled? + belongs_to :account, inverse_of: :favourites belongs_to :status, inverse_of: :favourites, counter_cache: true diff --git a/app/models/status.rb b/app/models/status.rb index 26ff40bf7a..0de89ad4e1 100644 --- a/app/models/status.rb +++ b/app/models/status.rb @@ -31,6 +31,8 @@ class Status < ApplicationRecord include Cacheable include StatusThreadingConcern + update_index('statuses#status', :proper) if Chewy.enabled? + enum visibility: [:public, :unlisted, :private, :direct], _suffix: :visibility belongs_to :application, class_name: 'Doorkeeper::Application', optional: true @@ -78,6 +80,22 @@ class Status < ApplicationRecord delegate :domain, to: :account, prefix: true + def searchable_by(preloaded = nil) + ids = [account_id] + + if preloaded.nil? + ids += mentions.pluck(:account_id) + ids += favourites.pluck(:account_id) + ids += reblogs.pluck(:account_id) + else + ids += preloaded.mentions[id] || [] + ids += preloaded.favourites[id] || [] + ids += preloaded.reblogs[id] || [] + end + + ids.uniq + end + def reply? !in_reply_to_id.nil? || attributes['reply'] end diff --git a/app/services/search_service.rb b/app/services/search_service.rb index 5f763b8f77..fe98566866 100644 --- a/app/services/search_service.rb +++ b/app/services/search_service.rb @@ -1,21 +1,43 @@ # frozen_string_literal: true class SearchService < BaseService - attr_accessor :query + attr_accessor :query, :account, :limit, :resolve def call(query, limit, resolve = false, account = nil) - @query = query + @query = query + @account = account + @limit = limit + @resolve = resolve default_results.tap do |results| if url_query? results.merge!(url_resource_results) unless url_resource.nil? elsif query.present? - results[:accounts] = AccountSearchService.new.call(query, limit, account, resolve: resolve) - results[:hashtags] = Tag.search_for(query.gsub(/\A#/, ''), limit) unless query.start_with?('@') + results[:accounts] = perform_accounts_search! if account_searchable? + results[:statuses] = perform_statuses_search! if full_text_searchable? + results[:hashtags] = perform_hashtags_search! if hashtag_searchable? end end end + private + + def perform_accounts_search! + AccountSearchService.new.call(query, limit, account, resolve: resolve) + end + + def perform_statuses_search! + statuses = StatusesIndex.filter(term: { searchable_by: account.id }) + .query(multi_match: { type: 'most_fields', query: query, operator: 'and', fields: %w(text text.stemmed) }) + .limit(limit).objects + + statuses.reject { |status| StatusFilter.new(status, account).filtered? } + end + + def perform_hashtags_search! + Tag.search_for(query.gsub(/\A#/, ''), limit) + end + def default_results { accounts: [], hashtags: [], statuses: [] } end @@ -35,4 +57,17 @@ class SearchService < BaseService def url_resource_symbol url_resource.class.name.downcase.pluralize.to_sym end + + def full_text_searchable? + return false unless Chewy.enabled? + !account.nil? && !((query.start_with?('#') || query.include?('@')) && !query.include?(' ')) + end + + def account_searchable? + !(query.include?('@') && query.include?(' ')) + end + + def hashtag_searchable? + !query.include?('@') + end end diff --git a/config/initializers/chewy.rb b/config/initializers/chewy.rb new file mode 100644 index 0000000000..bef2746ec7 --- /dev/null +++ b/config/initializers/chewy.rb @@ -0,0 +1,22 @@ +enabled = ENV['ES_ENABLED'] == 'true' +host = ENV.fetch('ES_HOST') { 'localhost' } +port = ENV.fetch('ES_PORT') { 9200 } +fallback_prefix = ENV.fetch('REDIS_NAMESPACE') { nil } +prefix = ENV.fetch('ES_PREFIX') { fallback_prefix } + +Chewy.settings = { + host: "#{host}:#{port}", + prefix: prefix, + enabled: enabled, + journal: false, +} + +Chewy.root_strategy = enabled ? :sidekiq : :bypass + +module Chewy + class << self + def enabled? + settings[:enabled] + end + end +end diff --git a/docker-compose.yml b/docker-compose.yml index aaa3a44782..55b419e984 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,6 +19,17 @@ services: # volumes: # - ./redis:/data +# es: +# restart: always +# image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.1.3 +# environment: +# - "ES_JAVA_OPTS=-Xms512m -Xmx512m" +# networks: +# - internal_network +#### Uncomment to enable ES persistance +## volumes: +## - ./elasticsearch:/usr/share/elasticsearch/data + web: build: . image: gargron/mastodon @@ -33,6 +44,7 @@ services: depends_on: - db - redis +# - es volumes: - ./public/assets:/mastodon/public/assets - ./public/packs:/mastodon/public/packs diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index eecaec4acf..a0466dd4bf 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -25,6 +25,10 @@ RSpec.configure do |config| end end + config.before :suite do + Chewy.strategy(:bypass) + end + config.after :suite do gc_counter = 0 FileUtils.rm_rf(Dir["#{Rails.root}/spec/test_files/"])