From e7aa2be828f6a632dadd5c41e2364cea91ddbb2c Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Wed, 13 Jul 2022 15:03:28 +0200 Subject: [PATCH] Change how hashtags are normalized (#18795) * Change how hashtags are normalized * Fix tests --- app/controllers/admin/tags_controller.rb | 4 ++- .../api/v1/featured_tags_controller.rb | 4 +-- .../settings/featured_tags_controller.rb | 1 - app/javascript/mastodon/actions/compose.js | 15 ++++++++- app/lib/ascii_folding.rb | 10 ++++++ app/lib/hashtag_normalizer.rb | 25 +++++++++++++++ app/models/account.rb | 2 +- app/models/custom_filter.rb | 6 ++-- app/models/custom_filter_keyword.rb | 4 +-- app/models/featured_tag.rb | 31 ++++++++++++------ app/models/tag.rb | 20 +++++++++--- .../activitypub/hashtag_serializer.rb | 4 +-- .../rest/featured_tag_serializer.rb | 4 +++ app/serializers/rest/tag_serializer.rb | 4 +++ app/views/accounts/show.html.haml | 2 +- app/views/accounts/show.rss.ruby | 2 +- app/views/admin/tags/show.html.haml | 4 +-- app/views/admin/trends/tags/_tag.html.haml | 2 +- .../admin_mailer/_new_trending_tags.text.erb | 4 +-- .../settings/featured_tags/index.html.haml | 2 +- app/views/tags/_og.html.haml | 4 +-- app/views/tags/show.html.haml | 6 ++-- app/views/tags/show.rss.ruby | 6 ++-- config/initializers/inflections.rb | 1 + ...20220710102457_add_display_name_to_tags.rb | 5 +++ db/schema.rb | 3 +- spec/lib/hashtag_normalizer_spec.rb | 29 +++++++++++++++++ spec/models/tag_spec.rb | 8 ++--- streaming/index.js | 32 +++++++++++++++++-- 29 files changed, 193 insertions(+), 51 deletions(-) create mode 100644 app/lib/ascii_folding.rb create mode 100644 app/lib/hashtag_normalizer.rb create mode 100644 db/migrate/20220710102457_add_display_name_to_tags.rb create mode 100644 spec/lib/hashtag_normalizer_spec.rb diff --git a/app/controllers/admin/tags_controller.rb b/app/controllers/admin/tags_controller.rb index 749e2f144d..4f727c398a 100644 --- a/app/controllers/admin/tags_controller.rb +++ b/app/controllers/admin/tags_controller.rb @@ -16,6 +16,8 @@ module Admin if @tag.update(tag_params.merge(reviewed_at: Time.now.utc)) redirect_to admin_tag_path(@tag.id), notice: I18n.t('admin.tags.updated_msg') else + @time_period = (6.days.ago.to_date...Time.now.utc.to_date) + render :show end end @@ -27,7 +29,7 @@ module Admin end def tag_params - params.require(:tag).permit(:name, :trendable, :usable, :listable) + params.require(:tag).permit(:name, :display_name, :trendable, :usable, :listable) end end end diff --git a/app/controllers/api/v1/featured_tags_controller.rb b/app/controllers/api/v1/featured_tags_controller.rb index e4e836c971..c1ead4f540 100644 --- a/app/controllers/api/v1/featured_tags_controller.rb +++ b/app/controllers/api/v1/featured_tags_controller.rb @@ -13,9 +13,7 @@ class Api::V1::FeaturedTagsController < Api::BaseController end def create - @featured_tag = current_account.featured_tags.new(featured_tag_params) - @featured_tag.reset_data - @featured_tag.save! + @featured_tag = current_account.featured_tags.create!(featured_tag_params) render json: @featured_tag, serializer: REST::FeaturedTagSerializer end diff --git a/app/controllers/settings/featured_tags_controller.rb b/app/controllers/settings/featured_tags_controller.rb index e805527d07..aadff7c835 100644 --- a/app/controllers/settings/featured_tags_controller.rb +++ b/app/controllers/settings/featured_tags_controller.rb @@ -11,7 +11,6 @@ class Settings::FeaturedTagsController < Settings::BaseController def create @featured_tag = current_account.featured_tags.new(featured_tag_params) - @featured_tag.reset_data if @featured_tag.save redirect_to settings_featured_tags_path diff --git a/app/javascript/mastodon/actions/compose.js b/app/javascript/mastodon/actions/compose.js index bd4c1d0024..32d8c229e4 100644 --- a/app/javascript/mastodon/actions/compose.js +++ b/app/javascript/mastodon/actions/compose.js @@ -606,7 +606,20 @@ function insertIntoTagHistory(recognizedTags, text) { const state = getState(); const oldHistory = state.getIn(['compose', 'tagHistory']); const me = state.getIn(['meta', 'me']); - const names = recognizedTags.map(tag => text.match(new RegExp(`#${tag.name}`, 'i'))[0].slice(1)); + + // FIXME: Matching input hashtags with recognized hashtags has become more + // complicated because of new normalization rules, it's no longer just + // a case sensitivity issue + const names = recognizedTags.map(tag => { + const matches = text.match(new RegExp(`#${tag.name}`, 'i')); + + if (matches && matches.length > 0) { + return matches[0].slice(1); + } else { + return tag.name; + } + }); + const intersectedOldHistory = oldHistory.filter(name => names.findIndex(newName => newName.toLowerCase() === name.toLowerCase()) === -1); names.push(...intersectedOldHistory.toJS()); diff --git a/app/lib/ascii_folding.rb b/app/lib/ascii_folding.rb new file mode 100644 index 0000000000..1798d3d0e6 --- /dev/null +++ b/app/lib/ascii_folding.rb @@ -0,0 +1,10 @@ +# frozen_string_literal: true + +class ASCIIFolding + NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž' + EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz' + + def fold(str) + str.tr(NON_ASCII_CHARS, EQUIVALENT_ASCII_CHARS) + end +end diff --git a/app/lib/hashtag_normalizer.rb b/app/lib/hashtag_normalizer.rb new file mode 100644 index 0000000000..c1f99e1638 --- /dev/null +++ b/app/lib/hashtag_normalizer.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +class HashtagNormalizer + def normalize(str) + remove_invalid_characters(ascii_folding(lowercase(cjk_width(str)))) + end + + private + + def remove_invalid_characters(str) + str.gsub(/[^[:alnum:]#{Tag::HASHTAG_SEPARATORS}]/, '') + end + + def ascii_folding(str) + ASCIIFolding.new.fold(str) + end + + def lowercase(str) + str.mb_chars.downcase.to_s + end + + def cjk_width(str) + str.unicode_normalize(:nfkc) + end +end diff --git a/app/models/account.rb b/app/models/account.rb index 628692d22e..fe77eaec45 100644 --- a/app/models/account.rb +++ b/app/models/account.rb @@ -62,7 +62,7 @@ class Account < ApplicationRecord ) USERNAME_RE = /[a-z0-9_]+([a-z0-9_\.-]+[a-z0-9_]+)?/i - MENTION_RE = /(?<=^|[^\/[:word:]])@((#{USERNAME_RE})(?:@[[:word:]\.\-]+[[:word:]]+)?)/i + MENTION_RE = /(?<=^|[^\/[:word:]])@((#{USERNAME_RE})(?:@[[:alnum:]\.\-]+[[:alnum:]]+)?)/i URL_PREFIX_RE = /\Ahttp(s?):\/\/[^\/]+/ include Attachmentable diff --git a/app/models/custom_filter.rb b/app/models/custom_filter.rb index e98ed7df9f..985eab1254 100644 --- a/app/models/custom_filter.rb +++ b/app/models/custom_filter.rb @@ -3,14 +3,14 @@ # # Table name: custom_filters # -# id :bigint not null, primary key -# account_id :bigint +# id :bigint(8) not null, primary key +# account_id :bigint(8) # expires_at :datetime # phrase :text default(""), not null # context :string default([]), not null, is an Array # created_at :datetime not null # updated_at :datetime not null -# action :integer default(0), not null +# action :integer default("warn"), not null # class CustomFilter < ApplicationRecord diff --git a/app/models/custom_filter_keyword.rb b/app/models/custom_filter_keyword.rb index bf5c557469..e0d0289ae1 100644 --- a/app/models/custom_filter_keyword.rb +++ b/app/models/custom_filter_keyword.rb @@ -3,8 +3,8 @@ # # Table name: custom_filter_keywords # -# id :bigint not null, primary key -# custom_filter_id :bigint not null +# id :bigint(8) not null, primary key +# custom_filter_id :bigint(8) not null # keyword :text default(""), not null # whole_word :boolean default(TRUE), not null # created_at :datetime not null diff --git a/app/models/featured_tag.rb b/app/models/featured_tag.rb index 74d62e7778..c9c285bfa1 100644 --- a/app/models/featured_tag.rb +++ b/app/models/featured_tag.rb @@ -13,17 +13,19 @@ # class FeaturedTag < ApplicationRecord - belongs_to :account, inverse_of: :featured_tags, required: true - belongs_to :tag, inverse_of: :featured_tags, required: true + belongs_to :account, inverse_of: :featured_tags + belongs_to :tag, inverse_of: :featured_tags, optional: true # Set after validation - delegate :name, to: :tag, allow_nil: true - - validates_associated :tag, on: :create - validates :name, presence: true, on: :create + validate :validate_tag_name, on: :create validate :validate_featured_tags_limit, on: :create - def name=(str) - self.tag = Tag.find_or_create_by_names(str.strip)&.first + before_create :set_tag + before_create :reset_data + + attr_writer :name + + def name + tag_id.present? ? tag.name : @name end def increment(timestamp) @@ -34,14 +36,23 @@ class FeaturedTag < ApplicationRecord update(statuses_count: [0, statuses_count - 1].max, last_status_at: account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).where.not(id: deleted_status_id).select(:created_at).first&.created_at) end + private + + def set_tag + self.tag = Tag.find_or_create_by_names(@name)&.first + end + def reset_data self.statuses_count = account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).count self.last_status_at = account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).select(:created_at).first&.created_at end - private - def validate_featured_tags_limit errors.add(:base, I18n.t('featured_tags.errors.limit')) if account.featured_tags.count >= 10 end + + def validate_tag_name + errors.add(:name, :blank) if @name.blank? + errors.add(:name, :invalid) unless @name.match?(/\A(#{Tag::HASHTAG_NAME_RE})\z/i) + end end diff --git a/app/models/tag.rb b/app/models/tag.rb index a640426149..f078007f24 100644 --- a/app/models/tag.rb +++ b/app/models/tag.rb @@ -15,6 +15,7 @@ # last_status_at :datetime # max_score :float # max_score_at :datetime +# display_name :string # class Tag < ApplicationRecord @@ -24,11 +25,12 @@ class Tag < ApplicationRecord has_many :featured_tags, dependent: :destroy, inverse_of: :tag HASHTAG_SEPARATORS = "_\u00B7\u200c" - HASHTAG_NAME_RE = "([[:word:]_][[:word:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:word:]#{HASHTAG_SEPARATORS}]*[[:word:]_])|([[:word:]_]*[[:alpha:]][[:word:]_]*)" + HASHTAG_NAME_RE = "([[:alnum:]_][[:alnum:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:alnum:]#{HASHTAG_SEPARATORS}]*[[:alnum:]_])|([[:alnum:]_]*[[:alpha:]][[:alnum:]_]*)" HASHTAG_RE = /(?:^|[^\/\)\w])#(#{HASHTAG_NAME_RE})/i validates :name, presence: true, format: { with: /\A(#{HASHTAG_NAME_RE})\z/i } validate :validate_name_change, if: -> { !new_record? && name_changed? } + validate :validate_display_name_change, if: -> { !new_record? && display_name_changed? } scope :reviewed, -> { where.not(reviewed_at: nil) } scope :unreviewed, -> { where(reviewed_at: nil) } @@ -46,6 +48,10 @@ class Tag < ApplicationRecord name end + def display_name + attributes['display_name'] || name + end + def usable boolean_with_default('usable', true) end @@ -90,8 +96,10 @@ class Tag < ApplicationRecord class << self def find_or_create_by_names(name_or_names) - Array(name_or_names).map(&method(:normalize)).uniq { |str| str.mb_chars.downcase.to_s }.map do |normalized_name| - tag = matching_name(normalized_name).first || create(name: normalized_name) + names = Array(name_or_names).map { |str| [normalize(str), str] }.uniq(&:first) + + names.map do |(normalized_name, display_name)| + tag = matching_name(normalized_name).first || create(name: normalized_name, display_name: display_name) yield tag if block_given? @@ -129,7 +137,7 @@ class Tag < ApplicationRecord end def normalize(str) - str.gsub(/\A#/, '') + HashtagNormalizer.new.normalize(str) end end @@ -138,4 +146,8 @@ class Tag < ApplicationRecord def validate_name_change errors.add(:name, I18n.t('tags.does_not_match_previous_name')) unless name_was.mb_chars.casecmp(name.mb_chars).zero? end + + def validate_display_name_change + errors.add(:display_name, I18n.t('tags.does_not_match_previous_name')) unless HashtagNormalizer.new.normalize(display_name).casecmp(name.mb_chars).zero? + end end diff --git a/app/serializers/activitypub/hashtag_serializer.rb b/app/serializers/activitypub/hashtag_serializer.rb index 1a56e4dfe4..90929c57f9 100644 --- a/app/serializers/activitypub/hashtag_serializer.rb +++ b/app/serializers/activitypub/hashtag_serializer.rb @@ -10,11 +10,11 @@ class ActivityPub::HashtagSerializer < ActivityPub::Serializer end def name - "##{object.name}" + "##{object.display_name}" end def href - if object.class.name == 'FeaturedTag' + if object.instance_of?(FeaturedTag) short_account_tag_url(object.account, object.tag) else tag_url(object) diff --git a/app/serializers/rest/featured_tag_serializer.rb b/app/serializers/rest/featured_tag_serializer.rb index 96adcc7d09..8abcd9b90f 100644 --- a/app/serializers/rest/featured_tag_serializer.rb +++ b/app/serializers/rest/featured_tag_serializer.rb @@ -12,4 +12,8 @@ class REST::FeaturedTagSerializer < ActiveModel::Serializer def url short_account_tag_url(object.account, object.tag) end + + def name + object.display_name + end end diff --git a/app/serializers/rest/tag_serializer.rb b/app/serializers/rest/tag_serializer.rb index 74aa571a4c..52bfaa4ce4 100644 --- a/app/serializers/rest/tag_serializer.rb +++ b/app/serializers/rest/tag_serializer.rb @@ -8,4 +8,8 @@ class REST::TagSerializer < ActiveModel::Serializer def url tag_url(object) end + + def name + object.display_name + end end diff --git a/app/views/accounts/show.html.haml b/app/views/accounts/show.html.haml index 72e9c66111..7fa688bd35 100644 --- a/app/views/accounts/show.html.haml +++ b/app/views/accounts/show.html.haml @@ -75,7 +75,7 @@ = link_to short_account_tag_path(@account, featured_tag.tag) do %h4 = fa_icon 'hashtag' - = featured_tag.name + = featured_tag.display_name %small - if featured_tag.last_status_at.nil? = t('accounts.nothing_here') diff --git a/app/views/accounts/show.rss.ruby b/app/views/accounts/show.rss.ruby index fd45a8b2b0..34e29d483f 100644 --- a/app/views/accounts/show.rss.ruby +++ b/app/views/accounts/show.rss.ruby @@ -28,7 +28,7 @@ RSS::Builder.build do |doc| end status.tags.each do |tag| - item.category(tag.name) + item.category(tag.display_name) end end end diff --git a/app/views/admin/tags/show.html.haml b/app/views/admin/tags/show.html.haml index fd9acce4a3..104190b588 100644 --- a/app/views/admin/tags/show.html.haml +++ b/app/views/admin/tags/show.html.haml @@ -2,7 +2,7 @@ = javascript_pack_tag 'admin', async: true, crossorigin: 'anonymous' - content_for :page_title do - = "##{@tag.name}" + = "##{@tag.display_name}" - if current_user.can?(:view_dashboard) - content_for :heading_actions do @@ -53,7 +53,7 @@ = render 'shared/error_messages', object: @tag .fields-group - = f.input :name, wrapper: :with_block_label + = f.input :display_name, wrapper: :with_block_label .fields-group = f.input :usable, as: :boolean, wrapper: :with_label diff --git a/app/views/admin/trends/tags/_tag.html.haml b/app/views/admin/trends/tags/_tag.html.haml index 7bb99b1580..a30666a08b 100644 --- a/app/views/admin/trends/tags/_tag.html.haml +++ b/app/views/admin/trends/tags/_tag.html.haml @@ -6,7 +6,7 @@ .pending-account__header = link_to admin_tag_path(tag.id) do = fa_icon 'hashtag' - = tag.name + = tag.display_name %br/ diff --git a/app/views/admin_mailer/_new_trending_tags.text.erb b/app/views/admin_mailer/_new_trending_tags.text.erb index cde5af4e4e..363df369d5 100644 --- a/app/views/admin_mailer/_new_trending_tags.text.erb +++ b/app/views/admin_mailer/_new_trending_tags.text.erb @@ -1,12 +1,12 @@ <%= raw t('admin_mailer.new_trends.new_trending_tags.title') %> <% @tags.each do |tag| %> -- #<%= tag.name %> +- #<%= tag.display_name %> <%= raw t('admin.trends.tags.usage_comparison', today: tag.history.get(Time.now.utc).accounts, yesterday: tag.history.get(Time.now.utc - 1.day).accounts) %> • <%= t('admin.trends.tags.current_score', score: Trends.tags.score(tag.id).round(2)) %> <% end %> <% if @lowest_trending_tag %> -<%= raw t('admin_mailer.new_trends.new_trending_tags.requirements', lowest_tag_name: @lowest_trending_tag.name, lowest_tag_score: Trends.tags.score(@lowest_trending_tag.id).round(2), rank: Trends.tags.options[:review_threshold]) %> +<%= raw t('admin_mailer.new_trends.new_trending_tags.requirements', lowest_tag_name: @lowest_trending_tag.display_name, lowest_tag_score: Trends.tags.score(@lowest_trending_tag.id).round(2), rank: Trends.tags.options[:review_threshold]) %> <% else %> <%= raw t('admin_mailer.new_trends.new_trending_tags.no_approved_tags') %> <% end %> diff --git a/app/views/settings/featured_tags/index.html.haml b/app/views/settings/featured_tags/index.html.haml index 65de7f8f30..5d87e2862d 100644 --- a/app/views/settings/featured_tags/index.html.haml +++ b/app/views/settings/featured_tags/index.html.haml @@ -9,7 +9,7 @@ = render 'shared/error_messages', object: @featured_tag .fields-group - = f.input :name, wrapper: :with_block_label, hint: safe_join([t('simple_form.hints.featured_tag.name'), safe_join(@recently_used_tags.map { |tag| link_to("##{tag.name}", settings_featured_tags_path(featured_tag: { name: tag.name }), method: :post) }, ', ')], ' ') + = f.input :name, wrapper: :with_block_label, hint: safe_join([t('simple_form.hints.featured_tag.name'), safe_join(@recently_used_tags.map { |tag| link_to("##{tag.display_name}", settings_featured_tags_path(featured_tag: { name: tag.name }), method: :post) }, ', ')], ' ') .actions = f.button :button, t('featured_tags.add_new'), type: :submit diff --git a/app/views/tags/_og.html.haml b/app/views/tags/_og.html.haml index a7c289bcb0..37f644cf2f 100644 --- a/app/views/tags/_og.html.haml +++ b/app/views/tags/_og.html.haml @@ -1,6 +1,6 @@ = opengraph 'og:site_name', t('about.hosted_on', domain: site_hostname) = opengraph 'og:url', tag_url(@tag) = opengraph 'og:type', 'website' -= opengraph 'og:title', "##{@tag.name}" -= opengraph 'og:description', strip_tags(t('about.about_hashtag_html', hashtag: @tag.name)) += opengraph 'og:title', "##{@tag.display_name}" += opengraph 'og:description', strip_tags(t('about.about_hashtag_html', hashtag: @tag.display_name)) = opengraph 'twitter:card', 'summary' diff --git a/app/views/tags/show.html.haml b/app/views/tags/show.html.haml index 5cd513b320..6dfb4f9b34 100644 --- a/app/views/tags/show.html.haml +++ b/app/views/tags/show.html.haml @@ -1,5 +1,5 @@ - content_for :page_title do - = "##{@tag.name}" + = "##{@tag.display_name}" - content_for :header_tags do %meta{ name: 'robots', content: 'noindex' }/ @@ -9,8 +9,8 @@ = render 'og' .page-header - %h1= "##{@tag.name}" - %p= t('about.about_hashtag_html', hashtag: @tag.name) + %h1= "##{@tag.display_name}" + %p= t('about.about_hashtag_html', hashtag: @tag.display_name) #mastodon-timeline{ data: { props: Oj.dump(default_props.merge(hashtag: @tag.name, local: @local)) }} .notranslate#modal-container diff --git a/app/views/tags/show.rss.ruby b/app/views/tags/show.rss.ruby index 9ce71be74c..8e0c2327b5 100644 --- a/app/views/tags/show.rss.ruby +++ b/app/views/tags/show.rss.ruby @@ -1,6 +1,6 @@ RSS::Builder.build do |doc| - doc.title("##{@tag.name}") - doc.description(I18n.t('rss.descriptions.tag', hashtag: @tag.name)) + doc.title("##{@tag.display_name}") + doc.description(I18n.t('rss.descriptions.tag', hashtag: @tag.display_name)) doc.link(tag_url(@tag)) doc.last_build_date(@statuses.first.created_at) if @statuses.any? doc.generator("Mastodon v#{Mastodon::Version.to_s}") @@ -26,7 +26,7 @@ RSS::Builder.build do |doc| end status.tags.each do |tag| - item.category(tag.name) + item.category(tag.display_name) end end end diff --git a/config/initializers/inflections.rb b/config/initializers/inflections.rb index 9bc9a54b2d..3e5a556176 100644 --- a/config/initializers/inflections.rb +++ b/config/initializers/inflections.rb @@ -24,6 +24,7 @@ ActiveSupport::Inflector.inflections(:en) do |inflect| inflect.acronym 'RSS' inflect.acronym 'REST' inflect.acronym 'URL' + inflect.acronym 'ASCII' inflect.singular 'data', 'data' end diff --git a/db/migrate/20220710102457_add_display_name_to_tags.rb b/db/migrate/20220710102457_add_display_name_to_tags.rb new file mode 100644 index 0000000000..aa78676459 --- /dev/null +++ b/db/migrate/20220710102457_add_display_name_to_tags.rb @@ -0,0 +1,5 @@ +class AddDisplayNameToTags < ActiveRecord::Migration[6.1] + def change + add_column :tags, :display_name, :string + end +end diff --git a/db/schema.rb b/db/schema.rb index 54966ef64b..9b465b674f 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 2022_07_04_024901) do +ActiveRecord::Schema.define(version: 2022_07_10_102457) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -940,6 +940,7 @@ ActiveRecord::Schema.define(version: 2022_07_04_024901) do t.datetime "last_status_at" t.float "max_score" t.datetime "max_score_at" + t.string "display_name" t.index "lower((name)::text) text_pattern_ops", name: "index_tags_on_name_lower_btree", unique: true end diff --git a/spec/lib/hashtag_normalizer_spec.rb b/spec/lib/hashtag_normalizer_spec.rb new file mode 100644 index 0000000000..fbb9f37c07 --- /dev/null +++ b/spec/lib/hashtag_normalizer_spec.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +require 'rails_helper' + +describe HashtagNormalizer do + subject { described_class.new } + + describe '#normalize' do + it 'converts full-width Latin characters into basic Latin characters' do + expect(subject.normalize('Synthwave')).to eq 'synthwave' + end + + it 'converts half-width Katakana into Kana characters' do + expect(subject.normalize('シーサイドライナー')).to eq 'シーサイドライナー' + end + + it 'converts modified Latin characters into basic Latin characters' do + expect(subject.normalize('BLÅHAJ')).to eq 'blahaj' + end + + it 'strips out invalid characters' do + expect(subject.normalize('#foo')).to eq 'foo' + end + + it 'keeps valid characters' do + expect(subject.normalize('a·b')).to eq 'a·b' + end + end +end diff --git a/spec/models/tag_spec.rb b/spec/models/tag_spec.rb index 3949dbce54..b16f99a799 100644 --- a/spec/models/tag_spec.rb +++ b/spec/models/tag_spec.rb @@ -91,7 +91,7 @@ RSpec.describe Tag, type: :model do upcase_string = 'abcABCabcABCやゆよ' downcase_string = 'abcabcabcabcやゆよ'; - tag = Fabricate(:tag, name: downcase_string) + tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string)) expect(Tag.find_normalized(upcase_string)).to eq tag end end @@ -101,12 +101,12 @@ RSpec.describe Tag, type: :model do upcase_string = 'abcABCabcABCやゆよ' downcase_string = 'abcabcabcabcやゆよ'; - tag = Fabricate(:tag, name: downcase_string) + tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string)) expect(Tag.matches_name(upcase_string)).to eq [tag] end it 'uses the LIKE operator' do - expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100\\%abc%')] + expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100abc%')] end end @@ -115,7 +115,7 @@ RSpec.describe Tag, type: :model do upcase_string = 'abcABCabcABCやゆよ' downcase_string = 'abcabcabcabcやゆよ'; - tag = Fabricate(:tag, name: downcase_string) + tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string)) expect(Tag.matching_name(upcase_string)).to eq [tag] end end diff --git a/streaming/index.js b/streaming/index.js index 792ec5a445..a55181bad2 100644 --- a/streaming/index.js +++ b/streaming/index.js @@ -892,6 +892,34 @@ const startWorker = async (workerId) => { return arr; }; + /** + * See app/lib/ascii_folder.rb for the canon definitions + * of these constants + */ + const NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'; + const EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'; + + /** + * @param {string} str + * @return {string} + */ + const foldToASCII = str => { + const regex = new RegExp(NON_ASCII_CHARS.split('').join('|'), 'g'); + + return str.replace(regex, match => { + const index = NON_ASCII_CHARS.indexOf(match); + return EQUIVALENT_ASCII_CHARS[index]; + }); + }; + + /** + * @param {string} str + * @return {string} + */ + const normalizeHashtag = str => { + return foldToASCII(str.normalize('NFKC').toLowerCase()).replace(/[^\p{L}\p{N}_\u00b7\u200c]/gu, ''); + }; + /** * @param {any} req * @param {string} name @@ -968,7 +996,7 @@ const startWorker = async (workerId) => { reject('No tag for stream provided'); } else { resolve({ - channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}`], + channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}`], options: { needsFiltering: true }, }); } @@ -979,7 +1007,7 @@ const startWorker = async (workerId) => { reject('No tag for stream provided'); } else { resolve({ - channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}:local`], + channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}:local`], options: { needsFiltering: true }, }); }