diff --git a/app/controllers/quick_search/search_controller.rb b/app/controllers/quick_search/search_controller.rb index 879f972..20b9e29 100644 --- a/app/controllers/quick_search/search_controller.rb +++ b/app/controllers/quick_search/search_controller.rb @@ -168,6 +168,9 @@ def http_search(endpoint = 'defaults', page_to_render = :index) @query = params_q_scrubbed @search_in_params = true search_all_in_threads(endpoint) + if @spell_check.blank? + @spell_check = spell_check_from_searchers(@query, searchers_for_spell_check(endpoint)) + end #log_search(@query, page_to_render) render page_to_render else @@ -176,6 +179,16 @@ def http_search(endpoint = 'defaults', page_to_render = :index) end end + def searchers_for_spell_check(endpoint) + searcher_names = if endpoint == 'defaults' + QuickSearch::Engine::APP_CONFIG['searchers'] + else + [endpoint] + end + + searcher_names.filter_map { |searcher_name| instance_variable_get("@#{searcher_name}") } + end + def page if page_in_params? page = params[:page].to_i diff --git a/app/helpers/quick_search/search_helper.rb b/app/helpers/quick_search/search_helper.rb index e00aa0c..a5d4e53 100644 --- a/app/helpers/quick_search/search_helper.rb +++ b/app/helpers/quick_search/search_helper.rb @@ -7,12 +7,15 @@ def spell_check(params_q_scrubbed) if !params_q_scrubbed return [] end + canonical_name = best_name_phrase_suggestion(params_q_scrubbed) + return [canonical_name] if canonical_name.present? + query_clean = params_q_scrubbed.downcase if DICTIONARY.include?(query_clean) corrections = [] else query_list = query_clean.split(' ') - corrections = SPELL_CHECKER.correct(query_clean) + corrections = SPELL_CHECKER.check?(query_clean) ? [] : SPELL_CHECKER.suggest(query_clean) multiwordcheck = (query_clean.length - corrections.join(" ").length).abs() > 2 multiwordcheck = multiwordcheck ? multiwordcheck : !query_list.any? { |word| corrections.join(" ").include?(word) } @@ -21,7 +24,7 @@ def spell_check(params_q_scrubbed) correction = [] query_list.each do | q | if !DICTIONARY.include?(q) && q.length > 2 - checker = SPELL_CHECKER.correct(q).first + checker = SPELL_CHECKER.check?(q) ? q : preferred_suggestion(q) check = checker.present? ? (q.length - checker.length).abs() : 3 check2 = checker.present? ? (checker.chars - q.chars).concat(q.chars - checker.chars).uniq.count : 4 if check > 2 || check2 > 3 @@ -38,4 +41,209 @@ def spell_check(params_q_scrubbed) end return corrections end + + def spell_check_from_searchers(params_q_scrubbed, searchers) + return [] if params_q_scrubbed.blank? + + phrase_candidates, token_candidates = search_result_name_candidates(searchers) + candidates = params_q_scrubbed.to_s.split.size > 1 ? phrase_candidates : token_candidates + normalized_query = normalize_for_matching(params_q_scrubbed) + filtered_candidates = candidates.reject do |candidate| + normalize_for_matching(candidate) == normalized_query + end + suggestion = best_name_suggestion(params_q_scrubbed, name_candidates_for(params_q_scrubbed, filtered_candidates)) + + suggestion.present? ? [suggestion] : [] + end + + private + + # Pick the suggestion with the smallest Levenshtein distance to the original word. + # This produces better results than taking Hunspell's first suggestion, which is + # ranked by Hunspell's internal phonetic scoring rather than edit distance. + def best_hunspell_suggestion(word, suggestions) + return nil if suggestions.empty? + suggestions.min_by { |s| levenshtein_distance(word, s) } + end + + def preferred_suggestion(word) + name_suggestion = best_name_token_suggestion(word) + hunspell_suggestion = best_hunspell_suggestion(word, SPELL_CHECKER.suggest(word)) + + return name_suggestion if prefer_name_suggestion?(word, name_suggestion, hunspell_suggestion) + + hunspell_suggestion + end + + def prefer_name_suggestion?(word, name_suggestion, hunspell_suggestion) + return false if name_suggestion.blank? + return true if hunspell_suggestion.blank? + + levenshtein_distance(normalize_for_matching(word), normalize_for_matching(name_suggestion)) <= + levenshtein_distance(normalize_for_matching(word), normalize_for_matching(hunspell_suggestion)) + end + + def best_name_phrase_suggestion(query) + return nil if query.blank? + + candidates = name_candidates_for(query, names_dictionary) + best_name_suggestion(query, candidates) + end + + def best_name_token_suggestion(word) + return nil if word.blank? + + candidates = name_candidates_for(word, names_dictionary.flat_map { |name| name.split(/\s+/) }.uniq) + best_name_suggestion(word, candidates) + end + + def best_name_suggestion(term, candidates) + return nil if candidates.empty? + + candidate = candidates.min_by do |value| + levenshtein_distance(normalize_for_matching(term), normalize_for_matching(value)) + end + + acceptable_name_match?(term, candidate) ? candidate : nil + end + + def name_candidates_for(term, candidates) + term_token_count = term.to_s.split.size + return candidates if term_token_count <= 1 + + matching_candidates = candidates.select { |candidate| candidate.to_s.split.size == term_token_count } + matching_candidates.presence || candidates + end + + def acceptable_name_match?(term, candidate) + return false if candidate.blank? + + normalized_term = normalize_for_matching(term) + normalized_candidate = normalize_for_matching(candidate) + return false if normalized_term == normalized_candidate + + distance = levenshtein_distance(normalized_term, normalized_candidate) + max_distance = [1, normalized_term.length / 4].max + + distance <= max_distance + end + + def normalize_for_matching(value) + value.to_s.downcase.gsub(/[^a-z0-9]/, '') + end + + def names_dictionary + @names_dictionary ||= begin + (cached_webnodes_names_dictionary + configured_names_dictionary).uniq + end + end + + def cached_webnodes_names_dictionary + return [] if webnodes_solr_url.blank? + + Rails.cache.fetch('quick_search/spell_check/webnodes_names', expires_in: 12.hours) do + webnodes_names_dictionary + end + rescue StandardError + [] + end + + def webnodes_names_dictionary + solr = RSolr.connect url: webnodes_solr_url + response = solr.get 'select', params: { + 'q' => 'type:staff', + 'fl' => 'firstname,lastname', + 'rows' => 2000, + 'sort' => 'lastname asc, firstname asc' + } + + Array(response.dig('response', 'docs')).filter_map do |doc| + first_name = Array(doc['firstname']).first.to_s.strip + last_name = Array(doc['lastname']).first.to_s.strip + next if first_name.blank? || last_name.blank? + + "#{first_name} #{last_name}" + end.uniq + end + + def webnodes_solr_url + QuickSearch::Engine::WEBNODES_CONFIG['solr_url'] + rescue NameError, NoMethodError + nil + end + + def configured_names_dictionary + config_path = File.join(Rails.root, 'config', 'names.yml') + return [] unless File.exist?(config_path) + + config = YAML.load_file(config_path) + values = if config.is_a?(Hash) + config['names'] || config[:names] || [] + elsif config.is_a?(Array) + config + else + [] + end + + values.map(&:to_s).map(&:strip).reject(&:blank?).uniq + end + + def search_result_name_candidates(searchers) + phrase_candidates = [] + token_candidates = [] + + Array(searchers).each do |searcher| + next if searcher.blank? || searcher.is_a?(StandardError) || searcher.results.blank? + + searcher.results.first(5).each do |result| + phrase_candidates.concat(author_phrase_candidates(result)) + token_candidates.concat(author_token_candidates(result)) + end + end + + [phrase_candidates.uniq, token_candidates.uniq] + end + + def author_phrase_candidates(result) + author_values(result).filter_map do |author_value| + normalize_author_phrase(author_value) + end + end + + def author_token_candidates(result) + author_values(result).flat_map do |author_value| + normalize_author_phrase(author_value).to_s.scan(/\b[A-Z][a-z]{2,}\b/) + end.uniq + end + + def author_values(result) + result_hash = result.respond_to?(:to_h) ? result.to_h : {} + author = result_hash['author'] || result_hash[:author] + author.present? ? author.to_s.split(';').map(&:strip).reject(&:blank?) : [] + end + + def normalize_author_phrase(author_value) + if author_value.include?(',') + parts = author_value.split(',').map(&:strip).reject(&:blank?) + return nil if parts.empty? + + ([parts[1..].join(' '), parts.first].reject(&:blank?).join(' ')).squeeze(' ') + else + author_value.squeeze(' ') + end + end + + def levenshtein_distance(s, t) + m, n = s.length, t.length + d = Array.new(m + 1) { Array.new(n + 1, 0) } + (0..m).each { |i| d[i][0] = i } + (0..n).each { |j| d[0][j] = j } + (1..n).each do |j| + (1..m).each do |i| + cost = s[i-1] == t[j-1] ? 0 : 1 + d[i][j] = [d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost].min + end + end + d[m][n] + end end diff --git a/app/views/quick_search/search/index.html.erb b/app/views/quick_search/search/index.html.erb index b3016cd..f17f11a 100644 --- a/app/views/quick_search/search/index.html.erb +++ b/app/views/quick_search/search/index.html.erb @@ -1,121 +1,119 @@ <% content_for :head do %> - - - - + + + + <% end %> <% title %> <%= render partial: 'layouts/quick_search/page_title', locals: { page_title: @page_title } %> <%= render partial: 'layouts/quick_search/search_form' %>