From 9322315cdc30280997b0467d6c89c245ed8a5963 Mon Sep 17 00:00:00 2001 From: Ian Iyengar Date: Wed, 29 Apr 2026 15:59:48 +0000 Subject: [PATCH 1/2] Replace Summon spelling correction with ffi-hunspell - Add ffi-hunspell as a runtime dependency in gemspec - Replace DidYouMean::SpellChecker with FFI::Hunspell.dict('en_US') in the words initializer - Update spell_check helper to use Hunspell's check?/suggest API - Add Levenshtein-based reranking of suggestions for better accuracy - Fix index view to use @spell_check (controller-generated) instead of @summon.spelling_suggestion; remove stale Summon error guard --- app/helpers/quick_search/search_helper.rb | 28 ++++++++++++++++++-- app/views/quick_search/search/index.html.erb | 10 +++---- config/initializers/words.rb | 3 ++- quick_search-core.gemspec | 1 + 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/app/helpers/quick_search/search_helper.rb b/app/helpers/quick_search/search_helper.rb index e00aa0c..552a68d 100644 --- a/app/helpers/quick_search/search_helper.rb +++ b/app/helpers/quick_search/search_helper.rb @@ -12,7 +12,7 @@ def spell_check(params_q_scrubbed) corrections = [] else query_list = query_clean.split(' ') - corrections = SPELL_CHECKER.correct(query_clean) + corrections = SPELL_CHECKER.check?(query_clean) ? [] : SPELL_CHECKER.suggest(query_clean) multiwordcheck = (query_clean.length - corrections.join(" ").length).abs() > 2 multiwordcheck = multiwordcheck ? multiwordcheck : !query_list.any? { |word| corrections.join(" ").include?(word) } @@ -21,7 +21,7 @@ def spell_check(params_q_scrubbed) correction = [] query_list.each do | q | if !DICTIONARY.include?(q) && q.length > 2 - checker = SPELL_CHECKER.correct(q).first + checker = SPELL_CHECKER.check?(q) ? q : best_hunspell_suggestion(q, SPELL_CHECKER.suggest(q)) check = checker.present? ? (q.length - checker.length).abs() : 3 check2 = checker.present? ? (checker.chars - q.chars).concat(q.chars - checker.chars).uniq.count : 4 if check > 2 || check2 > 3 @@ -38,4 +38,28 @@ def spell_check(params_q_scrubbed) end return corrections end + + private + + # Pick the suggestion with the smallest Levenshtein distance to the original word. + # This produces better results than taking Hunspell's first suggestion, which is + # ranked by Hunspell's internal phonetic scoring rather than edit distance. + def best_hunspell_suggestion(word, suggestions) + return nil if suggestions.empty? + suggestions.min_by { |s| levenshtein_distance(word, s) } + end + + def levenshtein_distance(s, t) + m, n = s.length, t.length + d = Array.new(m + 1) { Array.new(n + 1, 0) } + (0..m).each { |i| d[i][0] = i } + (0..n).each { |j| d[0][j] = j } + (1..n).each do |j| + (1..m).each do |i| + cost = s[i-1] == t[j-1] ? 0 : 1 + d[i][j] = [d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost].min + end + end + d[m][n] + end end diff --git a/app/views/quick_search/search/index.html.erb b/app/views/quick_search/search/index.html.erb index b3016cd..3ac6320 100644 --- a/app/views/quick_search/search/index.html.erb +++ b/app/views/quick_search/search/index.html.erb @@ -28,12 +28,10 @@ <% thirdbento = all_good_bets.length > 0 ? true : false %> <%# FIXME: This logic could stand a second look %> -<% unless @summon.is_a? StandardError %> - <% unless @best_bets.is_a? StandardError %> - <% if @best_bets.results.blank? %> - <% unless @summon.spelling_suggestion.blank? %> - <%= render partial: 'layouts/quick_search/spelling_suggestion', locals: { spelling_suggestion: @summon.spelling_suggestion} %> - <% end %> +<% unless @best_bets.is_a? StandardError %> + <% if @best_bets.results.blank? %> + <% unless @spell_check.blank? %> + <%= render partial: 'layouts/quick_search/spelling_suggestion', locals: { spelling_suggestion: @spell_check.first} %> <% end %> <% end %> <% else %> diff --git a/config/initializers/words.rb b/config/initializers/words.rb index 670d125..1ac6774 100644 --- a/config/initializers/words.rb +++ b/config/initializers/words.rb @@ -1,3 +1,4 @@ words = File.join Rails.root, "/config/words.yml" DICTIONARY = YAML.load_file(words) -SPELL_CHECKER = DidYouMean::SpellChecker.new(dictionary: DICTIONARY) \ No newline at end of file + +SPELL_CHECKER = FFI::Hunspell.dict('en_US') \ No newline at end of file diff --git a/quick_search-core.gemspec b/quick_search-core.gemspec index cf63ce2..bd4c321 100644 --- a/quick_search-core.gemspec +++ b/quick_search-core.gemspec @@ -26,6 +26,7 @@ Gem::Specification.new do |s| s.add_dependency "mysql2" s.add_dependency "modernizr-rails" s.add_dependency "httpclient" + s.add_dependency "ffi-hunspell" end From be0d5aa56992bb195aecc7fc4e490abfdc9207e8 Mon Sep 17 00:00:00 2001 From: Ian Iyengar <194038725+iriyenga_ncstate@users.noreply.github.com> Date: Tue, 19 May 2026 12:39:29 -0400 Subject: [PATCH 2/2] feat: Add dynamic name-aware spell checking for person names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements spell correction that leverages both dynamic and static name sources: - Query webnodes Solr staff index for firstname/lastname fields with 12-hour cache - Add YAML fallback config (config/names.yml) for manual canonical name overrides - Mine author names from loaded search results as final post-search fallback - Filter typo candidates to prevent incorrect matches (e.g., 'Jonathn' vs 'Jonathan') - Remove UI gate that suppressed suggestions when best_bets had network errors Key improvements: - Solves 'Jonathn' → 'Jonathan' correction case via result-derived author names - Combines Levenshtein distance matching with candidate filtering - Handles 'Last, First' author format conversion to 'First Last' - Extends SearchHelper with result mining and name extraction methods - Adds regression tests for typo filtering and fallback corpus scenarios - Maintains backward compatibility with Hunspell word-level corrections Closes issue: Person names not corrected in spell suggestions --- .../quick_search/search_controller.rb | 13 ++ app/helpers/quick_search/search_helper.rb | 186 ++++++++++++++++- app/views/quick_search/search/index.html.erb | 192 +++++++++--------- test/helpers/search_helper_test.rb | 84 ++++++++ 4 files changed, 378 insertions(+), 97 deletions(-) diff --git a/app/controllers/quick_search/search_controller.rb b/app/controllers/quick_search/search_controller.rb index 879f972..20b9e29 100644 --- a/app/controllers/quick_search/search_controller.rb +++ b/app/controllers/quick_search/search_controller.rb @@ -168,6 +168,9 @@ def http_search(endpoint = 'defaults', page_to_render = :index) @query = params_q_scrubbed @search_in_params = true search_all_in_threads(endpoint) + if @spell_check.blank? + @spell_check = spell_check_from_searchers(@query, searchers_for_spell_check(endpoint)) + end #log_search(@query, page_to_render) render page_to_render else @@ -176,6 +179,16 @@ def http_search(endpoint = 'defaults', page_to_render = :index) end end + def searchers_for_spell_check(endpoint) + searcher_names = if endpoint == 'defaults' + QuickSearch::Engine::APP_CONFIG['searchers'] + else + [endpoint] + end + + searcher_names.filter_map { |searcher_name| instance_variable_get("@#{searcher_name}") } + end + def page if page_in_params? page = params[:page].to_i diff --git a/app/helpers/quick_search/search_helper.rb b/app/helpers/quick_search/search_helper.rb index 552a68d..a5d4e53 100644 --- a/app/helpers/quick_search/search_helper.rb +++ b/app/helpers/quick_search/search_helper.rb @@ -7,6 +7,9 @@ def spell_check(params_q_scrubbed) if !params_q_scrubbed return [] end + canonical_name = best_name_phrase_suggestion(params_q_scrubbed) + return [canonical_name] if canonical_name.present? + query_clean = params_q_scrubbed.downcase if DICTIONARY.include?(query_clean) corrections = [] @@ -21,7 +24,7 @@ def spell_check(params_q_scrubbed) correction = [] query_list.each do | q | if !DICTIONARY.include?(q) && q.length > 2 - checker = SPELL_CHECKER.check?(q) ? q : best_hunspell_suggestion(q, SPELL_CHECKER.suggest(q)) + checker = SPELL_CHECKER.check?(q) ? q : preferred_suggestion(q) check = checker.present? ? (q.length - checker.length).abs() : 3 check2 = checker.present? ? (checker.chars - q.chars).concat(q.chars - checker.chars).uniq.count : 4 if check > 2 || check2 > 3 @@ -39,6 +42,20 @@ def spell_check(params_q_scrubbed) return corrections end + def spell_check_from_searchers(params_q_scrubbed, searchers) + return [] if params_q_scrubbed.blank? + + phrase_candidates, token_candidates = search_result_name_candidates(searchers) + candidates = params_q_scrubbed.to_s.split.size > 1 ? phrase_candidates : token_candidates + normalized_query = normalize_for_matching(params_q_scrubbed) + filtered_candidates = candidates.reject do |candidate| + normalize_for_matching(candidate) == normalized_query + end + suggestion = best_name_suggestion(params_q_scrubbed, name_candidates_for(params_q_scrubbed, filtered_candidates)) + + suggestion.present? ? [suggestion] : [] + end + private # Pick the suggestion with the smallest Levenshtein distance to the original word. @@ -49,6 +66,173 @@ def best_hunspell_suggestion(word, suggestions) suggestions.min_by { |s| levenshtein_distance(word, s) } end + def preferred_suggestion(word) + name_suggestion = best_name_token_suggestion(word) + hunspell_suggestion = best_hunspell_suggestion(word, SPELL_CHECKER.suggest(word)) + + return name_suggestion if prefer_name_suggestion?(word, name_suggestion, hunspell_suggestion) + + hunspell_suggestion + end + + def prefer_name_suggestion?(word, name_suggestion, hunspell_suggestion) + return false if name_suggestion.blank? + return true if hunspell_suggestion.blank? + + levenshtein_distance(normalize_for_matching(word), normalize_for_matching(name_suggestion)) <= + levenshtein_distance(normalize_for_matching(word), normalize_for_matching(hunspell_suggestion)) + end + + def best_name_phrase_suggestion(query) + return nil if query.blank? + + candidates = name_candidates_for(query, names_dictionary) + best_name_suggestion(query, candidates) + end + + def best_name_token_suggestion(word) + return nil if word.blank? + + candidates = name_candidates_for(word, names_dictionary.flat_map { |name| name.split(/\s+/) }.uniq) + best_name_suggestion(word, candidates) + end + + def best_name_suggestion(term, candidates) + return nil if candidates.empty? + + candidate = candidates.min_by do |value| + levenshtein_distance(normalize_for_matching(term), normalize_for_matching(value)) + end + + acceptable_name_match?(term, candidate) ? candidate : nil + end + + def name_candidates_for(term, candidates) + term_token_count = term.to_s.split.size + return candidates if term_token_count <= 1 + + matching_candidates = candidates.select { |candidate| candidate.to_s.split.size == term_token_count } + matching_candidates.presence || candidates + end + + def acceptable_name_match?(term, candidate) + return false if candidate.blank? + + normalized_term = normalize_for_matching(term) + normalized_candidate = normalize_for_matching(candidate) + return false if normalized_term == normalized_candidate + + distance = levenshtein_distance(normalized_term, normalized_candidate) + max_distance = [1, normalized_term.length / 4].max + + distance <= max_distance + end + + def normalize_for_matching(value) + value.to_s.downcase.gsub(/[^a-z0-9]/, '') + end + + def names_dictionary + @names_dictionary ||= begin + (cached_webnodes_names_dictionary + configured_names_dictionary).uniq + end + end + + def cached_webnodes_names_dictionary + return [] if webnodes_solr_url.blank? + + Rails.cache.fetch('quick_search/spell_check/webnodes_names', expires_in: 12.hours) do + webnodes_names_dictionary + end + rescue StandardError + [] + end + + def webnodes_names_dictionary + solr = RSolr.connect url: webnodes_solr_url + response = solr.get 'select', params: { + 'q' => 'type:staff', + 'fl' => 'firstname,lastname', + 'rows' => 2000, + 'sort' => 'lastname asc, firstname asc' + } + + Array(response.dig('response', 'docs')).filter_map do |doc| + first_name = Array(doc['firstname']).first.to_s.strip + last_name = Array(doc['lastname']).first.to_s.strip + next if first_name.blank? || last_name.blank? + + "#{first_name} #{last_name}" + end.uniq + end + + def webnodes_solr_url + QuickSearch::Engine::WEBNODES_CONFIG['solr_url'] + rescue NameError, NoMethodError + nil + end + + def configured_names_dictionary + config_path = File.join(Rails.root, 'config', 'names.yml') + return [] unless File.exist?(config_path) + + config = YAML.load_file(config_path) + values = if config.is_a?(Hash) + config['names'] || config[:names] || [] + elsif config.is_a?(Array) + config + else + [] + end + + values.map(&:to_s).map(&:strip).reject(&:blank?).uniq + end + + def search_result_name_candidates(searchers) + phrase_candidates = [] + token_candidates = [] + + Array(searchers).each do |searcher| + next if searcher.blank? || searcher.is_a?(StandardError) || searcher.results.blank? + + searcher.results.first(5).each do |result| + phrase_candidates.concat(author_phrase_candidates(result)) + token_candidates.concat(author_token_candidates(result)) + end + end + + [phrase_candidates.uniq, token_candidates.uniq] + end + + def author_phrase_candidates(result) + author_values(result).filter_map do |author_value| + normalize_author_phrase(author_value) + end + end + + def author_token_candidates(result) + author_values(result).flat_map do |author_value| + normalize_author_phrase(author_value).to_s.scan(/\b[A-Z][a-z]{2,}\b/) + end.uniq + end + + def author_values(result) + result_hash = result.respond_to?(:to_h) ? result.to_h : {} + author = result_hash['author'] || result_hash[:author] + author.present? ? author.to_s.split(';').map(&:strip).reject(&:blank?) : [] + end + + def normalize_author_phrase(author_value) + if author_value.include?(',') + parts = author_value.split(',').map(&:strip).reject(&:blank?) + return nil if parts.empty? + + ([parts[1..].join(' '), parts.first].reject(&:blank?).join(' ')).squeeze(' ') + else + author_value.squeeze(' ') + end + end + def levenshtein_distance(s, t) m, n = s.length, t.length d = Array.new(m + 1) { Array.new(n + 1, 0) } diff --git a/app/views/quick_search/search/index.html.erb b/app/views/quick_search/search/index.html.erb index 3ac6320..f17f11a 100644 --- a/app/views/quick_search/search/index.html.erb +++ b/app/views/quick_search/search/index.html.erb @@ -1,119 +1,119 @@ <% content_for :head do %> - - - - + + + + <% end %> <% title %> <%= render partial: 'layouts/quick_search/page_title', locals: { page_title: @page_title } %> <%= render partial: 'layouts/quick_search/search_form' %>
-
    +
      -
      -
      - <%= render partial: 'layouts/quick_search/found_types' %> -
      +
      +
      + <%= render partial: 'layouts/quick_search/found_types' %>
      +
      <% thirdbento = all_good_bets.length > 0 ? true : false %> <%# FIXME: This logic could stand a second look %> -<% unless @best_bets.is_a? StandardError %> - <% if @best_bets.results.blank? %> - <% unless @spell_check.blank? %> - <%= render partial: 'layouts/quick_search/spelling_suggestion', locals: { spelling_suggestion: @spell_check.first} %> - <% end %> - <% end %> +<% unless @spell_check.blank? %> +<%= render partial: 'layouts/quick_search/spelling_suggestion', locals: { spelling_suggestion: @spell_check.first} %> <% else %> - + <% end %>
      -
      - <% if @best_bets.is_a? StandardError %> -
      - <% else %> - <% unless @best_bets.results.blank? %> - <%= render partial: 'layouts/quick_search/best_bets', locals: { best_bets: get_best_bets_data(@best_bets)} %> - <% end %> - <% end %> -
      +
      + <% if @best_bets.is_a? StandardError %> +
      + <% else %> + <% unless @best_bets.results.blank? %> + <%= render partial: 'layouts/quick_search/best_bets', locals: { best_bets: get_best_bets_data(@best_bets)} %> + <% end %> + <% end %> +
      -
      -
      - <%= render_module(@summon, 'summon') %> -
      -
      - <%= render_module(@catalog, 'catalog') %> -
      - <% if thirdbento %> -
      - <%= render partial: 'layouts/quick_search/good_bets', locals: { good_bets: all_good_bets[0..4] } %> -
      - <% end %> -
      +
      +
      + <%= render_module(@summon, 'summon') %>
      -
      -

      Other Ways to Find Articles

      -
      - <%= render_module(@ematrix_journal, 'ematrix_journal') %> -
      -
      - <%= render_module(@ematrix_database, 'ematrix_database') %> -
      -
      - <% if @smart_subjects.is_a? StandardError %> -
      - <% else %> - <% unless @smart_subjects.results.blank? %> - <%= render_module(@smart_subjects, 'smart_subjects') %> - <% end %> - <% end %> -
      -
      +
      + <%= render_module(@catalog, 'catalog') %>
      -
      -

      Information About the Libraries

      -
      - <% if @faq.is_a? StandardError %> -
      - <% else %> - <% unless @faq.results.blank? %> - <%= render_module(@faq, 'faq') %> - <% end %> - <% end %> -
      -
      - <%= render_module(@website, 'website') %> -
      + <% if thirdbento %> +
      + <%= render partial: 'layouts/quick_search/good_bets', locals: { good_bets: all_good_bets[0..4] } %> +
      + <% end %> +
      +
      +
      +
      +

      Other Ways to Find Articles

      +
      +
      + <%= render_module(@ematrix_journal, 'ematrix_journal') %> +
      +
      + <%= render_module(@ematrix_database, 'ematrix_database') %> +
      +
      + <% if @smart_subjects.is_a? StandardError %> +
      + <% else %> + <% unless @smart_subjects.results.blank? %> + <%= render_module(@smart_subjects, 'smart_subjects') %> + <% end %> + <% end %> +
      +
      +
      +
      +
      +

      Information About the Libraries

      +
      + <% if @faq.is_a? StandardError %> +
      + <% else %> + <% unless @faq.results.blank? %> + <%= render_module(@faq, 'faq') %> + <% end %> + <% end %> +
      +
      + <%= render_module(@website, 'website') %> +
      +
      -
      -
      -
      - <% unless @summon.is_a? StandardError %> - <% unless @summon.related_topics.blank? %> - - <% end %> - <% else %> - - <% end %> -
      - <%= render partial: 'layouts/quick_search/more_options' %> -
      -
      +
      +
      +
      + <% unless @summon.is_a? StandardError %> + <% unless @summon.related_topics.blank? %> + + <% end %> + <% else %> + + <% end %> +
      + <%= render partial: 'layouts/quick_search/more_options' %> +
      +
      -
      +
      +
      \ No newline at end of file diff --git a/test/helpers/search_helper_test.rb b/test/helpers/search_helper_test.rb index 3034163..141a891 100644 --- a/test/helpers/search_helper_test.rb +++ b/test/helpers/search_helper_test.rb @@ -1,4 +1,88 @@ require 'test_helper' class SearchHelperTest < ActionView::TestCase + test 'prefers canonical name for misspelled full name' do + with_stubbed_names(['John Smith']) do + assert_equal ['John Smith'], spell_check('jon smth') + end + end + + test 'prefers canonical name token over regular-word hunspell suggestion' do + with_stubbed_names(['John Smith']) do + SPELL_CHECKER.stub(:check?, false) do + SPELL_CHECKER.stub(:suggest, ['join']) do + assert_equal ['John'], spell_check('jon') + end + end + end + end + + test 'still uses hunspell for regular misspellings when no name matches' do + with_stubbed_names([]) do + SPELL_CHECKER.stub(:check?, false) do + SPELL_CHECKER.stub(:suggest, ['spelling']) do + assert_equal ['spelling'], spell_check('speling') + end + end + end + end + + test 'combines webnodes names with configured names' do + with_stubbed_name_sources(webnodes: ['John Smith'], configured: ['Jane Doe']) do + assert_equal ['John Smith', 'Jane Doe'], names_dictionary + end + end + + test 'uses author names from search results as a fallback corpus' do + with_stubbed_name_sources(webnodes: [], configured: []) do + searcher = Struct.new(:results).new([ + OpenStruct.new(author: 'Olson, Jonathan R; Welsh, Janet A') + ]) + + assert_equal ['Jonathan'], spell_check_from_searchers('Jonathn', [searcher]) + end + end + + test 'filters out exact typo token from fallback candidates' do + with_stubbed_name_sources(webnodes: [], configured: []) do + searcher = Struct.new(:results).new([ + OpenStruct.new(author: 'Jonathn Logan; Olson, Jonathan R') + ]) + + assert_equal ['Jonathan'], spell_check_from_searchers('Jonathn', [searcher]) + end + end + + private + + def with_stubbed_names(names) + with_stubbed_name_sources(webnodes: names, configured: []) do + yield + end + end + + def with_stubbed_name_sources(webnodes:, configured:) + names_dictionary_backup = @names_dictionary if instance_variable_defined?(:@names_dictionary) + remove_instance_variable(:@names_dictionary) if instance_variable_defined?(:@names_dictionary) + + stubbed_webnodes = webnodes + stubbed_configured = configured + singleton_class.class_eval do + define_method(:cached_webnodes_names_dictionary) { stubbed_webnodes } + define_method(:configured_names_dictionary) { stubbed_configured } + end + + yield + ensure + singleton_class.class_eval do + remove_method(:cached_webnodes_names_dictionary) if method_defined?(:cached_webnodes_names_dictionary) + remove_method(:configured_names_dictionary) if method_defined?(:configured_names_dictionary) + end + + if names_dictionary_backup + @names_dictionary = names_dictionary_backup + elsif instance_variable_defined?(:@names_dictionary) + remove_instance_variable(:@names_dictionary) + end + end end