diff --git a/internal/provider/wikidata_provider.go b/internal/provider/wikidata_provider.go index e00f84d..94ffeef 100644 --- a/internal/provider/wikidata_provider.go +++ b/internal/provider/wikidata_provider.go @@ -96,10 +96,60 @@ type wbSearchResponse struct { } `json:"search"` } +// Common corporate suffixes — we try these *after* a normalized search fails, so +// names like "REPSOL, S.A." (caps, double whitespace, suffix) collapse to a +// query Wikidata can match. Order matters: longer suffixes first so we don't +// accidentally peel off "Inc" when the real suffix is "Inc.". +var corporateSuffixes = []string{ + ", S.A.", " S.A.", ", S.A", " S.A", + ", Inc.", " Inc.", ", Inc", " Inc", + ", N.V.", " N.V.", ", NV", " NV", + " Corporation", " Corp.", " Corp", + " Limited", " Ltd.", " Ltd", + " plc", " PLC", " p.l.c.", + " GmbH", " AG", " SE", " S.E.", + " Co., Ltd.", " Co.", " & Co.", +} + +func normalizeWhitespace(s string) string { + return strings.Join(strings.Fields(s), " ") +} + +func stripCorporateSuffix(name string) string { + for _, suffix := range corporateSuffixes { + if strings.HasSuffix(name, suffix) { + return strings.TrimSpace(strings.TrimSuffix(name, suffix)) + } + } + return name +} + +// searchEntity tries the normalized name first; if Wikidata returns no hits, +// it falls back to a suffix-stripped variant. Two queries worst case — cheap +// against Wikidata's free API and dramatically improves coverage for names +// pulled from Yahoo (which often arrive as "REPSOL, S.A." or "Diageo plc"). func (w *WikidataProvider) searchEntity(ctx context.Context, name string) (string, error) { + normalized := normalizeWhitespace(name) + if id, err := w.searchOnce(ctx, normalized); err == nil { + return id, nil + } + + stripped := stripCorporateSuffix(normalized) + if stripped == "" || stripped == normalized { + return "", fmt.Errorf("no wikidata entity for %q", normalized) + } + + id, err := w.searchOnce(ctx, stripped) + if err != nil { + return "", fmt.Errorf("no wikidata entity for %q or %q", normalized, stripped) + } + return id, nil +} + +func (w *WikidataProvider) searchOnce(ctx context.Context, query string) (string, error) { q := url.Values{ "action": {"wbsearchentities"}, - "search": {name}, + "search": {query}, "language": {"en"}, "limit": {"1"}, "format": {"json"}, @@ -114,7 +164,7 @@ func (w *WikidataProvider) searchEntity(ctx context.Context, name string) (strin return "", fmt.Errorf("decoding search response: %w", err) } if len(resp.Search) == 0 { - return "", fmt.Errorf("no wikidata entity for %q", name) + return "", fmt.Errorf("no entity") } return resp.Search[0].ID, nil } diff --git a/internal/provider/wikidata_provider_test.go b/internal/provider/wikidata_provider_test.go index 4c7e9ea..4ec7d6d 100644 --- a/internal/provider/wikidata_provider_test.go +++ b/internal/provider/wikidata_provider_test.go @@ -150,3 +150,129 @@ func TestWikidataProvider_BulkImport_NotSupported(t *testing.T) { t.Fatal("expected error from BulkImport") } } + +func TestNormalizeWhitespace(t *testing.T) { + cases := map[string]string{ + "REPSOL, S.A.": "REPSOL, S.A.", + " Diageo plc ": "Diageo plc", + "Apple\tInc.": "Apple Inc.", + "Multi\n Line\tCo.": "Multi Line Co.", + } + for input, want := range cases { + if got := normalizeWhitespace(input); got != want { + t.Errorf("normalizeWhitespace(%q) = %q, want %q", input, got, want) + } + } +} + +func TestStripCorporateSuffix(t *testing.T) { + cases := map[string]string{ + "Repsol, S.A.": "Repsol", + "Diageo plc": "Diageo", + "Apple Inc.": "Apple", + "Iberdrola, S.A.": "Iberdrola", + "Toyota Motor Corp": "Toyota Motor", + "Just a Name": "Just a Name", // no suffix → unchanged + } + for input, want := range cases { + if got := stripCorporateSuffix(input); got != want { + t.Errorf("stripCorporateSuffix(%q) = %q, want %q", input, got, want) + } + } +} + +// Whitespace-only difference: the as-is search succeeds, no fallback needed. +func TestWikidataProvider_GetLogo_RecoversFromDoubleSpace(t *testing.T) { + calls := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query() + switch q.Get("action") { + case "wbsearchentities": + calls++ + search := q.Get("search") + // Whitespace-collapsed input must be the FIRST query — and we expect + // it to hit on the first try (single query). + if search != "REPSOL, S.A." { + t.Errorf("call %d: expected first search to be whitespace-collapsed, got %q", calls, search) + } + _, _ = io.WriteString(w, `{"search":[{"id":"Q174747"}]}`) + case "wbgetclaims": + _, _ = io.WriteString(w, `{"claims":{"P154":[{"mainsnak":{"datavalue":{"value":"Repsol logo.svg"}}}]}}`) + default: + _, _ = io.WriteString(w, "PNGDATA") + } + })) + defer srv.Close() + + p := newWikidataProviderWithServer(srv) + result, err := p.GetLogo(context.Background(), "REP.MC", "REPSOL, S.A.") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.Source != "wikidata:Q174747" { + t.Errorf("Source = %q", result.Source) + } + if calls != 1 { + t.Errorf("expected exactly 1 search call (whitespace-only fix), got %d", calls) + } +} + +// First (normalized) search misses, suffix-stripped retry hits. +func TestWikidataProvider_GetLogo_RecoversBySuffixStrip(t *testing.T) { + searches := []string{} + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query() + switch q.Get("action") { + case "wbsearchentities": + search := q.Get("search") + searches = append(searches, search) + // First search: as-is (still fails). Second: stripped, hits. + if len(searches) == 1 { + _, _ = io.WriteString(w, `{"search":[]}`) + } else { + _, _ = io.WriteString(w, `{"search":[{"id":"Q200147"}]}`) + } + case "wbgetclaims": + _, _ = io.WriteString(w, `{"claims":{"P154":[{"mainsnak":{"datavalue":{"value":"Some logo.svg"}}}]}}`) + default: + _, _ = io.WriteString(w, "PNGDATA") + } + })) + defer srv.Close() + + p := newWikidataProviderWithServer(srv) + result, err := p.GetLogo(context.Background(), "FOO", "ObscureName Inc.") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.Source != "wikidata:Q200147" { + t.Errorf("Source = %q", result.Source) + } + if len(searches) != 2 { + t.Fatalf("expected 2 search calls (normalized + stripped), got %d: %v", len(searches), searches) + } + if !strings.Contains(searches[0], "Inc.") { + t.Errorf("first search should be the as-is normalized form: %q", searches[0]) + } + if strings.Contains(searches[1], "Inc.") { + t.Errorf("second search should have suffix stripped: %q", searches[1]) + } +} + +// Both attempts miss — error should mention both variants. +func TestWikidataProvider_GetLogo_NoMatchAfterStrip(t *testing.T) { + srv := stubWikidata(t, + `{"search":[]}`, + ``, ``, http.StatusOK, + ) + defer srv.Close() + + p := newWikidataProviderWithServer(srv) + _, err := p.GetLogo(context.Background(), "XYZ", "Made Up Co., Inc.") + if err == nil { + t.Fatal("expected error when both searches miss") + } + if !strings.Contains(err.Error(), "Made Up Co., Inc.") { + t.Errorf("error should mention the normalized name: %v", err) + } +}