Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 52 additions & 2 deletions internal/provider/wikidata_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,60 @@ type wbSearchResponse struct {
} `json:"search"`
}

// Common corporate suffixes — we try these *after* a normalized search fails, so
// names like "REPSOL, S.A." (caps, double whitespace, suffix) collapse to a
// query Wikidata can match. Order matters: longer suffixes first so we don't
// accidentally peel off "Inc" when the real suffix is "Inc.".
var corporateSuffixes = []string{
", S.A.", " S.A.", ", S.A", " S.A",
", Inc.", " Inc.", ", Inc", " Inc",
", N.V.", " N.V.", ", NV", " NV",
" Corporation", " Corp.", " Corp",
" Limited", " Ltd.", " Ltd",
" plc", " PLC", " p.l.c.",
" GmbH", " AG", " SE", " S.E.",
" Co., Ltd.", " Co.", " & Co.",
}

func normalizeWhitespace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

func stripCorporateSuffix(name string) string {
for _, suffix := range corporateSuffixes {
if strings.HasSuffix(name, suffix) {
return strings.TrimSpace(strings.TrimSuffix(name, suffix))
}
}
return name
}

// searchEntity tries the normalized name first; if Wikidata returns no hits,
// it falls back to a suffix-stripped variant. Two queries worst case — cheap
// against Wikidata's free API and dramatically improves coverage for names
// pulled from Yahoo (which often arrive as "REPSOL, S.A." or "Diageo plc").
func (w *WikidataProvider) searchEntity(ctx context.Context, name string) (string, error) {
normalized := normalizeWhitespace(name)
if id, err := w.searchOnce(ctx, normalized); err == nil {
return id, nil
}

stripped := stripCorporateSuffix(normalized)
if stripped == "" || stripped == normalized {
return "", fmt.Errorf("no wikidata entity for %q", normalized)
}

id, err := w.searchOnce(ctx, stripped)
if err != nil {
return "", fmt.Errorf("no wikidata entity for %q or %q", normalized, stripped)
}
return id, nil
}

func (w *WikidataProvider) searchOnce(ctx context.Context, query string) (string, error) {
q := url.Values{
"action": {"wbsearchentities"},
"search": {name},
"search": {query},
"language": {"en"},
"limit": {"1"},
"format": {"json"},
Expand All @@ -114,7 +164,7 @@ func (w *WikidataProvider) searchEntity(ctx context.Context, name string) (strin
return "", fmt.Errorf("decoding search response: %w", err)
}
if len(resp.Search) == 0 {
return "", fmt.Errorf("no wikidata entity for %q", name)
return "", fmt.Errorf("no entity")
}
return resp.Search[0].ID, nil
}
Expand Down
126 changes: 126 additions & 0 deletions internal/provider/wikidata_provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,129 @@ func TestWikidataProvider_BulkImport_NotSupported(t *testing.T) {
t.Fatal("expected error from BulkImport")
}
}

func TestNormalizeWhitespace(t *testing.T) {
cases := map[string]string{
"REPSOL, S.A.": "REPSOL, S.A.",
" Diageo plc ": "Diageo plc",
"Apple\tInc.": "Apple Inc.",
"Multi\n Line\tCo.": "Multi Line Co.",
}
for input, want := range cases {
if got := normalizeWhitespace(input); got != want {
t.Errorf("normalizeWhitespace(%q) = %q, want %q", input, got, want)
}
}
}

func TestStripCorporateSuffix(t *testing.T) {
cases := map[string]string{
"Repsol, S.A.": "Repsol",
"Diageo plc": "Diageo",
"Apple Inc.": "Apple",
"Iberdrola, S.A.": "Iberdrola",
"Toyota Motor Corp": "Toyota Motor",
"Just a Name": "Just a Name", // no suffix → unchanged
}
for input, want := range cases {
if got := stripCorporateSuffix(input); got != want {
t.Errorf("stripCorporateSuffix(%q) = %q, want %q", input, got, want)
}
}
}

// Whitespace-only difference: the as-is search succeeds, no fallback needed.
func TestWikidataProvider_GetLogo_RecoversFromDoubleSpace(t *testing.T) {
calls := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
q := r.URL.Query()
switch q.Get("action") {
case "wbsearchentities":
calls++
search := q.Get("search")
// Whitespace-collapsed input must be the FIRST query — and we expect
// it to hit on the first try (single query).
if search != "REPSOL, S.A." {
t.Errorf("call %d: expected first search to be whitespace-collapsed, got %q", calls, search)
}
_, _ = io.WriteString(w, `{"search":[{"id":"Q174747"}]}`)
case "wbgetclaims":
_, _ = io.WriteString(w, `{"claims":{"P154":[{"mainsnak":{"datavalue":{"value":"Repsol logo.svg"}}}]}}`)
default:
_, _ = io.WriteString(w, "PNGDATA")
}
}))
defer srv.Close()

p := newWikidataProviderWithServer(srv)
result, err := p.GetLogo(context.Background(), "REP.MC", "REPSOL, S.A.")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if result.Source != "wikidata:Q174747" {
t.Errorf("Source = %q", result.Source)
}
if calls != 1 {
t.Errorf("expected exactly 1 search call (whitespace-only fix), got %d", calls)
}
}

// First (normalized) search misses, suffix-stripped retry hits.
func TestWikidataProvider_GetLogo_RecoversBySuffixStrip(t *testing.T) {
searches := []string{}
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
q := r.URL.Query()
switch q.Get("action") {
case "wbsearchentities":
search := q.Get("search")
searches = append(searches, search)
// First search: as-is (still fails). Second: stripped, hits.
if len(searches) == 1 {
_, _ = io.WriteString(w, `{"search":[]}`)
} else {
_, _ = io.WriteString(w, `{"search":[{"id":"Q200147"}]}`)
}
case "wbgetclaims":
_, _ = io.WriteString(w, `{"claims":{"P154":[{"mainsnak":{"datavalue":{"value":"Some logo.svg"}}}]}}`)
default:
_, _ = io.WriteString(w, "PNGDATA")
}
}))
defer srv.Close()

p := newWikidataProviderWithServer(srv)
result, err := p.GetLogo(context.Background(), "FOO", "ObscureName Inc.")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if result.Source != "wikidata:Q200147" {
t.Errorf("Source = %q", result.Source)
}
if len(searches) != 2 {
t.Fatalf("expected 2 search calls (normalized + stripped), got %d: %v", len(searches), searches)
}
if !strings.Contains(searches[0], "Inc.") {
t.Errorf("first search should be the as-is normalized form: %q", searches[0])
}
if strings.Contains(searches[1], "Inc.") {
t.Errorf("second search should have suffix stripped: %q", searches[1])
}
}

// Both attempts miss — error should mention both variants.
func TestWikidataProvider_GetLogo_NoMatchAfterStrip(t *testing.T) {
srv := stubWikidata(t,
`{"search":[]}`,
``, ``, http.StatusOK,
)
defer srv.Close()

p := newWikidataProviderWithServer(srv)
_, err := p.GetLogo(context.Background(), "XYZ", "Made Up Co., Inc.")
if err == nil {
t.Fatal("expected error when both searches miss")
}
if !strings.Contains(err.Error(), "Made Up Co., Inc.") {
t.Errorf("error should mention the normalized name: %v", err)
}
}
Loading