Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 72 additions & 34 deletions internal/provider/wikidata_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,54 +105,92 @@ type wbSearchResponse struct {
} `json:"search"`
}

// Common corporate suffixes β€” we try these *after* a normalized search fails, so
// names like "REPSOL, S.A." (caps, double whitespace, suffix) collapse to a
// query Wikidata can match. Order matters: longer suffixes first so we don't
// accidentally peel off "Inc" when the real suffix is "Inc.".
var corporateSuffixes = []string{
", S.A.", " S.A.", ", S.A", " S.A",
", Inc.", " Inc.", ", Inc", " Inc",
", N.V.", " N.V.", ", NV", " NV",
" Corporation", " Corp.", " Corp",
" Limited", " Ltd.", " Ltd",
" plc", " PLC", " p.l.c.",
" GmbH", " AG", " SE", " S.E.",
" Co., Ltd.", " Co.", " & Co.",
// Lowercased corporate-form tokens we look for inside the company name. Yahoo
// often hands us not just the company name but the full security descriptor
// ("DIAGEO PLC ORD 28 101/108P"), so we can't rely on the form being a trailing
// suffix β€” we find it as a word and truncate everything after it.
var corporateForms = []string{
"plc", "p.l.c.",
"inc", "inc.",
"sa", "s.a", "s.a.",
"ltd", "ltd.", "limited",
"corp", "corp.", "corporation",
"ag",
"gmbh",
"nv", "n.v", "n.v.",
"se", "s.e.",
"co", "co.",
}

func normalizeWhitespace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

func stripCorporateSuffix(name string) string {
for _, suffix := range corporateSuffixes {
if strings.HasSuffix(name, suffix) {
return strings.TrimSpace(strings.TrimSuffix(name, suffix))
// companyNameVariants returns search-friendly variants of a raw company name
// in decreasing order of fidelity, e.g. for "DIAGEO PLC ORD 28 101/108P":
//
// ["DIAGEO PLC ORD 28 101/108P", "DIAGEO PLC", "DIAGEO"]
//
// Wikidata's fuzzy search picks up the right entity from the bare name even
// when the descriptive cruft drowns it out in the as-is form.
func companyNameVariants(name string) []string {
normalized := normalizeWhitespace(name)
variants := []string{normalized}

words := strings.Fields(normalized)
for i, word := range words {
token := strings.ToLower(strings.TrimRight(word, ".,"))
if !isCorporateForm(token) {
continue
}

// "<name…> <CorpForm>" β€” drop trailing security-descriptor cruft.
withForm := strings.Join(words[:i+1], " ")
if withForm != normalized {
variants = append(variants, withForm)
}

// "<name…>" β€” drop the corporate form too. Also trim trailing
// punctuation that often sits between the name and the form,
// e.g. "REPSOL," β†’ "REPSOL".
if i > 0 {
bare := strings.TrimRight(strings.Join(words[:i], " "), ",")
if bare != "" && bare != withForm {
variants = append(variants, bare)
}
}
break // first match wins; further tokens are inside the descriptor
}
return name

return variants
}

// searchEntity tries the normalized name first; if Wikidata returns no hits,
// it falls back to a suffix-stripped variant. Two queries worst case β€” cheap
// against Wikidata's free API and dramatically improves coverage for names
// pulled from Yahoo (which often arrive as "REPSOL, S.A." or "Diageo plc").
func (w *WikidataProvider) searchEntity(ctx context.Context, name string) (string, error) {
normalized := normalizeWhitespace(name)
if id, err := w.searchOnce(ctx, normalized); err == nil {
return id, nil
func isCorporateForm(token string) bool {
for _, f := range corporateForms {
if token == f {
return true
}
}
return false
}

stripped := stripCorporateSuffix(normalized)
if stripped == "" || stripped == normalized {
return "", fmt.Errorf("no wikidata entity for %q", normalized)
}
// searchEntity tries each variant of the company name in order, stopping at
// the first hit. At most 3 queries per call (raw β†’ with-form β†’ bare), all
// against Wikidata's free API.
func (w *WikidataProvider) searchEntity(ctx context.Context, name string) (string, error) {
variants := companyNameVariants(name)

id, err := w.searchOnce(ctx, stripped)
if err != nil {
return "", fmt.Errorf("no wikidata entity for %q or %q", normalized, stripped)
for _, v := range variants {
if v == "" {
continue
}
if id, err := w.searchOnce(ctx, v); err == nil {
return id, nil
}
}
return id, nil

return "", fmt.Errorf("no wikidata entity for any variant of %q (tried %d: %v)",
variants[0], len(variants), variants)
}

func (w *WikidataProvider) searchOnce(ctx context.Context, query string) (string, error) {
Expand Down
92 changes: 76 additions & 16 deletions internal/provider/wikidata_provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,22 +168,42 @@ func TestNormalizeWhitespace(t *testing.T) {
}
}

func TestStripCorporateSuffix(t *testing.T) {
cases := map[string]string{
"Repsol, S.A.": "Repsol",
"Diageo plc": "Diageo",
"Apple Inc.": "Apple",
"Iberdrola, S.A.": "Iberdrola",
"Toyota Motor Corp": "Toyota Motor",
"Just a Name": "Just a Name", // no suffix β†’ unchanged
func TestCompanyNameVariants(t *testing.T) {
cases := map[string][]string{
// Whitespace + trailing corp form: variants = original, then bare name.
"REPSOL, S.A.": {"REPSOL, S.A.", "REPSOL"},
// Trailing plc: original then bare.
"Diageo plc": {"Diageo plc", "Diageo"},
// Yahoo descriptor noise after the corp form β€” must truncate to "with form"
// AND "bare".
"DIAGEO PLC ORD 28 101/108P": {"DIAGEO PLC ORD 28 101/108P", "DIAGEO PLC", "DIAGEO"},
// Multi-word name with trailing form: same shape.
"Toyota Motor Corporation": {"Toyota Motor Corporation", "Toyota Motor"},
// Inc.
"Apple Inc.": {"Apple Inc.", "Apple"},
// No corp form anywhere: just the normalized name.
"Berkshire Hathaway": {"Berkshire Hathaway"},
}
for input, want := range cases {
if got := stripCorporateSuffix(input); got != want {
t.Errorf("stripCorporateSuffix(%q) = %q, want %q", input, got, want)
got := companyNameVariants(input)
if !equalSlices(got, want) {
t.Errorf("companyNameVariants(%q) = %v, want %v", input, got, want)
}
}
}

func equalSlices(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}

// Whitespace-only difference: the as-is search succeeds, no fallback needed.
func TestWikidataProvider_GetLogo_RecoversFromDoubleSpace(t *testing.T) {
calls := 0
Expand Down Expand Up @@ -220,16 +240,16 @@ func TestWikidataProvider_GetLogo_RecoversFromDoubleSpace(t *testing.T) {
}
}

// First (normalized) search misses, suffix-stripped retry hits.
func TestWikidataProvider_GetLogo_RecoversBySuffixStrip(t *testing.T) {
// First (normalized) search misses, bare-name retry hits.
func TestWikidataProvider_GetLogo_RecoversByStrippingCorpForm(t *testing.T) {
searches := []string{}
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
q := r.URL.Query()
switch q.Get("action") {
case "wbsearchentities":
search := q.Get("search")
searches = append(searches, search)
// First search: as-is (still fails). Second: stripped, hits.
// First search (as-is): miss. Second (bare): hit.
if len(searches) == 1 {
_, _ = io.WriteString(w, `{"search":[]}`)
} else {
Expand All @@ -252,13 +272,53 @@ func TestWikidataProvider_GetLogo_RecoversBySuffixStrip(t *testing.T) {
t.Errorf("Source = %q", result.Source)
}
if len(searches) != 2 {
t.Fatalf("expected 2 search calls (normalized + stripped), got %d: %v", len(searches), searches)
t.Fatalf("expected 2 search calls (as-is + bare), got %d: %v", len(searches), searches)
}
if !strings.Contains(searches[0], "Inc.") {
t.Errorf("first search should be the as-is normalized form: %q", searches[0])
t.Errorf("first search should keep the corp form: %q", searches[0])
}
if strings.Contains(searches[1], "Inc.") {
t.Errorf("second search should have suffix stripped: %q", searches[1])
t.Errorf("second search should drop the corp form: %q", searches[1])
}
}

// Yahoo-style descriptor noise β€” variants must include the truncated form so
// Wikidata can find Diageo when handed "DIAGEO PLC ORD 28 101/108P".
func TestWikidataProvider_GetLogo_HandlesYahooSecurityDescriptor(t *testing.T) {
searches := []string{}
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
q := r.URL.Query()
switch q.Get("action") {
case "wbsearchentities":
search := q.Get("search")
searches = append(searches, search)
// First two miss (raw + truncated-with-form), third hits ("DIAGEO").
if len(searches) <= 2 {
_, _ = io.WriteString(w, `{"search":[]}`)
} else {
_, _ = io.WriteString(w, `{"search":[{"id":"Q161140"}]}`)
}
case "wbgetclaims":
_, _ = io.WriteString(w, `{"claims":{"P154":[{"mainsnak":{"datavalue":{"value":"Diageo logo.svg"}}}]}}`)
default:
_, _ = io.WriteString(w, "PNGDATA")
}
}))
defer srv.Close()

p := newWikidataProviderWithServer(srv)
result, err := p.GetLogo(context.Background(), "DGE.L", "DIAGEO PLC ORD 28 101/108P")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if result.Source != "wikidata:Q161140" {
t.Errorf("Source = %q", result.Source)
}
if len(searches) != 3 {
t.Fatalf("expected 3 search calls (raw β†’ with-form β†’ bare), got %d: %v", len(searches), searches)
}
if searches[len(searches)-1] != "DIAGEO" {
t.Errorf("final search should be the bare name, got %q", searches[len(searches)-1])
}
}

Expand Down
Loading