From 38e8c7df6cb2723e41f27a2b6d02aa4cd30687e4 Mon Sep 17 00:00:00 2001 From: Francesc Leveque Date: Sun, 17 May 2026 01:31:06 +0200 Subject: [PATCH] Truncate Yahoo's security descriptor before Wikidata search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Yesterday's normalization fix (#20) helped REPSOL, but DGE.L's company_name from Yahoo arrived as the full security descriptor: "DIAGEO PLC ORD 28 101/108P" (Diageo plc, Ordinary shares, par value 28 101/108 pence.) My previous suffix-strip only handled trailing corporate forms ("plc" at the END), so it left this string unchanged and Wikidata couldn't match. Replace the two helpers (normalizeWhitespace + stripCorporateSuffix with a single companyNameVariants function that scans for the corporate form as a *word inside* the name and returns up to three search variants in decreasing fidelity: "DIAGEO PLC ORD 28 101/108P" ─► ["DIAGEO PLC ORD 28 101/108P", "DIAGEO PLC", "DIAGEO"] "REPSOL, S.A." ─► ["REPSOL, S.A.", "REPSOL"] "Apple Inc." ─► ["Apple Inc.", "Apple"] "Berkshire Hathaway" ─► ["Berkshire Hathaway"] Search now tries each variant in order, stopping at the first hit. Three Wikidata queries worst case — still cheap against their free API. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/provider/wikidata_provider.go | 106 +++++++++++++------- internal/provider/wikidata_provider_test.go | 92 ++++++++++++++--- 2 files changed, 148 insertions(+), 50 deletions(-) diff --git a/internal/provider/wikidata_provider.go b/internal/provider/wikidata_provider.go index d3f7187..f1f50c8 100644 --- a/internal/provider/wikidata_provider.go +++ b/internal/provider/wikidata_provider.go @@ -105,54 +105,92 @@ type wbSearchResponse struct { } `json:"search"` } -// Common corporate suffixes — we try these *after* a normalized search fails, so -// names like "REPSOL, S.A." (caps, double whitespace, suffix) collapse to a -// query Wikidata can match. Order matters: longer suffixes first so we don't -// accidentally peel off "Inc" when the real suffix is "Inc.". -var corporateSuffixes = []string{ - ", S.A.", " S.A.", ", S.A", " S.A", - ", Inc.", " Inc.", ", Inc", " Inc", - ", N.V.", " N.V.", ", NV", " NV", - " Corporation", " Corp.", " Corp", - " Limited", " Ltd.", " Ltd", - " plc", " PLC", " p.l.c.", - " GmbH", " AG", " SE", " S.E.", - " Co., Ltd.", " Co.", " & Co.", +// Lowercased corporate-form tokens we look for inside the company name. Yahoo +// often hands us not just the company name but the full security descriptor +// ("DIAGEO PLC ORD 28 101/108P"), so we can't rely on the form being a trailing +// suffix — we find it as a word and truncate everything after it. +var corporateForms = []string{ + "plc", "p.l.c.", + "inc", "inc.", + "sa", "s.a", "s.a.", + "ltd", "ltd.", "limited", + "corp", "corp.", "corporation", + "ag", + "gmbh", + "nv", "n.v", "n.v.", + "se", "s.e.", + "co", "co.", } func normalizeWhitespace(s string) string { return strings.Join(strings.Fields(s), " ") } -func stripCorporateSuffix(name string) string { - for _, suffix := range corporateSuffixes { - if strings.HasSuffix(name, suffix) { - return strings.TrimSpace(strings.TrimSuffix(name, suffix)) +// companyNameVariants returns search-friendly variants of a raw company name +// in decreasing order of fidelity, e.g. for "DIAGEO PLC ORD 28 101/108P": +// +// ["DIAGEO PLC ORD 28 101/108P", "DIAGEO PLC", "DIAGEO"] +// +// Wikidata's fuzzy search picks up the right entity from the bare name even +// when the descriptive cruft drowns it out in the as-is form. +func companyNameVariants(name string) []string { + normalized := normalizeWhitespace(name) + variants := []string{normalized} + + words := strings.Fields(normalized) + for i, word := range words { + token := strings.ToLower(strings.TrimRight(word, ".,")) + if !isCorporateForm(token) { + continue + } + + // " " — drop trailing security-descriptor cruft. + withForm := strings.Join(words[:i+1], " ") + if withForm != normalized { + variants = append(variants, withForm) + } + + // "" — drop the corporate form too. Also trim trailing + // punctuation that often sits between the name and the form, + // e.g. "REPSOL," → "REPSOL". + if i > 0 { + bare := strings.TrimRight(strings.Join(words[:i], " "), ",") + if bare != "" && bare != withForm { + variants = append(variants, bare) + } } + break // first match wins; further tokens are inside the descriptor } - return name + + return variants } -// searchEntity tries the normalized name first; if Wikidata returns no hits, -// it falls back to a suffix-stripped variant. Two queries worst case — cheap -// against Wikidata's free API and dramatically improves coverage for names -// pulled from Yahoo (which often arrive as "REPSOL, S.A." or "Diageo plc"). -func (w *WikidataProvider) searchEntity(ctx context.Context, name string) (string, error) { - normalized := normalizeWhitespace(name) - if id, err := w.searchOnce(ctx, normalized); err == nil { - return id, nil +func isCorporateForm(token string) bool { + for _, f := range corporateForms { + if token == f { + return true + } } + return false +} - stripped := stripCorporateSuffix(normalized) - if stripped == "" || stripped == normalized { - return "", fmt.Errorf("no wikidata entity for %q", normalized) - } +// searchEntity tries each variant of the company name in order, stopping at +// the first hit. At most 3 queries per call (raw → with-form → bare), all +// against Wikidata's free API. +func (w *WikidataProvider) searchEntity(ctx context.Context, name string) (string, error) { + variants := companyNameVariants(name) - id, err := w.searchOnce(ctx, stripped) - if err != nil { - return "", fmt.Errorf("no wikidata entity for %q or %q", normalized, stripped) + for _, v := range variants { + if v == "" { + continue + } + if id, err := w.searchOnce(ctx, v); err == nil { + return id, nil + } } - return id, nil + + return "", fmt.Errorf("no wikidata entity for any variant of %q (tried %d: %v)", + variants[0], len(variants), variants) } func (w *WikidataProvider) searchOnce(ctx context.Context, query string) (string, error) { diff --git a/internal/provider/wikidata_provider_test.go b/internal/provider/wikidata_provider_test.go index 023abd3..048bf99 100644 --- a/internal/provider/wikidata_provider_test.go +++ b/internal/provider/wikidata_provider_test.go @@ -168,22 +168,42 @@ func TestNormalizeWhitespace(t *testing.T) { } } -func TestStripCorporateSuffix(t *testing.T) { - cases := map[string]string{ - "Repsol, S.A.": "Repsol", - "Diageo plc": "Diageo", - "Apple Inc.": "Apple", - "Iberdrola, S.A.": "Iberdrola", - "Toyota Motor Corp": "Toyota Motor", - "Just a Name": "Just a Name", // no suffix → unchanged +func TestCompanyNameVariants(t *testing.T) { + cases := map[string][]string{ + // Whitespace + trailing corp form: variants = original, then bare name. + "REPSOL, S.A.": {"REPSOL, S.A.", "REPSOL"}, + // Trailing plc: original then bare. + "Diageo plc": {"Diageo plc", "Diageo"}, + // Yahoo descriptor noise after the corp form — must truncate to "with form" + // AND "bare". + "DIAGEO PLC ORD 28 101/108P": {"DIAGEO PLC ORD 28 101/108P", "DIAGEO PLC", "DIAGEO"}, + // Multi-word name with trailing form: same shape. + "Toyota Motor Corporation": {"Toyota Motor Corporation", "Toyota Motor"}, + // Inc. + "Apple Inc.": {"Apple Inc.", "Apple"}, + // No corp form anywhere: just the normalized name. + "Berkshire Hathaway": {"Berkshire Hathaway"}, } for input, want := range cases { - if got := stripCorporateSuffix(input); got != want { - t.Errorf("stripCorporateSuffix(%q) = %q, want %q", input, got, want) + got := companyNameVariants(input) + if !equalSlices(got, want) { + t.Errorf("companyNameVariants(%q) = %v, want %v", input, got, want) } } } +func equalSlices(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + // Whitespace-only difference: the as-is search succeeds, no fallback needed. func TestWikidataProvider_GetLogo_RecoversFromDoubleSpace(t *testing.T) { calls := 0 @@ -220,8 +240,8 @@ func TestWikidataProvider_GetLogo_RecoversFromDoubleSpace(t *testing.T) { } } -// First (normalized) search misses, suffix-stripped retry hits. -func TestWikidataProvider_GetLogo_RecoversBySuffixStrip(t *testing.T) { +// First (normalized) search misses, bare-name retry hits. +func TestWikidataProvider_GetLogo_RecoversByStrippingCorpForm(t *testing.T) { searches := []string{} srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { q := r.URL.Query() @@ -229,7 +249,7 @@ func TestWikidataProvider_GetLogo_RecoversBySuffixStrip(t *testing.T) { case "wbsearchentities": search := q.Get("search") searches = append(searches, search) - // First search: as-is (still fails). Second: stripped, hits. + // First search (as-is): miss. Second (bare): hit. if len(searches) == 1 { _, _ = io.WriteString(w, `{"search":[]}`) } else { @@ -252,13 +272,53 @@ func TestWikidataProvider_GetLogo_RecoversBySuffixStrip(t *testing.T) { t.Errorf("Source = %q", result.Source) } if len(searches) != 2 { - t.Fatalf("expected 2 search calls (normalized + stripped), got %d: %v", len(searches), searches) + t.Fatalf("expected 2 search calls (as-is + bare), got %d: %v", len(searches), searches) } if !strings.Contains(searches[0], "Inc.") { - t.Errorf("first search should be the as-is normalized form: %q", searches[0]) + t.Errorf("first search should keep the corp form: %q", searches[0]) } if strings.Contains(searches[1], "Inc.") { - t.Errorf("second search should have suffix stripped: %q", searches[1]) + t.Errorf("second search should drop the corp form: %q", searches[1]) + } +} + +// Yahoo-style descriptor noise — variants must include the truncated form so +// Wikidata can find Diageo when handed "DIAGEO PLC ORD 28 101/108P". +func TestWikidataProvider_GetLogo_HandlesYahooSecurityDescriptor(t *testing.T) { + searches := []string{} + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query() + switch q.Get("action") { + case "wbsearchentities": + search := q.Get("search") + searches = append(searches, search) + // First two miss (raw + truncated-with-form), third hits ("DIAGEO"). + if len(searches) <= 2 { + _, _ = io.WriteString(w, `{"search":[]}`) + } else { + _, _ = io.WriteString(w, `{"search":[{"id":"Q161140"}]}`) + } + case "wbgetclaims": + _, _ = io.WriteString(w, `{"claims":{"P154":[{"mainsnak":{"datavalue":{"value":"Diageo logo.svg"}}}]}}`) + default: + _, _ = io.WriteString(w, "PNGDATA") + } + })) + defer srv.Close() + + p := newWikidataProviderWithServer(srv) + result, err := p.GetLogo(context.Background(), "DGE.L", "DIAGEO PLC ORD 28 101/108P") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.Source != "wikidata:Q161140" { + t.Errorf("Source = %q", result.Source) + } + if len(searches) != 3 { + t.Fatalf("expected 3 search calls (raw → with-form → bare), got %d: %v", len(searches), searches) + } + if searches[len(searches)-1] != "DIAGEO" { + t.Errorf("final search should be the bare name, got %q", searches[len(searches)-1]) } }