|
| 1 | +# Brand and Official Account Heuristics |
| 2 | +# Used to filter out brand, media, official, and organizational accounts |
| 3 | +# from the influencer pool. Rules are evaluated as OR (any match triggers flag). |
| 4 | + |
| 5 | +version: "2.0.0" |
| 6 | +updated: "2025-11-14" |
| 7 | + |
| 8 | +# Keywords in name or username (case-insensitive, word boundary match) |
| 9 | +name_keywords: |
| 10 | + official_indicators: |
| 11 | + - "official" |
| 12 | + - "team" # NOTE: May FP on gaming/esports teams ("Team Liquid player"). Recommend context check (gaming domain indicators). |
| 13 | + - "support" |
| 14 | + - "help" |
| 15 | + - "press" |
| 16 | + - "pr" |
| 17 | + - "media" |
| 18 | + - "news" |
| 19 | + - "newsroom" |
| 20 | + |
| 21 | + corporate_indicators: |
| 22 | + - "corp" |
| 23 | + - "inc" # NOTE: "Inc" as personal nickname (e.g., "John Doe Inc") may cause FP; use context/verified status to override |
| 24 | + - "ltd" |
| 25 | + - "llc" |
| 26 | + - "gmbh" |
| 27 | + - "company" |
| 28 | + - "enterprises" |
| 29 | + |
| 30 | + brand_commerce: |
| 31 | + - "store" |
| 32 | + - "shop" # NOTE: May FP on job descriptions ("at Shopify"). Recommend word-boundary match in implementation. |
| 33 | + - "shopping" |
| 34 | + - "deals" |
| 35 | + - "sales" |
| 36 | + - "promo" |
| 37 | + - "coupon" |
| 38 | + |
| 39 | + # Major tech companies and platforms (corporate accounts) |
| 40 | + tech_corporations: |
| 41 | + - "amazon web services" |
| 42 | + - "aws" |
| 43 | + - "microsoft azure" |
| 44 | + - "azure" |
| 45 | + - "google cloud" |
| 46 | + - "google ai" |
| 47 | + - "google deepmind" |
| 48 | + - "tensor flow" |
| 49 | + - "kaggle" |
| 50 | + - "docker" |
| 51 | + - "figma" |
| 52 | + - "mongodb" |
| 53 | + - "stripe" |
| 54 | + - "next.js" |
| 55 | + - "nextjs" |
| 56 | + - "vercel" |
| 57 | + - "netlify" |
| 58 | + - "cloudflare" |
| 59 | + - "tailwind css" |
| 60 | + - "tailwindcss" |
| 61 | + - "react" |
| 62 | + - "reactjs" |
| 63 | + - "vue.js" |
| 64 | + - "vuejs" |
| 65 | + - "angular" |
| 66 | + - "nvidia" |
| 67 | + - "github" |
| 68 | + - "gitlab" |
| 69 | + - "openai" |
| 70 | + - "anthropic" |
| 71 | + - "hugging face" |
| 72 | + - "huggingface" |
| 73 | + - "pytorch" |
| 74 | + - "tensorflow" |
| 75 | + - "def con" |
| 76 | + - "defcon" |
| 77 | + - "supabase" |
| 78 | + - "linear" |
| 79 | + - "remix" |
| 80 | + - "digitalocean" |
| 81 | + - "heroku" |
| 82 | + |
| 83 | + tech_frameworks: |
| 84 | + - "platform" |
| 85 | + - "framework" |
| 86 | + - "ecosystem" |
| 87 | + - "sdk" |
| 88 | + - "api" |
| 89 | + - "developer tools" |
| 90 | + - "open source" |
| 91 | + - "library" |
| 92 | + - "runtime" |
| 93 | + - "database" |
| 94 | + - "devtools" |
| 95 | + |
| 96 | + conference_events: |
| 97 | + - "conference" |
| 98 | + - "summit" |
| 99 | + - "keynote" |
| 100 | + - "event" |
| 101 | + - "festival" |
| 102 | + - "convention" |
| 103 | + - "meetup" |
| 104 | + |
| 105 | +# Chinese official account patterns (新增中文官号识别) |
| 106 | + chinese_official: |
| 107 | + - "官号" |
| 108 | + - "官方" |
| 109 | + - "公司" |
| 110 | + - "企业" |
| 111 | + - "组织" |
| 112 | + - "机构" |
| 113 | + - "平台" |
| 114 | + - "官方账号" |
| 115 | + - "官方推特" |
| 116 | + - "客服" |
| 117 | + |
| 118 | +# Keywords in bio/description (case-insensitive, substring match) |
| 119 | +bio_keywords: |
| 120 | + organizational: |
| 121 | + - "official account" |
| 122 | + - "official twitter" |
| 123 | + - "official page" |
| 124 | + - "managed by" |
| 125 | + - "run by our team" |
| 126 | + - "corporate account" |
| 127 | + - "company news" |
| 128 | + - "press releases" |
| 129 | + - "media inquiries" |
| 130 | + - "for support" |
| 131 | + - "customer service" |
| 132 | + |
| 133 | + media_publishers: |
| 134 | + - "news outlet" |
| 135 | + - "news organization" |
| 136 | + - "media company" |
| 137 | + - "news network" |
| 138 | + - "publishing" |
| 139 | + - "journalist at" |
| 140 | + - "reporter for" |
| 141 | + - "editor at" |
| 142 | + |
| 143 | + aggregators_bots: |
| 144 | + - "automated" |
| 145 | + - "bot account" |
| 146 | + - "news aggregator" |
| 147 | + - "auto-tweet" |
| 148 | + - "rss feed" |
| 149 | + |
| 150 | +# Domain patterns in profile URL or bio links (regex) |
| 151 | +domain_patterns: |
| 152 | + - pattern: ".*\\.(gov|edu|org)$" |
| 153 | + reason: "institutional_domain" |
| 154 | + exceptions: ["github.org", "huggingface.co"] |
| 155 | + |
| 156 | + - pattern: ".*(shop|store|deals|buy|cart|checkout).*" |
| 157 | + reason: "ecommerce_domain" |
| 158 | + |
| 159 | + - pattern: ".*(news|press|media|journal|times|post|tribune).*" |
| 160 | + reason: "media_domain" |
| 161 | + |
| 162 | +# Verification status rules |
| 163 | +verification_rules: |
| 164 | + # X Blue (verified=blue) is personal; keep unless other heuristics match |
| 165 | + # Organization verification (verified=org) is always flagged |
| 166 | + flag_org_verification: true |
| 167 | + |
| 168 | + # Legacy verified (verified=legacy) before 2023: mixed (keep unless other heuristics match) |
| 169 | + # Gold verified (not in current schema): future-proof placeholder |
| 170 | + |
| 171 | +# Follower/following ratio heuristics (optional, low weight) |
| 172 | +ratio_heuristics: |
| 173 | + # If followers/following > 100 AND bio matches corporate, likely brand |
| 174 | + high_ratio_threshold: 100 |
| 175 | + # If following = 0, likely automated/official (but not sufficient alone) |
| 176 | + zero_following_flag: false |
| 177 | + |
| 178 | +# Known organization handles (AUTO-FILTER - these are always removed) |
| 179 | +# Format: handle (without @) - any match = immediate removal |
| 180 | +org_handles_blacklist: |
| 181 | + - "github" |
| 182 | + - "nvidia" |
| 183 | + - "awscloud" |
| 184 | + - "azure" |
| 185 | + - "reactjs" |
| 186 | + - "code" |
| 187 | + - "huggingface" |
| 188 | + - "googlecloud" |
| 189 | + - "docker" |
| 190 | + - "figma" |
| 191 | + - "mongodb" |
| 192 | + - "vuejs" |
| 193 | + - "nextjs" |
| 194 | + - "golang" |
| 195 | + - "netlify" |
| 196 | + - "cloudflare" |
| 197 | + - "supabase" |
| 198 | + - "linear" |
| 199 | + - "remix" |
| 200 | + - "gitlab" |
| 201 | + - "openai" |
| 202 | + - "anthropic" |
| 203 | + - "pytorch" |
| 204 | + - "tensorflow" |
| 205 | + - "vercel" |
| 206 | + - "digitalocean" |
| 207 | + - "heroku" |
| 208 | + - "stripe" |
| 209 | + - "tailwindcss" |
| 210 | + - "defcon" |
| 211 | + |
| 212 | +# Exceptions (handles that match heuristics but are known individuals) |
| 213 | +# Format: handle (without @) |
| 214 | +exceptions: |
| 215 | + - "example_personal_account" # Placeholder |
| 216 | + # Crypto/Web3 individual influencers (high follower accounts incorrectly flagged as org) |
| 217 | + - "100trillionUSD" |
| 218 | + - "CredibleCrypto" |
| 219 | + - "CryptoGodJohn" |
| 220 | + - "TheCryptoDog" |
| 221 | + - "DocumentingBTC" |
| 222 | + - "PlanB" |
| 223 | + - "Loomdart" |
| 224 | + - "WhalePanda" |
| 225 | + - "Winklevoss" |
| 226 | + # Gaming/Creator individual accounts |
| 227 | + - "Ninja" |
| 228 | + - "DrDisrespect" |
| 229 | + - "MarkRuffalo" |
| 230 | + - "JackMa" |
| 231 | + - "JeffBezos" |
| 232 | + # Add known false positives here as they're discovered |
| 233 | + |
| 234 | +# Scoring adjustments (for fine-tuning, not hard filters) |
| 235 | +# These adjust the is_org / is_official confidence score |
| 236 | +confidence_weights: |
| 237 | + name_keyword_match: 0.6 |
| 238 | + bio_keyword_match: 0.4 |
| 239 | + domain_match: 0.8 |
| 240 | + org_verification: 1.0 |
| 241 | + |
| 242 | + # Threshold for flagging (sum of weights) |
| 243 | + flag_threshold: 0.7 # ≥0.7 → is_org or is_official = true |
0 commit comments