From 995810e97138da2afffb8cd6d83a7248a53155db Mon Sep 17 00:00:00 2001 From: mashbean Date: Tue, 16 Jun 2026 18:15:11 +0800 Subject: [PATCH 1/2] feat(comment-spam): 3-tier moderation alerting to admin Telegram (notify-only) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A high spam score alone can't separate true spam from false positives: on matters_prod (7-day, >=0.94 band) precision is only ~60% — escort ads (0.996) score the same as 中文 creative writing (0.992) and short genuine replies. Account age doesn't separate either (an escort account was 818d old / 883 articles). What cleanly partitions them (ZERO false positives on the real high-score set) is a compound gate: Tier A (auto): score>=threshold AND contact-channel AND solicitation-keyword → escort / paid-services / account-selling / betting promo. Tier B (ring): author repeats near-identical content across comments. Tier C (review): high score but neither → surface to humans, never auto-act (creative writing / opinions / replies land here). This wires the gate into detectSpam and surfaces all three tiers to the admin Telegram chat by reusing the existing report-alert SQS → reportTelegramAlert pipeline (new source 'spam_detection'). NOTIFY-ONLY: it never hides a comment — auto-action stays behind the separate, still-off commentSpamAutoCollapse flag — so we validate the gate's precision in production before enabling enforcement. Gated by MATTERS_COMMENT_SPAM_ALERT (default off). Signal logic lives in a pure, fully unit-tested module (commentSpamSignals.ts); the ring check is one bounded read of the author's recent comments, run only for the rare high-score comments. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/common/environment.ts | 5 + src/common/notifications/reportAlert.ts | 2 +- .../__test__/commentService.test.ts | 105 +++++++++++ .../__test__/commentSpamSignals.test.ts | 174 ++++++++++++++++++ src/connectors/commentService.ts | 96 ++++++++++ src/connectors/commentSpamSignals.ts | 142 ++++++++++++++ src/handlers/reportTelegramAlert.ts | 9 +- 7 files changed, 531 insertions(+), 2 deletions(-) create mode 100644 src/connectors/__test__/commentSpamSignals.test.ts create mode 100644 src/connectors/commentSpamSignals.ts diff --git a/src/common/environment.ts b/src/common/environment.ts index ea24848aa..fbc14c4a4 100644 --- a/src/common/environment.ts +++ b/src/common/environment.ts @@ -194,6 +194,11 @@ export const environment = { // 看見"). Default off so scoring stays observe-only until ops opts in. commentSpamAutoCollapse: process.env.MATTERS_COMMENT_SPAM_AUTO_COLLAPSE === 'true', + // When true, a comment whose spam score reaches the system spam threshold is + // classified (Tier A auto / B ring / C review) and surfaced to the admin + // Telegram chat for moderation. Notify-only: this flag NEVER hides a comment + // (auto-action stays behind commentSpamAutoCollapse). Default off. + commentSpamAlert: process.env.MATTERS_COMMENT_SPAM_ALERT === 'true', channelClassificationApiUrl: process.env.MATTERS_CHANNEL_CLASSIFICATION_API_URL || '', languageDetectionApiUrl: process.env.MATTERS_LANGUAGE_DETECTION_API_URL || '', diff --git a/src/common/notifications/reportAlert.ts b/src/common/notifications/reportAlert.ts index 771263da8..f2d387c86 100644 --- a/src/common/notifications/reportAlert.ts +++ b/src/common/notifications/reportAlert.ts @@ -14,7 +14,7 @@ const logger = getLogger('report-alert') * is the integration boundary between the two. */ export type ReportAlertRequested = { - source: 'direct' | 'community_watch' + source: 'direct' | 'community_watch' | 'spam_detection' /** Aggregation key — same key within 24h edits the same Telegram message. */ dedupeKey: string /** Human-readable description of what was reported. */ diff --git a/src/connectors/__test__/commentService.test.ts b/src/connectors/__test__/commentService.test.ts index 89814fb3c..4651f5aac 100644 --- a/src/connectors/__test__/commentService.test.ts +++ b/src/connectors/__test__/commentService.test.ts @@ -6,10 +6,12 @@ import { COMMENT_TYPE, FEATURE_FLAG, FEATURE_NAME, + QUEUE_URL, USER_FEATURE_FLAG_TYPE, USER_STATE, } from '#common/enums/index.js' import { environment } from '#common/environment.js' +import { aws } from '#connectors/aws/index.js' import { PublicationService } from '../article/publicationService.js' import { AtomService } from '../atomService.js' @@ -865,3 +867,106 @@ describe('auto-collapse spam comments', () => { }) }) }) + +describe('spam telegram alert (notify-only tiering)', () => { + let targetTypeId: string + const originalQueue = QUEUE_URL.reportAlert + const originalSqsSend = aws.sqsSendMessage + let sent: Array> + + beforeAll(async () => { + const entityType = await atomService.findFirst({ + table: 'entity_type', + where: { table: 'article' }, + }) + targetTypeId = entityType.id + await systemService.setFeatureFlag({ + name: FEATURE_NAME.spam_detection, + flag: FEATURE_FLAG.on, + value: 0.8, + }) + ;(QUEUE_URL as { reportAlert: string }).reportAlert = + 'https://sqs.test/report-alert' + ;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage = + (async (params) => { + sent.push(params.messageBody as Record) + }) as typeof aws.sqsSendMessage + }) + + afterAll(() => { + ;(QUEUE_URL as { reportAlert: string }).reportAlert = + originalQueue as string + ;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage = + originalSqsSend + }) + + beforeEach(() => { + sent = [] + }) + + const createComment = async (content: string, authorId = '1') => + atomService.create({ + table: 'comment', + data: { + type: COMMENT_TYPE.article, + targetId: '1', + targetTypeId, + state: COMMENT_STATE.active, + uuid: uuidv4(), + authorId, + content, + }, + }) + + // _alertSpamIfHighScore is private; reach it directly for a focused unit test. + const alert = (id: string, score: number, content: string) => + ( + commentService as unknown as { + _alertSpamIfHighScore: ( + id: string, + score: number, + content: string + ) => Promise + } + )._alertSpamIfHighScore(id, score, content) + + test('emits Tier A (spam_auto) for contact + solicitation', async () => { + const content = '

賴 sk3826 台灣外送茶 約妹服務 官網 www.ppp8669.com

' + const comment = await createComment(content) + await alert(comment.id, 0.98, content) + expect(sent).toHaveLength(1) + expect(sent[0]).toMatchObject({ + source: 'spam_detection', + reason: 'spam_auto', + dedupeKey: `comment:${comment.id}`, + }) + }) + + test('emits Tier C (spam_review) for high-score benign-looking content', async () => { + const content = '

紀子璇作為楊羽棠管家的最後一天定在夏天的尾聲。

' + const comment = await createComment(content) + await alert(comment.id, 0.99, content) + expect(sent).toHaveLength(1) + expect(sent[0]).toMatchObject({ reason: 'spam_review' }) + }) + + test('emits Tier B (spam_ring) when the author repeats near-identical content', async () => { + const ringAuthor = '2' + const tmpl = (tag: string) => + `

加賴 ${tag} 全套服務到府 官網 www.x${tag}.com 約妹首選快來

` + await createComment(tmpl('aaa'), ringAuthor) + await createComment(tmpl('bbb'), ringAuthor) + await createComment(tmpl('ccc'), ringAuthor) + const latest = await createComment(tmpl('ddd'), ringAuthor) + await alert(latest.id, 0.99, tmpl('ddd')) + expect(sent).toHaveLength(1) + expect(sent[0]).toMatchObject({ reason: 'spam_ring' }) + }) + + test('stays silent when the score is below the system threshold', async () => { + const content = '

賴 台灣外送茶 約妹 www.ppp8669.com

' + const comment = await createComment(content) + await alert(comment.id, 0.5, content) + expect(sent).toHaveLength(0) + }) +}) diff --git a/src/connectors/__test__/commentSpamSignals.test.ts b/src/connectors/__test__/commentSpamSignals.test.ts new file mode 100644 index 000000000..e29e15254 --- /dev/null +++ b/src/connectors/__test__/commentSpamSignals.test.ts @@ -0,0 +1,174 @@ +import { + classifyContentTier, + hasContact, + hasSolicit, + jaccard, + nearDuplicate, + normalizeForDup, + shingles, + stripHtml, + TIER_REASON, +} from '../commentSpamSignals.js' + +// Real high-score examples from matters_prod (anonymized snippets) drive these +// cases: the >= 0.94 band mixes blatant spam with creative writing / replies, +// and only the contact+solicitation conjunction separates them cleanly. + +describe('stripHtml', () => { + test('removes tags, entities, and collapses whitespace', () => { + expect(stripHtml('

hi

 there now')).toBe('hi there now') + }) +}) + +describe('hasContact', () => { + test.each([ + 'Contact us ➤Telegram: @smmbigmarket', + 'Call Us – 8447779280 Call Girls', + '官網 www.ppp8669.com', + 'visit mtmis.org.pk for details', + 'add my line id: abc123', + ])('detects contact channel: %s', (text) => { + expect(hasContact(text)).toBe(true) + }) + + test.each([ + '問世間情為何物。孟婆的癡情令人心疼。', + '感謝你的分享,我很喜歡這個觀點。', + ])('does not fire on plain prose: %s', (text) => { + expect(hasContact(text)).toBe(false) + }) +}) + +describe('hasSolicit', () => { + test.each([ + '台灣外送茶推薦 約妹服務', + 'Where to Buy Verified Paxum Accounts', + 'football betting predictions and odds', + '全套服務 莞式', + ])('detects solicitation: %s', (text) => { + expect(hasSolicit(text)).toBe(true) + }) + + test('does not fire on benign creative writing', () => { + expect(hasSolicit('紀子璇作為楊羽棠管家的最後一天定在夏天的尾聲')).toBe(false) + }) +}) + +describe('classifyContentTier', () => { + const threshold = 0.94 + + test('returns null when there is no threshold', () => { + expect( + classifyContentTier({ score: 0.99, content: 'x', threshold: null }) + ).toBeNull() + }) + + test('returns null when the score is below the threshold', () => { + expect( + classifyContentTier({ score: 0.5, content: 'x', threshold }) + ).toBeNull() + }) + + test('Tier A (auto) for contact + solicitation', () => { + expect( + classifyContentTier({ + score: 0.984, + content: + '

賴/大G:sk3826 台灣外送茶推薦 約妹服務 官網 www.ppp8669.com

', + threshold, + }) + ).toBe('auto') + }) + + test('Tier A for English escort ad with phone', () => { + expect( + classifyContentTier({ + score: 0.996, + content: 'Call Us – 8447779280 Call Girls In Chanakyapuri Escorts', + threshold, + }) + ).toBe('auto') + }) + + test('Tier C (review) for high-score creative writing (no contact/solicit)', () => { + expect( + classifyContentTier({ + score: 0.992, + content: + '

紀子璇作為楊羽棠管家的最後一天定在夏天的尾聲,她還沒有找到工作。

', + threshold, + }) + ).toBe('review') + }) + + test('Tier C for a genuine reply that only mentions a keyword (no contact)', () => { + // "下注" alone (discussing own betting losses) must NOT be auto-acted. + expect( + classifyContentTier({ + score: 0.957, + content: '隔夜裂口 大勝後盲目下注 星期一及星期五共損手6萬', + threshold, + }) + ).toBe('review') + }) + + test('Tier C for an @mention reply (contact-pattern but no solicitation)', () => { + expect( + classifyContentTier({ + score: 0.992, + content: '@VietJoe333 請AI處理的,我懶得打馬賽克', + threshold, + }) + ).toBe('review') + }) +}) + +describe('near-duplicate ring helpers', () => { + test('shingles produces trigrams and handles short strings', () => { + expect(shingles('abcd')).toEqual(new Set(['abc', 'bcd'])) + expect(shingles('ab')).toEqual(new Set(['ab'])) + expect(shingles('')).toEqual(new Set()) + }) + + test('jaccard of identical sets is 1, disjoint is 0', () => { + expect(jaccard(new Set(['a']), new Set(['a']))).toBe(1) + expect(jaccard(new Set(['a']), new Set(['b']))).toBe(0) + expect(jaccard(new Set(), new Set())).toBe(0) + }) + + test('normalizeForDup masks urls, handles, digits, punctuation', () => { + expect(normalizeForDup('

Hi @bob! call 0912-345 http://x.co

')).toBe( + 'hicall' + ) + }) + + test('nearDuplicate matches the same template with rotated contact info', () => { + const a = '加賴 abc123 全套服務到府 官網 www.aaa.com 約妹首選' + const b = '加賴 xyz789 全套服務到府 官網 www.bbb.net 約妹首選' + expect(nearDuplicate(a, b)).toBe(true) + }) + + test('nearDuplicate does not merge genuinely different texts', () => { + expect( + nearDuplicate( + '今天天氣很好我去公園散步看到很多花', + '股市今天大跌我虧了很多錢心情很差' + ) + ).toBe(false) + }) + + test('nearDuplicate falls back to exact match for too-short content', () => { + expect(nearDuplicate('hi', 'hi')).toBe(true) + expect(nearDuplicate('hi', 'yo')).toBe(false) + }) +}) + +describe('TIER_REASON', () => { + test('maps every tier to a worker reason key', () => { + expect(TIER_REASON).toEqual({ + auto: 'spam_auto', + ring: 'spam_ring', + review: 'spam_review', + }) + }) +}) diff --git a/src/connectors/commentService.ts b/src/connectors/commentService.ts index 6cee1a968..9b2c069eb 100644 --- a/src/connectors/commentService.ts +++ b/src/connectors/commentService.ts @@ -30,9 +30,19 @@ import { ForbiddenError, UserInputError, } from '#common/errors.js' +import { enqueueReportAlert } from '#common/notifications/reportAlert.js' import { v4 } from 'uuid' import { BaseService } from './baseService.js' +import { + classifyContentTier, + nearDuplicate, + normalizeForDup, + stripHtml, + RING_MIN_FAMILY, + TIER_REASON, + type CommentSpamTier, +} from './commentSpamSignals.js' import { NotificationService } from './notification/notificationService.js' import { PaymentService } from './paymentService.js' import { SpamDetector } from './spamDetector.js' @@ -952,6 +962,9 @@ export class CommentService extends BaseService { where: { id }, data: { spamScore: score }, }) + if (environment.commentSpamAlert) { + await this._alertSpamIfHighScore(id, score, content) + } if (environment.commentSpamAutoCollapse) { await this._autoCollapseIfSpam(id, score) } @@ -959,6 +972,89 @@ export class CommentService extends BaseService { return score } + /** + * Classify a high-scoring comment into a moderation tier and surface it to the + * admin Telegram chat. NOTIFY-ONLY — this never hides a comment (auto-action + * lives in `_autoCollapseIfSpam`, gated separately). Gated by + * `commentSpamAlert`; a no-op when the env flag is off. + * + * Tier A (auto): contact + solicitation → blatant porn/escort/commercial. + * Tier B (ring): author repeats near-identical content >= RING_MIN_FAMILY. + * Tier C (review): high score but neither → human confirms (likely benign). + * + * See commentSpamSignals.ts for why score alone is insufficient. + */ + private _alertSpamIfHighScore = async ( + id: string, + score: number, + content: string + ) => { + const systemService = new SystemService(this.connections) + const threshold = await systemService.getSpamThreshold() + const contentTier = classifyContentTier({ score, content, threshold }) + if (!contentTier) { + return + } + + const comment = await this.models.commentIdLoader.load(id) + if (!comment) { + return + } + + // Tier B takes precedence: a confirmed ring is acted on regardless of the + // content-only tier (rings are what per-comment content scoring misses). + const isRing = await this._isAuthorRepeating(comment.authorId, content, id) + const tier: CommentSpamTier = isRing ? 'ring' : contentTier + + const author = await this.models.userIdLoader.load(comment.authorId) + const snippet = stripHtml(content).slice(0, 80) + await enqueueReportAlert({ + source: 'spam_detection', + dedupeKey: `comment:${id}`, + subject: `留言 @${author?.userName ?? comment.authorId}(${score.toFixed( + 2 + )}):${snippet}`, + reason: TIER_REASON[tier], + }) + } + + /** + * Tier B signal: does this author have >= RING_MIN_FAMILY other recent + * comments whose content is near-identical to this one? Bounded to the + * author's last 100 comments in 30 days so the per-comment cost stays small + * (only runs for the rare high-score comments). + */ + private _isAuthorRepeating = async ( + authorId: string, + content: string, + excludeId: string + ): Promise => { + if (normalizeForDup(content).length < 8) { + return false + } + const rows = await this.knexRO('comment') + .select('content') + .where('author_id', authorId) + .whereNot('id', excludeId) + .andWhere( + 'created_at', + '>', + this.knexRO.raw("now() - interval '30 days'") + ) + .orderBy('id', 'desc') + .limit(100) + let similar = 0 + for (const row of rows) { + if (nearDuplicate(content, row.content || '')) { + similar++ + if (similar >= RING_MIN_FAMILY) { + return true + } + } + } + return false + } + /** * Collapse an active comment whose spam score reaches the system spam * threshold. Collapse (not deletion) keeps the comment foldable in-thread — diff --git a/src/connectors/commentSpamSignals.ts b/src/connectors/commentSpamSignals.ts new file mode 100644 index 000000000..e71f05aa0 --- /dev/null +++ b/src/connectors/commentSpamSignals.ts @@ -0,0 +1,142 @@ +/** + * Comment-spam classification signals. + * + * The comment-spam model gives a single score, but a high score alone is not + * enough to act: on real data (matters_prod, 7-day window) the >= 0.94 band is + * only ~60% precision — escort ads (0.996) score the same as 中文 creative + * writing (0.992) and short genuine replies. Score cannot separate them. + * + * What DOES separate "true spam/abuse/porn" from false positives is a compound + * gate. On the real high-score set this partition had ZERO false positives: + * + * Tier A (auto): score >= threshold AND has-contact AND has-solicitation + * → escort / paid-services / account-selling / betting promo. + * Tier B (ring): score >= threshold AND the author repeats near-identical + * content across comments → templated link-builder spam. + * Tier C (review): score >= threshold but neither A nor B → surface to humans, + * NEVER auto-act (this is where creative writing / opinions / + * short replies land; a human confirms they are not spam). + * + * This module is pure (no I/O) so the gate is unit-testable in isolation; the + * ring check (Tier B) needs the author's recent comments and is performed by + * CommentService, which calls `nearDuplicate` from here. + */ + +/** char-3gram Jaccard threshold for counting two comments as near-duplicates. */ +export const RING_SIM_THRESHOLD = 0.8 +/** how many near-identical sibling comments make a confirmed ring. */ +export const RING_MIN_FAMILY = 3 + +/** + * Contact / solicitation channel signals: phone numbers, messaging handles, + * external links and domains. Deliberately broad — on its own it over-matches + * (a bare @mention to another user trips `@handle`), which is why Tier A + * requires BOTH this and a solicitation keyword. The conjunction is what makes + * it precise. + */ +export const CONTACT_RE = + /(telegram|what'?s\s?app|wechat|微信|line\s*(id|帳號)|賴|skype|t\.me\/|wa\.me\/|@[a-z0-9_]{4,}|\+?\d[\d\s().-]{7,}\d|https?:\/\/|www\.|\.(com|net|org|cn|pk|xyz|top|vip|me|info|biz|club|shop)\b)/i + +/** + * Solicitation keywords across the categories we want to catch: porn / escort, + * betting / gambling, account-selling and paid-service spam. Matching one of + * these is necessary-but-not-sufficient for Tier A (it also fires on a user + * *discussing* betting / crypto — hence the AND with CONTACT_RE). + */ +export const SOLICIT_RE = + /(escort|call\s*girls?|外送茶|約妹|約炮|叫小姐|上門服務|全套服務|莞式|spa\s*服務|按摩\s*(服務|到府)|betting|odds|predictions?|賠率|下注|博彩|彩票|百家樂|paxum|verified\s+accounts?|buy\s+[a-z ]*accounts?|usdt|代開|代辦|刷單|兼職日結|日結)/i + +export type CommentSpamTier = 'auto' | 'ring' | 'review' + +/** Reason enum value sent to the telegram alert worker per tier. */ +export const TIER_REASON: Record = { + auto: 'spam_auto', + ring: 'spam_ring', + review: 'spam_review', +} + +/** Strip HTML tags and collapse whitespace to plain text. */ +export const stripHtml = (html: string): string => + html + .replace(/<[^>]+>/g, ' ') + .replace(/ /g, ' ') + .replace(/\s+/g, ' ') + .trim() + +export const hasContact = (text: string): boolean => CONTACT_RE.test(text) +export const hasSolicit = (text: string): boolean => SOLICIT_RE.test(text) + +/** + * Decide the content-only tier for a scored comment. + * - returns `null` when there is no threshold or the score is below it + * (nothing to surface); + * - returns `'auto'` for the high-precision contact+solicitation conjunction; + * - returns `'review'` otherwise (human-in-the-loop). + * Tier B (`'ring'`) is decided by the caller, which has the author's history. + */ +export const classifyContentTier = ({ + score, + content, + threshold, +}: { + score: number + content: string + threshold: number | null +}): 'auto' | 'review' | null => { + if (!threshold || score < threshold) { + return null + } + const text = stripHtml(content) + return hasContact(text) && hasSolicit(text) ? 'auto' : 'review' +} + +/** + * Canonicalize a comment for near-duplicate comparison: drop HTML, lowercase, + * mask the volatile bits a spammer rotates (urls, @handles, digits) and strip + * punctuation/spacing so only the stable template skeleton remains. + */ +export const normalizeForDup = (content: string): string => + stripHtml(content) + .toLowerCase() + .replace(/https?:\/\/\S+|www\.\S+/g, ' ') + .replace(/@[a-z0-9_]+/g, ' ') + .replace(/\d+/g, ' ') + .replace(/[^\p{Letter}\p{Number}]+/gu, '') + +/** Set of character n-grams (default trigrams) of a normalized string. */ +export const shingles = (s: string, n = 3): Set => { + const out = new Set() + if (s.length < n) { + if (s) out.add(s) + return out + } + for (let i = 0; i + n <= s.length; i++) { + out.add(s.slice(i, i + n)) + } + return out +} + +export const jaccard = (a: Set, b: Set): number => { + if (a.size === 0 && b.size === 0) return 0 + let inter = 0 + for (const x of a) if (b.has(x)) inter++ + const union = a.size + b.size - inter + return union === 0 ? 0 : inter / union +} + +/** + * True when two raw comment contents are near-duplicates after canonicalization + * (char-3gram Jaccard >= `threshold`). Robust to rotated urls/@/digits and small + * edits without merging genuinely different texts. + */ +export const nearDuplicate = ( + a: string, + b: string, + threshold = RING_SIM_THRESHOLD +): boolean => { + const na = normalizeForDup(a) + const nb = normalizeForDup(b) + // too-short normalized text is unreliable for ring matching + if (na.length < 8 || nb.length < 8) return na === nb && na.length > 0 + return jaccard(shingles(na), shingles(nb)) >= threshold +} diff --git a/src/handlers/reportTelegramAlert.ts b/src/handlers/reportTelegramAlert.ts index 341a6dd7e..144ae82b7 100644 --- a/src/handlers/reportTelegramAlert.ts +++ b/src/handlers/reportTelegramAlert.ts @@ -22,6 +22,7 @@ const TELEGRAM_API_TIMEOUT_MS = 5000 const SOURCE_LABELS: Record = { direct: '🚨 站內檢舉', community_watch: '🛡️ 守望相助', + spam_detection: '🤖 留言垃圾偵測', } /** @@ -37,6 +38,10 @@ const REASON_LABELS: Record = { other: '其他', porn_ad: '色情/成人廣告', spam_ad: '濫發廣告', + // comment-spam detection tiers (source: spam_detection) + spam_auto: '高信度垃圾(色情/招攬/博弈)— 建議處置', + spam_ring: '重複貼文 ring — 建議處置', + spam_review: '高分待人工確認', } type DedupRecord = { @@ -79,7 +84,9 @@ const isValidPayload = (raw: unknown): raw is ReportAlertRequested => { if (!raw || typeof raw !== 'object') return false const v = raw as Record return ( - (v.source === 'direct' || v.source === 'community_watch') && + (v.source === 'direct' || + v.source === 'community_watch' || + v.source === 'spam_detection') && typeof v.dedupeKey === 'string' && v.dedupeKey.length > 0 && typeof v.subject === 'string' && From e1879a05a1292f467b2ca12e48e9e66b6194fe4d Mon Sep 17 00:00:00 2001 From: mashbean Date: Tue, 16 Jun 2026 18:23:23 +0800 Subject: [PATCH 2/2] fix(comment-spam): mask rotated contact IDs in ring normalization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ring near-duplicate check only stripped bare digits, so a rotated contact token (sk3826, abc123) left a letter remnant (sk, abc) and otherwise-identical spam templates failed to match. Drop whole alphanumeric tokens containing a digit instead — the IDs/phone numbers spammers rotate — while keeping pure-letter words so English templates still ring-match. Fixes the two failing ring tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/connectors/__test__/commentService.test.ts | 15 ++++++++------- src/connectors/commentSpamSignals.ts | 6 +++++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/connectors/__test__/commentService.test.ts b/src/connectors/__test__/commentService.test.ts index 4651f5aac..7fc228319 100644 --- a/src/connectors/__test__/commentService.test.ts +++ b/src/connectors/__test__/commentService.test.ts @@ -952,13 +952,14 @@ describe('spam telegram alert (notify-only tiering)', () => { test('emits Tier B (spam_ring) when the author repeats near-identical content', async () => { const ringAuthor = '2' - const tmpl = (tag: string) => - `

加賴 ${tag} 全套服務到府 官網 www.x${tag}.com 約妹首選快來

` - await createComment(tmpl('aaa'), ringAuthor) - await createComment(tmpl('bbb'), ringAuthor) - await createComment(tmpl('ccc'), ringAuthor) - const latest = await createComment(tmpl('ddd'), ringAuthor) - await alert(latest.id, 0.99, tmpl('ddd')) + // rotating contact IDs (with digits) between an otherwise-identical template + const tmpl = (n: number) => + `

加賴 vip${n}888 全套服務到府 官網 www.x${n}.com 約妹首選快來

` + await createComment(tmpl(1), ringAuthor) + await createComment(tmpl(2), ringAuthor) + await createComment(tmpl(3), ringAuthor) + const latest = await createComment(tmpl(4), ringAuthor) + await alert(latest.id, 0.99, tmpl(4)) expect(sent).toHaveLength(1) expect(sent[0]).toMatchObject({ reason: 'spam_ring' }) }) diff --git a/src/connectors/commentSpamSignals.ts b/src/connectors/commentSpamSignals.ts index e71f05aa0..82b54a08a 100644 --- a/src/connectors/commentSpamSignals.ts +++ b/src/connectors/commentSpamSignals.ts @@ -100,7 +100,11 @@ export const normalizeForDup = (content: string): string => .toLowerCase() .replace(/https?:\/\/\S+|www\.\S+/g, ' ') .replace(/@[a-z0-9_]+/g, ' ') - .replace(/\d+/g, ' ') + // drop whole alphanumeric tokens that contain a digit — these are the + // contact IDs / phone numbers a spammer rotates between otherwise-identical + // posts (sk3826, vip888, 0912-345...). Pure-letter words are kept so English + // templates still ring-match. + .replace(/[a-z0-9]*\d[a-z0-9]*/gi, ' ') .replace(/[^\p{Letter}\p{Number}]+/gu, '') /** Set of character n-grams (default trigrams) of a normalized string. */