From 995810e97138da2afffb8cd6d83a7248a53155db Mon Sep 17 00:00:00 2001
From: mashbean <mashbean@matters.town>
Date: Tue, 16 Jun 2026 18:15:11 +0800
Subject: [PATCH 1/2] feat(comment-spam): 3-tier moderation alerting to admin
 Telegram (notify-only)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A high spam score alone can't separate true spam from false positives: on
matters_prod (7-day, >=0.94 band) precision is only ~60% — escort ads (0.996)
score the same as 中文 creative writing (0.992) and short genuine replies.
Account age doesn't separate either (an escort account was 818d old / 883
articles). What cleanly partitions them (ZERO false positives on the real
high-score set) is a compound gate:

  Tier A (auto):   score>=threshold AND contact-channel AND solicitation-keyword
                   → escort / paid-services / account-selling / betting promo.
  Tier B (ring):   author repeats near-identical content across comments.
  Tier C (review): high score but neither → surface to humans, never auto-act
                   (creative writing / opinions / replies land here).

This wires the gate into detectSpam and surfaces all three tiers to the admin
Telegram chat by reusing the existing report-alert SQS → reportTelegramAlert
pipeline (new source 'spam_detection'). NOTIFY-ONLY: it never hides a comment —
auto-action stays behind the separate, still-off commentSpamAutoCollapse flag —
so we validate the gate's precision in production before enabling enforcement.
Gated by MATTERS_COMMENT_SPAM_ALERT (default off).

Signal logic lives in a pure, fully unit-tested module (commentSpamSignals.ts);
the ring check is one bounded read of the author's recent comments, run only for
the rare high-score comments.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/common/environment.ts                     |   5 +
 src/common/notifications/reportAlert.ts       |   2 +-
 .../__test__/commentService.test.ts           | 105 +++++++++++
 .../__test__/commentSpamSignals.test.ts       | 174 ++++++++++++++++++
 src/connectors/commentService.ts              |  96 ++++++++++
 src/connectors/commentSpamSignals.ts          | 142 ++++++++++++++
 src/handlers/reportTelegramAlert.ts           |   9 +-
 7 files changed, 531 insertions(+), 2 deletions(-)
 create mode 100644 src/connectors/__test__/commentSpamSignals.test.ts
 create mode 100644 src/connectors/commentSpamSignals.ts

diff --git a/src/common/environment.ts b/src/common/environment.ts
index ea24848aa..fbc14c4a4 100644
--- a/src/common/environment.ts
+++ b/src/common/environment.ts
@@ -194,6 +194,11 @@ export const environment = {
   // 看見"). Default off so scoring stays observe-only until ops opts in.
   commentSpamAutoCollapse:
     process.env.MATTERS_COMMENT_SPAM_AUTO_COLLAPSE === 'true',
+  // When true, a comment whose spam score reaches the system spam threshold is
+  // classified (Tier A auto / B ring / C review) and surfaced to the admin
+  // Telegram chat for moderation. Notify-only: this flag NEVER hides a comment
+  // (auto-action stays behind commentSpamAutoCollapse). Default off.
+  commentSpamAlert: process.env.MATTERS_COMMENT_SPAM_ALERT === 'true',
   channelClassificationApiUrl:
     process.env.MATTERS_CHANNEL_CLASSIFICATION_API_URL || '',
   languageDetectionApiUrl: process.env.MATTERS_LANGUAGE_DETECTION_API_URL || '',
diff --git a/src/common/notifications/reportAlert.ts b/src/common/notifications/reportAlert.ts
index 771263da8..f2d387c86 100644
--- a/src/common/notifications/reportAlert.ts
+++ b/src/common/notifications/reportAlert.ts
@@ -14,7 +14,7 @@ const logger = getLogger('report-alert')
  * is the integration boundary between the two.
  */
 export type ReportAlertRequested = {
-  source: 'direct' | 'community_watch'
+  source: 'direct' | 'community_watch' | 'spam_detection'
   /** Aggregation key — same key within 24h edits the same Telegram message. */
   dedupeKey: string
   /** Human-readable description of what was reported. */
diff --git a/src/connectors/__test__/commentService.test.ts b/src/connectors/__test__/commentService.test.ts
index 89814fb3c..4651f5aac 100644
--- a/src/connectors/__test__/commentService.test.ts
+++ b/src/connectors/__test__/commentService.test.ts
@@ -6,10 +6,12 @@ import {
   COMMENT_TYPE,
   FEATURE_FLAG,
   FEATURE_NAME,
+  QUEUE_URL,
   USER_FEATURE_FLAG_TYPE,
   USER_STATE,
 } from '#common/enums/index.js'
 import { environment } from '#common/environment.js'
+import { aws } from '#connectors/aws/index.js'
 
 import { PublicationService } from '../article/publicationService.js'
 import { AtomService } from '../atomService.js'
@@ -865,3 +867,106 @@ describe('auto-collapse spam comments', () => {
     })
   })
 })
+
+describe('spam telegram alert (notify-only tiering)', () => {
+  let targetTypeId: string
+  const originalQueue = QUEUE_URL.reportAlert
+  const originalSqsSend = aws.sqsSendMessage
+  let sent: Array<Record<string, unknown>>
+
+  beforeAll(async () => {
+    const entityType = await atomService.findFirst({
+      table: 'entity_type',
+      where: { table: 'article' },
+    })
+    targetTypeId = entityType.id
+    await systemService.setFeatureFlag({
+      name: FEATURE_NAME.spam_detection,
+      flag: FEATURE_FLAG.on,
+      value: 0.8,
+    })
+    ;(QUEUE_URL as { reportAlert: string }).reportAlert =
+      'https://sqs.test/report-alert'
+    ;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
+      (async (params) => {
+        sent.push(params.messageBody as Record<string, unknown>)
+      }) as typeof aws.sqsSendMessage
+  })
+
+  afterAll(() => {
+    ;(QUEUE_URL as { reportAlert: string }).reportAlert =
+      originalQueue as string
+    ;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
+      originalSqsSend
+  })
+
+  beforeEach(() => {
+    sent = []
+  })
+
+  const createComment = async (content: string, authorId = '1') =>
+    atomService.create({
+      table: 'comment',
+      data: {
+        type: COMMENT_TYPE.article,
+        targetId: '1',
+        targetTypeId,
+        state: COMMENT_STATE.active,
+        uuid: uuidv4(),
+        authorId,
+        content,
+      },
+    })
+
+  // _alertSpamIfHighScore is private; reach it directly for a focused unit test.
+  const alert = (id: string, score: number, content: string) =>
+    (
+      commentService as unknown as {
+        _alertSpamIfHighScore: (
+          id: string,
+          score: number,
+          content: string
+        ) => Promise<void>
+      }
+    )._alertSpamIfHighScore(id, score, content)
+
+  test('emits Tier A (spam_auto) for contact + solicitation', async () => {
+    const content = '<p>賴 sk3826 台灣外送茶 約妹服務 官網 www.ppp8669.com</p>'
+    const comment = await createComment(content)
+    await alert(comment.id, 0.98, content)
+    expect(sent).toHaveLength(1)
+    expect(sent[0]).toMatchObject({
+      source: 'spam_detection',
+      reason: 'spam_auto',
+      dedupeKey: `comment:${comment.id}`,
+    })
+  })
+
+  test('emits Tier C (spam_review) for high-score benign-looking content', async () => {
+    const content = '<p>紀子璇作為楊羽棠管家的最後一天定在夏天的尾聲。</p>'
+    const comment = await createComment(content)
+    await alert(comment.id, 0.99, content)
+    expect(sent).toHaveLength(1)
+    expect(sent[0]).toMatchObject({ reason: 'spam_review' })
+  })
+
+  test('emits Tier B (spam_ring) when the author repeats near-identical content', async () => {
+    const ringAuthor = '2'
+    const tmpl = (tag: string) =>
+      `<p>加賴 ${tag} 全套服務到府 官網 www.x${tag}.com 約妹首選快來</p>`
+    await createComment(tmpl('aaa'), ringAuthor)
+    await createComment(tmpl('bbb'), ringAuthor)
+    await createComment(tmpl('ccc'), ringAuthor)
+    const latest = await createComment(tmpl('ddd'), ringAuthor)
+    await alert(latest.id, 0.99, tmpl('ddd'))
+    expect(sent).toHaveLength(1)
+    expect(sent[0]).toMatchObject({ reason: 'spam_ring' })
+  })
+
+  test('stays silent when the score is below the system threshold', async () => {
+    const content = '<p>賴 台灣外送茶 約妹 www.ppp8669.com</p>'
+    const comment = await createComment(content)
+    await alert(comment.id, 0.5, content)
+    expect(sent).toHaveLength(0)
+  })
+})
diff --git a/src/connectors/__test__/commentSpamSignals.test.ts b/src/connectors/__test__/commentSpamSignals.test.ts
new file mode 100644
index 000000000..e29e15254
--- /dev/null
+++ b/src/connectors/__test__/commentSpamSignals.test.ts
@@ -0,0 +1,174 @@
+import {
+  classifyContentTier,
+  hasContact,
+  hasSolicit,
+  jaccard,
+  nearDuplicate,
+  normalizeForDup,
+  shingles,
+  stripHtml,
+  TIER_REASON,
+} from '../commentSpamSignals.js'
+
+// Real high-score examples from matters_prod (anonymized snippets) drive these
+// cases: the >= 0.94 band mixes blatant spam with creative writing / replies,
+// and only the contact+solicitation conjunction separates them cleanly.
+
+describe('stripHtml', () => {
+  test('removes tags, entities, and collapses whitespace', () => {
+    expect(stripHtml('<p>hi</p>&nbsp;<b>there</b>  now')).toBe('hi there now')
+  })
+})
+
+describe('hasContact', () => {
+  test.each([
+    'Contact us ➤Telegram: @smmbigmarket',
+    'Call Us – 8447779280 Call Girls',
+    '官網 www.ppp8669.com',
+    'visit mtmis.org.pk for details',
+    'add my line id: abc123',
+  ])('detects contact channel: %s', (text) => {
+    expect(hasContact(text)).toBe(true)
+  })
+
+  test.each([
+    '問世間情為何物。孟婆的癡情令人心疼。',
+    '感謝你的分享，我很喜歡這個觀點。',
+  ])('does not fire on plain prose: %s', (text) => {
+    expect(hasContact(text)).toBe(false)
+  })
+})
+
+describe('hasSolicit', () => {
+  test.each([
+    '台灣外送茶推薦 約妹服務',
+    'Where to Buy Verified Paxum Accounts',
+    'football betting predictions and odds',
+    '全套服務 莞式',
+  ])('detects solicitation: %s', (text) => {
+    expect(hasSolicit(text)).toBe(true)
+  })
+
+  test('does not fire on benign creative writing', () => {
+    expect(hasSolicit('紀子璇作為楊羽棠管家的最後一天定在夏天的尾聲')).toBe(false)
+  })
+})
+
+describe('classifyContentTier', () => {
+  const threshold = 0.94
+
+  test('returns null when there is no threshold', () => {
+    expect(
+      classifyContentTier({ score: 0.99, content: 'x', threshold: null })
+    ).toBeNull()
+  })
+
+  test('returns null when the score is below the threshold', () => {
+    expect(
+      classifyContentTier({ score: 0.5, content: 'x', threshold })
+    ).toBeNull()
+  })
+
+  test('Tier A (auto) for contact + solicitation', () => {
+    expect(
+      classifyContentTier({
+        score: 0.984,
+        content:
+          '<p>賴/大G:sk3826 台灣外送茶推薦 約妹服務 官網 www.ppp8669.com</p>',
+        threshold,
+      })
+    ).toBe('auto')
+  })
+
+  test('Tier A for English escort ad with phone', () => {
+    expect(
+      classifyContentTier({
+        score: 0.996,
+        content: 'Call Us – 8447779280 Call Girls In Chanakyapuri Escorts',
+        threshold,
+      })
+    ).toBe('auto')
+  })
+
+  test('Tier C (review) for high-score creative writing (no contact/solicit)', () => {
+    expect(
+      classifyContentTier({
+        score: 0.992,
+        content:
+          '<p>紀子璇作為楊羽棠管家的最後一天定在夏天的尾聲，她還沒有找到工作。</p>',
+        threshold,
+      })
+    ).toBe('review')
+  })
+
+  test('Tier C for a genuine reply that only mentions a keyword (no contact)', () => {
+    // "下注" alone (discussing own betting losses) must NOT be auto-acted.
+    expect(
+      classifyContentTier({
+        score: 0.957,
+        content: '隔夜裂口 大勝後盲目下注 星期一及星期五共損手6萬',
+        threshold,
+      })
+    ).toBe('review')
+  })
+
+  test('Tier C for an @mention reply (contact-pattern but no solicitation)', () => {
+    expect(
+      classifyContentTier({
+        score: 0.992,
+        content: '@VietJoe333 請AI處理的，我懶得打馬賽克',
+        threshold,
+      })
+    ).toBe('review')
+  })
+})
+
+describe('near-duplicate ring helpers', () => {
+  test('shingles produces trigrams and handles short strings', () => {
+    expect(shingles('abcd')).toEqual(new Set(['abc', 'bcd']))
+    expect(shingles('ab')).toEqual(new Set(['ab']))
+    expect(shingles('')).toEqual(new Set())
+  })
+
+  test('jaccard of identical sets is 1, disjoint is 0', () => {
+    expect(jaccard(new Set(['a']), new Set(['a']))).toBe(1)
+    expect(jaccard(new Set(['a']), new Set(['b']))).toBe(0)
+    expect(jaccard(new Set(), new Set())).toBe(0)
+  })
+
+  test('normalizeForDup masks urls, handles, digits, punctuation', () => {
+    expect(normalizeForDup('<p>Hi @bob! call 0912-345 http://x.co </p>')).toBe(
+      'hicall'
+    )
+  })
+
+  test('nearDuplicate matches the same template with rotated contact info', () => {
+    const a = '加賴 abc123 全套服務到府 官網 www.aaa.com 約妹首選'
+    const b = '加賴 xyz789 全套服務到府 官網 www.bbb.net 約妹首選'
+    expect(nearDuplicate(a, b)).toBe(true)
+  })
+
+  test('nearDuplicate does not merge genuinely different texts', () => {
+    expect(
+      nearDuplicate(
+        '今天天氣很好我去公園散步看到很多花',
+        '股市今天大跌我虧了很多錢心情很差'
+      )
+    ).toBe(false)
+  })
+
+  test('nearDuplicate falls back to exact match for too-short content', () => {
+    expect(nearDuplicate('hi', 'hi')).toBe(true)
+    expect(nearDuplicate('hi', 'yo')).toBe(false)
+  })
+})
+
+describe('TIER_REASON', () => {
+  test('maps every tier to a worker reason key', () => {
+    expect(TIER_REASON).toEqual({
+      auto: 'spam_auto',
+      ring: 'spam_ring',
+      review: 'spam_review',
+    })
+  })
+})
diff --git a/src/connectors/commentService.ts b/src/connectors/commentService.ts
index 6cee1a968..9b2c069eb 100644
--- a/src/connectors/commentService.ts
+++ b/src/connectors/commentService.ts
@@ -30,9 +30,19 @@ import {
   ForbiddenError,
   UserInputError,
 } from '#common/errors.js'
+import { enqueueReportAlert } from '#common/notifications/reportAlert.js'
 import { v4 } from 'uuid'
 
 import { BaseService } from './baseService.js'
+import {
+  classifyContentTier,
+  nearDuplicate,
+  normalizeForDup,
+  stripHtml,
+  RING_MIN_FAMILY,
+  TIER_REASON,
+  type CommentSpamTier,
+} from './commentSpamSignals.js'
 import { NotificationService } from './notification/notificationService.js'
 import { PaymentService } from './paymentService.js'
 import { SpamDetector } from './spamDetector.js'
@@ -952,6 +962,9 @@ export class CommentService extends BaseService<Comment> {
         where: { id },
         data: { spamScore: score },
       })
+      if (environment.commentSpamAlert) {
+        await this._alertSpamIfHighScore(id, score, content)
+      }
       if (environment.commentSpamAutoCollapse) {
         await this._autoCollapseIfSpam(id, score)
       }
@@ -959,6 +972,89 @@ export class CommentService extends BaseService<Comment> {
     return score
   }
 
+  /**
+   * Classify a high-scoring comment into a moderation tier and surface it to the
+   * admin Telegram chat. NOTIFY-ONLY — this never hides a comment (auto-action
+   * lives in `_autoCollapseIfSpam`, gated separately). Gated by
+   * `commentSpamAlert`; a no-op when the env flag is off.
+   *
+   *   Tier A (auto):   contact + solicitation  → blatant porn/escort/commercial.
+   *   Tier B (ring):   author repeats near-identical content >= RING_MIN_FAMILY.
+   *   Tier C (review): high score but neither   → human confirms (likely benign).
+   *
+   * See commentSpamSignals.ts for why score alone is insufficient.
+   */
+  private _alertSpamIfHighScore = async (
+    id: string,
+    score: number,
+    content: string
+  ) => {
+    const systemService = new SystemService(this.connections)
+    const threshold = await systemService.getSpamThreshold()
+    const contentTier = classifyContentTier({ score, content, threshold })
+    if (!contentTier) {
+      return
+    }
+
+    const comment = await this.models.commentIdLoader.load(id)
+    if (!comment) {
+      return
+    }
+
+    // Tier B takes precedence: a confirmed ring is acted on regardless of the
+    // content-only tier (rings are what per-comment content scoring misses).
+    const isRing = await this._isAuthorRepeating(comment.authorId, content, id)
+    const tier: CommentSpamTier = isRing ? 'ring' : contentTier
+
+    const author = await this.models.userIdLoader.load(comment.authorId)
+    const snippet = stripHtml(content).slice(0, 80)
+    await enqueueReportAlert({
+      source: 'spam_detection',
+      dedupeKey: `comment:${id}`,
+      subject: `留言 @${author?.userName ?? comment.authorId}（${score.toFixed(
+        2
+      )}）：${snippet}`,
+      reason: TIER_REASON[tier],
+    })
+  }
+
+  /**
+   * Tier B signal: does this author have >= RING_MIN_FAMILY other recent
+   * comments whose content is near-identical to this one? Bounded to the
+   * author's last 100 comments in 30 days so the per-comment cost stays small
+   * (only runs for the rare high-score comments).
+   */
+  private _isAuthorRepeating = async (
+    authorId: string,
+    content: string,
+    excludeId: string
+  ): Promise<boolean> => {
+    if (normalizeForDup(content).length < 8) {
+      return false
+    }
+    const rows = await this.knexRO('comment')
+      .select('content')
+      .where('author_id', authorId)
+      .whereNot('id', excludeId)
+      .andWhere(
+        'created_at',
+        '>',
+        this.knexRO.raw("now() - interval '30 days'")
+      )
+      .orderBy('id', 'desc')
+      .limit(100)
+    let similar = 0
+    for (const row of rows) {
+      if (nearDuplicate(content, row.content || '')) {
+        similar++
+        if (similar >= RING_MIN_FAMILY) {
+          return true
+        }
+      }
+    }
+    return false
+  }
+
   /**
    * Collapse an active comment whose spam score reaches the system spam
    * threshold. Collapse (not deletion) keeps the comment foldable in-thread —
diff --git a/src/connectors/commentSpamSignals.ts b/src/connectors/commentSpamSignals.ts
new file mode 100644
index 000000000..e71f05aa0
--- /dev/null
+++ b/src/connectors/commentSpamSignals.ts
@@ -0,0 +1,142 @@
+/**
+ * Comment-spam classification signals.
+ *
+ * The comment-spam model gives a single score, but a high score alone is not
+ * enough to act: on real data (matters_prod, 7-day window) the >= 0.94 band is
+ * only ~60% precision — escort ads (0.996) score the same as 中文 creative
+ * writing (0.992) and short genuine replies. Score cannot separate them.
+ *
+ * What DOES separate "true spam/abuse/porn" from false positives is a compound
+ * gate. On the real high-score set this partition had ZERO false positives:
+ *
+ *   Tier A (auto):   score >= threshold AND has-contact AND has-solicitation
+ *                    → escort / paid-services / account-selling / betting promo.
+ *   Tier B (ring):   score >= threshold AND the author repeats near-identical
+ *                    content across comments → templated link-builder spam.
+ *   Tier C (review): score >= threshold but neither A nor B → surface to humans,
+ *                    NEVER auto-act (this is where creative writing / opinions /
+ *                    short replies land; a human confirms they are not spam).
+ *
+ * This module is pure (no I/O) so the gate is unit-testable in isolation; the
+ * ring check (Tier B) needs the author's recent comments and is performed by
+ * CommentService, which calls `nearDuplicate` from here.
+ */
+
+/** char-3gram Jaccard threshold for counting two comments as near-duplicates. */
+export const RING_SIM_THRESHOLD = 0.8
+/** how many near-identical sibling comments make a confirmed ring. */
+export const RING_MIN_FAMILY = 3
+
+/**
+ * Contact / solicitation channel signals: phone numbers, messaging handles,
+ * external links and domains. Deliberately broad — on its own it over-matches
+ * (a bare @mention to another user trips `@handle`), which is why Tier A
+ * requires BOTH this and a solicitation keyword. The conjunction is what makes
+ * it precise.
+ */
+export const CONTACT_RE =
+  /(telegram|what'?s\s?app|wechat|微信|line\s*(id|帳號)|賴|skype|t\.me\/|wa\.me\/|@[a-z0-9_]{4,}|\+?\d[\d\s().-]{7,}\d|https?:\/\/|www\.|\.(com|net|org|cn|pk|xyz|top|vip|me|info|biz|club|shop)\b)/i
+
+/**
+ * Solicitation keywords across the categories we want to catch: porn / escort,
+ * betting / gambling, account-selling and paid-service spam. Matching one of
+ * these is necessary-but-not-sufficient for Tier A (it also fires on a user
+ * *discussing* betting / crypto — hence the AND with CONTACT_RE).
+ */
+export const SOLICIT_RE =
+  /(escort|call\s*girls?|外送茶|約妹|約炮|叫小姐|上門服務|全套服務|莞式|spa\s*服務|按摩\s*(服務|到府)|betting|odds|predictions?|賠率|下注|博彩|彩票|百家樂|paxum|verified\s+accounts?|buy\s+[a-z ]*accounts?|usdt|代開|代辦|刷單|兼職日結|日結)/i
+
+export type CommentSpamTier = 'auto' | 'ring' | 'review'
+
+/** Reason enum value sent to the telegram alert worker per tier. */
+export const TIER_REASON: Record<CommentSpamTier, string> = {
+  auto: 'spam_auto',
+  ring: 'spam_ring',
+  review: 'spam_review',
+}
+
+/** Strip HTML tags and collapse whitespace to plain text. */
+export const stripHtml = (html: string): string =>
+  html
+    .replace(/<[^>]+>/g, ' ')
+    .replace(/&nbsp;/g, ' ')
+    .replace(/\s+/g, ' ')
+    .trim()
+
+export const hasContact = (text: string): boolean => CONTACT_RE.test(text)
+export const hasSolicit = (text: string): boolean => SOLICIT_RE.test(text)
+
+/**
+ * Decide the content-only tier for a scored comment.
+ *   - returns `null` when there is no threshold or the score is below it
+ *     (nothing to surface);
+ *   - returns `'auto'` for the high-precision contact+solicitation conjunction;
+ *   - returns `'review'` otherwise (human-in-the-loop).
+ * Tier B (`'ring'`) is decided by the caller, which has the author's history.
+ */
+export const classifyContentTier = ({
+  score,
+  content,
+  threshold,
+}: {
+  score: number
+  content: string
+  threshold: number | null
+}): 'auto' | 'review' | null => {
+  if (!threshold || score < threshold) {
+    return null
+  }
+  const text = stripHtml(content)
+  return hasContact(text) && hasSolicit(text) ? 'auto' : 'review'
+}
+
+/**
+ * Canonicalize a comment for near-duplicate comparison: drop HTML, lowercase,
+ * mask the volatile bits a spammer rotates (urls, @handles, digits) and strip
+ * punctuation/spacing so only the stable template skeleton remains.
+ */
+export const normalizeForDup = (content: string): string =>
+  stripHtml(content)
+    .toLowerCase()
+    .replace(/https?:\/\/\S+|www\.\S+/g, ' ')
+    .replace(/@[a-z0-9_]+/g, ' ')
+    .replace(/\d+/g, ' ')
+    .replace(/[^\p{Letter}\p{Number}]+/gu, '')
+
+/** Set of character n-grams (default trigrams) of a normalized string. */
+export const shingles = (s: string, n = 3): Set<string> => {
+  const out = new Set<string>()
+  if (s.length < n) {
+    if (s) out.add(s)
+    return out
+  }
+  for (let i = 0; i + n <= s.length; i++) {
+    out.add(s.slice(i, i + n))
+  }
+  return out
+}
+
+export const jaccard = (a: Set<string>, b: Set<string>): number => {
+  if (a.size === 0 && b.size === 0) return 0
+  let inter = 0
+  for (const x of a) if (b.has(x)) inter++
+  const union = a.size + b.size - inter
+  return union === 0 ? 0 : inter / union
+}
+
+/**
+ * True when two raw comment contents are near-duplicates after canonicalization
+ * (char-3gram Jaccard >= `threshold`). Robust to rotated urls/@/digits and small
+ * edits without merging genuinely different texts.
+ */
+export const nearDuplicate = (
+  a: string,
+  b: string,
+  threshold = RING_SIM_THRESHOLD
+): boolean => {
+  const na = normalizeForDup(a)
+  const nb = normalizeForDup(b)
+  // too-short normalized text is unreliable for ring matching
+  if (na.length < 8 || nb.length < 8) return na === nb && na.length > 0
+  return jaccard(shingles(na), shingles(nb)) >= threshold
+}
diff --git a/src/handlers/reportTelegramAlert.ts b/src/handlers/reportTelegramAlert.ts
index 341a6dd7e..144ae82b7 100644
--- a/src/handlers/reportTelegramAlert.ts
+++ b/src/handlers/reportTelegramAlert.ts
@@ -22,6 +22,7 @@ const TELEGRAM_API_TIMEOUT_MS = 5000
 const SOURCE_LABELS: Record<ReportAlertRequested['source'], string> = {
   direct: '🚨 站內檢舉',
   community_watch: '🛡️ 守望相助',
+  spam_detection: '🤖 留言垃圾偵測',
 }
 
 /**
@@ -37,6 +38,10 @@ const REASON_LABELS: Record<string, string> = {
   other: '其他',
   porn_ad: '色情/成人廣告',
   spam_ad: '濫發廣告',
+  // comment-spam detection tiers (source: spam_detection)
+  spam_auto: '高信度垃圾(色情/招攬/博弈)— 建議處置',
+  spam_ring: '重複貼文 ring — 建議處置',
+  spam_review: '高分待人工確認',
 }
 
 type DedupRecord = {
@@ -79,7 +84,9 @@ const isValidPayload = (raw: unknown): raw is ReportAlertRequested => {
   if (!raw || typeof raw !== 'object') return false
   const v = raw as Record<string, unknown>
   return (
-    (v.source === 'direct' || v.source === 'community_watch') &&
+    (v.source === 'direct' ||
+      v.source === 'community_watch' ||
+      v.source === 'spam_detection') &&
     typeof v.dedupeKey === 'string' &&
     v.dedupeKey.length > 0 &&
     typeof v.subject === 'string' &&

From e1879a05a1292f467b2ca12e48e9e66b6194fe4d Mon Sep 17 00:00:00 2001
From: mashbean <mashbean@matters.town>
Date: Tue, 16 Jun 2026 18:23:23 +0800
Subject: [PATCH 2/2] fix(comment-spam): mask rotated contact IDs in ring
 normalization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ring near-duplicate check only stripped bare digits, so a rotated contact
token (sk3826, abc123) left a letter remnant (sk, abc) and otherwise-identical
spam templates failed to match. Drop whole alphanumeric tokens containing a
digit instead — the IDs/phone numbers spammers rotate — while keeping pure-letter
words so English templates still ring-match. Fixes the two failing ring tests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/connectors/__test__/commentService.test.ts | 15 ++++++++-------
 src/connectors/commentSpamSignals.ts           |  6 +++++-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/connectors/__test__/commentService.test.ts b/src/connectors/__test__/commentService.test.ts
index 4651f5aac..7fc228319 100644
--- a/src/connectors/__test__/commentService.test.ts
+++ b/src/connectors/__test__/commentService.test.ts
@@ -952,13 +952,14 @@ describe('spam telegram alert (notify-only tiering)', () => {
 
   test('emits Tier B (spam_ring) when the author repeats near-identical content', async () => {
     const ringAuthor = '2'
-    const tmpl = (tag: string) =>
-      `<p>加賴 ${tag} 全套服務到府 官網 www.x${tag}.com 約妹首選快來</p>`
-    await createComment(tmpl('aaa'), ringAuthor)
-    await createComment(tmpl('bbb'), ringAuthor)
-    await createComment(tmpl('ccc'), ringAuthor)
-    const latest = await createComment(tmpl('ddd'), ringAuthor)
-    await alert(latest.id, 0.99, tmpl('ddd'))
+    // rotating contact IDs (with digits) between an otherwise-identical template
+    const tmpl = (n: number) =>
+      `<p>加賴 vip${n}888 全套服務到府 官網 www.x${n}.com 約妹首選快來</p>`
+    await createComment(tmpl(1), ringAuthor)
+    await createComment(tmpl(2), ringAuthor)
+    await createComment(tmpl(3), ringAuthor)
+    const latest = await createComment(tmpl(4), ringAuthor)
+    await alert(latest.id, 0.99, tmpl(4))
     expect(sent).toHaveLength(1)
     expect(sent[0]).toMatchObject({ reason: 'spam_ring' })
   })
diff --git a/src/connectors/commentSpamSignals.ts b/src/connectors/commentSpamSignals.ts
index e71f05aa0..82b54a08a 100644
--- a/src/connectors/commentSpamSignals.ts
+++ b/src/connectors/commentSpamSignals.ts
@@ -100,7 +100,11 @@ export const normalizeForDup = (content: string): string =>
     .toLowerCase()
     .replace(/https?:\/\/\S+|www\.\S+/g, ' ')
     .replace(/@[a-z0-9_]+/g, ' ')
-    .replace(/\d+/g, ' ')
+    // drop whole alphanumeric tokens that contain a digit — these are the
+    // contact IDs / phone numbers a spammer rotates between otherwise-identical
+    // posts (sk3826, vip888, 0912-345...). Pure-letter words are kept so English
+    // templates still ring-match.
+    .replace(/[a-z0-9]*\d[a-z0-9]*/gi, ' ')
     .replace(/[^\p{Letter}\p{Number}]+/gu, '')
 
 /** Set of character n-grams (default trigrams) of a normalized string. */