Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/common/environment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@ export const environment = {
// 看見"). Default off so scoring stays observe-only until ops opts in.
commentSpamAutoCollapse:
process.env.MATTERS_COMMENT_SPAM_AUTO_COLLAPSE === 'true',
// When true, a comment whose spam score reaches the system spam threshold is
// classified (Tier A auto / B ring / C review) and surfaced to the admin
// Telegram chat for moderation. Notify-only: this flag NEVER hides a comment
// (auto-action stays behind commentSpamAutoCollapse). Default off.
commentSpamAlert: process.env.MATTERS_COMMENT_SPAM_ALERT === 'true',
channelClassificationApiUrl:
process.env.MATTERS_CHANNEL_CLASSIFICATION_API_URL || '',
languageDetectionApiUrl: process.env.MATTERS_LANGUAGE_DETECTION_API_URL || '',
Expand Down
2 changes: 1 addition & 1 deletion src/common/notifications/reportAlert.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ const logger = getLogger('report-alert')
* is the integration boundary between the two.
*/
export type ReportAlertRequested = {
source: 'direct' | 'community_watch'
source: 'direct' | 'community_watch' | 'spam_detection'
/** Aggregation key — same key within 24h edits the same Telegram message. */
dedupeKey: string
/** Human-readable description of what was reported. */
Expand Down
106 changes: 106 additions & 0 deletions src/connectors/__test__/commentService.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ import {
COMMENT_TYPE,
FEATURE_FLAG,
FEATURE_NAME,
QUEUE_URL,
USER_FEATURE_FLAG_TYPE,
USER_STATE,
} from '#common/enums/index.js'
import { environment } from '#common/environment.js'
import { aws } from '#connectors/aws/index.js'

import { PublicationService } from '../article/publicationService.js'
import { AtomService } from '../atomService.js'
Expand Down Expand Up @@ -865,3 +867,107 @@ describe('auto-collapse spam comments', () => {
})
})
})

describe('spam telegram alert (notify-only tiering)', () => {
let targetTypeId: string
const originalQueue = QUEUE_URL.reportAlert
const originalSqsSend = aws.sqsSendMessage
let sent: Array<Record<string, unknown>>

beforeAll(async () => {
const entityType = await atomService.findFirst({
table: 'entity_type',
where: { table: 'article' },
})
targetTypeId = entityType.id
await systemService.setFeatureFlag({
name: FEATURE_NAME.spam_detection,
flag: FEATURE_FLAG.on,
value: 0.8,
})
;(QUEUE_URL as { reportAlert: string }).reportAlert =
'https://sqs.test/report-alert'
;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
(async (params) => {
sent.push(params.messageBody as Record<string, unknown>)
}) as typeof aws.sqsSendMessage
})

afterAll(() => {
;(QUEUE_URL as { reportAlert: string }).reportAlert =
originalQueue as string
;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
originalSqsSend
})

beforeEach(() => {
sent = []
})

const createComment = async (content: string, authorId = '1') =>
atomService.create({
table: 'comment',
data: {
type: COMMENT_TYPE.article,
targetId: '1',
targetTypeId,
state: COMMENT_STATE.active,
uuid: uuidv4(),
authorId,
content,
},
})

// _alertSpamIfHighScore is private; reach it directly for a focused unit test.
const alert = (id: string, score: number, content: string) =>
(
commentService as unknown as {
_alertSpamIfHighScore: (
id: string,
score: number,
content: string
) => Promise<void>
}
)._alertSpamIfHighScore(id, score, content)

test('emits Tier A (spam_auto) for contact + solicitation', async () => {
const content = '<p>賴 sk3826 台灣外送茶 約妹服務 官網 www.ppp8669.com</p>'
const comment = await createComment(content)
await alert(comment.id, 0.98, content)
expect(sent).toHaveLength(1)
expect(sent[0]).toMatchObject({
source: 'spam_detection',
reason: 'spam_auto',
dedupeKey: `comment:${comment.id}`,
})
})

test('emits Tier C (spam_review) for high-score benign-looking content', async () => {
const content = '<p>紀子璇作為楊羽棠管家的最後一天定在夏天的尾聲。</p>'
const comment = await createComment(content)
await alert(comment.id, 0.99, content)
expect(sent).toHaveLength(1)
expect(sent[0]).toMatchObject({ reason: 'spam_review' })
})

test('emits Tier B (spam_ring) when the author repeats near-identical content', async () => {
const ringAuthor = '2'
// rotating contact IDs (with digits) between an otherwise-identical template
const tmpl = (n: number) =>
`<p>加賴 vip${n}888 全套服務到府 官網 www.x${n}.com 約妹首選快來</p>`
await createComment(tmpl(1), ringAuthor)
await createComment(tmpl(2), ringAuthor)
await createComment(tmpl(3), ringAuthor)
const latest = await createComment(tmpl(4), ringAuthor)
await alert(latest.id, 0.99, tmpl(4))
expect(sent).toHaveLength(1)
expect(sent[0]).toMatchObject({ reason: 'spam_ring' })
})

test('stays silent when the score is below the system threshold', async () => {
const content = '<p>賴 台灣外送茶 約妹 www.ppp8669.com</p>'
const comment = await createComment(content)
await alert(comment.id, 0.5, content)
expect(sent).toHaveLength(0)
})
})
174 changes: 174 additions & 0 deletions src/connectors/__test__/commentSpamSignals.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import {
classifyContentTier,
hasContact,
hasSolicit,
jaccard,
nearDuplicate,
normalizeForDup,
shingles,
stripHtml,
TIER_REASON,
} from '../commentSpamSignals.js'

// Real high-score examples from matters_prod (anonymized snippets) drive these
// cases: the >= 0.94 band mixes blatant spam with creative writing / replies,
// and only the contact+solicitation conjunction separates them cleanly.

describe('stripHtml', () => {
test('removes tags, entities, and collapses whitespace', () => {
expect(stripHtml('<p>hi</p>&nbsp;<b>there</b> now')).toBe('hi there now')
})
})

describe('hasContact', () => {
test.each([
'Contact us ➤Telegram: @smmbigmarket',
'Call Us – 8447779280 Call Girls',
'官網 www.ppp8669.com',
'visit mtmis.org.pk for details',
'add my line id: abc123',
])('detects contact channel: %s', (text) => {
expect(hasContact(text)).toBe(true)
})

test.each([
'問世間情為何物。孟婆的癡情令人心疼。',
'感謝你的分享,我很喜歡這個觀點。',
])('does not fire on plain prose: %s', (text) => {
expect(hasContact(text)).toBe(false)
})
})

describe('hasSolicit', () => {
test.each([
'台灣外送茶推薦 約妹服務',
'Where to Buy Verified Paxum Accounts',
'football betting predictions and odds',
'全套服務 莞式',
])('detects solicitation: %s', (text) => {
expect(hasSolicit(text)).toBe(true)
})

test('does not fire on benign creative writing', () => {
expect(hasSolicit('紀子璇作為楊羽棠管家的最後一天定在夏天的尾聲')).toBe(false)
})
})

describe('classifyContentTier', () => {
const threshold = 0.94

test('returns null when there is no threshold', () => {
expect(
classifyContentTier({ score: 0.99, content: 'x', threshold: null })
).toBeNull()
})

test('returns null when the score is below the threshold', () => {
expect(
classifyContentTier({ score: 0.5, content: 'x', threshold })
).toBeNull()
})

test('Tier A (auto) for contact + solicitation', () => {
expect(
classifyContentTier({
score: 0.984,
content:
'<p>賴/大G:sk3826 台灣外送茶推薦 約妹服務 官網 www.ppp8669.com</p>',
threshold,
})
).toBe('auto')
})

test('Tier A for English escort ad with phone', () => {
expect(
classifyContentTier({
score: 0.996,
content: 'Call Us – 8447779280 Call Girls In Chanakyapuri Escorts',
threshold,
})
).toBe('auto')
})

test('Tier C (review) for high-score creative writing (no contact/solicit)', () => {
expect(
classifyContentTier({
score: 0.992,
content:
'<p>紀子璇作為楊羽棠管家的最後一天定在夏天的尾聲,她還沒有找到工作。</p>',
threshold,
})
).toBe('review')
})

test('Tier C for a genuine reply that only mentions a keyword (no contact)', () => {
// "下注" alone (discussing own betting losses) must NOT be auto-acted.
expect(
classifyContentTier({
score: 0.957,
content: '隔夜裂口 大勝後盲目下注 星期一及星期五共損手6萬',
threshold,
})
).toBe('review')
})

test('Tier C for an @mention reply (contact-pattern but no solicitation)', () => {
expect(
classifyContentTier({
score: 0.992,
content: '@VietJoe333 請AI處理的,我懶得打馬賽克',
threshold,
})
).toBe('review')
})
})

describe('near-duplicate ring helpers', () => {
test('shingles produces trigrams and handles short strings', () => {
expect(shingles('abcd')).toEqual(new Set(['abc', 'bcd']))
expect(shingles('ab')).toEqual(new Set(['ab']))
expect(shingles('')).toEqual(new Set())
})

test('jaccard of identical sets is 1, disjoint is 0', () => {
expect(jaccard(new Set(['a']), new Set(['a']))).toBe(1)
expect(jaccard(new Set(['a']), new Set(['b']))).toBe(0)
expect(jaccard(new Set(), new Set())).toBe(0)
})

test('normalizeForDup masks urls, handles, digits, punctuation', () => {
expect(normalizeForDup('<p>Hi @bob! call 0912-345 http://x.co </p>')).toBe(
'hicall'
)
})

test('nearDuplicate matches the same template with rotated contact info', () => {
const a = '加賴 abc123 全套服務到府 官網 www.aaa.com 約妹首選'
const b = '加賴 xyz789 全套服務到府 官網 www.bbb.net 約妹首選'
expect(nearDuplicate(a, b)).toBe(true)
})

test('nearDuplicate does not merge genuinely different texts', () => {
expect(
nearDuplicate(
'今天天氣很好我去公園散步看到很多花',
'股市今天大跌我虧了很多錢心情很差'
)
).toBe(false)
})

test('nearDuplicate falls back to exact match for too-short content', () => {
expect(nearDuplicate('hi', 'hi')).toBe(true)
expect(nearDuplicate('hi', 'yo')).toBe(false)
})
})

describe('TIER_REASON', () => {
test('maps every tier to a worker reason key', () => {
expect(TIER_REASON).toEqual({
auto: 'spam_auto',
ring: 'spam_ring',
review: 'spam_review',
})
})
})
Loading
Loading