Skip to content
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ MATTERS_SPAM_DETECTION_API_URL=
MATTERS_SHORT_CONTENT_SPAM_DETECTION_API_URL=
MATTERS_COMMENT_SPAM_DETECTION_API_URL=
MATTERS_COMMENT_SPAM_AUTO_COLLAPSE=false
MATTERS_AWS_SPAM_SAMPLE_QUEUE_URL=
MATTERS_SPAM_SAMPLE_HASH_SALT=
MATTERS_CHANNEL_CLASSIFICATION_API_URL=
MATTERS_LANGUAGE_DETECTION_API_URL=
MATTERS_FEDERATION_EXPORT_TRIGGER_MODE=off
Expand Down
19 changes: 19 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Codecov configuration.
#
# patch: strict — every PR's changed lines must stay covered vs base (this is
# the real "test your new code" gate; default target auto, no slack).
# project: tolerate a 1% drop in total repo coverage. Total coverage on this
# repo fluctuates run-to-run because the suite is sharded across heavy
# integration tests, and codecov compares against the nearest ancestor that
# has a coverage upload (develop merge commits don't publish one), so a PR can
# show a small spurious project drop even when its own diff is fully covered.
# The 1% threshold absorbs that noise without weakening the patch gate.
coverage:
status:
project:
default:
target: auto
threshold: 1%
patch:
default:
target: auto
3 changes: 3 additions & 0 deletions src/common/enums/sqs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ export const QUEUE_URL = {
// report alert (Telegram side-channel)
reportAlert: environment?.awsReportAlertQueueUrl,

// spam training-sample capture (axis-2 L2)
spamSample: environment?.awsSpamSampleQueueUrl,

// likecoin
likecoinLike: environment?.awsLikecoinLikeUrl,
likecoinSendPV: environment?.awsLikecoinSendPVUrl,
Expand Down
5 changes: 5 additions & 0 deletions src/common/environment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ export const environment = {
awsArchiveUserQueueUrl: process.env.MATTERS_AWS_ARCHIVE_USER_QUEUE_URL || '',
awsReportAlertQueueUrl:
process.env.MATTERS_AWS_REPORT_ALERT_QUEUE_URL || '',
// Spam training-sample capture (axis-2 L2): de-identified moderation events
// for the spam-model training corpus. Best-effort; off when unset.
awsSpamSampleQueueUrl:
process.env.MATTERS_AWS_SPAM_SAMPLE_QUEUE_URL || '',
spamSampleHashSalt: process.env.MATTERS_SPAM_SAMPLE_HASH_SALT || '',
awsLikecoinLikeUrl: process.env.MATTERS_AWS_LIKECOIN_LIKE_QUEUE_URL || '',
awsLikecoinSendPVUrl:
process.env.MATTERS_AWS_LIKECOIN_SEND_PV_QUEUE_URL || '',
Expand Down
87 changes: 87 additions & 0 deletions src/common/notifications/spamSample.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import { QUEUE_URL } from '#common/enums/index.js'
import { environment } from '#common/environment.js'
import { getLogger } from '#common/logger.js'
import { aws } from '#connectors/aws/index.js'
import * as Sentry from '@sentry/node'
import { createHmac } from 'node:crypto'

const logger = getLogger('spam-sample')

/**
* Shape of the SQS message emitted when a moderation event yields a labeled
* sample worth keeping for the spam-model training corpus (axis-2 L2).
*
* The point of L2 is to capture content at the moment of moderation so it
* survives later deletion: `clearCommunityWatchOriginalContent` nulls the
* snapshot, and account archival/ban can purge content — both would erase the
* training signal that L1's passive DB extraction relies on. A separate Lambda
* worker consumes this queue and appends de-identified rows to the S3 training
* bucket.
*
* De-identification happens HERE so no raw user/content ids ever enter the
* queue: ids are replaced with HMAC-SHA256(salt, id) (stable for dedup, not
* reversible). Only the text the model needs to learn from is carried verbatim.
*/
export type SpamSampleCaptured = {
/** 1 = spam (confirmed/blocked), 0 = ham (false-positive / reversed). */
label: 0 | 1
/** The content to train on (the only field carried verbatim). */
text: string
/** Where this label came from, e.g. 'community_watch_remove:porn_ad'. */
labelSource: string
/** Model spam score at capture time, if known. */
score?: number | null
/** HMAC of the comment id (dedup key, non-reversible). */
commentHash: string
/** HMAC of the author id (non-reversible). */
authorHash: string
/** ISO-8601 capture time, stamped by the producer. */
occurredAt: string
}

const hash = (value: string): string =>
createHmac('sha256', environment.spamSampleHashSalt)
.update(String(value))
.digest('hex')

/**
* Emit a `SpamSampleCaptured` event to SQS. Mirrors `enqueueReportAlert`:
* - Best-effort: NEVER throws — a queue/crypto issue must not fail the
* moderation mutation that triggered it.
* - No-op when the queue or salt is unconfigured (local/dev), so the salt is
* never optional-but-empty in a way that would weaken the hash silently.
*/
export const enqueueSpamSample = async (input: {
label: 0 | 1
text: string
labelSource: string
commentId: string
authorId: string | null
score?: number | null
}): Promise<void> => {
if (!QUEUE_URL.spamSample || !environment.spamSampleHashSalt) {
return
}
if (!input.text || !input.text.trim()) {
return
}

try {
const message: SpamSampleCaptured = {
label: input.label,
text: input.text,
labelSource: input.labelSource,
score: input.score ?? null,
commentHash: hash(input.commentId),
authorHash: input.authorId ? hash(input.authorId) : '',
occurredAt: new Date().toISOString(),
}
await aws.sqsSendMessage({
messageBody: message,
queueUrl: QUEUE_URL.spamSample,
})
} catch (err) {
logger.error(err, 'failed to enqueue spam sample')
Sentry.captureException(err)
}
}
69 changes: 69 additions & 0 deletions src/common/utils/__test__/communityWatchRemoveComment.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@ import {
COMMENT_TYPE,
NODE_TYPES,
OFFICIAL_NOTICE_EXTEND_TYPE,
QUEUE_URL,
USER_FEATURE_FLAG_TYPE,
USER_STATE,
} from '#common/enums/index.js'
import { environment } from '#common/environment.js'
import { toGlobalId } from '#common/utils/index.js'
import { aws } from '#connectors/aws/index.js'
import communityWatchRemoveComment from '#mutations/comment/communityWatchRemoveComment.js'

const mutation = communityWatchRemoveComment as NonNullable<
Expand Down Expand Up @@ -163,6 +166,39 @@ const removeComment = (
{} as any
)

// Exercise the spam-training-sample capture (axis-2 L2) so spamSample.ts's
// enqueue body runs under coverage. The notifications dir has no CI test script,
// so this — the mutation path that calls enqueueSpamSample — is its coverage.
const originalSpamQueue = QUEUE_URL.spamSample
const originalSpamSalt = environment.spamSampleHashSalt
const originalSqsSend = aws.sqsSendMessage
let sentSpamSamples: Array<{ messageBody: Record<string, unknown> }>

beforeAll(() => {
;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
(async (params) => {
sentSpamSamples.push({
messageBody: params.messageBody as Record<string, unknown>,
})
}) as typeof aws.sqsSendMessage
;(QUEUE_URL as { spamSample: string }).spamSample = 'https://sqs.test/spam'
;(environment as { spamSampleHashSalt: string }).spamSampleHashSalt =
'test-salt'
})

afterAll(() => {
;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
originalSqsSend
;(QUEUE_URL as { spamSample: string }).spamSample =
originalSpamQueue as string
;(environment as { spamSampleHashSalt: string }).spamSampleHashSalt =
originalSpamSalt
})

beforeEach(() => {
sentSpamSamples = []
})

describe('communityWatchRemoveComment', () => {
test('removes an article comment and writes an audit action', async () => {
const { context, insertedActions, insertedReports, commentUpdates } =
Expand Down Expand Up @@ -214,6 +250,39 @@ describe('communityWatchRemoveComment', () => {
recipientId: baseComment.authorId,
})
)

// a de-identified spam training sample is captured (axis-2 L2)
expect(sentSpamSamples).toHaveLength(1)
expect(sentSpamSamples[0].messageBody).toMatchObject({
label: 1,
labelSource: 'community_watch_remove:porn_ad',
})
expect(sentSpamSamples[0].messageBody.commentHash).toMatch(/^[0-9a-f]{64}$/)
})

test('still removes the comment when spam-sample enqueue throws', async () => {
const { context } = createContext()
const prev = aws.sqsSendMessage
;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
(async () => {
throw new Error('SQS down')
}) as typeof aws.sqsSendMessage
try {
const result = await removeComment(context)
expect(result.state).toBe(COMMENT_STATE.banned)
} finally {
;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
prev
}
})

test('skips the spam sample when the removed comment has no text', async () => {
const { context } = createContext({
comment: { ...baseComment, content: ' ' },
})
const result = await removeComment(context)
expect(result.state).toBe(COMMENT_STATE.banned)
expect(sentSpamSamples).toHaveLength(0)
})

test('removes a moment comment without an article title', async () => {
Expand Down
6 changes: 6 additions & 0 deletions src/common/utils/__test__/communityWatchStaffReview.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,12 @@ describe('community watch staff review mutations', () => {
.mockResolvedValue({ ...baseAction, originalContent: null })
const context = createMutationContext({
commentService: {
// the clear mutation snapshots the action for the spam-training
// capture (axis-2 L2) before clearing; enqueueSpamSample itself
// no-ops without the queue/salt env, so nothing is actually sent.
findCommunityWatchActionByUUID: jest
.fn<any>()
.mockResolvedValue(baseAction),
clearCommunityWatchOriginalContent:
clearCommunityWatchOriginalContentService,
},
Expand Down
15 changes: 15 additions & 0 deletions src/mutations/comment/clearCommunityWatchOriginalContent.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { Context, GQLMutationResolvers } from '#definitions/index.js'

import { ForbiddenError } from '#common/errors.js'
import { enqueueSpamSample } from '#common/notifications/spamSample.js'

type ClearCommunityWatchOriginalContentInput = {
uuid: string
Expand All @@ -18,6 +19,20 @@ const resolver = async (
throw new ForbiddenError('viewer has no permission')
}

// Last chance to keep this content as a training sample before it's nulled
// (axis-2 L2). A reversed action means the removal was a false positive →
// hard-negative ham; otherwise confirmed spam. De-identified, best-effort.
const action = await commentService.findCommunityWatchActionByUUID(uuid)
if (action?.originalContent) {
await enqueueSpamSample({
label: action.reviewState === 'reversed' ? 0 : 1,
text: action.originalContent,
labelSource: `community_watch_clear:${action.reason}`,
commentId: action.commentId,
authorId: action.commentAuthorId,
})
}

return commentService.clearCommunityWatchOriginalContent({
uuid,
actorId: viewer.id,
Expand Down
14 changes: 14 additions & 0 deletions src/mutations/comment/communityWatchRemoveComment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import {
UserInputError,
} from '#common/errors.js'
import { enqueueReportAlert } from '#common/notifications/reportAlert.js'
import { enqueueSpamSample } from '#common/notifications/spamSample.js'
import { fromGlobalId } from '#common/utils/index.js'
import { invalidateFQC } from '@matters/apollo-response-cache'
import { v4 } from 'uuid'
Expand Down Expand Up @@ -204,6 +205,19 @@ const resolver = async (
}
}

// Capture the removed comment as a confirmed-spam training sample (axis-2 L2).
// Done at removal time so it survives a later
// clearCommunityWatchOriginalContent / account purge. De-identified and
// best-effort inside enqueueSpamSample; never fails the removal.
await enqueueSpamSample({
label: 1,
text: updatedComment.content ?? '',
labelSource: `community_watch_remove:${reason}`,
commentId: updatedComment.id,
authorId: updatedComment.authorId,
score: updatedComment.spamScore ?? null,
})

await invalidateFQC({
node: {
id: updatedComment.targetId,
Expand Down
Loading