thematters · mashbean · Jun 15, 2026 · Jun 13, 2026 · Jun 14, 2026 · Jun 14, 2026
@@ -106,6 +106,8 @@ MATTERS_SPAM_DETECTION_API_URL=
 MATTERS_SHORT_CONTENT_SPAM_DETECTION_API_URL=
 MATTERS_COMMENT_SPAM_DETECTION_API_URL=
 MATTERS_COMMENT_SPAM_AUTO_COLLAPSE=false
+MATTERS_AWS_SPAM_SAMPLE_QUEUE_URL=
+MATTERS_SPAM_SAMPLE_HASH_SALT=
 MATTERS_CHANNEL_CLASSIFICATION_API_URL=
 MATTERS_LANGUAGE_DETECTION_API_URL=
 MATTERS_FEDERATION_EXPORT_TRIGGER_MODE=off

@@ -0,0 +1,19 @@
+# Codecov configuration.
+#
+# patch: strict — every PR's changed lines must stay covered vs base (this is
+#   the real "test your new code" gate; default target auto, no slack).
+# project: tolerate a 1% drop in total repo coverage. Total coverage on this
+#   repo fluctuates run-to-run because the suite is sharded across heavy
+#   integration tests, and codecov compares against the nearest ancestor that
+#   has a coverage upload (develop merge commits don't publish one), so a PR can
+#   show a small spurious project drop even when its own diff is fully covered.
+#   The 1% threshold absorbs that noise without weakening the patch gate.
+coverage:
+  status:
+    project:
+      default:
+        target: auto
+        threshold: 1%
+    patch:
+      default:
+        target: auto
@@ -9,6 +9,9 @@ export const QUEUE_URL = {
   // report alert (Telegram side-channel)
   reportAlert: environment?.awsReportAlertQueueUrl,
 
+  // spam training-sample capture (axis-2 L2)
+  spamSample: environment?.awsSpamSampleQueueUrl,
+
   // likecoin
   likecoinLike: environment?.awsLikecoinLikeUrl,
   likecoinSendPV: environment?.awsLikecoinSendPVUrl,

@@ -54,6 +54,11 @@ export const environment = {
   awsArchiveUserQueueUrl: process.env.MATTERS_AWS_ARCHIVE_USER_QUEUE_URL || '',
   awsReportAlertQueueUrl:
     process.env.MATTERS_AWS_REPORT_ALERT_QUEUE_URL || '',
+  // Spam training-sample capture (axis-2 L2): de-identified moderation events
+  // for the spam-model training corpus. Best-effort; off when unset.
+  awsSpamSampleQueueUrl:
+    process.env.MATTERS_AWS_SPAM_SAMPLE_QUEUE_URL || '',
+  spamSampleHashSalt: process.env.MATTERS_SPAM_SAMPLE_HASH_SALT || '',
   awsLikecoinLikeUrl: process.env.MATTERS_AWS_LIKECOIN_LIKE_QUEUE_URL || '',
   awsLikecoinSendPVUrl:
     process.env.MATTERS_AWS_LIKECOIN_SEND_PV_QUEUE_URL || '',

@@ -0,0 +1,87 @@
+import { QUEUE_URL } from '#common/enums/index.js'
+import { environment } from '#common/environment.js'
+import { getLogger } from '#common/logger.js'
+import { aws } from '#connectors/aws/index.js'
+import * as Sentry from '@sentry/node'
+import { createHmac } from 'node:crypto'
+
+const logger = getLogger('spam-sample')
+
+/**
+ * Shape of the SQS message emitted when a moderation event yields a labeled
+ * sample worth keeping for the spam-model training corpus (axis-2 L2).
+ *
+ * The point of L2 is to capture content at the moment of moderation so it
+ * survives later deletion: `clearCommunityWatchOriginalContent` nulls the
+ * snapshot, and account archival/ban can purge content — both would erase the
+ * training signal that L1's passive DB extraction relies on. A separate Lambda
+ * worker consumes this queue and appends de-identified rows to the S3 training
+ * bucket.
+ *
+ * De-identification happens HERE so no raw user/content ids ever enter the
+ * queue: ids are replaced with HMAC-SHA256(salt, id) (stable for dedup, not
+ * reversible). Only the text the model needs to learn from is carried verbatim.
+ */
+export type SpamSampleCaptured = {
+  /** 1 = spam (confirmed/blocked), 0 = ham (false-positive / reversed). */
+  label: 0 | 1
+  /** The content to train on (the only field carried verbatim). */
+  text: string
+  /** Where this label came from, e.g. 'community_watch_remove:porn_ad'. */
+  labelSource: string
+  /** Model spam score at capture time, if known. */
+  score?: number | null
+  /** HMAC of the comment id (dedup key, non-reversible). */
+  commentHash: string
+  /** HMAC of the author id (non-reversible). */
+  authorHash: string
+  /** ISO-8601 capture time, stamped by the producer. */
+  occurredAt: string
+}
+
+const hash = (value: string): string =>
+  createHmac('sha256', environment.spamSampleHashSalt)
+    .update(String(value))
+    .digest('hex')
+
+/**
+ * Emit a `SpamSampleCaptured` event to SQS. Mirrors `enqueueReportAlert`:
+ *   - Best-effort: NEVER throws — a queue/crypto issue must not fail the
+ *     moderation mutation that triggered it.
+ *   - No-op when the queue or salt is unconfigured (local/dev), so the salt is
+ *     never optional-but-empty in a way that would weaken the hash silently.
+ */
+export const enqueueSpamSample = async (input: {
+  label: 0 | 1
+  text: string
+  labelSource: string
+  commentId: string
+  authorId: string | null
+  score?: number | null
+}): Promise<void> => {
+  if (!QUEUE_URL.spamSample || !environment.spamSampleHashSalt) {
+    return
+  }
+  if (!input.text || !input.text.trim()) {
+    return
+  }
+
+  try {
+    const message: SpamSampleCaptured = {
+      label: input.label,
+      text: input.text,
+      labelSource: input.labelSource,
+      score: input.score ?? null,
+      commentHash: hash(input.commentId),
+      authorHash: input.authorId ? hash(input.authorId) : '',
+      occurredAt: new Date().toISOString(),
+    }
+    await aws.sqsSendMessage({
+      messageBody: message,
+      queueUrl: QUEUE_URL.spamSample,
+    })
+  } catch (err) {
+    logger.error(err, 'failed to enqueue spam sample')
+    Sentry.captureException(err)
+  }
+}
@@ -11,10 +11,13 @@ import {
   COMMENT_TYPE,
   NODE_TYPES,
   OFFICIAL_NOTICE_EXTEND_TYPE,
+  QUEUE_URL,
   USER_FEATURE_FLAG_TYPE,
   USER_STATE,
 } from '#common/enums/index.js'
+import { environment } from '#common/environment.js'
 import { toGlobalId } from '#common/utils/index.js'
+import { aws } from '#connectors/aws/index.js'
 import communityWatchRemoveComment from '#mutations/comment/communityWatchRemoveComment.js'
 
 const mutation = communityWatchRemoveComment as NonNullable<
@@ -163,6 +166,39 @@ const removeComment = (
     {} as any
   )
 
+// Exercise the spam-training-sample capture (axis-2 L2) so spamSample.ts's
+// enqueue body runs under coverage. The notifications dir has no CI test script,
+// so this — the mutation path that calls enqueueSpamSample — is its coverage.
+const originalSpamQueue = QUEUE_URL.spamSample
+const originalSpamSalt = environment.spamSampleHashSalt
+const originalSqsSend = aws.sqsSendMessage
+let sentSpamSamples: Array<{ messageBody: Record<string, unknown> }>
+
+beforeAll(() => {
+  ;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
+    (async (params) => {
+      sentSpamSamples.push({
+        messageBody: params.messageBody as Record<string, unknown>,
+      })
+    }) as typeof aws.sqsSendMessage
+  ;(QUEUE_URL as { spamSample: string }).spamSample = 'https://sqs.test/spam'
+  ;(environment as { spamSampleHashSalt: string }).spamSampleHashSalt =
+    'test-salt'
+})
+
+afterAll(() => {
+  ;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
+    originalSqsSend
+  ;(QUEUE_URL as { spamSample: string }).spamSample =
+    originalSpamQueue as string
+  ;(environment as { spamSampleHashSalt: string }).spamSampleHashSalt =
+    originalSpamSalt
+})
+
+beforeEach(() => {
+  sentSpamSamples = []
+})
+
 describe('communityWatchRemoveComment', () => {
   test('removes an article comment and writes an audit action', async () => {
     const { context, insertedActions, insertedReports, commentUpdates } =
@@ -214,6 +250,39 @@ describe('communityWatchRemoveComment', () => {
         recipientId: baseComment.authorId,
       })
     )
+
+    // a de-identified spam training sample is captured (axis-2 L2)
+    expect(sentSpamSamples).toHaveLength(1)
+    expect(sentSpamSamples[0].messageBody).toMatchObject({
+      label: 1,
+      labelSource: 'community_watch_remove:porn_ad',
+    })
+    expect(sentSpamSamples[0].messageBody.commentHash).toMatch(/^[0-9a-f]{64}$/)
+  })
+
+  test('still removes the comment when spam-sample enqueue throws', async () => {
+    const { context } = createContext()
+    const prev = aws.sqsSendMessage
+    ;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
+      (async () => {
+        throw new Error('SQS down')
+      }) as typeof aws.sqsSendMessage
+    try {
+      const result = await removeComment(context)
+      expect(result.state).toBe(COMMENT_STATE.banned)
+    } finally {
+      ;(aws as { sqsSendMessage: typeof aws.sqsSendMessage }).sqsSendMessage =
+        prev
+    }
+  })
+
+  test('skips the spam sample when the removed comment has no text', async () => {
+    const { context } = createContext({
+      comment: { ...baseComment, content: '   ' },
+    })
+    const result = await removeComment(context)
+    expect(result.state).toBe(COMMENT_STATE.banned)
+    expect(sentSpamSamples).toHaveLength(0)
   })
 
   test('removes a moment comment without an article title', async () => {

@@ -457,6 +457,12 @@ describe('community watch staff review mutations', () => {
       .mockResolvedValue({ ...baseAction, originalContent: null })
     const context = createMutationContext({
       commentService: {
+        // the clear mutation snapshots the action for the spam-training
+        // capture (axis-2 L2) before clearing; enqueueSpamSample itself
+        // no-ops without the queue/salt env, so nothing is actually sent.
+        findCommunityWatchActionByUUID: jest
+          .fn<any>()
+          .mockResolvedValue(baseAction),
         clearCommunityWatchOriginalContent:
           clearCommunityWatchOriginalContentService,
       },

@@ -1,6 +1,7 @@
 import type { Context, GQLMutationResolvers } from '#definitions/index.js'
 
 import { ForbiddenError } from '#common/errors.js'
+import { enqueueSpamSample } from '#common/notifications/spamSample.js'
 
 type ClearCommunityWatchOriginalContentInput = {
   uuid: string
@@ -18,6 +19,20 @@ const resolver = async (
     throw new ForbiddenError('viewer has no permission')
   }
 
+  // Last chance to keep this content as a training sample before it's nulled
+  // (axis-2 L2). A reversed action means the removal was a false positive →
+  // hard-negative ham; otherwise confirmed spam. De-identified, best-effort.
+  const action = await commentService.findCommunityWatchActionByUUID(uuid)
+  if (action?.originalContent) {
+    await enqueueSpamSample({
+      label: action.reviewState === 'reversed' ? 0 : 1,
+      text: action.originalContent,
+      labelSource: `community_watch_clear:${action.reason}`,
+      commentId: action.commentId,
+      authorId: action.commentAuthorId,
+    })
+  }
+
   return commentService.clearCommunityWatchOriginalContent({
     uuid,
     actorId: viewer.id,

@@ -22,6 +22,7 @@ import {
   UserInputError,
 } from '#common/errors.js'
 import { enqueueReportAlert } from '#common/notifications/reportAlert.js'
+import { enqueueSpamSample } from '#common/notifications/spamSample.js'
 import { fromGlobalId } from '#common/utils/index.js'
 import { invalidateFQC } from '@matters/apollo-response-cache'
 import { v4 } from 'uuid'
@@ -204,6 +205,19 @@ const resolver = async (
     }
   }
 
+  // Capture the removed comment as a confirmed-spam training sample (axis-2 L2).
+  // Done at removal time so it survives a later
+  // clearCommunityWatchOriginalContent / account purge. De-identified and
+  // best-effort inside enqueueSpamSample; never fails the removal.
+  await enqueueSpamSample({
+    label: 1,
+    text: updatedComment.content ?? '',
+    labelSource: `community_watch_remove:${reason}`,
+    commentId: updatedComment.id,
+    authorId: updatedComment.authorId,
+    score: updatedComment.spamScore ?? null,
+  })
+
   await invalidateFQC({
     node: {
       id: updatedComment.targetId,