From 01847a55934ee40a3e5355ada0db989958fc3380 Mon Sep 17 00:00:00 2001 From: Jan Krivanek Date: Thu, 4 Jun 2026 08:19:58 +0200 Subject: [PATCH 1/3] Fix malicious-scan repeat-spam + integrity-filter blocks Root causes (observed on PR #237): 1. Agent's emitted add_comment body did not include the HTML marker line (), so both the orchestrator's pre-dispatch check and the agent's own Step 1 idempotency lookup failed to find a prior scan for the same head SHA. Result: hourly re-dispatch. 2. The github MCP tools (pull_request_read, list_pull_requests, search_pull_requests) are blocked by the gh-aw integrity filter on PRs from non-approved authors -- exactly the population this scanner targets. Result: 'Integrity filter blocked N items' notes in every comment. Fixes: - pr-malicious-scan.agent.md: drop the github MCP toolset, add 'gh' to the bash allowlist, and instruct the agent to use 'gh api' for all PR data reads (PAT-authenticated, not subject to the integrity filter). - Strengthen Step 5: the HTML marker MUST be the first line of the comment body. Add a defense-in-depth note that the orchestrator also accepts the visible-body sentinel. - pr-triage-batch.yml + pr-triage-act.sh: match prior scans by EITHER the HTML marker OR the visible-body sentinel ('Automated diff scan' + backticked sha7), so a missing marker on a previously-emitted comment no longer triggers re-dispatch. Workflow disabled remotely while this lands. --- .github/scripts/pr-triage-act.sh | 28 ++++++++- .../pr-malicious-scan.agent.lock.yml | 35 +++++------ .github/workflows/pr-malicious-scan.agent.md | 60 +++++++++++++++---- .github/workflows/pr-triage-batch.yml | 7 ++- 4 files changed, 98 insertions(+), 32 deletions(-) diff --git a/.github/scripts/pr-triage-act.sh b/.github/scripts/pr-triage-act.sh index b2fb5efec3..0c5de148a7 100644 --- a/.github/scripts/pr-triage-act.sh +++ b/.github/scripts/pr-triage-act.sh @@ -144,6 +144,26 @@ cooldown_seconds() { echo $(( COOLDOWN_DAYS * 86400 )) } +# Same as seconds_since_marker but matches a comment that contains TWO substrings +# (both must be present). Useful when the bot's HTML marker is missing and we +# fall back to the visible-body sentinel. +seconds_since_marker_visible() { + local sub_a="$1" + local sub_b="$2" + local newest + newest=$(gh api --paginate "repos/$REPO/issues/$PR_NUMBER/comments" \ + --jq ".[] | select(.user.login == \"$BOT_LOGIN\") | select((.body | contains(\"$sub_a\")) and (.body | contains(\"$sub_b\"))) | .created_at" \ + | sort | tail -n 1) + if [ -z "$newest" ] || [ "$newest" = "null" ]; then + echo "" + return + fi + local then now + then=$(date -u -d "$newest" +%s 2>/dev/null || date -u -j -f "%Y-%m-%dT%H:%M:%SZ" "$newest" +%s) + now=$(date -u +%s) + echo $(( now - then )) +} + post_comment() { local body="$1" if [ "$DRY_RUN" = "true" ]; then @@ -277,9 +297,15 @@ if [ -z "$STATE" ]; then EVAL_STATE=$(eval_status_state) log "reviewDecision=$REVIEW_DECISION unresolved_threads=$UNRESOLVED eval_status=$EVAL_STATE" - # Malicious scan precedence (non-bot, untrusted, no marker for current head) + # Malicious scan precedence (non-bot, untrusted, no marker for current head). + # Match either the HTML marker (preferred) or the visible-body sentinel — + # the scanner agent has been observed to drop the HTML comment line, in which + # case we still must not re-dispatch the scanner. if [ "$IS_BOT" = "false" ] && [ "$IS_TRUSTED" = "false" ]; then SECS=$(seconds_since_marker " ' comment BEFORE calling gh workflow run. That comment is the source of truth for 'a scan has been initiated for this head SHA' and survives every agent-side failure mode (PAT outage, integrity block, dropped HTML marker). Dedup matches either that orchestrator marker OR the agent's own fingerprint marker. - pr-triage-act.sh: drop the visible-body-sentinel fallback; match the orchestrator dispatched marker plus the agent fingerprint marker. Validated: gh aw compile clean; bash -n clean for both worker script and orchestrator embedded script; markdownlint clean; dedup query and POST api tested live against PR #713. --- .github/scripts/pr-triage-act.sh | 29 +----- .../pr-malicious-scan.agent.lock.yml | 90 +++++++------------ .github/workflows/pr-malicious-scan.agent.md | 84 ++++++++--------- .github/workflows/pr-triage-batch.yml | 38 ++++++-- 4 files changed, 111 insertions(+), 130 deletions(-) diff --git a/.github/scripts/pr-triage-act.sh b/.github/scripts/pr-triage-act.sh index 0c5de148a7..7098852a8b 100644 --- a/.github/scripts/pr-triage-act.sh +++ b/.github/scripts/pr-triage-act.sh @@ -144,26 +144,6 @@ cooldown_seconds() { echo $(( COOLDOWN_DAYS * 86400 )) } -# Same as seconds_since_marker but matches a comment that contains TWO substrings -# (both must be present). Useful when the bot's HTML marker is missing and we -# fall back to the visible-body sentinel. -seconds_since_marker_visible() { - local sub_a="$1" - local sub_b="$2" - local newest - newest=$(gh api --paginate "repos/$REPO/issues/$PR_NUMBER/comments" \ - --jq ".[] | select(.user.login == \"$BOT_LOGIN\") | select((.body | contains(\"$sub_a\")) and (.body | contains(\"$sub_b\"))) | .created_at" \ - | sort | tail -n 1) - if [ -z "$newest" ] || [ "$newest" = "null" ]; then - echo "" - return - fi - local then now - then=$(date -u -d "$newest" +%s 2>/dev/null || date -u -j -f "%Y-%m-%dT%H:%M:%SZ" "$newest" +%s) - now=$(date -u +%s) - echo $(( now - then )) -} - post_comment() { local body="$1" if [ "$DRY_RUN" = "true" ]; then @@ -298,13 +278,12 @@ if [ -z "$STATE" ]; then log "reviewDecision=$REVIEW_DECISION unresolved_threads=$UNRESOLVED eval_status=$EVAL_STATE" # Malicious scan precedence (non-bot, untrusted, no marker for current head). - # Match either the HTML marker (preferred) or the visible-body sentinel — - # the scanner agent has been observed to drop the HTML comment line, in which - # case we still must not re-dispatch the scanner. + # Match either the orchestrator-posted dispatched marker (source of truth) or + # the agent-posted fingerprint marker (set by a successful scan run). if [ "$IS_BOT" = "false" ] && [ "$IS_TRUSTED" = "false" ]; then - SECS=$(seconds_since_marker "") if [ -z "$SECS" ]; then - SECS=$(seconds_since_marker_visible "Automated diff scan" "\`$HEAD_SHA_SHORT\`") + SECS=$(seconds_since_marker "` for the **current head + SHA**'s short form (first 7 chars). The `dispatched` marker is posted by + the orchestrator before this workflow is dispatched; the `fingerprint` + marker is posted by a previous run of this workflow. + + If a match exists, **stop**: emit `noop` with reason + `already-scanned-this-head`. (This belt-and-braces check is only reached + if the orchestrator's pre-dispatch dedup somehow missed it; under normal + operation Step 1.4 always passes.) ## Step 2 — Fetch the diff -Use the GitHub API. Do not run `git checkout` on the PR head. +Use the GitHub API (MCP `pull_request_read` for `files`, or `gh api` from +bash). Do not run `git checkout` on the PR head. ```bash gh api --paginate "repos/${REPO}/pulls/${PR}/files" \ @@ -242,13 +249,10 @@ comment (Step 5). Do not apply labels. > The `add_comment` body **must begin with the literal HTML-comment marker > line on its own first line**. Do not add any prefix, blank line, indentation, > emoji, or other text before it. The orchestrator parses prior bot comments -> looking for this exact marker; if it is missing the scan will be repeated -> hourly. As a defense-in-depth fallback the orchestrator also matches the -> visible-body sentinel `Automated diff scan` plus the backticked sha7, so -> always include both `` `{sha7}` `` AND the marker line. +> looking for this exact marker. -Always post a single PR comment containing the marker so the orchestrator and -the per-PR worker can detect that this head SHA has been scanned. Use +Always post a single PR comment containing the marker so the orchestrator +and the per-PR worker can detect that this head SHA has been scanned. Use `add_comment` with body shaped exactly (the **first line** is the marker): - **Clean scan** (no findings): diff --git a/.github/workflows/pr-triage-batch.yml b/.github/workflows/pr-triage-batch.yml index c24ce237b1..788cddbc25 100644 --- a/.github/workflows/pr-triage-batch.yml +++ b/.github/workflows/pr-triage-batch.yml @@ -29,6 +29,7 @@ on: permissions: pull-requests: read + issues: write statuses: read actions: write contents: read @@ -129,15 +130,17 @@ jobs: # Compute state — same logic as worker, kept simple and deterministic. STATE="" if [ "$IS_BOT" = "false" ] && [ "$IS_TRUSTED" = "false" ]; then - # Look for prior malicious-scan signal on this head. Match either the - # HTML marker (preferred) or the visible-body sentinel — the agent has - # been observed to occasionally drop the HTML comment line, and we - # must not re-dispatch the scanner in that case. + # Look for prior malicious-scan signal on this head. Match either: + # (orchestrator-authored, posted + # just before `gh workflow run`; survives any agent-side failure mode), OR + # (agent-authored, posted by + # a successful scan run). + # Either marker means "do not re-dispatch for this head SHA". SHORT="${HEAD_SHA:0:7}" # NB: --paginate runs --jq per page, so aggregations like 'length' would emit one # number per page. Emit one .id per matching comment and count lines in the shell. MARKER=$(gh api --paginate "repos/$REPO/issues/$PR/comments" \ - --jq ".[] | select(.user.login == \"github-actions[bot]\") | select((.body | contains(\"\")) or (.body | contains(\"" \ + "🔍 Automated malicious-diff scan dispatched for \`$SHORT\`." \ + "_Results will be posted as code-scanning alerts and a follow-up comment by github-actions[bot]._") + if ! gh api -X POST "repos/$REPO/issues/$PR/comments" \ + -f body="$PRE_DISPATCH_BODY" >/dev/null 2>&1; then + echo "::warning::failed to post pre-dispatch marker for PR #$PR — skipping scanner dispatch" + continue + fi + if gh workflow run pr-malicious-scan.agent.lock.yml --repo "$REPO" \ + -f pr_number="$PR"; then + DISPATCHED=$((DISPATCHED + 1)) + else + echo "::warning::failed to dispatch scanner for PR #$PR" + fi else echo "::notice::scanner workflow not yet present; would dispatch for PR #$PR" fi From 6eb0d132ec6e4f503566fd272da9b265d0ef2e14 Mon Sep 17 00:00:00 2001 From: Jan Krivanek Date: Thu, 4 Jun 2026 12:28:52 +0200 Subject: [PATCH 3/3] address PR review: idempotency marker + image pinning - Step 1.4 idempotency check matches only fingerprint=, not dispatched= (the orchestrator-emitted marker would otherwise self-cancel every run) - tools.github sets allowed-repos: public + min-integrity: none (replaces gh api workaround) - Drop all gh api / base64 shell snippets from agent prompt; rely on MCP pull_request_read and repos.get_file_contents - Restore container image digest pinning (sha256) in lock manifest, comment block, and download_docker_images.sh args - Header comment in pr-triage-batch.yml notes orchestrator emits pre-dispatch idempotency marker --- .../pr-malicious-scan.agent.lock.yml | 32 +++++----- .github/workflows/pr-malicious-scan.agent.md | 64 +++++++++---------- .github/workflows/pr-triage-batch.yml | 9 ++- 3 files changed, 55 insertions(+), 50 deletions(-) diff --git a/.github/workflows/pr-malicious-scan.agent.lock.yml b/.github/workflows/pr-malicious-scan.agent.lock.yml index 57852091c0..4aa229432d 100644 --- a/.github/workflows/pr-malicious-scan.agent.lock.yml +++ b/.github/workflows/pr-malicious-scan.agent.lock.yml @@ -1,5 +1,5 @@ -# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"0fdb9ff4661a928a6145a531d2598aa532edee55932739b3b4bf82b88fc12314","compiler_version":"v0.68.3","strict":true,"agent_id":"copilot"} -# gh-aw-manifest: {"version":1,"secrets":["COPILOT_GITHUB_TOKEN","COPILOT_GITHUB_TOKEN_2","COPILOT_GITHUB_TOKEN_3","COPILOT_GITHUB_TOKEN_4","COPILOT_GITHUB_TOKEN_5","COPILOT_GITHUB_TOKEN_6","COPILOT_GITHUB_TOKEN_7","COPILOT_GITHUB_TOKEN_8","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"373c709c69115d41ff229c7e5df9f8788daa9553","version":"v9"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"},{"repo":"github/codeql-action/upload-sarif","sha":"0e9f55954318745b37b7933c693bc093f7336125","version":"v4.35.1"},{"repo":"github/gh-aw-actions/setup","sha":"ba90f2186d7ad780ec640f364005fa24e797b360","version":"v0.68.3"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.20"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.20"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.20"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.2.19"},{"image":"ghcr.io/github/github-mcp-server:v0.32.0"},{"image":"node:lts-alpine"}]} +# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"0b17e77bda4dae603373a5ab53b26bcf12ee695128ef7c9c7808b3b47367941f","compiler_version":"v0.68.3","strict":true,"agent_id":"copilot"} +# gh-aw-manifest: {"version":1,"secrets":["COPILOT_GITHUB_TOKEN","COPILOT_GITHUB_TOKEN_2","COPILOT_GITHUB_TOKEN_3","COPILOT_GITHUB_TOKEN_4","COPILOT_GITHUB_TOKEN_5","COPILOT_GITHUB_TOKEN_6","COPILOT_GITHUB_TOKEN_7","COPILOT_GITHUB_TOKEN_8","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"373c709c69115d41ff229c7e5df9f8788daa9553","version":"v9"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"},{"repo":"github/codeql-action/upload-sarif","sha":"0e9f55954318745b37b7933c693bc093f7336125","version":"v4.35.1"},{"repo":"github/gh-aw-actions/setup","sha":"ba90f2186d7ad780ec640f364005fa24e797b360","version":"v0.68.3"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.20"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.20"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.20"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.2.19"},{"image":"ghcr.io/github/github-mcp-server:v0.32.0","digest":"sha256:2763823c63bcca718ce53850a1d7fcf2f501ec84028394f1b63ce7e9f4f9be28","pinned_image":"ghcr.io/github/github-mcp-server:v0.32.0@sha256:2763823c63bcca718ce53850a1d7fcf2f501ec84028394f1b63ce7e9f4f9be28"},{"image":"node:lts-alpine","digest":"sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f","pinned_image":"node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f"}]} # ___ _ _ # / _ \ | | (_) # | |_| | __ _ ___ _ __ | |_ _ ___ @@ -50,8 +50,8 @@ # - ghcr.io/github/gh-aw-firewall/api-proxy:0.25.20 # - ghcr.io/github/gh-aw-firewall/squid:0.25.20 # - ghcr.io/github/gh-aw-mcpg:v0.2.19 -# - ghcr.io/github/github-mcp-server:v0.32.0 -# - node:lts-alpine +# - ghcr.io/github/github-mcp-server:v0.32.0@sha256:2763823c63bcca718ce53850a1d7fcf2f501ec84028394f1b63ce7e9f4f9be28 +# - node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f name: "PR Malicious Code Scan" "on": @@ -197,14 +197,14 @@ jobs: run: | bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh" { - cat << 'GH_AW_PROMPT_e204d17d76f28624_EOF' + cat << 'GH_AW_PROMPT_8487c2356260795e_EOF' - GH_AW_PROMPT_e204d17d76f28624_EOF + GH_AW_PROMPT_8487c2356260795e_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md" cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md" cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md" cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md" - cat << 'GH_AW_PROMPT_e204d17d76f28624_EOF' + cat << 'GH_AW_PROMPT_8487c2356260795e_EOF' Tools: add_comment, add_labels(max:2), create_code_scanning_alert, missing_tool, missing_data, noop @@ -236,12 +236,12 @@ jobs: {{/if}} - GH_AW_PROMPT_e204d17d76f28624_EOF + GH_AW_PROMPT_8487c2356260795e_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md" - cat << 'GH_AW_PROMPT_e204d17d76f28624_EOF' + cat << 'GH_AW_PROMPT_8487c2356260795e_EOF' {{#runtime-import .github/workflows/pr-malicious-scan.agent.md}} - GH_AW_PROMPT_e204d17d76f28624_EOF + GH_AW_PROMPT_8487c2356260795e_EOF } > "$GH_AW_PROMPT" - name: Interpolate variables and render templates uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 @@ -406,15 +406,15 @@ jobs: GH_AW_APPROVAL_LABELS_VAR: ${{ vars.GH_AW_GITHUB_APPROVAL_LABELS || '' }} run: bash "${RUNNER_TEMP}/gh-aw/actions/parse_guard_list.sh" - name: Download container images - run: bash "${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh" ghcr.io/github/gh-aw-firewall/agent:0.25.20 ghcr.io/github/gh-aw-firewall/api-proxy:0.25.20 ghcr.io/github/gh-aw-firewall/squid:0.25.20 ghcr.io/github/gh-aw-mcpg:v0.2.19 ghcr.io/github/github-mcp-server:v0.32.0 node:lts-alpine + run: bash "${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh" ghcr.io/github/gh-aw-firewall/agent:0.25.20 ghcr.io/github/gh-aw-firewall/api-proxy:0.25.20 ghcr.io/github/gh-aw-firewall/squid:0.25.20 ghcr.io/github/gh-aw-mcpg:v0.2.19 ghcr.io/github/github-mcp-server:v0.32.0@sha256:2763823c63bcca718ce53850a1d7fcf2f501ec84028394f1b63ce7e9f4f9be28 node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f - name: Write Safe Outputs Config run: | mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs" mkdir -p /tmp/gh-aw/safeoutputs mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs - cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_397fd8764489d169_EOF' + cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_7c6845d367e3c6a9_EOF' {"add_comment":{"max":1},"add_labels":{"max":2},"create_code_scanning_alert":{"driver":"PR Malicious Code Scanner"},"create_report_incomplete_issue":{},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"false"},"report_incomplete":{}} - GH_AW_SAFE_OUTPUTS_CONFIG_397fd8764489d169_EOF + GH_AW_SAFE_OUTPUTS_CONFIG_7c6845d367e3c6a9_EOF - name: Write Safe Outputs Tools env: GH_AW_TOOLS_META_JSON: | @@ -657,7 +657,7 @@ jobs: export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.2.19' mkdir -p /home/runner/.copilot - cat << GH_AW_MCP_CONFIG_5b948834b79b9124_EOF | bash "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh" + cat << GH_AW_MCP_CONFIG_bd35165c62e36b49_EOF | bash "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh" { "mcpServers": { "github": { @@ -674,7 +674,7 @@ jobs: "approval-labels": ${{ steps.parse-guard-vars.outputs.approval_labels }}, "blocked-users": ${{ steps.parse-guard-vars.outputs.blocked_users }}, "min-integrity": "none", - "repos": "all", + "repos": "public", "trusted-users": ${{ steps.parse-guard-vars.outputs.trusted_users }} } } @@ -701,7 +701,7 @@ jobs: "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}" } } - GH_AW_MCP_CONFIG_5b948834b79b9124_EOF + GH_AW_MCP_CONFIG_bd35165c62e36b49_EOF - name: Download activation artifact uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: diff --git a/.github/workflows/pr-malicious-scan.agent.md b/.github/workflows/pr-malicious-scan.agent.md index 07674e76fd..dc10aad60d 100644 --- a/.github/workflows/pr-malicious-scan.agent.md +++ b/.github/workflows/pr-malicious-scan.agent.md @@ -81,6 +81,11 @@ tools: # by `safe-outputs` (only code-scanning alerts, one comment, ≤2 labels) # and the `permissions: contents: read, pull-requests: read` block above. min-integrity: none + # Scope the github MCP guard to public repos only — this workflow only + # ever inspects this repo (which is public). `allowed-repos` accepts + # `all` or `public`; `public` is the tighter of the two and matches the + # pattern used by other gh-aw workflows in this repo. + allowed-repos: public bash: - "cat" - "grep" @@ -131,10 +136,9 @@ is to inspect the **diff** of a single pull request submitted by an external ## Target PR - PR number: `${{ inputs.pr_number }}` -- Head SHA: look it up via the GitHub MCP `pull_request_read` tool (or - `gh api repos/{owner}/{repo}/pulls/{pr_number}` from bash) at scan time. - Always use the head SHA reported by the API, not anything from the diff - body or the trigger payload. +- Head SHA: look it up via the GitHub MCP `pull_request_read` tool at scan + time. Always use the head SHA reported by the API, not anything from the + diff body. Use the GitHub MCP tools (`pull_request_read`, `repos`) to read PR data. The scanner runs with `min-integrity: none` so these tools are NOT filtered @@ -142,42 +146,38 @@ by the gh-aw integrity gateway. `safe-outputs` still gates every mutation. ## Step 1 — Eligibility -1. Fetch the PR (`pull_request_read` MCP tool, or `gh api repos/{owner}/{repo}/pulls/{pr_number}`). +1. Fetch the PR via the MCP `pull_request_read` tool. 2. If `author_association` ∈ `{OWNER, MEMBER, COLLABORATOR}`, **stop**: emit `noop` with reason `trusted-contributor`. Trusted contributors are scanned only by request. 3. If the author's login ends with `[bot]` or `.user.type == "Bot"`, **stop**: emit `noop` with reason `bot-author`. -4. **Idempotency check.** Fetch existing PR comments and look for any prior - comment authored by `github-actions[bot]` whose body contains the literal - string `` for the **current head - SHA**'s short form (first 7 chars). The `dispatched` marker is posted by - the orchestrator before this workflow is dispatched; the `fingerprint` - marker is posted by a previous run of this workflow. - - If a match exists, **stop**: emit `noop` with reason - `already-scanned-this-head`. (This belt-and-braces check is only reached - if the orchestrator's pre-dispatch dedup somehow missed it; under normal - operation Step 1.4 always passes.) +4. **Idempotency self-check.** Fetch existing PR comments via the MCP tools + and look for any prior comment authored by `github-actions[bot]` whose + body contains the literal string + `` + marker** — that one is posted by the orchestrator immediately *before* it + dispatches this workflow, so it will always be present at the start of + your run. Treating it as "already scanned" would cause every scan to + no-op. ## Step 2 — Fetch the diff -Use the GitHub API (MCP `pull_request_read` for `files`, or `gh api` from -bash). Do not run `git checkout` on the PR head. - -```bash -gh api --paginate "repos/${REPO}/pulls/${PR}/files" \ - --jq '.[] | {filename, status, additions, deletions, patch}' -``` - -For files where `patch` is null/empty (binary or oversized), record the -filename and treat it as `binary-or-oversized`. For at most 5 such files that -are also under a sensitive path (see Step 3), fetch the raw blob: - -```bash -gh api "repos/${REPO}/contents/${path}?ref=${HEAD_SHA}" --jq .content | base64 -d | head -c 8192 -``` +Use the GitHub MCP `pull_request_read` tool with the `files` action (or the +`repos` toolset for raw blob reads). Do not run `git checkout` on the PR +head, and do not invoke `gh` or `curl` from bash — only the MCP tools and +the text-processing utilities listed under `tools.bash` are available. + +For each changed file, capture `filename`, `status`, `additions`, +`deletions`, and `patch`. For files where `patch` is null/empty (binary or +oversized), record the filename and treat it as `binary-or-oversized`. For +at most 5 such files that are also under a sensitive path (see Step 3), +fetch the raw file content via the MCP `repos` toolset (`get_file_contents` +at `ref=`) and inspect the first ~8 KB. Limit total inspection to ~64 changed files / ~256 KB of patch text. If the diff is larger, scan the most-sensitive paths first diff --git a/.github/workflows/pr-triage-batch.yml b/.github/workflows/pr-triage-batch.yml index 788cddbc25..ab32018ec9 100644 --- a/.github/workflows/pr-triage-batch.yml +++ b/.github/workflows/pr-triage-batch.yml @@ -3,8 +3,13 @@ name: "PR Triage — Batch" # Hourly orchestrator. Enumerates open PRs, computes a deterministic state # for each, and dispatches the per-PR worker (pr-triage.yml) or the malicious- # code scanner (pr-malicious-scan.agent.lock.yml) for PRs that need action. -# No model calls; no comments; no labels are applied here. The worker owns the -# side effects. +# No model calls; no labels are applied here. The worker owns label and +# author-ping side effects. The orchestrator itself posts at most ONE comment +# per scanner dispatch — a deterministic +# `` idempotency marker — before +# triggering the scanner workflow. That marker is the source of truth that +# survives any scanner-side failure mode (PAT outage, integrity block, +# dropped HTML marker by the agent). on: schedule: