Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
d109ba2
Enhance CI workflows for staging and validation of Lambda layers
aavinash-nr Apr 24, 2026
e6eb859
Refactor CI workflows for Java, Node.js, Python, and Ruby to streamli…
aavinash-nr Apr 24, 2026
947f17c
fix: reduce S3 poll frequency to 5 min and extend default timeout to …
aavinash-nr Apr 27, 2026
8e78d79
feat: implement run_region_loop for layer publishing across all runti…
aavinash-nr Apr 27, 2026
fb8804a
fix: region-resilient publishing — continue on failure, report failed…
aavinash-nr Apr 27, 2026
db916d2
feat: implement Notify Slack Layer action for structured notification…
aavinash-nr Apr 27, 2026
aeac43a
feat: add automatic region retry on re-run for all publish workflows
aavinash-nr Apr 28, 2026
3c6998f
fix: remove expression syntax from composite action description fields
aavinash-nr Apr 28, 2026
60509e7
feat: add ECR error handling, per-image failure tracking, and Slack n…
aavinash-nr Apr 28, 2026
6a9f62b
feat: make regions, S3 bucket prefix, and ECR repository configurable…
aavinash-nr Apr 28, 2026
d8bc3b4
fix: pass LAYER_REGIONS, S3_BUCKET_PREFIX, ECR_REPOSITORY from Action…
aavinash-nr Apr 28, 2026
62ece92
fix: capture region failures from Docker via temp file to fix retry m…
aavinash-nr Apr 28, 2026
6065ca9
feat: enhance Slack notification action with failure_key and run_atte…
aavinash-nr Apr 28, 2026
47efb41
feat: decouple ECR from layer publish, show per-version regions in Slack
aavinash-nr Apr 28, 2026
11bcf42
feat: enhance layer publishing scripts to include ECR failure summari…
aavinash-nr Apr 28, 2026
d556865
chore: updated description to be generalized comments.
aavinash-nr May 7, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/actions/node-layer-setup/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ runs:
id: check-tag
shell: bash
run: |
if [[ ${{ github.event.ref }} =~ ^refs/tags/v[0-9]+(\.[0-9]+)*_nodejs$ ]]; then
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]] || \
[[ "${{ github.event.ref }}" =~ ^refs/tags/v[0-9]+(\.[0-9]+)*_nodejs$ ]]; then
echo "match=true" >> $GITHUB_OUTPUT
fi
- name: Run Node unit tests
Expand Down
173 changes: 173 additions & 0 deletions .github/actions/notify-slack-layer/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
name: Notify Slack Layer Release
description: Builds and sends a structured Slack notification for lambda layer releases

inputs:
language_name:
description: Language display name, e.g. "Node.js"
required: true
versions_json:
description: |
JSON array describing each runtime version to include in the Slack message.
Schema (one object per version):
- key: unique identifier, e.g. "3.9"
- label: display name shown in Slack, e.g. "Python 3.9"
- job: GitHub Actions job name for API result lookup, e.g. "publish-python (3.9)"
- fallback: result to use when the API lookup finds no match, e.g. "success" or "failure"
- failure_key: job_key used by region-retry for per-version region failure detail, e.g. "python-3.9"
Example:
[
{"key":"3.9","label":"Python 3.9","job":"publish-python (3.9)","fallback":"success","failure_key":"python-3.9"},
{"key":"3.10","label":"Python 3.10","job":"publish-python (3.10)","fallback":"success","failure_key":"python-3.10"}
]
required: true
failure_summaries:
description: Newline-separated failure_summary strings from publish jobs (used for ECR failures)
required: false
default: ""
slack_webhook:
description: Slack incoming webhook URL
required: true
gh_token:
required: true
repo:
required: true
run_id:
required: true
run_attempt:
description: Pass github.run_attempt from the calling workflow.
required: false
default: "1"
ref_name:
required: true
actor:
required: true
server_url:
required: true

runs:
using: composite
steps:
- name: Build Slack payload
shell: bash
env:
LANGUAGE_NAME: ${{ inputs.language_name }}
VERSIONS_JSON: ${{ inputs.versions_json }}
FAILURE_SUMMARIES: ${{ inputs.failure_summaries }}
GH_TOKEN: ${{ inputs.gh_token }}
REPO: ${{ inputs.repo }}
RUN_ID: ${{ inputs.run_id }}
RUN_ATTEMPT: ${{ inputs.run_attempt }}
TAG: ${{ inputs.ref_name }}
ACTOR: ${{ inputs.actor }}
SERVER_URL: ${{ inputs.server_url }}
run: |
python3 << 'PYEOF'
import json, os, subprocess, glob, re

versions = json.loads(os.environ["VERSIONS_JSON"])
run_id = os.environ["RUN_ID"]
run_attempt = os.environ.get("RUN_ATTEMPT", "1")
repo = os.environ["REPO"]

# ── Fetch per-job results from the GitHub API ────────────────────────
r = subprocess.run(
["gh", "api", "--paginate",
f"repos/{repo}/actions/runs/{run_id}/jobs",
"-q", ".jobs[]"],
capture_output=True, text=True
)
api_jobs = {}
for line in r.stdout.strip().splitlines():
try:
job = json.loads(line)
api_jobs[job["name"]] = job.get("conclusion") or "in_progress"
except Exception:
pass

results = {}
for v in versions:
results[v["key"]] = api_jobs.get(v["job"], v["fallback"])

# ── Download per-version region failure artifacts ─────────────────────
# Artifacts are named: failed-regions-{job_key}-{run_id}-attempt-{N}
subprocess.run(
["gh", "run", "download", run_id,
"--pattern", "failed-regions-*",
"--dir", "/tmp/region-artifacts",
"--repo", repo],
capture_output=True, text=True
)

version_failures = {} # job_key -> comma-separated failed regions
artifact_base = "/tmp/region-artifacts"
if os.path.isdir(artifact_base):
for artifact_dir in sorted(glob.glob(f"{artifact_base}/failed-regions-*")):
artifact_name = os.path.basename(artifact_dir)
# Strip "failed-regions-" prefix, then split off "-{run_id}-attempt-{N}"
stripped = artifact_name[len("failed-regions-"):]
parts = stripped.rsplit(f"-{run_id}-attempt-", 1)
if len(parts) != 2:
continue
job_key, attempt_num = parts
if attempt_num != run_attempt:
continue
txt_file = os.path.join(artifact_dir, f"failed-regions-{job_key}.txt")
if os.path.isfile(txt_file):
content = open(txt_file).read().strip()
if content:
version_failures[job_key] = content

# ── Build Slack message ───────────────────────────────────────────────
total = len(versions)
failed_count = sum(1 for v in versions if results.get(v["key"]) != "success")
all_ok = failed_count == 0

lang = os.environ["LANGUAGE_NAME"]
icon = ":white_check_mark:" if all_ok else ":x:"
status = "Succeeded" if all_ok else f"Failed ({failed_count}/{total} versions failed)"
lines = [
f"{icon} *{lang} Layer Release {status}*",
f"Tag: `{os.environ['TAG']}`",
f"Triggered by: {os.environ['ACTOR']}",
"",
"*Layer Results:*",
]
for v in versions:
res = results.get(v["key"], "unknown")
em = ":white_check_mark:" if res == "success" else ":x:"
if res == "success":
detail = "Layer published successfully"
else:
fk = v.get("failure_key", "")
regions_str = version_failures.get(fk, "")
if regions_str:
regions = [r for r in regions_str.split(",") if r]
detail = f"{len(regions)} region(s) failed: {', '.join(regions)}"
else:
detail = "Layer publish to AWS failed"
lines.append(f"{em} {v['label']} — {detail}")

# ECR failures (from failure_summaries — not stored in artifacts)
skipped_ecr = []
for fs in os.environ.get("FAILURE_SUMMARIES", "").splitlines():
fs = fs.strip()
if "ECR images failed:" in fs:
for img in fs.split("ECR images failed:")[-1].strip().split():
if img not in skipped_ecr:
skipped_ecr.append(img)
if skipped_ecr:
lines += ["", ":warning: *ECR images failed to publish:*"]
lines += [f"• {i}" for i in skipped_ecr]

run_url = f"{os.environ['SERVER_URL']}/{repo}/actions/runs/{run_id}"
lines += ["", f"<{run_url}|View Run>"]

with open("/tmp/slack-payload.json", "w") as f:
json.dump({"text": "\n".join(lines)}, f)
PYEOF
- name: Notify Slack
uses: slackapi/slack-github-action@v2.1.0
with:
webhook: ${{ inputs.slack_webhook }}
webhook-type: incoming-webhook
payload-file-path: /tmp/slack-payload.json
3 changes: 2 additions & 1 deletion .github/actions/python-layer-setup/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ runs:
id: check-tag
shell: bash
run: |
if [[ ${{ github.event.ref }} =~ ^refs/tags/v[0-9]+(\.[0-9]+)*_python$ ]]; then
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]] || \
[[ "${{ github.event.ref }}" =~ ^refs/tags/v[0-9]+(\.[0-9]+)*_python$ ]]; then
echo "match=true" >> $GITHUB_OUTPUT
fi
- name: Install python dependencies
Expand Down
105 changes: 105 additions & 0 deletions .github/actions/region-retry/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
name: Region Retry State
description: |
Save failed regions after a publish attempt, or load them before a re-run.
On attempt N > 1, load automatically restricts publishing to the regions
that failed in attempt N-1. This makes "Re-run failed jobs" smart: it only
retries the regions that actually need it.

Usage in a publish job:
# Before publish step:
- uses: ./.github/actions/region-retry
id: region-retry-load
with: { mode: load, job_key: nodejs-20, run_id: ..., run_attempt: ... }

# In publish step env:
# PUBLISH_REGIONS: steps.region-retry-load.outputs.publish_regions || inputs.regions

# After publish step (if: always()):
- uses: ./.github/actions/region-retry
with: { mode: save, job_key: nodejs-20, failure_summary: ..., run_id: ..., run_attempt: ... }

inputs:
mode:
description: '"save" — write failed regions artifact after publish. "load" — read it before publish.'
required: true
job_key:
description: 'Unique key for this publish job, e.g. "nodejs-20", "python-3.9", "java-java21".'
required: true
failure_summary:
description: 'failure_summary output from the publish step (save mode only).'
required: false
default: ''
run_id:
description: 'Pass github.run_id from the calling workflow.'
required: true
run_attempt:
description: 'Pass github.run_attempt from the calling workflow.'
required: true

outputs:
publish_regions:
description: 'Comma-separated regions to target (empty = all). Set PUBLISH_REGIONS env var to this value.'
value: ${{ steps.load-regions.outputs.publish_regions }}

runs:
using: composite
steps:
# ── SAVE: write failed regions to an artifact named by attempt number ─────
- name: Write failed-regions file
if: inputs.mode == 'save'
id: write-file
shell: bash
run: |
fs="${{ inputs.failure_summary }}"
outfile="/tmp/failed-regions-${{ inputs.job_key }}.txt"
if [[ "$fs" == *"regions failed:"* ]]; then
regions="${fs#*regions failed: }"
printf '%s' "${regions// /,}" > "$outfile"
echo "has_failures=true" >> "$GITHUB_OUTPUT"
echo "Captured failed regions: ${regions}"
else
echo "has_failures=false" >> "$GITHUB_OUTPUT"
fi

- name: Upload failed-regions artifact
if: inputs.mode == 'save' && steps.write-file.outputs.has_failures == 'true'
uses: actions/upload-artifact@v4
with:
name: failed-regions-${{ inputs.job_key }}-${{ inputs.run_id }}-attempt-${{ inputs.run_attempt }}
path: /tmp/failed-regions-${{ inputs.job_key }}.txt
retention-days: 7

# ── LOAD: on re-run, download the previous attempt's failed-regions ────────
- name: Compute previous attempt artifact name
if: inputs.mode == 'load'
id: artifact-name
shell: bash
run: |
prev=$(( ${{ inputs.run_attempt }} - 1 ))
echo "name=failed-regions-${{ inputs.job_key }}-${{ inputs.run_id }}-attempt-${prev}" >> "$GITHUB_OUTPUT"

- name: Download previous failed-regions artifact
if: inputs.mode == 'load' && fromJSON(inputs.run_attempt) > 1
id: download
continue-on-error: true
uses: actions/download-artifact@v4
with:
name: ${{ steps.artifact-name.outputs.name }}
path: /tmp/prev-attempt-${{ inputs.job_key }}

- name: Set publish_regions output
if: inputs.mode == 'load'
id: load-regions
shell: bash
run: |
regions=""
f="/tmp/prev-attempt-${{ inputs.job_key }}/failed-regions-${{ inputs.job_key }}.txt"
if [[ -f "$f" ]]; then
regions=$(cat "$f")
if [[ -n "$regions" ]]; then
echo "Re-run: restricting to previously failed regions: ${regions}"
else
echo "Previous attempt had no failures — publishing to all regions."
fi
fi
echo "publish_regions=${regions}" >> "$GITHUB_OUTPUT"
Loading
Loading