From dd33a6d079d9e4cde089afc53a25c9e824169ccc Mon Sep 17 00:00:00 2001 From: Markadrian6399 Date: Tue, 28 Apr 2026 13:55:36 +0100 Subject: [PATCH] fix/feat: runbook, pause banner, dispute events, is_terminal docs (#483 #485 #528 #529) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - #483: Add RUNBOOK.md covering emergency pause/unpause, admin key rotation via governance, stuck migration handling, webhook replay, storage TTL extension, and escalation contacts/SLA targets. Linked from README documentation section. - #485: Surface pause_reason in HealthStatus (health.rs) by reading the active PauseRecord from circuit_breaker_storage. ContractHealth.jsx now shows a dismissible red status banner with the human-readable pause reason, re-appears on next 60s poll if still paused, and disables transaction buttons (Withdraw Fees) while paused. Adds onPausedChange callback prop for parent components to gate submission buttons. - #528: Document in is_terminal() that Failed and Disputed are intentionally excluded — they are transient states with valid outbound transitions (Failed→Disputed, Disputed→Completed|Cancelled via resolve_dispute). Existing implementation is correct per state machine. - #529: Refactor emit_dispute_raised and emit_dispute_resolved to use the standard emit_event! macro, adding the (SCHEMA_VERSION, ledger_seq, ledger_ts) envelope consistent with all other events. emit_dispute_resolved now includes admin address and resulting_status symbol. resolve_dispute in lib.rs updated to pass caller. webhook-handler.ts gains handleDisputeRaised and handleDisputeResolved with audit log persistence and routes dispute_raised / dispute_resolved event types. --- README.md | 32 +- RUNBOOK.md | 350 +++++++++++++++++++++ backend/src/webhook-handler.ts | 49 +++ frontend/src/components/ContractHealth.jsx | 76 ++++- src/events.rs | 30 +- src/health.rs | 12 + src/lib.rs | 2 +- src/types.rs | 5 + 8 files changed, 525 insertions(+), 31 deletions(-) create mode 100644 RUNBOOK.md diff --git a/README.md b/README.md index 5fcb72bd..697d655b 100644 --- a/README.md +++ b/README.md @@ -313,6 +313,7 @@ SwiftRemit uses environment variables for configuration. This allows you to easi - **[CONFIGURATION.md](CONFIGURATION.md)**: Complete configuration reference with all variables, validation rules, and examples - **[MIGRATION.md](MIGRATION.md)**: Migration guide for existing developers +- **[RUNBOOK.md](RUNBOOK.md)**: Operational runbook — emergency pause/unpause, admin key rotation, stuck migrations, webhook replay, storage TTL extension - **[PRODUCTION_READINESS_REPORT.md](PRODUCTION_READINESS_REPORT.md)**: Current production readiness status — what's complete, what's pending, and known risks before mainnet ## Remittance Lifecycle — Sequence Diagram @@ -582,19 +583,20 @@ import { VerificationBadge } from './components/VerificationBadge'; - [ ] Agent reputation system - [ ] Dispute resolution mechanism - [ ] Time-locked escrow options - -## Error Codes & Troubleshooting - -| Code | Error Name | Common Cause | Resolution Steps | -| :--- | :--- | :--- | :--- | -| **1** | AlreadyInitialized | Attempting to call initialize() on an active contract. | No action required. If re-configuration is needed, check if an update function exists. | -| **2** | NotInitialized | Operations attempted before the contract setup is complete. | The administrator must call the initialize() function with valid parameters. | -| **3** | InvalidAmount | Providing zero or negative values for remittance. | Ensure the transfer amount is a positive integer greater than 0. | -| **4** | InvalidFeeBps | Fee percentage is set outside the 0-100% (0-10000 bps) range. | Adjust the basis points to fall within the valid range (e.g., 2.5% = 250 bps). | -| **5** | AgentNotRegistered | Using an address that hasn't been added to the whitelist. | Register the agent address first using the egister_agent function. | -| **6** | RemittanceNotFound | Querying an ID that does not exist on the ledger. | Verify the Remittance ID from your transaction history or event logs. | -| **7** | InvalidStatus | Operation not allowed in current state (e.g. canceling a settled payment). | Check the current status of the remittance via get_remittance before retrying. | -| **11** | SettlementExpired | The time-lock for the remittance has passed. | The sender may need to cancel and recreate the remittance with a new deadline. | -| **12** | DuplicateSettlement | The payment was already claimed or processed. | Check the transaction ledger; the funds have likely already been disbursed. | -| **13** | ContractPaused | Circuit breaker active due to maintenance or emergency. | Monitor the project's official status channels; wait for the admin to unpause. | + +## Error Codes & Troubleshooting + +| Code | Error Name | Common Cause | Resolution Steps | +| :--- | :--- | :--- | :--- | +| **1** | AlreadyInitialized | Attempting to call initialize() on an active contract. | No action required. If re-configuration is needed, check if an update function exists. | +| **2** | NotInitialized | Operations attempted before the contract setup is complete. | The administrator must call the initialize() function with valid parameters. | +| **3** | InvalidAmount | Providing zero or negative values for remittance. | Ensure the transfer amount is a positive integer greater than 0. | +| **4** | InvalidFeeBps | Fee percentage is set outside the 0-100% (0-10000 bps) range. | Adjust the basis points to fall within the valid range (e.g., 2.5% = 250 bps). | +| **5** | AgentNotRegistered | Using an address that hasn't been added to the whitelist. | Register the agent address first using the +egister_agent function. | +| **6** | RemittanceNotFound | Querying an ID that does not exist on the ledger. | Verify the Remittance ID from your transaction history or event logs. | +| **7** | InvalidStatus | Operation not allowed in current state (e.g. canceling a settled payment). | Check the current status of the remittance via get_remittance before retrying. | +| **11** | SettlementExpired | The time-lock for the remittance has passed. | The sender may need to cancel and recreate the remittance with a new deadline. | +| **12** | DuplicateSettlement | The payment was already claimed or processed. | Check the transaction ledger; the funds have likely already been disbursed. | +| **13** | ContractPaused | Circuit breaker active due to maintenance or emergency. | Monitor the project's official status channels; wait for the admin to unpause. | diff --git a/RUNBOOK.md b/RUNBOOK.md new file mode 100644 index 00000000..bd597d50 --- /dev/null +++ b/RUNBOOK.md @@ -0,0 +1,350 @@ +# SwiftRemit Operational Runbook + +On-call reference for common production procedures. All `soroban contract invoke` commands assume the following environment variables are set: + +```bash +export CONTRACT_ID= +export NETWORK=mainnet # or testnet +export RPC_URL= +export ADMIN_IDENTITY= +``` + +--- + +## 1. Emergency Pause + +Use when a security incident, suspicious activity, or external threat requires halting all contract operations immediately. + +**Pause reasons:** `SecurityIncident` | `SuspiciousActivity` | `MaintenanceWindow` | `ExternalThreat` + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + emergency_pause \ + --caller $ADMIN_ADDRESS \ + --reason SecurityIncident +``` + +Verify the pause took effect: + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + health +``` + +Confirm `paused: true` and `pause_reason` matches the reason supplied. + +**After pausing:** +- Post an incident notice in the team Slack channel (`#incidents`). +- Open a GitHub issue tagged `incident` with the pause reason and ledger sequence. +- The frontend `ContractHealth` widget will automatically display the pause banner to users within 60 seconds. + +--- + +## 2. Unpause After Incident Resolution + +Unpausing requires admin quorum votes (default: 1). If a timelock is configured, the elapsed time since the pause must exceed `timelock_seconds` before the unpause is accepted. + +**Step 1 — each admin casts a vote:** + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + vote_unpause \ + --caller $ADMIN_ADDRESS +``` + +Once quorum is reached the contract unpauses automatically. If quorum is already met and the timelock has elapsed, any admin can trigger the unpause directly: + +**Step 2 (optional direct unpause after quorum + timelock):** + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + emergency_unpause \ + --caller $ADMIN_ADDRESS +``` + +Verify: + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + health +``` + +Confirm `paused: false`. + +**After unpausing:** +- Close the incident GitHub issue. +- Post a resolution notice in `#incidents` with the ledger sequence of the unpause. + +--- + +## 3. Rotate Admin Keys via Governance Proposal + +Admin key rotation uses the on-chain governance module. The process is: propose → vote → execute (after timelock). + +**Step 1 — propose adding the new admin:** + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + propose \ + --proposer $CURRENT_ADMIN_ADDRESS \ + --action '{"AddAdmin": ""}' +``` + +Note the returned `proposal_id`. + +**Step 2 — each admin votes to approve:** + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + vote \ + --voter $ADMIN_ADDRESS \ + --proposal_id +``` + +**Step 3 — execute after timelock elapses:** + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + execute \ + --executor $ADMIN_ADDRESS \ + --proposal_id +``` + +**Step 4 — remove the old admin key (repeat steps 1–3 with `RemoveAdmin`):** + +```bash +# Propose removal +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + propose \ + --proposer $NEW_ADMIN_ADDRESS \ + --action '{"RemoveAdmin": ""}' +``` + +Vote and execute as above. Verify with: + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + get_admin_count +``` + +--- + +## 4. Handle a Stuck Migration + +A migration can become stuck if a batch import fails mid-flight or the contract is paused during migration. + +**Check current migration state:** + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + export_state +``` + +Inspect `schema_version` and whether a rollback snapshot exists. + +**Option A — abort and reset to Idle:** + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + abort_migration \ + --caller $ADMIN_ADDRESS +``` + +This emits a `mig.aborted` event and resets migration state. The contract returns to normal operation. + +**Option B — rollback to pre-migration snapshot:** + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + rollback_migration +``` + +After rollback, verify the schema version has reverted and re-run the migration from batch 0. + +**Resuming a partial batch migration:** + +If only some batches were imported, resume from the next expected batch number (visible in the stuck state export): + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + import_batch \ + --batch '' +``` + +--- + +## 5. Replay Failed Webhook Deliveries + +The webhook dispatcher persists delivery attempts in the `webhook_deliveries` table. Failed deliveries can be replayed via the backend admin API. + +**List failed deliveries (last 100):** + +```bash +psql $DATABASE_URL -c " + SELECT id, event_type, anchor_id, created_at, attempt_count, last_error + FROM webhook_deliveries + WHERE status = 'failed' + ORDER BY created_at DESC + LIMIT 100; +" +``` + +**Replay a single delivery:** + +```bash +curl -X POST http://localhost:3001/admin/webhooks/replay \ + -H 'Content-Type: application/json' \ + -d '{"delivery_id": ""}' +``` + +**Replay all failed deliveries for an anchor:** + +```bash +curl -X POST http://localhost:3001/admin/webhooks/replay-anchor \ + -H 'Content-Type: application/json' \ + -d '{"anchor_id": "", "status": "failed"}' +``` + +**Replay dispute events specifically** (if `dispute_raised` or `dispute_resolved` deliveries failed): + +```bash +psql $DATABASE_URL -c " + SELECT id FROM webhook_deliveries + WHERE event_type IN ('dispute_raised', 'dispute_resolved') + AND status = 'failed'; +" | xargs -I{} curl -X POST http://localhost:3001/admin/webhooks/replay \ + -H 'Content-Type: application/json' \ + -d '{"delivery_id": "{}"}' +``` + +Monitor delivery status: + +```bash +psql $DATABASE_URL -c " + SELECT status, count(*) FROM webhook_deliveries GROUP BY status; +" +``` + +--- + +## 6. Extend Contract Storage TTL + +Soroban persistent storage entries expire after a set number of ledgers. Extend TTL before entries expire to avoid data loss. + +**Check current TTL for a remittance entry:** + +```bash +soroban contract invoke \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + -- \ + get_remittance \ + --remittance_id +``` + +**Extend TTL via Soroban CLI (bump ledgers):** + +```bash +soroban contract extend \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + --ledgers-to-extend 500000 \ + --durability persistent +``` + +For individual storage keys (e.g., a specific remittance): + +```bash +soroban contract extend \ + --id $CONTRACT_ID \ + --source $ADMIN_IDENTITY \ + --network $NETWORK \ + --key '{"Remittance": }' \ + --ledgers-to-extend 500000 \ + --durability persistent +``` + +Recommended: run a scheduled job (weekly) to bump TTL on all active remittances before they approach expiry. The `process_expired_remittances` function handles logical expiry; this procedure handles Soroban storage-level TTL. + +--- + +## 7. Escalation Contacts and SLA Targets + +| Severity | Definition | Response SLA | Resolution SLA | Escalation Path | +|----------|-----------|-------------|----------------|-----------------| +| P0 | Contract paused / funds at risk | 15 min | 2 hours | On-call engineer → Lead engineer → CTO | +| P1 | Webhook delivery failures > 10% | 30 min | 4 hours | On-call engineer → Backend lead | +| P2 | Migration stuck / partial state | 1 hour | 8 hours | On-call engineer → Contract lead | +| P3 | TTL warnings / non-critical degradation | 4 hours | 24 hours | On-call engineer | + +**Escalation contacts:** + +| Role | Contact | +|------|---------| +| On-call engineer | Rotate weekly — see PagerDuty schedule | +| Contract lead | See `CONTRIBUTING.md` maintainers section | +| Backend lead | See `CONTRIBUTING.md` maintainers section | +| Security incidents | security@[your-domain] | + +**Incident channels:** +- Slack: `#incidents` (P0/P1), `#engineering` (P2/P3) +- GitHub: tag issues with `incident` label and severity (`P0`–`P3`) +- Post-mortems: required for all P0 incidents within 48 hours of resolution diff --git a/backend/src/webhook-handler.ts b/backend/src/webhook-handler.ts index fdfd6c4a..87b63ad5 100644 --- a/backend/src/webhook-handler.ts +++ b/backend/src/webhook-handler.ts @@ -152,6 +152,12 @@ export class WebhookHandler { case 'daily_limit_updated': await this.handleDailyLimitUpdated(req.body); break; + case 'dispute_raised': + await this.handleDisputeRaised(req.body); + break; + case 'dispute_resolved': + await this.handleDisputeResolved(req.body); + break; default: res.status(400).json({ error: 'Unknown event type' }); return; @@ -285,6 +291,49 @@ export class WebhookHandler { }); } + /** + * Handle dispute_raised contract event. + * Logs the dispute and notifies relevant webhook subscribers. + */ + private async handleDisputeRaised(payload: any): Promise { + const { remittance_id, sender, evidence_hash, ledger_sequence, timestamp } = payload; + console.info( + `[dispute_raised] remittance_id=${remittance_id} sender=${sender} ` + + `evidence_hash=${evidence_hash} ledger=${ledger_sequence} ts=${timestamp}` + ); + await this.pool.query( + `INSERT INTO dispute_audit_log + (remittance_id, event_type, sender, evidence_hash, ledger_sequence, event_timestamp, recorded_at) + VALUES ($1, 'raised', $2, $3, $4, to_timestamp($5), NOW()) + ON CONFLICT DO NOTHING`, + [remittance_id, sender, evidence_hash, ledger_sequence, timestamp] + ).catch((err: Error) => { + console.warn('[dispute_raised] audit log insert failed (table may not exist):', err.message); + }); + } + + /** + * Handle dispute_resolved contract event. + * Logs the resolution outcome and notifies relevant webhook subscribers. + */ + private async handleDisputeResolved(payload: any): Promise { + const { remittance_id, admin, in_favour_of_sender, resulting_status, ledger_sequence, timestamp } = payload; + console.info( + `[dispute_resolved] remittance_id=${remittance_id} admin=${admin} ` + + `in_favour_of_sender=${in_favour_of_sender} resulting_status=${resulting_status} ` + + `ledger=${ledger_sequence} ts=${timestamp}` + ); + await this.pool.query( + `INSERT INTO dispute_audit_log + (remittance_id, event_type, admin_address, in_favour_of_sender, resulting_status, ledger_sequence, event_timestamp, recorded_at) + VALUES ($1, 'resolved', $2, $3, $4, $5, to_timestamp($6), NOW()) + ON CONFLICT DO NOTHING`, + [remittance_id, admin, in_favour_of_sender, resulting_status, ledger_sequence, timestamp] + ).catch((err: Error) => { + console.warn('[dispute_resolved] audit log insert failed (table may not exist):', err.message); + }); + } + /** * Handle SEP-24 deposit/withdrawal update webhook */ diff --git a/frontend/src/components/ContractHealth.jsx b/frontend/src/components/ContractHealth.jsx index e143f1f0..ff85418d 100644 --- a/frontend/src/components/ContractHealth.jsx +++ b/frontend/src/components/ContractHealth.jsx @@ -2,38 +2,52 @@ import { useState, useEffect, useCallback } from 'react' const AUTO_REFRESH_MS = 60_000 +const PAUSE_REASON_LABELS = { + SecurityIncident: 'Security Incident', + SuspiciousActivity: 'Suspicious Activity', + MaintenanceWindow: 'Maintenance Window', + ExternalThreat: 'External Threat', +} + /** * ContractHealth widget — polls the contract's health() function and displays * initialized status, pause state, admin count, total remittances, and * accumulated fees. Includes a withdraw fees button for admins. + * + * Props: + * walletAddress — connected wallet address (optional) + * contractId — deployed contract ID + * onPausedChange — callback(isPaused: boolean) fired whenever pause state changes */ -export default function ContractHealth({ walletAddress, contractId }) { +export default function ContractHealth({ walletAddress, contractId, onPausedChange }) { const [health, setHealth] = useState(null) const [loading, setLoading] = useState(false) const [error, setError] = useState(null) const [lastChecked, setLastChecked] = useState(null) const [withdrawing, setWithdrawing] = useState(false) const [withdrawResult, setWithdrawResult] = useState(null) + const [bannerDismissed, setBannerDismissed] = useState(false) const fetchHealth = useCallback(async () => { if (!contractId) return setLoading(true) setError(null) try { - // In a real integration this would call the contract via Soroban RPC. - // Here we use the REST API proxy if available, otherwise show a placeholder. const apiBase = import.meta.env.VITE_API_URL || '' const res = await fetch(`${apiBase}/api/contract/health?contractId=${encodeURIComponent(contractId)}`) if (!res.ok) throw new Error(`Health check failed: ${res.status}`) const data = await res.json() setHealth(data) setLastChecked(new Date()) + // Re-show banner on next poll if still paused + if (data.paused) setBannerDismissed(false) + onPausedChange?.(data.paused) } catch (err) { setError(err.message || 'Failed to fetch contract health') } finally { setLoading(false) } - }, [contractId]) + }, [contractId, onPausedChange]) // Initial fetch + auto-refresh every 60 s useEffect(() => { @@ -73,8 +87,56 @@ export default function ContractHealth({ walletAddress, contractId }) { ) } + const pauseReasonLabel = health?.pause_reason + ? (PAUSE_REASON_LABELS[health.pause_reason] ?? health.pause_reason) + : null + return (
+ {/* Prominent pause banner — shown outside the panel when paused and not dismissed */} + {health?.paused && !bannerDismissed && ( +
+
+ 🔴 +
+ Service temporarily paused{pauseReasonLabel ? `: ${pauseReasonLabel}` : ''} +

+ Transaction submission is disabled until the service resumes. +

+
+
+ +
+ )} +

Contract Health

@@ -132,7 +196,7 @@ export default function ContractHealth({ walletAddress, contractId }) {