From 84e28ac1d5717dbfdb0b1613110f52320a1ece7c Mon Sep 17 00:00:00 2001 From: BisimJang Date: Wed, 29 Apr 2026 17:04:44 +0100 Subject: [PATCH 1/2] feat(backend): add keeper health degradation states and documentation --- keeper/README.md | 19 + keeper/__tests__/server.test.js | 52 + keeper/coverage/coverage-summary.json | 14 +- .../coverage/lcov-report/concurrency.js.html | 108 +- keeper/coverage/lcov-report/index.html | 76 +- keeper/coverage/lcov-report/logger.js.html | 48 +- keeper/coverage/lcov-report/poller.js.html | 327 ++- keeper/coverage/lcov-report/queue.js.html | 627 +++-- keeper/coverage/lcov-report/registry.js.html | 147 +- keeper/coverage/lcov-report/retry.js.html | 384 +-- keeper/coverage/lcov.info | 1516 +++++++----- keeper/package-lock.json | 2162 +++-------------- keeper/package.json | 3 +- keeper/src/metrics.js | 35 +- 14 files changed, 2425 insertions(+), 3093 deletions(-) diff --git a/keeper/README.md b/keeper/README.md index 4f198dff..6d4456c6 100644 --- a/keeper/README.md +++ b/keeper/README.md @@ -420,6 +420,25 @@ docker compose down docker compose up -d --build ``` +## Health & Operational Status + +The keeper provides a rich health endpoint at `/health` that exposes detailed operational states. This allows operators to distinguish between normal operation, temporary degradation, and total failure. + +### Health States + +| State | Description | HTTP Status | Recommended Action | +|-------|-------------|-------------|--------------------| +| `ok` | Normal operation. | 200 | No action required. | +| `degraded_rpc` | Partial RPC failure (Circuit Breaker is HALF_OPEN). | 200 | Monitor RPC connectivity; investigate network/provider stability. | +| `degraded_backlog` | High retry backlog pressure (>50 tasks). | 200 | Consider increasing `MAX_CONCURRENT_EXECUTIONS` or scale keeper instances. | +| `stale` | Polling activity has stopped or delayed beyond threshold. | 503 | Investigate if the main polling loop is hung. Sidecar will restart service. | +| `failing` | Total failure: RPC disconnected or Circuit Breaker is OPEN. | 503 | Check network connection, RPC endpoint availability, and credentials. | + +### Monitoring + +- **Health Check Sidecar**: Use the included `health-check-sidecar.sh` to automatically restart the keeper on `503` errors. +- **Prometheus**: Scrape `/metrics/prometheus` for real-time alerting on `keeper_rpc_connected` and `keeper_backlog_size`. + --- ## Need Help? diff --git a/keeper/__tests__/server.test.js b/keeper/__tests__/server.test.js index 721f7095..22dc5e2c 100644 --- a/keeper/__tests__/server.test.js +++ b/keeper/__tests__/server.test.js @@ -18,4 +18,56 @@ describe('Metrics', () => { const metrics = new Metrics(); expect(metrics).toBeDefined(); }); + + it('should return ok status when operating normally', () => { + const metrics = new Metrics(); + metrics.updateHealth({ rpcConnected: true }); + const health = metrics.getHealthStatus(60000); + expect(health.status).toBe('ok'); + expect(health.reason).toBe('Keeper is operating normally'); + }); + + it('should return failing status when rpc is disconnected', () => { + const metrics = new Metrics(); + metrics.updateHealth({ rpcConnected: false }); + const health = metrics.getHealthStatus(60000); + expect(health.status).toBe('failing'); + expect(health.reason).toBe('RPC connection lost or circuit breaker is OPEN'); + }); + + it('should return failing status when circuit breaker is OPEN', () => { + const metrics = new Metrics(); + metrics.updateHealth({ rpcConnected: true }); + metrics.record('rpcCircuitState', 2); // OPEN + const health = metrics.getHealthStatus(60000); + expect(health.status).toBe('failing'); + expect(health.reason).toBe('RPC connection lost or circuit breaker is OPEN'); + }); + + it('should return degraded_rpc when circuit breaker is HALF_OPEN', () => { + const metrics = new Metrics(); + metrics.updateHealth({ rpcConnected: true }); + metrics.record('rpcCircuitState', 1); // HALF_OPEN + const health = metrics.getHealthStatus(60000); + expect(health.status).toBe('degraded_rpc'); + expect(health.reason).toBe('Partial RPC failure, circuit breaker is HALF_OPEN'); + }); + + it('should return degraded_backlog when backlog is high', () => { + const metrics = new Metrics(); + metrics.updateHealth({ rpcConnected: true, backlogSize: 60 }); + const health = metrics.getHealthStatus(60000); + expect(health.status).toBe('degraded_backlog'); + expect(health.reason).toBe('High retry backlog pressure (60 tasks)'); + }); + + it('should return stale when polling is delayed', () => { + const metrics = new Metrics(); + metrics.updateHealth({ rpcConnected: true }); + // Set last poll to 2 minutes ago + metrics.lastPollAt = new Date(Date.now() - 120000); + const health = metrics.getHealthStatus(60000); + expect(health.status).toBe('stale'); + expect(health.reason).toBe('No polling activity for over 60000ms'); + }); }); diff --git a/keeper/coverage/coverage-summary.json b/keeper/coverage/coverage-summary.json index dd5276ff..587b3efe 100644 --- a/keeper/coverage/coverage-summary.json +++ b/keeper/coverage/coverage-summary.json @@ -1,8 +1,8 @@ -{"total": {"lines":{"total":384,"covered":350,"skipped":0,"pct":91.14},"statements":{"total":393,"covered":358,"skipped":0,"pct":91.09},"functions":{"total":67,"covered":66,"skipped":0,"pct":98.5},"branches":{"total":235,"covered":196,"skipped":0,"pct":83.4},"branchesTrue":{"total":0,"covered":0,"skipped":0,"pct":"Unknown"}} -,"C:\\Users\\c-christopher\\Desktop\\soro\\SoroTask\\keeper\\src\\concurrency.js": {"lines":{"total":22,"covered":22,"skipped":0,"pct":100},"functions":{"total":6,"covered":6,"skipped":0,"pct":100},"statements":{"total":22,"covered":22,"skipped":0,"pct":100},"branches":{"total":4,"covered":4,"skipped":0,"pct":100}} -,"C:\\Users\\c-christopher\\Desktop\\soro\\SoroTask\\keeper\\src\\logger.js": {"lines":{"total":33,"covered":33,"skipped":0,"pct":100},"functions":{"total":18,"covered":18,"skipped":0,"pct":100},"statements":{"total":33,"covered":33,"skipped":0,"pct":100},"branches":{"total":20,"covered":15,"skipped":0,"pct":75}} -,"C:\\Users\\c-christopher\\Desktop\\soro\\SoroTask\\keeper\\src\\poller.js": {"lines":{"total":117,"covered":95,"skipped":0,"pct":81.19},"functions":{"total":11,"covered":11,"skipped":0,"pct":100},"statements":{"total":118,"covered":96,"skipped":0,"pct":81.35},"branches":{"total":57,"covered":44,"skipped":0,"pct":77.19}} -,"C:\\Users\\c-christopher\\Desktop\\soro\\SoroTask\\keeper\\src\\queue.js": {"lines":{"total":69,"covered":66,"skipped":0,"pct":95.65},"functions":{"total":7,"covered":6,"skipped":0,"pct":85.71},"statements":{"total":71,"covered":67,"skipped":0,"pct":94.36},"branches":{"total":32,"covered":28,"skipped":0,"pct":87.5}} -,"C:\\Users\\c-christopher\\Desktop\\soro\\SoroTask\\keeper\\src\\registry.js": {"lines":{"total":72,"covered":64,"skipped":0,"pct":88.88},"functions":{"total":12,"covered":12,"skipped":0,"pct":100},"statements":{"total":74,"covered":66,"skipped":0,"pct":89.18},"branches":{"total":44,"covered":34,"skipped":0,"pct":77.27}} -,"C:\\Users\\c-christopher\\Desktop\\soro\\SoroTask\\keeper\\src\\retry.js": {"lines":{"total":71,"covered":70,"skipped":0,"pct":98.59},"functions":{"total":13,"covered":13,"skipped":0,"pct":100},"statements":{"total":75,"covered":74,"skipped":0,"pct":98.66},"branches":{"total":78,"covered":71,"skipped":0,"pct":91.02}} +{"total": {"lines":{"total":512,"covered":0,"skipped":0,"pct":0},"statements":{"total":526,"covered":0,"skipped":0,"pct":0},"functions":{"total":86,"covered":0,"skipped":0,"pct":0},"branches":{"total":314,"covered":0,"skipped":0,"pct":0},"branchesTrue":{"total":0,"covered":0,"skipped":0,"pct":"Unknown"}} +,"C:\\Users\\Jason\\Desktop\\DripWave\\Stellar\\SoroTask\\keeper\\src\\concurrency.js": {"lines":{"total":50,"covered":0,"skipped":0,"pct":0},"functions":{"total":8,"covered":0,"skipped":0,"pct":0},"statements":{"total":51,"covered":0,"skipped":0,"pct":0},"branches":{"total":26,"covered":0,"skipped":0,"pct":0}} +,"C:\\Users\\Jason\\Desktop\\DripWave\\Stellar\\SoroTask\\keeper\\src\\logger.js": {"lines":{"total":33,"covered":0,"skipped":0,"pct":0},"functions":{"total":18,"covered":0,"skipped":0,"pct":0},"statements":{"total":33,"covered":0,"skipped":0,"pct":0},"branches":{"total":20,"covered":0,"skipped":0,"pct":0}} +,"C:\\Users\\Jason\\Desktop\\DripWave\\Stellar\\SoroTask\\keeper\\src\\poller.js": {"lines":{"total":148,"covered":0,"skipped":0,"pct":0},"functions":{"total":17,"covered":0,"skipped":0,"pct":0},"statements":{"total":150,"covered":0,"skipped":0,"pct":0},"branches":{"total":81,"covered":0,"skipped":0,"pct":0}} +,"C:\\Users\\Jason\\Desktop\\DripWave\\Stellar\\SoroTask\\keeper\\src\\queue.js": {"lines":{"total":126,"covered":0,"skipped":0,"pct":0},"functions":{"total":13,"covered":0,"skipped":0,"pct":0},"statements":{"total":128,"covered":0,"skipped":0,"pct":0},"branches":{"total":55,"covered":0,"skipped":0,"pct":0}} +,"C:\\Users\\Jason\\Desktop\\DripWave\\Stellar\\SoroTask\\keeper\\src\\registry.js": {"lines":{"total":84,"covered":0,"skipped":0,"pct":0},"functions":{"total":17,"covered":0,"skipped":0,"pct":0},"statements":{"total":89,"covered":0,"skipped":0,"pct":0},"branches":{"total":54,"covered":0,"skipped":0,"pct":0}} +,"C:\\Users\\Jason\\Desktop\\DripWave\\Stellar\\SoroTask\\keeper\\src\\retry.js": {"lines":{"total":71,"covered":0,"skipped":0,"pct":0},"functions":{"total":13,"covered":0,"skipped":0,"pct":0},"statements":{"total":75,"covered":0,"skipped":0,"pct":0},"branches":{"total":78,"covered":0,"skipped":0,"pct":0}} } diff --git a/keeper/coverage/lcov-report/concurrency.js.html b/keeper/coverage/lcov-report/concurrency.js.html index 6e02217c..0a812fd8 100644 --- a/keeper/coverage/lcov-report/concurrency.js.html +++ b/keeper/coverage/lcov-report/concurrency.js.html @@ -103,44 +103,94 @@

All files concurrency.js

38 39 40 -41  -66x -66x -66x -66x +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118    -66x -126x -64x     -62x -62x   -62x       -62x -62x       -66x -64x -64x -64x -  -  -66x -39x -2x -2x -  -  -  -66x -        @@ -371,7 +421,7 @@

All files concurrency.js