diff --git a/excalidraw/016-virtual-cluster-lifecycle.excalidraw b/excalidraw/016-virtual-cluster-lifecycle.excalidraw new file mode 100644 index 0000000..6931e2f --- /dev/null +++ b/excalidraw/016-virtual-cluster-lifecycle.excalidraw @@ -0,0 +1,1147 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "type": "rectangle", + "version": 2, + "versionNonce": 1793025876, + "isDeleted": false, + "id": "state_init", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 300, + "y": 30, + "strokeColor": "#1e1e1e", + "backgroundColor": "#dbe4ff", + "width": 180, + "height": 60, + "seed": 100001, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "text_init", + "type": "text" + }, + { + "id": "arrow_success", + "type": "arrow" + }, + { + "id": "arrow_failure", + "type": "arrow" + }, + { + "id": "arrow_reload", + "type": "arrow" + }, + { + "id": "arrow_retry", + "type": "arrow" + } + ], + "updated": 1772077195656, + "link": null, + "locked": false, + "index": "a0" + }, + { + "type": "text", + "version": 2, + "versionNonce": 1118595052, + "isDeleted": false, + "id": "text_init", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 330, + "y": 47, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 120, + "height": 25, + "seed": 100002, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077195656, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 1, + "text": "initializing", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "state_init", + "originalText": "initializing", + "lineHeight": 1.25, + "autoResize": true, + "index": "a1" + }, + { + "type": "rectangle", + "version": 2, + "versionNonce": 1520393428, + "isDeleted": false, + "id": "state_accept", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 60, + "y": 230, + "strokeColor": "#1e1e1e", + "backgroundColor": "#d3f9d8", + "width": 160, + "height": 60, + "seed": 100003, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "text_accept", + "type": "text" + }, + { + "id": "arrow_success", + "type": "arrow" + }, + { + "id": "arrow_shutdown", + "type": "arrow" + } + ], + "updated": 1772077195656, + "link": null, + "locked": false, + "index": "a2" + }, + { + "type": "text", + "version": 2, + "versionNonce": 657492588, + "isDeleted": false, + "id": "text_accept", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 93, + "y": 247, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 94, + "height": 25, + "seed": 100004, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077195656, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 1, + "text": "accepting", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "state_accept", + "originalText": "accepting", + "lineHeight": 1.25, + "autoResize": true, + "index": "a3" + }, + { + "type": "rectangle", + "version": 2, + "versionNonce": 163881556, + "isDeleted": false, + "id": "state_failed", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 530, + "y": 230, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffc9c9", + "width": 130, + "height": 60, + "seed": 100005, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "text_failed", + "type": "text" + }, + { + "id": "arrow_failure", + "type": "arrow" + }, + { + "id": "arrow_retry", + "type": "arrow" + }, + { + "id": "arrow_remove", + "type": "arrow" + } + ], + "updated": 1772077195656, + "link": null, + "locked": false, + "index": "a4" + }, + { + "type": "text", + "version": 2, + "versionNonce": 1544314092, + "isDeleted": false, + "id": "text_failed", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 565, + "y": 247, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 60, + "height": 25, + "seed": 100006, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077195656, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 1, + "text": "failed", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "state_failed", + "originalText": "failed", + "lineHeight": 1.25, + "autoResize": true, + "index": "a5" + }, + { + "type": "rectangle", + "version": 4, + "versionNonce": 630473964, + "isDeleted": false, + "id": "state_drain", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 60, + "y": 430, + "strokeColor": "#1e1e1e", + "backgroundColor": "#fff3bf", + "width": 150, + "height": 60, + "seed": 100007, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "text_drain", + "type": "text" + }, + { + "id": "arrow_shutdown", + "type": "arrow" + }, + { + "id": "arrow_timeout", + "type": "arrow" + }, + { + "id": "arrow_reload", + "type": "arrow" + } + ], + "updated": 1772077211400, + "link": null, + "locked": false, + "index": "a6" + }, + { + "type": "text", + "version": 2, + "versionNonce": 1523179372, + "isDeleted": false, + "id": "text_drain", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 93, + "y": 447, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 84, + "height": 25, + "seed": 100008, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077195656, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 1, + "text": "draining", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "state_drain", + "originalText": "draining", + "lineHeight": 1.25, + "autoResize": true, + "index": "a7" + }, + { + "type": "rectangle", + "version": 2, + "versionNonce": 1993809236, + "isDeleted": false, + "id": "state_stop", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 270, + "y": 620, + "strokeColor": "#1e1e1e", + "backgroundColor": "#dee2e6", + "width": 170, + "height": 60, + "seed": 100009, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "text_stop", + "type": "text" + }, + { + "id": "arrow_timeout", + "type": "arrow" + }, + { + "id": "arrow_remove", + "type": "arrow" + } + ], + "updated": 1772077195656, + "link": null, + "locked": false, + "index": "a8" + }, + { + "type": "text", + "version": 2, + "versionNonce": 1538462188, + "isDeleted": false, + "id": "text_stop", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 310, + "y": 637, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 70, + "height": 25, + "seed": 100010, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077195656, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 1, + "text": "stopped", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "state_stop", + "originalText": "stopped", + "lineHeight": 1.25, + "autoResize": true, + "index": "a9" + }, + { + "type": "arrow", + "version": 2, + "versionNonce": 976245460, + "isDeleted": false, + "id": "arrow_success", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 340, + "y": 90, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "width": 200, + "height": 140, + "seed": 100011, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "id": "label_success", + "type": "text" + } + ], + "updated": 1772077195656, + "link": null, + "locked": false, + "startBinding": { + "mode": "orbit", + "elementId": "state_init", + "fixedPoint": [ + 0.36556420693639274, + 0.6344357930636081 + ] + }, + "endBinding": { + "mode": "orbit", + "elementId": "state_accept", + "fixedPoint": [ + 0.3222625907224856, + 0.3222625907224857 + ] + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + -200, + 140 + ] + ], + "index": "aA" + }, + { + "type": "text", + "version": 2, + "versionNonce": 2110911596, + "isDeleted": false, + "id": "label_success", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 210, + "y": 148, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "width": 64, + "height": 25, + "seed": 100012, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077195656, + "link": null, + "locked": false, + "fontSize": 16, + "fontFamily": 1, + "text": "success", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "arrow_success", + "originalText": "success", + "lineHeight": 1.25, + "autoResize": true, + "index": "aB" + }, + { + "type": "arrow", + "version": 2, + "versionNonce": 2009520212, + "isDeleted": false, + "id": "arrow_failure", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 440, + "y": 90, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 155, + "height": 140, + "seed": 100013, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "id": "label_failure", + "type": "text" + } + ], + "updated": 1772077195656, + "link": null, + "locked": false, + "startBinding": { + "mode": "orbit", + "elementId": "state_init", + "fixedPoint": [ + 0.6807444026611206, + 0.68074440266112 + ] + }, + "endBinding": { + "mode": "orbit", + "elementId": "state_failed", + "fixedPoint": [ + 0.6787730383096512, + 0.3212269616903498 + ] + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 155, + 140 + ] + ], + "index": "aC" + }, + { + "type": "text", + "version": 2, + "versionNonce": 1861256940, + "isDeleted": false, + "id": "label_failure", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 490, + "y": 148, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 54, + "height": 25, + "seed": 100014, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077195656, + "link": null, + "locked": false, + "fontSize": 16, + "fontFamily": 1, + "text": "failure", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "arrow_failure", + "originalText": "failure", + "lineHeight": 1.25, + "autoResize": true, + "index": "aD" + }, + { + "type": "arrow", + "version": 2, + "versionNonce": 460369364, + "isDeleted": false, + "id": "arrow_shutdown", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 140, + "y": 290, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 5, + "height": 140, + "seed": 100015, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "id": "label_shutdown", + "type": "text" + } + ], + "updated": 1772077195656, + "link": null, + "locked": false, + "startBinding": { + "mode": "orbit", + "elementId": "state_accept", + "fixedPoint": [ + 0.5054381699530802, + 0.5054381699530798 + ] + }, + "endBinding": { + "mode": "orbit", + "elementId": "state_drain", + "fixedPoint": [ + 0.4941674554245221, + 0.4941674554245234 + ] + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + -5, + 140 + ] + ], + "index": "aE" + }, + { + "type": "text", + "version": 2, + "versionNonce": 328780140, + "isDeleted": false, + "id": "label_shutdown", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 148, + "y": 348, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 130, + "height": 25, + "seed": 100016, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077195656, + "link": null, + "locked": false, + "fontSize": 16, + "fontFamily": 1, + "text": "shutdown / reload", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "arrow_shutdown", + "originalText": "shutdown / reload", + "lineHeight": 1.25, + "autoResize": true, + "index": "aF" + }, + { + "type": "arrow", + "version": 117, + "versionNonce": 1033204332, + "isDeleted": false, + "id": "arrow_reload", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dashed", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 54, + "y": 460.0054956271953, + "strokeColor": "#1971c2", + "backgroundColor": "transparent", + "width": 311.3757753572384, + "height": 400.0054956271953, + "seed": 100017, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "id": "label_reload", + "type": "text" + } + ], + "updated": 1772077211400, + "link": null, + "locked": false, + "startBinding": { + "elementId": "state_drain", + "mode": "orbit", + "fixedPoint": [ + 0, + 0.5001 + ] + }, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + -65.37577535723841, + -0.0054956271952733005 + ], + [ + -65.37577535723841, + -400.0054956271953 + ], + [ + 246, + -400.0054956271953 + ] + ], + "index": "aG" + }, + { + "type": "text", + "version": 3, + "versionNonce": 550061420, + "isDeleted": false, + "id": "label_reload", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": -80, + "y": 248, + "strokeColor": "#1971c2", + "backgroundColor": "transparent", + "width": 124.01591491699219, + "height": 20, + "seed": 100018, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077207875, + "link": null, + "locked": false, + "fontSize": 16, + "fontFamily": 1, + "text": "drained (reload)", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "arrow_reload", + "originalText": "drained (reload)", + "lineHeight": 1.25, + "autoResize": true, + "index": "aH" + }, + { + "type": "arrow", + "version": 2, + "versionNonce": 110512340, + "isDeleted": false, + "id": "arrow_timeout", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 135, + "y": 490, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 220, + "height": 130, + "seed": 100019, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "id": "label_timeout", + "type": "text" + } + ], + "updated": 1772077195656, + "link": null, + "locked": false, + "startBinding": { + "mode": "orbit", + "elementId": "state_drain", + "fixedPoint": [ + 0.32257609284679245, + 0.6774239071532093 + ] + }, + "endBinding": { + "mode": "orbit", + "elementId": "state_stop", + "fixedPoint": [ + 0.6633629618042187, + 0.3366370381957798 + ] + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 220, + 130 + ] + ], + "index": "aI" + }, + { + "type": "text", + "version": 2, + "versionNonce": 191782508, + "isDeleted": false, + "id": "label_timeout", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 170, + "y": 543, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 130, + "height": 25, + "seed": 100020, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077195656, + "link": null, + "locked": false, + "fontSize": 16, + "fontFamily": 1, + "text": "drained / timeout", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "arrow_timeout", + "originalText": "drained / timeout", + "lineHeight": 1.25, + "autoResize": true, + "index": "aJ" + }, + { + "type": "arrow", + "version": 2, + "versionNonce": 1878920788, + "isDeleted": false, + "id": "arrow_retry", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dashed", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 660, + "y": 260, + "strokeColor": "#1971c2", + "backgroundColor": "transparent", + "width": 220, + "height": 200, + "seed": 100021, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "id": "label_retry", + "type": "text" + } + ], + "updated": 1772077195656, + "link": null, + "locked": false, + "startBinding": { + "mode": "orbit", + "elementId": "state_failed", + "fixedPoint": [ + 0.5001, + 0.5001 + ] + }, + "endBinding": { + "mode": "orbit", + "elementId": "state_init", + "fixedPoint": [ + 0.5001, + 0.5001 + ] + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 40, + 0 + ], + [ + 40, + -200 + ], + [ + -180, + -200 + ] + ], + "index": "aK" + }, + { + "type": "text", + "version": 2, + "versionNonce": 560967916, + "isDeleted": false, + "id": "label_retry", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 702, + "y": 148, + "strokeColor": "#1971c2", + "backgroundColor": "transparent", + "width": 36, + "height": 25, + "seed": 100022, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077195656, + "link": null, + "locked": false, + "fontSize": 16, + "fontFamily": 1, + "text": "retry", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "arrow_retry", + "originalText": "retry", + "lineHeight": 1.25, + "autoResize": true, + "index": "aL" + }, + { + "type": "arrow", + "version": 2, + "versionNonce": 860955604, + "isDeleted": false, + "id": "arrow_remove", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 595, + "y": 290, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 195, + "height": 330, + "seed": 100023, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [ + { + "id": "label_remove", + "type": "text" + } + ], + "updated": 1772077195656, + "link": null, + "locked": false, + "startBinding": { + "mode": "orbit", + "elementId": "state_failed", + "fixedPoint": [ + 0.6176391868202962, + 0.6176391868202965 + ] + }, + "endBinding": { + "mode": "orbit", + "elementId": "state_stop", + "fixedPoint": [ + 0.7090851788550153, + 0.290914821144986 + ] + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + -195, + 330 + ] + ], + "index": "aM" + }, + { + "type": "text", + "version": 2, + "versionNonce": 1084404588, + "isDeleted": false, + "id": "label_remove", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 430, + "y": 443, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 140, + "height": 25, + "seed": 100024, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1772077195656, + "link": null, + "locked": false, + "fontSize": 16, + "fontFamily": 1, + "text": "remove / shutdown", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "arrow_remove", + "originalText": "remove / shutdown", + "lineHeight": 1.25, + "autoResize": true, + "index": "aN" + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff", + "lockedMultiSelections": {} + }, + "files": {} +} \ No newline at end of file diff --git a/proposals/016-virtual-cluster-lifecycle.md b/proposals/016-virtual-cluster-lifecycle.md new file mode 100644 index 0000000..95e8ec6 --- /dev/null +++ b/proposals/016-virtual-cluster-lifecycle.md @@ -0,0 +1,203 @@ +# Proposal 016: Virtual Cluster Lifecycle + +## Summary + +Introduce a lifecycle state model for virtual clusters so that each cluster has a well-defined operational state at all times. This enables the proxy to handle per-cluster failures gracefully — during startup, shutdown, and future configuration reload — without affecting other clusters. + +## Current Situation + +Today a virtual cluster is either fully registered (ports bound, accepting connections) or it does not exist. There is no intermediate or error state. + +This has several consequences: + +1. **Startup is all-or-nothing.** If one virtual cluster fails to start (e.g. port conflict, filter initialisation failure), the entire proxy process fails. Other clusters that could have started successfully are taken down with it. + +2. **Shutdown is unstructured.** The proxy stops accepting connections and closes channels, but there is no formal draining phase that ensures in-flight Kafka requests complete before the connection is torn down. + +3. **No foundation for partial failure.** Proposals such as [012 - Hot Reload](https://github.com/kroxylicious/design/pull/83) need the ability to express "cluster-b failed to apply new configuration but cluster-a is still serving traffic." Without a lifecycle model this state is undefined and unreportable. + +## Motivation + +A virtual cluster is the natural unit of independent operation — the smallest scope at which the proxy can contain a failure without affecting unrelated traffic. Today this independence is not modelled: the proxy treats all clusters as a single unit that either starts completely or fails completely. + +Making per-cluster independence explicit enables the proxy to isolate configuration errors, startup failures, and runtime problems to the cluster that caused them, rather than treating them as proxy-wide events. + +A lifecycle model provides: + +- **Resilient startup**: clusters that can start do start; failures are reported rather than fatal. +- **Graceful shutdown**: in-flight requests complete (or time out) before connections close. +- **Observable state**: operators and tooling can query which clusters are operational and which have failed, and why. +- **A foundation for reload**: configuration reload ([012](https://github.com/kroxylicious/design/pull/83)) can define transitions on this state model rather than inventing its own. Lifecycle states are valuable independently of reload — they improve startup resilience and shutdown behaviour — so they are defined separately rather than being embedded in the reload proposal. + +## Proposal + +### Scope + +This proposal covers the lifecycle of individual virtual clusters — their filter chains and upstream connections. It does not cover proxy-level concerns such as port binding, management endpoint availability, or process-level shutdown sequencing. Port binding is managed by the proxy infrastructure and injected into the virtual cluster; the cluster's lifecycle does not own it. A proxy-level lifecycle model is identified as future work. + +### Lifecycle States + +Each virtual cluster has exactly one state at any time: + +| State | Description | +|-------|-------------| +| **initializing** | The cluster is being set up. Not yet accepting connections. Used on first boot, when retrying from `failed`, and during configuration reload. | +| **accepting** | The proxy has completed setup for this cluster and is accepting connections. This state makes no claim about the availability of upstream brokers or other runtime dependencies — it means the proxy is ready to handle connection attempts. | +| **draining** | New connections are rejected. Existing connections remain open to give in-flight requests the opportunity to complete. Connections are closed once idle or when the drain timeout is reached. | +| **failed** | The proxy determined the configuration not to be viable. All partially-acquired resources are released on entry to this state. The proxy retains the cluster's configuration and failure reason for diagnostics and retry. | +| **stopped** | The cluster is no longer operational. All resources have been released. This is a terminal state. | + +### State Transitions + +![Virtual cluster lifecycle state diagram](diagrams/016-virtual-cluster-lifecycle.png) + +**Startup transitions:** +- `initializing` → `accepting`: configuration applied successfully. The proxy is ready to handle connection attempts for this cluster. +- `initializing` → `failed`: configuration could not be applied. Any partially-acquired resources are released before entering `failed`. The error is captured against the cluster state. + +**Shutdown transitions:** +- `accepting` → `draining`: the cluster is being shut down or removed. New connections are rejected; existing connections are given the opportunity to complete. +- `draining` → `stopped`: connections are closed (gracefully or via timeout). Terminal. +- `failed` → `stopped`: the cluster is being removed or the proxy is shutting down. Since `failed` clusters have already released their resources, this is a bookkeeping transition. Terminal. + +**Reload transitions:** +- `accepting` → `draining`: connections are drained before reconfiguration. +- `draining` → `initializing`: drain is complete, cluster begins applying new configuration. +- `initializing` → `accepting`: new configuration applied successfully. +- `initializing` → `failed`: new configuration could not be applied. Partial resources are cleaned up. + +Whether a previous configuration is available for rollback is implementation context that the runtime tracks, not a property of the lifecycle state. + +**Recovery transitions:** +- `failed` → `initializing`: a retry is requested (e.g. operator action, reload with corrected config). Since `failed` clusters have already released all resources, this is a clean start from scratch. + +### Proxy Startup Behaviour + +On startup, the proxy attempts to initialise each virtual cluster in the configuration. Clusters that succeed move to `accepting`. Clusters that fail move to `failed` with a captured reason. + +By default, the proxy fails to start if any cluster fails to initialise (fail-fast). This is the correct behaviour for most deployments — configuration errors should be surfaced immediately, especially in development and bare-metal environments. + +A configurable startup policy allows deployments where partial availability is preferable to no availability: + +```yaml +proxy: + startupPolicy: fail-fast # default — any cluster failure prevents startup + # startupPolicy: best-effort # start with whatever clusters succeed +``` + +In best-effort mode, the proxy starts and serves traffic for clusters that initialised successfully, while reporting failed clusters via health endpoints and logs. Kubernetes readiness probes or monitoring systems can apply their own thresholds (e.g. "all clusters must be accepting" vs "at least one cluster must not be failed"). The operator would typically set this policy. + +### Graceful Shutdown + +When the proxy receives a shutdown signal: + +1. All `accepting` clusters transition to `draining`. +2. All `failed` clusters transition directly to `stopped`. +3. New connections are rejected for draining clusters. +4. For each existing connection, the proxy waits for in-flight requests to complete, up to a configurable drain timeout. +5. Once drained (or timed out), connections are closed and clusters move to `stopped`. +6. The proxy process exits. + +The drain timeout should be configurable. Kafka consumers with long poll timeouts (`max.poll.interval.ms` defaults to 5 minutes) or slow producers with `acks=all` can legitimately need more than the 30 seconds assumed in current code. + +```yaml +proxy: + drainTimeout: 60s # default TBD +``` + +### Observability + +Cluster lifecycle state should be observable — through management endpoints, logging, or metrics — so that operators and tooling can determine which clusters are accepting connections, which have failed, and why. The specific reporting mechanism is an implementation concern and not prescribed by this proposal. + +### Internal Representation + +Each virtual cluster holds a state object: + +```java +public record ClusterState( + LifecyclePhase phase, + Instant since, + @Nullable String reason) { + + public enum LifecyclePhase { + INITIALIZING, + ACCEPTING, + DRAINING, + FAILED, + STOPPED + } +} +``` + +State transitions should be validated — e.g. a cluster cannot move from `stopped` to any other state. Invalid transitions indicate a programming error and should throw. + +The component responsible for managing cluster state (likely an evolution of the existing `EndpointRegistry` or a new `ClusterLifecycleManager`) should be the single source of truth for state transitions, ensuring they are logged and observable. + +## Affected/not affected projects + +**Affected:** +- **kroxylicious-proxy (runtime)**: startup logic, shutdown logic, endpoint registry, health endpoints. This is where the lifecycle state machine lives. +- **kroxylicious-operator**: may choose to inspect per-cluster state for readiness/health reporting. Not required to change immediately. + +**Not affected:** +- **kroxylicious-api**: the filter SPI is unaffected. Filters do not need to know about cluster lifecycle. +- **kroxylicious-kms** and other plugin modules: no changes needed. + +## Compatibility + +The default startup policy is fail-fast, which matches current behaviour — the proxy process exits if any cluster fails to initialise. Existing deployments are unaffected. + +The new best-effort startup policy is opt-in. Deployments that enable it should ensure they have appropriate health/readiness checks in place to detect partially-started proxies. + +## Rejected Alternatives + +### Single boolean health status + +We considered a simple healthy/unhealthy model rather than per-cluster states. This is insufficient because: +- It cannot distinguish "one cluster failed to start" from "the entire proxy is broken." +- It provides no information for recovery (which cluster? why?). +- It conflates cluster health with proxy health. + +### Automatic retry on failure + +We considered having the proxy automatically retry failed clusters on a backoff schedule. This adds complexity (retry policies, backoff configuration, thundering herd concerns) and is better left to external orchestration (Kubernetes controllers, operator logic) that already has retry infrastructure. The lifecycle model exposes the `failed` state; the decision to retry belongs to the operator. + +### Retaining resources in `failed` state + +We considered having `failed` clusters retain any resources they successfully acquired (e.g. partially-initialised filters, upstream connections) to make retry faster. However, this creates ambiguity about what state a `failed` cluster is actually in and complicates recovery logic. + +We decided against this: `failed` clusters release all partially-acquired resources on entry. This means a retry from `failed` is a clean `initializing` cycle. Clean teardown on failure keeps the `failed` state uniform: it always means "no resources held, here's what went wrong." + +### Separate `reinitializing` state for reload + +We considered a separate `reinitializing` state to distinguish first-time initialisation (no rollback target) from reload (previous configuration available). However, with port binding scoped to the proxy infrastructure rather than the virtual cluster, `initializing` is a clean slate in both cases from the cluster's perspective. Whether a previous configuration is available for rollback is implementation context the runtime tracks, not a lifecycle state concern. A single `initializing` state keeps the model simpler. + +### Reload through `stopped` + +We considered having the reload path go through `stopped` (`accepting` → `draining` → `stopped` → `initializing` → `accepting`). This would make `stopped` a non-terminal state, changing its meaning from "this cluster is done" to "this cluster might come back." This complicates the model — during shutdown, all clusters reach `stopped`, but some might be re-entering `initializing` for reload while others are genuinely finished. Keeping `stopped` terminal and routing reload through `draining` → `initializing` avoids this ambiguity. + +### Runtime health as lifecycle state + +We considered splitting the `accepting` state into `healthy` and `degraded` to model runtime health (upstream broker availability, KMS connectivity, etc.) as part of the lifecycle. However, `healthy` and `degraded` had identical inward and outward transitions — both could transition to `draining` for shutdown or reload, and neither gated any lifecycle decision. This is a strong signal that they are not lifecycle states. + +Runtime health is also inherently perspectival: different observers (direct clients, load balancers, monitoring systems) may define "healthy" differently, and health signals depend on polling mechanisms with inherent delays. Baking a health model into the lifecycle commits us to a definition we do not yet have and that may not be the same for all consumers. + +The lifecycle model's job is to track what the proxy is doing with a cluster — setting it up, accepting connections, draining, or torn down. Whether the cluster can successfully serve traffic is a separate, orthogonal concern better addressed by readiness probes, health endpoints, or metrics that can evolve independently. + +### Runtime health model + +We considered defining a broader health model alongside the lifecycle — covering upstream broker reachability, KMS availability, filter readiness, and similar runtime concerns. This was ruled out of scope. Health depends on what the proxy is being used for: a proxy doing record encryption has different health criteria from one doing schema validation. The appropriate health model will vary by deployment, and may need to account for request-level routing where health is per-destination rather than per-cluster. Defining a health model prematurely would constrain future design options without providing immediate value. The lifecycle model intentionally leaves room for health to be addressed separately. + +## Future Enhancements + +### Reload without draining + +The current reload path requires draining connections before reinitialising (`accepting` → `draining` → `initializing`). In the future, it may be possible to skip the drain step for certain types of configuration change — for example, swapping the filter chain in place or reconnecting upstream without dropping client connections. + +This would introduce a direct `accepting` → `initializing` transition. The state model as proposed accommodates this without structural changes: `initializing` already represents "setting up the cluster," and its exit transitions (`accepting` on success, `failed` on failure) remain the same regardless of whether draining preceded it. + +Some configuration changes will likely always require draining — for example, changes to the upstream cluster identity or TLS configuration that invalidate existing connections. The optimisation is about identifying changes where draining can be safely skipped, not eliminating it. + +### Proxy-level lifecycle + +This proposal covers the lifecycle of individual virtual clusters. The proxy process itself has lifecycle concerns that sit above the per-cluster model: management port binding, process startup/shutdown sequencing, and aggregate health reporting. A proxy-level lifecycle model would define states and transitions for the process as a whole, with per-cluster states feeding into it. Port binding, which is managed by the proxy infrastructure and injected into virtual clusters, would naturally belong to this layer. diff --git a/proposals/diagrams/016-virtual-cluster-lifecycle.png b/proposals/diagrams/016-virtual-cluster-lifecycle.png new file mode 100644 index 0000000..2961844 Binary files /dev/null and b/proposals/diagrams/016-virtual-cluster-lifecycle.png differ