diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0824a607..2a0d71fb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,14 +32,14 @@ jobs: with: node-version: ${{ env.NODE_VERSION }} - - name: Download Controller dependencies - working-directory: ./controller + - name: Download Kubernetes Controller dependencies + working-directory: ./k8s-controller run: | go mod tidy go mod download - - name: Lint Controller - working-directory: ./controller + - name: Lint Kubernetes Controller + working-directory: ./k8s-controller run: | go fmt ./... go vet ./... @@ -67,7 +67,7 @@ jobs: npm run lint test-controller: - name: Test Controller + name: Test Kubernetes Controller runs-on: ubuntu-latest steps: - name: Checkout code @@ -84,18 +84,18 @@ jobs: path: | ~/.cache/go-build ~/go/pkg/mod - key: ${{ runner.os }}-go-${{ hashFiles('controller/go.sum', 'controller/go.mod') }} + key: ${{ runner.os }}-go-${{ hashFiles('k8s-controller/go.sum', 'k8s-controller/go.mod') }} restore-keys: | ${{ runner.os }}-go- - name: Download dependencies - working-directory: ./controller + working-directory: ./k8s-controller run: | go mod download go mod tidy - name: Run tests - working-directory: ./controller + working-directory: ./k8s-controller run: | go test -v -race -coverprofile=coverage.out -covermode=atomic ./... go tool cover -func=coverage.out @@ -103,9 +103,9 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v4 with: - files: ./controller/coverage.out - flags: controller - name: controller-coverage + files: ./k8s-controller/coverage.out + flags: k8s-controller + name: k8s-controller-coverage test-api: name: Test API @@ -231,17 +231,17 @@ jobs: with: node-version: ${{ env.NODE_VERSION }} - - name: Download Controller dependencies - working-directory: ./controller + - name: Download Kubernetes Controller dependencies + working-directory: ./k8s-controller run: | go mod tidy go mod download - - name: Build Controller - working-directory: ./controller + - name: Build Kubernetes Controller + working-directory: ./k8s-controller run: | CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o bin/manager cmd/main.go - echo "Controller binary size: $(ls -lh bin/manager | awk '{print $5}')" + echo "Kubernetes Controller binary size: $(ls -lh bin/manager | awk '{print $5}')" - name: Download API dependencies working-directory: ./api @@ -262,11 +262,11 @@ jobs: npm run build echo "UI build size: $(du -sh build | awk '{print $1}')" - - name: Upload Controller artifact + - name: Upload Kubernetes Controller artifact uses: actions/upload-artifact@v4 with: - name: controller-binary - path: controller/bin/manager + name: k8s-controller-binary + path: k8s-controller/bin/manager - name: Upload API artifact uses: actions/upload-artifact@v4 diff --git a/.github/workflows/container-images.yml b/.github/workflows/container-images.yml index 83f6a28e..f5e5a631 100644 --- a/.github/workflows/container-images.yml +++ b/.github/workflows/container-images.yml @@ -9,7 +9,7 @@ on: - 'v*' paths: - 'api/**' - - 'controller/**' + - 'k8s-controller/**' - 'ui/**' - '.github/workflows/container-images.yml' pull_request: @@ -31,7 +31,7 @@ permissions: jobs: build-and-sign-controller: - name: Build & Sign Controller + name: Build & Sign Kubernetes Controller runs-on: ubuntu-latest steps: - name: Checkout code @@ -58,7 +58,7 @@ jobs: id: meta uses: docker/metadata-action@v5 with: - images: ${{ env.IMAGE_PREFIX }}-controller + images: ${{ env.IMAGE_PREFIX }}-kubernetes-controller tags: | type=ref,event=branch type=ref,event=pr @@ -75,12 +75,12 @@ jobs: echo "COMMIT=${{ github.sha }}" >> $GITHUB_OUTPUT echo "BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT - - name: Build and push Controller image + - name: Build and push Kubernetes Controller image id: build uses: docker/build-push-action@v5 with: - context: ./controller - file: ./controller/Dockerfile + context: ./k8s-controller + file: ./k8s-controller/Dockerfile platforms: linux/amd64,linux/arm64 push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} @@ -139,20 +139,20 @@ jobs: fi done - IMAGE_REF="${{ env.IMAGE_PREFIX }}-controller@${DIGEST}" + IMAGE_REF="${{ env.IMAGE_PREFIX }}-kubernetes-controller@${DIGEST}" echo "Image reference for signing: $IMAGE_REF" cosign sign --yes "$IMAGE_REF" - - name: Generate SBOM for Controller + - name: Generate SBOM for Kubernetes Controller if: github.event_name != 'pull_request' uses: anchore/sbom-action@v0 with: - path: ./controller - artifact-name: streamspace-controller-sbom.spdx.json - output-file: sbom-controller.spdx.json + path: ./k8s-controller + artifact-name: streamspace-kubernetes-controller-sbom.spdx.json + output-file: sbom-kubernetes-controller.spdx.json format: spdx-json - - name: Attest Controller SBOM + - name: Attest Kubernetes Controller SBOM if: github.event_name != 'pull_request' env: COSIGN_EXPERIMENTAL: "true" @@ -185,18 +185,18 @@ jobs: fi done - IMAGE_REF="${{ env.IMAGE_PREFIX }}-controller@${DIGEST}" + IMAGE_REF="${{ env.IMAGE_PREFIX }}-kubernetes-controller@${DIGEST}" echo "Using digest for SBOM attestation: $DIGEST" cosign attest --yes --type spdxjson \ - --predicate sbom-controller.spdx.json \ + --predicate sbom-kubernetes-controller.spdx.json \ "$IMAGE_REF" - - name: Upload Controller SBOM + - name: Upload Kubernetes Controller SBOM if: github.event_name != 'pull_request' uses: actions/upload-artifact@v4 with: - name: sbom-controller - path: sbom-controller.spdx.json + name: sbom-kubernetes-controller + path: sbom-kubernetes-controller.spdx.json retention-days: 90 build-and-sign-api: @@ -544,7 +544,7 @@ jobs: needs: [build-and-sign-controller, build-and-sign-api, build-and-sign-ui] strategy: matrix: - component: [controller, api, ui] + component: [kubernetes-controller, api, ui] steps: - name: Install Cosign uses: sigstore/cosign-installer@v3 @@ -725,7 +725,7 @@ jobs: echo "## 🐳 Container Images Built" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "### Images" >> $GITHUB_STEP_SUMMARY - echo "- βœ… \`${{ env.IMAGE_PREFIX }}-controller:latest\`" >> $GITHUB_STEP_SUMMARY + echo "- βœ… \`${{ env.IMAGE_PREFIX }}-kubernetes-controller:latest\`" >> $GITHUB_STEP_SUMMARY echo "- βœ… \`${{ env.IMAGE_PREFIX }}-api:latest\`" >> $GITHUB_STEP_SUMMARY echo "- βœ… \`${{ env.IMAGE_PREFIX }}-ui:latest\`" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b6a51806..79c3d131 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -87,7 +87,7 @@ jobs: All images are available for both \`linux/amd64\` and \`linux/arm64\` platforms: - - Controller: \`ghcr.io/${{ github.repository_owner }}/streamspace-controller:${{ steps.version.outputs.VERSION }}\` + - Kubernetes Controller: \`ghcr.io/${{ github.repository_owner }}/streamspace-kubernetes-controller:${{ steps.version.outputs.VERSION }}\` - API: \`ghcr.io/${{ github.repository_owner }}/streamspace-api:${{ steps.version.outputs.VERSION }}\` - UI: \`ghcr.io/${{ github.repository_owner }}/streamspace-ui:${{ steps.version.outputs.VERSION }}\` @@ -171,7 +171,7 @@ jobs: needs: release strategy: matrix: - component: [controller, api, ui] + component: [kubernetes-controller, api, ui] steps: - name: Extract version id: version diff --git a/.github/workflows/security-scan.yml b/.github/workflows/security-scan.yml index 2be2a730..96b4edfd 100644 --- a/.github/workflows/security-scan.yml +++ b/.github/workflows/security-scan.yml @@ -21,7 +21,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - component: [api, ui, controller] + component: [api, ui, kubernetes-controller] steps: - name: Checkout code uses: actions/checkout@v4 @@ -35,8 +35,8 @@ jobs: docker build -t streamspace-api:scan ./api elif [ "${{ matrix.component }}" = "ui" ]; then docker build -t streamspace-ui:scan ./ui - elif [ "${{ matrix.component }}" = "controller" ]; then - docker build -t streamspace-controller:scan ./controller + elif [ "${{ matrix.component }}" = "kubernetes-controller" ]; then + docker build -t streamspace-kubernetes-controller:scan ./k8s-controller fi - name: Run Trivy vulnerability scanner @@ -77,7 +77,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - component: [api, controller] + component: [api, k8s-controller] steps: - name: Checkout code uses: actions/checkout@v4 @@ -241,7 +241,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - component: [api, ui, controller] + component: [api, ui, k8s-controller] steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/CLAUDE.md b/CLAUDE.md index a2c6665c..af469d9c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -363,7 +363,7 @@ streamspace/ β”‚ β”œβ”€β”€ PLUGIN_DEVELOPMENT.md # Plugin development guide β”‚ -β”œβ”€β”€ controller/ # Go controller using Kubebuilder +β”œβ”€β”€ k8s-controller/ # Go Kubernetes controller using Kubebuilder β”‚ β”œβ”€β”€ cmd/ # Main entry point β”‚ β”œβ”€β”€ internal/ # Controller logic, reconcilers β”‚ β”œβ”€β”€ api/ # CRD type definitions @@ -408,7 +408,7 @@ streamspace/ - **`scripts/`**: Automation scripts for template generation and utilities -- **`controller/`**: Go-based Kubernetes controller (Kubebuilder) +- **`k8s-controller/`**: Go-based Kubernetes controller (Kubebuilder) - Manages Session lifecycle and hibernation - Reconciles CRD resources with Kubernetes state diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md index cb7978d0..c037bb3b 100644 --- a/DEPLOYMENT.md +++ b/DEPLOYMENT.md @@ -32,7 +32,7 @@ kubectl create namespace streamspace ### 2. Deploy CRDs ```bash -kubectl apply -f controller/config/crd/bases/ +kubectl apply -f k8s-controller/config/crd/bases/ ``` Verify: @@ -93,7 +93,7 @@ Edit the deployment manifests to use your registry: ```bash # Update controller image sed -i 's|your-registry/streamspace-controller:v0.2.0|ghcr.io/yourname/streamspace-controller:v0.2.0|' \ - controller/config/manager/controller-deployment.yaml + k8s-controller/config/manager/controller-deployment.yaml # Update API image sed -i 's|your-registry/streamspace-api:v0.2.0|ghcr.io/yourname/streamspace-api:v0.2.0|' \ @@ -225,7 +225,7 @@ The controller watches Session and Template CRDs and manages their lifecycle. **Configuration via Environment Variables:** -Edit `controller/config/manager/controller-deployment.yaml`: +Edit `k8s-controller/config/manager/controller-deployment.yaml`: ```yaml env: @@ -526,7 +526,7 @@ When updating CRDs: kubectl get sessions -n streamspace -o yaml > sessions-backup.yaml # Update CRDs -kubectl apply -f controller/config/crd/bases/ +kubectl apply -f k8s-controller/config/crd/bases/ # Verify no resources were lost kubectl get sessions -n streamspace diff --git a/MIGRATION_SUMMARY.md b/MIGRATION_SUMMARY.md index c0c8e552..71627281 100644 --- a/MIGRATION_SUMMARY.md +++ b/MIGRATION_SUMMARY.md @@ -29,7 +29,7 @@ streamspace/ β”‚ β”œβ”€β”€ config/ # Deployment manifests β”‚ β”œβ”€β”€ templates/ # 22 application templates β”‚ └── monitoring/ # Grafana, Prometheus, Alerts -β”œβ”€β”€ controller/ # Go workspace controller +β”œβ”€β”€ k8s-controller/ # Go Kubernetes controller β”œβ”€β”€ api/ # API backend (to be built) β”œβ”€β”€ ui/ # React frontend (to be built) β”œβ”€β”€ chart/ # Helm chart diff --git a/README.md b/README.md index e79feb99..94cde370 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ StreamSpace is a Kubernetes-native platform that delivers browser-based access t ### Core Features - 🌐 **Browser-Based Access** - Access any application via web browser using open source VNC +- πŸ–₯️ **Multi-Platform Support** - Deploy on Kubernetes, Docker, or hybrid environments - πŸ‘₯ **Multi-User Support** - Isolated sessions with SSO (Authentik/Keycloak) - πŸ’Ύ **Persistent Home Directories** - User files persist across sessions (NFS) - ⚑ **On-Demand Auto-Hibernation** - Idle workspaces automatically scale to zero @@ -103,6 +104,8 @@ StreamSpace has completed **Phase 5 (Production-Ready)** with all core and enter ## πŸ—οΈ Architecture +StreamSpace uses a **multi-platform event-driven architecture** that supports Kubernetes, Docker, and future platforms through NATS messaging. + ``` β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ Web UI (React) β”‚ @@ -111,30 +114,41 @@ StreamSpace has completed **Phase 5 (Production-Ready)** with all core and enter β”‚ REST API + WebSocket ↓ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ StreamSpace Controller (Go) β”‚ -β”‚ Session Lifecycle β€’ Auto-Hibernation β€’ User Management β”‚ +β”‚ API Backend (Go/Gin) β”‚ +β”‚ Session CRUD β€’ Auth β€’ Plugins β€’ Repository Sync β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ Kubernetes API + β”‚ NATS Events ↓ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Kubernetes Cluster β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Session β”‚ β”‚ Session β”‚ β”‚ Session β”‚ β”‚ -β”‚ β”‚ Pod β”‚ β”‚ Pod β”‚ β”‚ Pod β”‚ β”‚ -β”‚ β”‚(VNC) β”‚ β”‚(VNC) β”‚ β”‚(VNC) β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ β”‚ β”‚ β”‚ -β”‚ /home/user1 /home/user2 /home/user3 β”‚ -β”‚ (NFS PVC) (NFS PVC) (NFS PVC) β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +β”‚ NATS JetStream Message Queue β”‚ +β”‚ Durable Events β€’ Platform Routing β€’ Event Sourcing β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + ↓ ↓ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Kubernetes Controller β”‚ β”‚ Docker Controller β”‚ +β”‚ (k8s-controller/) β”‚ β”‚ (docker-controller/) β”‚ +β”‚ Session Lifecycle β”‚ β”‚ Docker Compose β”‚ +β”‚ Auto-Hibernation β”‚ β”‚ Container Lifecycle β”‚ +β”‚ CRD Reconciliation β”‚ β”‚ Volume Management β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + ↓ ↓ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Kubernetes Cluster β”‚ β”‚ Docker Host β”‚ +β”‚ Sessions (Pods/CRDs) β”‚ β”‚ Sessions (Containers) β”‚ +β”‚ NFS PVC Storage β”‚ β”‚ Local Volume Storage β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` **Key Components**: -- **Controller**: Manages session lifecycle, hibernation, and provisioning -- **API Backend**: REST/WebSocket API for UI and integrations +- **API Backend**: REST/WebSocket API, publishes events to NATS for platform controllers +- **NATS JetStream**: Event-driven messaging for multi-platform coordination +- **Kubernetes Controller**: Manages sessions on Kubernetes clusters via CRDs +- **Docker Controller**: Manages sessions on standalone Docker hosts - **Web UI**: User-facing dashboard and workspace catalog - **Sessions**: Containerized applications with VNC streaming to your browser -- **User Storage**: Persistent NFS volumes mounted across all sessions +- **User Storage**: Persistent volumes (NFS for K8s, local for Docker) ## πŸ“¦ Prerequisites @@ -620,10 +634,10 @@ Access Grafana: `kubectl port-forward -n observability svc/grafana 3000:80` ## πŸ› οΈ Development -### Build Controller +### Build Kubernetes Controller ```bash -cd controller +cd k8s-controller # Initialize Go project go mod init github.com/yourusername/streamspace @@ -640,11 +654,23 @@ kubebuilder create api --group stream --version v1alpha1 --kind Session kubebuilder create api --group stream --version v1alpha1 --kind Template # Build -make docker-build docker-push IMG=yourregistry/streamspace-controller:latest +make docker-build docker-push IMG=yourregistry/streamspace-kubernetes-controller:latest ``` See full guide: [docs/CONTROLLER_GUIDE.md](docs/CONTROLLER_GUIDE.md) +### Build Docker Controller + +```bash +cd docker-controller + +# Build the Docker controller +go build -o streamspace-docker-controller + +# Or use Docker Compose for development +./scripts/docker-dev.sh +``` + ### Build API Backend ```bash @@ -676,10 +702,14 @@ npm run build ## πŸ§ͺ Testing ```bash -# Run controller tests -cd controller +# Run Kubernetes controller tests +cd k8s-controller make test +# Run Docker controller tests +cd docker-controller +go test ./... -v + # Run API tests cd api go test ./... -v @@ -691,6 +721,11 @@ npm test # Integration tests cd tests ./run-integration-tests.sh + +# Docker development environment +./scripts/docker-dev.sh # Start NATS + controllers +./scripts/test-nats.sh # Test NATS connectivity +./scripts/docker-dev-stop.sh # Stop development environment ``` ## 🀝 Contributing @@ -747,8 +782,8 @@ Contributions welcome! Please read [CONTRIBUTING.md](CONTRIBUTING.md) first. ### Sessions not starting ```bash -# Check controller logs -kubectl logs -n streamspace deploy/streamspace-controller +# Check Kubernetes controller logs +kubectl logs -n streamspace deploy/streamspace-kubernetes-controller # Check session events kubectl describe session -n streamspace diff --git a/api/cmd/main.go b/api/cmd/main.go index bba634fc..566ada85 100644 --- a/api/cmd/main.go +++ b/api/cmd/main.go @@ -18,6 +18,7 @@ import ( "github.com/streamspace/streamspace/api/internal/auth" "github.com/streamspace/streamspace/api/internal/cache" "github.com/streamspace/streamspace/api/internal/db" + "github.com/streamspace/streamspace/api/internal/events" "github.com/streamspace/streamspace/api/internal/handlers" "github.com/streamspace/streamspace/api/internal/k8s" "github.com/streamspace/streamspace/api/internal/middleware" @@ -92,9 +93,54 @@ func main() { log.Fatalf("Failed to initialize Kubernetes client: %v", err) } + // Initialize NATS event publisher + // This enables event-driven communication with platform controllers + log.Println("Initializing NATS event publisher...") + natsURL := getEnv("NATS_URL", "") + natsUser := getEnv("NATS_USER", "") + natsPassword := getEnv("NATS_PASSWORD", "") + eventPublisher, err := events.NewPublisher(events.Config{ + URL: natsURL, + User: natsUser, + Password: natsPassword, + }) + if err != nil { + log.Printf("Warning: Failed to initialize NATS publisher: %v", err) + log.Println("Event publishing will be disabled - controllers will not receive events") + } + defer eventPublisher.Close() + + // Get platform from environment (for multi-platform support) + platform := os.Getenv("PLATFORM") + if platform == "" { + platform = events.PlatformKubernetes // Default platform + } + + // Initialize NATS event subscriber for receiving status updates from controllers + log.Println("Initializing NATS event subscriber...") + eventSubscriber, err := events.NewSubscriber(events.Config{ + URL: natsURL, + User: natsUser, + Password: natsPassword, + }, database.DB()) + if err != nil { + log.Printf("Warning: Failed to initialize NATS subscriber: %v", err) + log.Println("Status feedback from controllers will be disabled") + } + defer eventSubscriber.Close() + + // Start subscriber in background to receive controller status events + subscriberCtx, cancelSubscriber := context.WithCancel(context.Background()) + defer cancelSubscriber() + go func() { + if err := eventSubscriber.Start(subscriberCtx); err != nil { + log.Printf("NATS subscriber error: %v", err) + } + }() + // Initialize connection tracker log.Println("Starting connection tracker...") - connTracker := tracker.NewConnectionTracker(database, k8sClient) + connTracker := tracker.NewConnectionTracker(database, k8sClient, eventPublisher, platform) go connTracker.Start() defer connTracker.Stop() @@ -125,7 +171,7 @@ func main() { // Initialize activity tracker log.Println("Initializing activity tracker...") - activityTracker := activity.NewTracker(k8sClient) + activityTracker := activity.NewTracker(k8sClient, eventPublisher, platform) // Start idle session monitor (check every 1 minute) idleCheckInterval := getEnv("IDLE_CHECK_INTERVAL", "1m") @@ -216,7 +262,22 @@ func main() { Issuer: "streamspace-api", TokenDuration: 24 * time.Hour, } - jwtManager := auth.NewJWTManager(jwtConfig) + // Use session-aware JWT manager for server-side session tracking + // This enables proper logout, session invalidation, and forced re-login on restart + jwtManager := auth.NewJWTManagerWithSessions(jwtConfig, redisCache) + + // Clear all sessions on startup to force users to re-login + // This is a security feature that ensures tokens from previous server runs are invalid + if redisCache.IsEnabled() { + log.Println("Clearing existing sessions (forcing re-login)...") + clearCtx, clearCancel := context.WithTimeout(context.Background(), 10*time.Second) + if err := jwtManager.ClearAllSessions(clearCtx); err != nil { + log.Printf("Warning: Failed to clear sessions: %v", err) + } else { + log.Println("Sessions cleared - users will need to re-login") + } + clearCancel() + } // Initialize SAML authentication (optional) var samlAuth *auth.SAMLAuthenticator @@ -235,7 +296,7 @@ func main() { } // Initialize API handlers - apiHandler := api.NewHandler(database, k8sClient, connTracker, syncService, wsManager, quotaEnforcer) + apiHandler := api.NewHandler(database, k8sClient, eventPublisher, connTracker, syncService, wsManager, quotaEnforcer, platform) userHandler := handlers.NewUserHandler(userDB, groupDB) groupHandler := handlers.NewGroupHandler(groupDB, userDB) authHandler := auth.NewAuthHandler(userDB, jwtManager, samlAuth) @@ -256,7 +317,7 @@ func main() { batchHandler := handlers.NewBatchHandler(database) monitoringHandler := handlers.NewMonitoringHandler(database) quotasHandler := handlers.NewQuotasHandler(database) - nodeHandler := handlers.NewNodeHandler(database, k8sClient) + nodeHandler := handlers.NewNodeHandler(database, k8sClient, eventPublisher, platform) // NOTE: WebSocket routes now use wsManager directly (see ws.GET routes below) consoleHandler := handlers.NewConsoleHandler(database) collaborationHandler := handlers.NewCollaborationHandler(database) @@ -266,12 +327,7 @@ func main() { securityHandler := handlers.NewSecurityHandler(database) templateVersioningHandler := handlers.NewTemplateVersioningHandler(database) setupHandler := handlers.NewSetupHandler(database) - // Get namespace from environment (same as api.NewHandler) - appNamespace := os.Getenv("NAMESPACE") - if appNamespace == "" { - appNamespace = "streamspace" // Default namespace - } - applicationHandler := handlers.NewApplicationHandler(database, k8sClient, appNamespace) + applicationHandler := handlers.NewApplicationHandler(database, eventPublisher, platform) // NOTE: Billing is now handled by the streamspace-billing plugin // SECURITY: Initialize webhook authentication diff --git a/api/go.mod b/api/go.mod index cf344eda..aeb41b60 100644 --- a/api/go.mod +++ b/api/go.mod @@ -13,6 +13,7 @@ require ( github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 github.com/lib/pq v1.10.9 github.com/microcosm-cc/bluemonday v1.0.27 + github.com/nats-io/nats.go v1.37.0 github.com/pquerna/otp v1.5.0 github.com/redis/go-redis/v9 v9.16.0 github.com/robfig/cron/v3 v3.0.1 @@ -65,6 +66,9 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/nats-io/nkeys v0.4.7 // indirect + github.com/nats-io/nuid v1.0.1 // indirect + github.com/klauspost/compress v1.17.9 // indirect github.com/pelletier/go-toml/v2 v2.0.8 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect diff --git a/api/internal/activity/tracker.go b/api/internal/activity/tracker.go index 1c3192de..475ee973 100644 --- a/api/internal/activity/tracker.go +++ b/api/internal/activity/tracker.go @@ -41,6 +41,7 @@ import ( "log" "time" + "github.com/streamspace/streamspace/api/internal/events" "github.com/streamspace/streamspace/api/internal/k8s" ) @@ -58,11 +59,15 @@ import ( // // Example: // -// tracker := NewTracker(k8sClient) +// tracker := NewTracker(k8sClient, publisher, "kubernetes") // err := tracker.UpdateSessionActivity(ctx, namespace, sessionName) type Tracker struct { // k8sClient interacts with Kubernetes to read and update Sessions. k8sClient *k8s.Client + // publisher publishes NATS events for platform-agnostic operations. + publisher *events.Publisher + // platform identifies the target platform (kubernetes, docker, etc.) + platform string } // NewTracker creates a new activity tracker instance. @@ -71,11 +76,16 @@ type Tracker struct { // // Example: // -// tracker := NewTracker(k8sClient) +// tracker := NewTracker(k8sClient, publisher, "kubernetes") // go tracker.StartIdleMonitor(ctx, "streamspace", 1*time.Minute) -func NewTracker(k8sClient *k8s.Client) *Tracker { +func NewTracker(k8sClient *k8s.Client, publisher *events.Publisher, platform string) *Tracker { + if platform == "" { + platform = events.PlatformKubernetes + } return &Tracker{ k8sClient: k8sClient, + publisher: publisher, + platform: platform, } } @@ -233,6 +243,16 @@ func (t *Tracker) HibernateIdleSession(ctx context.Context, namespace, sessionNa return fmt.Errorf("failed to hibernate session: %w", err) } + // Publish hibernate event for controllers + event := &events.SessionHibernateEvent{ + SessionID: sessionName, + UserID: session.User, + Platform: t.platform, + } + if err := t.publisher.PublishSessionHibernate(ctx, event); err != nil { + log.Printf("Warning: Failed to publish session hibernate event: %v", err) + } + log.Printf("Auto-hibernated idle session: %s/%s (idle for %v)", namespace, sessionName, status.IdleDuration) return nil } diff --git a/api/internal/api/handlers.go b/api/internal/api/handlers.go index 1ba8ad74..37afa3d4 100644 --- a/api/internal/api/handlers.go +++ b/api/internal/api/handlers.go @@ -106,6 +106,7 @@ import ( "github.com/gin-gonic/gin" "github.com/google/uuid" "github.com/streamspace/streamspace/api/internal/db" + "github.com/streamspace/streamspace/api/internal/events" "github.com/streamspace/streamspace/api/internal/k8s" "github.com/streamspace/streamspace/api/internal/quota" "github.com/streamspace/streamspace/api/internal/sync" @@ -154,12 +155,15 @@ var ( // Each request gets its own Gin context with isolated state. type Handler struct { db *db.Database // Database for caching and metadata + sessionDB *db.SessionDB // Session database operations k8sClient *k8s.Client // Kubernetes client for CRD operations + publisher *events.Publisher // NATS event publisher connTracker *tracker.ConnectionTracker // Active connection tracking syncService *sync.SyncService // Repository synchronization wsManager *websocket.Manager // WebSocket connection manager quotaEnforcer *quota.Enforcer // Resource quota enforcement namespace string // Kubernetes namespace for resources + platform string // Target platform (kubernetes, docker, etc.) } // NewHandler creates a new API handler with injected dependencies. @@ -168,10 +172,12 @@ type Handler struct { // // - database: PostgreSQL database connection for caching and metadata // - k8sClient: Kubernetes client for Session/Template CRD operations +// - publisher: NATS event publisher for platform-agnostic operations // - connTracker: Connection tracker for active session monitoring // - syncService: Service for syncing external template repositories // - wsManager: Manager for WebSocket connections and real-time updates // - quotaEnforcer: Enforcer for validating resource quotas +// - platform: Target platform (kubernetes, docker, hyperv, vcenter) // // NAMESPACE RESOLUTION: // @@ -180,24 +186,30 @@ type Handler struct { // // EXAMPLE USAGE: // -// handler := NewHandler(db, k8sClient, connTracker, syncService, wsManager, quotaEnforcer) +// handler := NewHandler(db, k8sClient, publisher, connTracker, syncService, wsManager, quotaEnforcer, "kubernetes") // router := gin.Default() // router.GET("/api/sessions", handler.ListSessions) // router.POST("/api/sessions", handler.CreateSession) -func NewHandler(database *db.Database, k8sClient *k8s.Client, connTracker *tracker.ConnectionTracker, syncService *sync.SyncService, wsManager *websocket.Manager, quotaEnforcer *quota.Enforcer) *Handler { +func NewHandler(database *db.Database, k8sClient *k8s.Client, publisher *events.Publisher, connTracker *tracker.ConnectionTracker, syncService *sync.SyncService, wsManager *websocket.Manager, quotaEnforcer *quota.Enforcer, platform string) *Handler { // Read namespace from environment variable for deployment flexibility namespace := os.Getenv("NAMESPACE") if namespace == "" { namespace = "streamspace" // Default namespace } + if platform == "" { + platform = events.PlatformKubernetes // Default platform + } return &Handler{ db: database, + sessionDB: db.NewSessionDB(database.DB()), k8sClient: k8sClient, + publisher: publisher, connTracker: connTracker, syncService: syncService, wsManager: wsManager, quotaEnforcer: quotaEnforcer, namespace: namespace, + platform: platform, } } @@ -252,26 +264,43 @@ func (h *Handler) ListSessions(c *gin.Context) { ctx := c.Request.Context() userID := c.Query("user") - var sessions []*k8s.Session + // Use database as source of truth for multi-platform support + var dbSessions []*db.Session var err error if userID != "" { - sessions, err = h.k8sClient.ListSessionsByUser(ctx, h.namespace, userID) + dbSessions, err = h.sessionDB.ListSessionsByUser(ctx, userID) } else { - sessions, err = h.k8sClient.ListSessions(ctx, h.namespace) + dbSessions, err = h.sessionDB.ListSessions(ctx) } if err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + // Fall back to Kubernetes for backward compatibility + log.Printf("Database session query failed, falling back to k8s: %v", err) + var k8sSessions []*k8s.Session + if userID != "" { + k8sSessions, err = h.k8sClient.ListSessionsByUser(ctx, h.namespace, userID) + } else { + k8sSessions, err = h.k8sClient.ListSessions(ctx, h.namespace) + } + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + enriched := h.enrichSessionsWithDBInfo(ctx, k8sSessions) + c.JSON(http.StatusOK, gin.H{ + "sessions": enriched, + "total": len(enriched), + }) return } - // Enrich with database info (active connections) - enriched := h.enrichSessionsWithDBInfo(ctx, sessions) + // Convert database sessions to API response format + sessions := h.convertDBSessionsToResponse(dbSessions) c.JSON(http.StatusOK, gin.H{ - "sessions": enriched, - "total": len(enriched), + "sessions": sessions, + "total": len(sessions), }) } @@ -281,16 +310,24 @@ func (h *Handler) GetSession(c *gin.Context) { ctx := c.Request.Context() sessionID := c.Param("id") - session, err := h.k8sClient.GetSession(ctx, h.namespace, sessionID) + // Use database as source of truth for multi-platform support + dbSession, err := h.sessionDB.GetSession(ctx, sessionID) if err != nil { - c.JSON(http.StatusNotFound, gin.H{"error": "Session not found"}) + // Fall back to Kubernetes for backward compatibility + log.Printf("Database session query failed, falling back to k8s: %v", err) + k8sSession, k8sErr := h.k8sClient.GetSession(ctx, h.namespace, sessionID) + if k8sErr != nil { + c.JSON(http.StatusNotFound, gin.H{"error": "Session not found"}) + return + } + enriched := h.enrichSessionWithDBInfo(ctx, k8sSession) + c.JSON(http.StatusOK, enriched) return } - // Enrich with database info - enriched := h.enrichSessionWithDBInfo(ctx, session) - - c.JSON(http.StatusOK, enriched) + // Convert to API response format + session := h.convertDBSessionToResponse(dbSession) + c.JSON(http.StatusOK, session) } // CreateSession creates a new container session for a user. @@ -472,6 +509,21 @@ func (h *Handler) CreateSession(c *gin.Context) { log.Printf("Failed to cache session in database: %v", err) } + // Publish session create event for controllers + // This enables platform-agnostic session management + createEvent := &events.SessionCreateEvent{ + SessionID: sessionName, + UserID: req.User, + TemplateID: req.Template, + Platform: h.platform, + Resources: events.ResourceSpec{Memory: memory, CPU: cpu}, + PersistentHome: session.PersistentHome, + IdleTimeout: session.IdleTimeout, + } + if err := h.publisher.PublishSessionCreate(ctx, createEvent); err != nil { + log.Printf("Warning: Failed to publish session create event: %v", err) + } + c.JSON(http.StatusCreated, created) } @@ -508,6 +560,28 @@ func (h *Handler) UpdateSession(c *gin.Context) { log.Printf("Failed to update session in database: %v", err) } + // Publish state change event for controllers + switch req.State { + case "hibernated": + event := &events.SessionHibernateEvent{ + SessionID: sessionID, + UserID: updated.User, + Platform: h.platform, + } + if err := h.publisher.PublishSessionHibernate(ctx, event); err != nil { + log.Printf("Warning: Failed to publish session hibernate event: %v", err) + } + case "running": + event := &events.SessionWakeEvent{ + SessionID: sessionID, + UserID: updated.User, + Platform: h.platform, + } + if err := h.publisher.PublishSessionWake(ctx, event); err != nil { + log.Printf("Warning: Failed to publish session wake event: %v", err) + } + } + c.JSON(http.StatusOK, updated) } @@ -517,8 +591,8 @@ func (h *Handler) DeleteSession(c *gin.Context) { ctx := c.Request.Context() sessionID := c.Param("id") - // Verify session exists before deletion - _, err := h.k8sClient.GetSession(ctx, h.namespace, sessionID) + // Verify session exists before deletion and get user info for event + session, err := h.k8sClient.GetSession(ctx, h.namespace, sessionID) if err != nil { c.JSON(http.StatusNotFound, gin.H{"error": "Session not found"}) return @@ -535,6 +609,16 @@ func (h *Handler) DeleteSession(c *gin.Context) { log.Printf("Failed to delete session from database: %v", err) } + // Publish session delete event for controllers + deleteEvent := &events.SessionDeleteEvent{ + SessionID: sessionID, + UserID: session.User, + Platform: h.platform, + } + if err := h.publisher.PublishSessionDelete(ctx, deleteEvent); err != nil { + log.Printf("Warning: Failed to publish session delete event: %v", err) + } + c.JSON(http.StatusOK, gin.H{"message": "Session deleted"}) } @@ -909,6 +993,18 @@ func (h *Handler) CreateTemplate(c *gin.Context) { return } + // Publish template create event for controllers + createEvent := &events.TemplateCreateEvent{ + TemplateID: created.Name, + DisplayName: created.DisplayName, + Category: created.Category, + BaseImage: created.BaseImage, + Platform: h.platform, + } + if err := h.publisher.PublishTemplateCreate(ctx, createEvent); err != nil { + log.Printf("Warning: Failed to publish template create event: %v", err) + } + c.JSON(http.StatusCreated, created) } @@ -923,6 +1019,15 @@ func (h *Handler) DeleteTemplate(c *gin.Context) { return } + // Publish template delete event for controllers + deleteEvent := &events.TemplateDeleteEvent{ + TemplateName: templateID, + Platform: h.platform, + } + if err := h.publisher.PublishTemplateDelete(ctx, deleteEvent); err != nil { + log.Printf("Warning: Failed to publish template delete event: %v", err) + } + c.JSON(http.StatusOK, gin.H{"message": "Template deleted"}) } @@ -1636,6 +1741,50 @@ func (h *Handler) enrichSessionWithDBInfo(ctx context.Context, session *k8s.Sess return result } +// convertDBSessionsToResponse converts database sessions to API response format. +func (h *Handler) convertDBSessionsToResponse(sessions []*db.Session) []map[string]interface{} { + result := make([]map[string]interface{}, 0, len(sessions)) + for _, session := range sessions { + result = append(result, h.convertDBSessionToResponse(session)) + } + return result +} + +// convertDBSessionToResponse converts a database session to API response format. +func (h *Handler) convertDBSessionToResponse(session *db.Session) map[string]interface{} { + result := map[string]interface{}{ + "name": session.ID, + "namespace": session.Namespace, + "user": session.UserID, + "template": session.TemplateName, + "state": session.State, + "persistentHome": session.PersistentHome, + "idleTimeout": session.IdleTimeout, + "maxSessionDuration": session.MaxSessionDuration, + "createdAt": session.CreatedAt, + "platform": session.Platform, + "activeConnections": session.ActiveConnections, + "status": map[string]interface{}{ + "phase": session.State, + "url": session.URL, + "podName": session.PodName, + }, + } + + if session.Memory != "" || session.CPU != "" { + result["resources"] = map[string]string{ + "memory": session.Memory, + "cpu": session.CPU, + } + } + + if session.LastActivity != nil { + result["status"].(map[string]interface{})["lastActivity"] = session.LastActivity + } + + return result +} + // cacheSessionInDB caches a session in the PostgreSQL database. // // DATABASE TRANSACTION BOUNDARY: @@ -1669,14 +1818,26 @@ func (h *Handler) enrichSessionWithDBInfo(ctx context.Context, session *k8s.Sess // log.Printf("Cache update failed (non-fatal): %v", err) // } func (h *Handler) cacheSessionInDB(ctx context.Context, session *k8s.Session) error { - _, err := h.db.DB().ExecContext(ctx, ` - INSERT INTO sessions (id, user_id, template_name, state, app_type, namespace, url, created_at, updated_at) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) - ON CONFLICT (id) DO UPDATE - SET user_id = $2, template_name = $3, state = $4, updated_at = $9 - `, session.Name, session.User, session.Template, session.State, "desktop", session.Namespace, session.Status.URL, session.CreatedAt, time.Now()) - - return err + dbSession := &db.Session{ + ID: session.Name, + UserID: session.User, + TemplateName: session.Template, + State: session.State, + AppType: "desktop", + Namespace: session.Namespace, + Platform: h.platform, + URL: session.Status.URL, + PodName: session.Status.PodName, + Memory: session.Resources.Memory, + CPU: session.Resources.CPU, + PersistentHome: session.PersistentHome, + IdleTimeout: session.IdleTimeout, + MaxSessionDuration: session.MaxSessionDuration, + CreatedAt: session.CreatedAt, + LastActivity: session.Status.LastActivity, + } + + return h.sessionDB.CreateSession(ctx, dbSession) } // updateSessionInDB updates a cached session in the database. diff --git a/api/internal/auth/handlers.go b/api/internal/auth/handlers.go index 82024a54..a60ccc46 100644 --- a/api/internal/auth/handlers.go +++ b/api/internal/auth/handlers.go @@ -190,8 +190,12 @@ func (h *AuthHandler) Login(c *gin.Context) { groupIDs = []string{} // Continue without groups if error } - // Generate JWT token - token, err := h.jwtManager.GenerateToken(user.ID, user.Username, user.Email, user.Role, groupIDs) + // Capture client info for session tracking + ipAddress := c.ClientIP() + userAgent := c.Request.UserAgent() + + // Generate JWT token with session tracking + token, err := h.jwtManager.GenerateTokenWithContext(ctx, user.ID, user.Username, user.Email, user.Role, groupIDs, ipAddress, userAgent) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": "Failed to generate token", @@ -268,10 +272,21 @@ func (h *AuthHandler) RefreshToken(c *gin.Context) { }) } -// Logout handles logout (client-side token invalidation) +// Logout handles logout and invalidates the session in Redis func (h *AuthHandler) Logout(c *gin.Context) { - // With JWT, logout is primarily client-side (remove token) - // Could implement token blacklist here if needed + // Get session ID from context (set by auth middleware) + sessionID, exists := c.Get("sessionID") + if exists && sessionID != nil { + if sid, ok := sessionID.(string); ok && sid != "" { + // Invalidate session in Redis + ctx := c.Request.Context() + if err := h.jwtManager.InvalidateSession(ctx, sid); err != nil { + // Log error but don't fail logout + log.Printf("Warning: Failed to invalidate session %s: %v", sid, err) + } + } + } + c.JSON(http.StatusOK, gin.H{ "message": "Logged out successfully", }) @@ -417,8 +432,12 @@ func (h *AuthHandler) SAMLCallback(c *gin.Context) { groupIDs = []string{} // Continue without groups if error } - // Generate JWT token - token, err := h.jwtManager.GenerateToken(user.ID, user.Username, user.Email, user.Role, groupIDs) + // Capture client info for session tracking + ipAddress := c.ClientIP() + userAgent := c.Request.UserAgent() + + // Generate JWT token with session tracking + token, err := h.jwtManager.GenerateTokenWithContext(ctx, user.ID, user.Username, user.Email, user.Role, groupIDs, ipAddress, userAgent) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": "Failed to generate token", diff --git a/api/internal/auth/jwt.go b/api/internal/auth/jwt.go index 447df939..46f2eaec 100644 --- a/api/internal/auth/jwt.go +++ b/api/internal/auth/jwt.go @@ -99,11 +99,13 @@ package auth import ( + "context" "errors" "fmt" "time" "github.com/golang-jwt/jwt/v5" + "github.com/streamspace/streamspace/api/internal/cache" ) // JWTConfig holds JWT configuration. @@ -188,7 +190,8 @@ type Claims struct { // JWTManager handles JWT token operations type JWTManager struct { - config *JWTConfig + config *JWTConfig + sessionStore *SessionStore } // NewJWTManager creates a new JWT manager @@ -204,6 +207,23 @@ func NewJWTManager(config *JWTConfig) *JWTManager { } } +// SetSessionStore sets the session store for server-side session tracking +func (m *JWTManager) SetSessionStore(store *SessionStore) { + m.sessionStore = store +} + +// NewJWTManagerWithSessions creates a new JWT manager with session tracking +func NewJWTManagerWithSessions(config *JWTConfig, cacheClient *cache.Cache) *JWTManager { + manager := NewJWTManager(config) + manager.sessionStore = NewSessionStore(cacheClient) + return manager +} + +// GetSessionStore returns the session store +func (m *JWTManager) GetSessionStore() *SessionStore { + return m.sessionStore +} + // GenerateToken generates a new JWT token for a user. // // This function creates a cryptographically signed JWT token containing user @@ -285,8 +305,21 @@ func NewJWTManager(config *JWTConfig) *JWTManager { // NOTE: The generated token contains sensitive information (user identity, role). // Always transmit tokens over HTTPS to prevent interception. func (m *JWTManager) GenerateToken(userID, username, email, role string, groups []string) (string, error) { + // Use background context for backward compatibility + return m.GenerateTokenWithContext(context.Background(), userID, username, email, role, groups, "", "") +} + +// GenerateTokenWithContext generates a new JWT token with session tracking +func (m *JWTManager) GenerateTokenWithContext(ctx context.Context, userID, username, email, role string, groups []string, ipAddress, userAgent string) (string, error) { // Get current time for timestamp claims now := time.Now() + expiresAt := now.Add(m.config.TokenDuration) + + // Generate unique session ID for server-side tracking + sessionID, err := GenerateSessionID() + if err != nil { + return "", fmt.Errorf("failed to generate session ID: %w", err) + } // STEP 1: Build Claims structure // This includes both custom claims (user info) and standard JWT claims @@ -300,6 +333,10 @@ func (m *JWTManager) GenerateToken(userID, username, email, role string, groups // Standard JWT claims - defined by RFC 7519 RegisteredClaims: jwt.RegisteredClaims{ + // ID (jti): Unique identifier for this token (session ID) + // Used for server-side session tracking and revocation + ID: sessionID, + // Issuer (iss): Identifies who created the token // Used to prevent tokens from other systems being accepted Issuer: m.config.Issuer, @@ -315,7 +352,7 @@ func (m *JWTManager) GenerateToken(userID, username, email, role string, groups // Expires At (exp): When the token expires // SECURITY: Limits exposure window for stolen tokens // Default: 24 hours from now - ExpiresAt: jwt.NewNumericDate(now.Add(m.config.TokenDuration)), + ExpiresAt: jwt.NewNumericDate(expiresAt), // Not Before (nbf): Token cannot be used before this time // Prevents premature token usage (e.g., for scheduled access) @@ -338,10 +375,63 @@ func (m *JWTManager) GenerateToken(userID, username, email, role string, groups return "", fmt.Errorf("failed to sign token: %w", err) } + // STEP 4: Store session in Redis for server-side tracking + if m.sessionStore != nil && m.sessionStore.IsEnabled() { + session := &SessionData{ + SessionID: sessionID, + UserID: userID, + Username: username, + Role: role, + CreatedAt: now, + ExpiresAt: expiresAt, + IPAddress: ipAddress, + UserAgent: userAgent, + } + + if err := m.sessionStore.CreateSession(ctx, session, m.config.TokenDuration); err != nil { + // Log the error but don't fail token generation + // This allows graceful degradation if Redis is temporarily unavailable + fmt.Printf("Warning: Failed to store session in Redis: %v\n", err) + } + } + // Return the complete token: "header.payload.signature" return tokenString, nil } +// InvalidateSession invalidates a session by its ID (logout) +func (m *JWTManager) InvalidateSession(ctx context.Context, sessionID string) error { + if m.sessionStore == nil { + return nil + } + return m.sessionStore.DeleteSession(ctx, sessionID) +} + +// InvalidateUserSessions invalidates all sessions for a user +func (m *JWTManager) InvalidateUserSessions(ctx context.Context, userID string) error { + if m.sessionStore == nil { + return nil + } + return m.sessionStore.DeleteUserSessions(ctx, userID) +} + +// ValidateSession checks if a session is valid (exists in Redis) +func (m *JWTManager) ValidateSession(ctx context.Context, sessionID string) (bool, error) { + if m.sessionStore == nil { + // No session store = all sessions valid (backward compatibility) + return true, nil + } + return m.sessionStore.ValidateSession(ctx, sessionID) +} + +// ClearAllSessions clears all sessions (force re-login on restart) +func (m *JWTManager) ClearAllSessions(ctx context.Context) error { + if m.sessionStore == nil { + return nil + } + return m.sessionStore.ClearAllSessions(ctx) +} + // ValidateToken validates a JWT token and returns the claims. // // This function performs comprehensive validation of a JWT token, including: diff --git a/api/internal/auth/middleware.go b/api/internal/auth/middleware.go index 639cba17..c1846d1b 100644 --- a/api/internal/auth/middleware.go +++ b/api/internal/auth/middleware.go @@ -216,6 +216,23 @@ func Middleware(jwtManager *JWTManager, userDB *db.UserDB) gin.HandlerFunc { return } + // Validate session exists in Redis (server-side session tracking) + // This ensures tokens can be invalidated on logout or server restart + if claims.ID != "" { + valid, err := jwtManager.ValidateSession(c.Request.Context(), claims.ID) + if err != nil || !valid { + if isWebSocket { + c.AbortWithStatus(http.StatusUnauthorized) + return + } + c.JSON(http.StatusUnauthorized, gin.H{ + "error": "Session expired or invalidated", + }) + c.Abort() + return + } + } + // Verify user still exists and is active user, err := userDB.GetUser(c.Request.Context(), claims.UserID) if err != nil { @@ -249,6 +266,7 @@ func Middleware(jwtManager *JWTManager, userDB *db.UserDB) gin.HandlerFunc { c.Set("userRole", claims.Role) c.Set("userGroups", claims.Groups) c.Set("claims", claims) + c.Set("sessionID", claims.ID) // For logout/session management c.Next() } @@ -277,6 +295,16 @@ func OptionalAuth(jwtManager *JWTManager, userDB *db.UserDB) gin.HandlerFunc { return } + // Validate session exists in Redis + if claims.ID != "" { + valid, err := jwtManager.ValidateSession(c.Request.Context(), claims.ID) + if err != nil || !valid { + // Session invalid, continue without user context + c.Next() + return + } + } + // Set user info if valid user, err := userDB.GetUser(c.Request.Context(), claims.UserID) if err == nil && user.Active { @@ -285,6 +313,7 @@ func OptionalAuth(jwtManager *JWTManager, userDB *db.UserDB) gin.HandlerFunc { c.Set("userEmail", claims.Email) c.Set("userRole", claims.Role) c.Set("userGroups", claims.Groups) + c.Set("sessionID", claims.ID) } c.Next() diff --git a/api/internal/auth/session_store.go b/api/internal/auth/session_store.go new file mode 100644 index 00000000..63933db9 --- /dev/null +++ b/api/internal/auth/session_store.go @@ -0,0 +1,191 @@ +// Package auth provides authentication and authorization mechanisms for StreamSpace. +// This file implements server-side session tracking using Redis. +// +// SESSION TRACKING: +// +// StreamSpace uses server-side session tracking to provide: +// - Session invalidation on logout +// - Force re-login on application restart +// - Ability to revoke all sessions for a user +// - Session audit trail +// +// HOW IT WORKS: +// +// 1. Token Generation: +// - Each JWT gets a unique session ID (jti claim) +// - Session metadata stored in Redis: session:{jti} +// - TTL matches token expiration +// +// 2. Token Validation: +// - Middleware checks if session exists in Redis +// - Missing session = invalid token (expired, revoked, or from before restart) +// - Valid session = allow request +// +// 3. Logout: +// - Delete session from Redis +// - Token immediately becomes invalid +// +// 4. Application Restart: +// - Redis pattern delete clears all sessions +// - All users must re-login +// +// SECURITY BENEFITS: +// +// - True logout: Sessions can be immediately invalidated +// - Compromise response: Revoke all user sessions on suspected breach +// - Multi-device management: Users can see and revoke active sessions +// - Forced re-authentication: Restart clears all sessions +package auth + +import ( + "context" + "crypto/rand" + "encoding/hex" + "fmt" + "time" + + "github.com/streamspace/streamspace/api/internal/cache" +) + +// SessionStore manages server-side session tracking in Redis +type SessionStore struct { + cache *cache.Cache +} + +// SessionData represents a stored session +type SessionData struct { + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + Username string `json:"username"` + Role string `json:"role"` + CreatedAt time.Time `json:"created_at"` + ExpiresAt time.Time `json:"expires_at"` + IPAddress string `json:"ip_address,omitempty"` + UserAgent string `json:"user_agent,omitempty"` +} + +// NewSessionStore creates a new session store +func NewSessionStore(cache *cache.Cache) *SessionStore { + return &SessionStore{ + cache: cache, + } +} + +// GenerateSessionID creates a cryptographically random session ID +func GenerateSessionID() (string, error) { + bytes := make([]byte, 32) + if _, err := rand.Read(bytes); err != nil { + return "", fmt.Errorf("failed to generate session ID: %w", err) + } + return hex.EncodeToString(bytes), nil +} + +// CreateSession stores a new session in Redis +func (s *SessionStore) CreateSession(ctx context.Context, session *SessionData, ttl time.Duration) error { + if !s.cache.IsEnabled() { + // If Redis is disabled, sessions won't be tracked + // This is acceptable for development but not recommended for production + return nil + } + + key := s.sessionKey(session.SessionID) + return s.cache.Set(ctx, key, session, ttl) +} + +// GetSession retrieves a session from Redis +func (s *SessionStore) GetSession(ctx context.Context, sessionID string) (*SessionData, error) { + if !s.cache.IsEnabled() { + // If Redis is disabled, assume all sessions are valid + return nil, nil + } + + var session SessionData + key := s.sessionKey(sessionID) + err := s.cache.Get(ctx, key, &session) + if err != nil { + return nil, err + } + return &session, nil +} + +// ValidateSession checks if a session exists and is valid +func (s *SessionStore) ValidateSession(ctx context.Context, sessionID string) (bool, error) { + if !s.cache.IsEnabled() { + // If Redis is disabled, assume all sessions are valid + return true, nil + } + + key := s.sessionKey(sessionID) + return s.cache.Exists(ctx, key) +} + +// DeleteSession removes a session from Redis (logout) +func (s *SessionStore) DeleteSession(ctx context.Context, sessionID string) error { + if !s.cache.IsEnabled() { + return nil + } + + key := s.sessionKey(sessionID) + return s.cache.Delete(ctx, key) +} + +// DeleteUserSessions removes all sessions for a specific user +func (s *SessionStore) DeleteUserSessions(ctx context.Context, userID string) error { + if !s.cache.IsEnabled() { + return nil + } + + // Delete all sessions matching user pattern + // Note: This requires listing sessions and checking userID + // For simplicity, we'll use a user-indexed key pattern + pattern := fmt.Sprintf("session:user:%s:*", userID) + return s.cache.DeletePattern(ctx, pattern) +} + +// ClearAllSessions removes all sessions from Redis (force all users to re-login) +func (s *SessionStore) ClearAllSessions(ctx context.Context) error { + if !s.cache.IsEnabled() { + return nil + } + + // Delete all session keys + pattern := "session:*" + return s.cache.DeletePattern(ctx, pattern) +} + +// RefreshSession extends the TTL of an existing session +func (s *SessionStore) RefreshSession(ctx context.Context, sessionID string, newExpiresAt time.Time) error { + if !s.cache.IsEnabled() { + return nil + } + + // Get existing session + session, err := s.GetSession(ctx, sessionID) + if err != nil { + return err + } + + // Update expiration + session.ExpiresAt = newExpiresAt + + // Calculate new TTL + ttl := time.Until(newExpiresAt) + if ttl <= 0 { + // Session has expired, delete it + return s.DeleteSession(ctx, sessionID) + } + + // Re-store with new TTL + key := s.sessionKey(sessionID) + return s.cache.Set(ctx, key, session, ttl) +} + +// sessionKey generates the Redis key for a session +func (s *SessionStore) sessionKey(sessionID string) string { + return fmt.Sprintf("session:%s", sessionID) +} + +// IsEnabled returns whether session tracking is enabled +func (s *SessionStore) IsEnabled() bool { + return s.cache != nil && s.cache.IsEnabled() +} diff --git a/api/internal/cache/middleware.go b/api/internal/cache/middleware.go index 5b42783b..b70b52ac 100644 --- a/api/internal/cache/middleware.go +++ b/api/internal/cache/middleware.go @@ -96,8 +96,15 @@ func CacheMiddleware(cache *Cache, ttl time.Duration) gin.HandlerFunc { return } - // Generate cache key from request path and query params - cacheKey := generateCacheKey(c.Request.URL.RequestURI()) + // Generate cache key from request path, query params, and userID for user-specific endpoints + // This ensures each user gets their own cached response for endpoints like /applications/user + userID := "" + if uid, exists := c.Get("userID"); exists { + if id, ok := uid.(string); ok { + userID = id + } + } + cacheKey := generateCacheKey(c.Request.URL.RequestURI(), userID) // Try to get cached response var cachedResp CachedResponse @@ -123,10 +130,19 @@ func CacheMiddleware(cache *Cache, ttl time.Duration) gin.HandlerFunc { // Only cache successful responses if c.Writer.Status() >= 200 && c.Writer.Status() < 300 { - // Capture headers + // Capture headers, excluding security-sensitive ones that shouldn't be cached headers := make(map[string]string) + excludeHeaders := map[string]bool{ + "X-Csrf-Token": true, // CSRF tokens must be fresh per-request + "X-CSRF-Token": true, // CSRF tokens (alternate case) + "Set-Cookie": true, // Cookies are user-specific + "Authorization": true, // Auth headers shouldn't be cached + "X-Request-Id": true, // Request IDs are unique per request + } for key := range c.Writer.Header() { - headers[key] = c.Writer.Header().Get(key) + if !excludeHeaders[key] { + headers[key] = c.Writer.Header().Get(key) + } } // Store in cache @@ -146,9 +162,16 @@ func CacheMiddleware(cache *Cache, ttl time.Duration) gin.HandlerFunc { } } -// generateCacheKey creates a consistent cache key from the request URI -func generateCacheKey(uri string) string { - hash := sha256.Sum256([]byte(uri)) +// generateCacheKey creates a consistent cache key from the request URI and optional userID +// Including userID ensures user-specific responses are cached separately +func generateCacheKey(uri string, userID string) string { + // Combine URI and userID for the hash + // This ensures each user gets their own cache entry for user-specific endpoints + keyInput := uri + if userID != "" { + keyInput = fmt.Sprintf("%s:user:%s", uri, userID) + } + hash := sha256.Sum256([]byte(keyInput)) return fmt.Sprintf("response:%s", hex.EncodeToString(hash[:])) } @@ -174,11 +197,15 @@ func CacheControl(maxAge time.Duration) gin.HandlerFunc { return func(c *gin.Context) { path := c.Request.URL.Path - // Never cache authentication/authorization endpoints + // Never cache authentication/authorization or user-specific endpoints noCachePaths := []string{ - "/api/v1/auth/", // All auth endpoints (login, logout, setup, etc.) - "/api/v1/users/me", // Current user info - "/api/v1/sessions/", // Session state (dynamic) + "/api/v1/auth/", // All auth endpoints (login, logout, setup, etc.) + "/api/v1/users/me", // Current user info + "/api/v1/sessions/", // Session state (dynamic) + "/api/v1/applications/user", // User-specific installed applications + "/api/v1/dashboard/me", // User dashboard + "/api/v1/notifications", // User notifications + "/api/v1/preferences", // User preferences } shouldCache := true diff --git a/api/internal/db/applications.go b/api/internal/db/applications.go index ff2a26fd..d0edb6ae 100644 --- a/api/internal/db/applications.go +++ b/api/internal/db/applications.go @@ -133,7 +133,7 @@ func (a *ApplicationDB) InstallApplication(ctx context.Context, req *models.Inst _, err = a.db.ExecContext(ctx, query, app.ID, app.CatalogTemplateID, app.Name, app.DisplayName, app.FolderPath, - app.Enabled, configJSON, app.CreatedBy, app.CreatedAt, app.UpdatedAt, + app.Enabled, string(configJSON), app.CreatedBy, app.CreatedAt, app.UpdatedAt, ) if err != nil { return nil, fmt.Errorf("failed to install application: %w", err) @@ -157,7 +157,8 @@ func (a *ApplicationDB) InstallApplication(ctx context.Context, req *models.Inst // GetApplication retrieves an installed application by ID func (a *ApplicationDB) GetApplication(ctx context.Context, appID string) (*models.InstalledApplication, error) { app := &models.InstalledApplication{} - var configJSON []byte + var configJSON sql.NullString + var catalogTemplateID sql.NullInt64 query := ` SELECT @@ -173,21 +174,26 @@ func (a *ApplicationDB) GetApplication(ctx context.Context, appID string) (*mode ` err := a.db.QueryRowContext(ctx, query, appID).Scan( - &app.ID, &app.CatalogTemplateID, &app.Name, &app.DisplayName, &app.FolderPath, + &app.ID, &catalogTemplateID, &app.Name, &app.DisplayName, &app.FolderPath, &app.Enabled, &configJSON, &app.CreatedBy, &app.CreatedAt, &app.UpdatedAt, &app.TemplateName, &app.TemplateDisplayName, &app.Description, &app.Category, &app.AppType, &app.IconURL, &app.Manifest, ) if err != nil { if err == sql.ErrNoRows { - return nil, fmt.Errorf("application not found") + return nil, fmt.Errorf("application not found: %s", appID) } - return nil, err + return nil, fmt.Errorf("database error scanning application %s: %w", appID, err) + } + + // Handle NULL catalog_template_id + if catalogTemplateID.Valid { + app.CatalogTemplateID = int(catalogTemplateID.Int64) } - // Unmarshal configuration - if len(configJSON) > 0 { - json.Unmarshal(configJSON, &app.Configuration) + // Unmarshal configuration from JSONB string + if configJSON.Valid && len(configJSON.String) > 0 { + json.Unmarshal([]byte(configJSON.String), &app.Configuration) } return app, nil @@ -297,7 +303,7 @@ func (a *ApplicationDB) UpdateApplication(ctx context.Context, appID string, req return fmt.Errorf("failed to marshal configuration: %w", err) } updates = append(updates, fmt.Sprintf("configuration = $%d", argIdx)) - args = append(args, configJSON) + args = append(args, string(configJSON)) // Convert to string for JSONB argIdx++ } diff --git a/api/internal/db/database.go b/api/internal/db/database.go index 29b7b031..d460bb48 100644 --- a/api/internal/db/database.go +++ b/api/internal/db/database.go @@ -2009,6 +2009,76 @@ func (d *Database) Migrate() error { `CREATE INDEX IF NOT EXISTS idx_compliance_violations_severity ON compliance_violations(severity)`, `CREATE INDEX IF NOT EXISTS idx_compliance_reports_framework_id ON compliance_reports(framework_id)`, `CREATE INDEX IF NOT EXISTS idx_compliance_reports_generated_at ON compliance_reports(generated_at DESC)`, + + // ========== NATS Event-Driven Architecture ========== + + // Platform controllers (registered platform controllers - K8s, Docker, Hyper-V, etc.) + `CREATE TABLE IF NOT EXISTS platform_controllers ( + id VARCHAR(255) PRIMARY KEY, + controller_id VARCHAR(255) UNIQUE NOT NULL, + platform VARCHAR(50) NOT NULL, + display_name VARCHAR(255), + status VARCHAR(50) DEFAULT 'unknown', + version VARCHAR(50), + capabilities JSONB DEFAULT '[]', + cluster_info JSONB DEFAULT '{}', + last_heartbeat TIMESTAMP, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + )`, + + // Create indexes for platform controllers + `CREATE INDEX IF NOT EXISTS idx_platform_controllers_platform ON platform_controllers(platform)`, + `CREATE INDEX IF NOT EXISTS idx_platform_controllers_status ON platform_controllers(status)`, + `CREATE INDEX IF NOT EXISTS idx_platform_controllers_heartbeat ON platform_controllers(last_heartbeat)`, + + // Event log (audit log of all NATS events for debugging and replay) + `CREATE TABLE IF NOT EXISTS event_log ( + id BIGSERIAL PRIMARY KEY, + event_id VARCHAR(255) NOT NULL, + subject VARCHAR(255) NOT NULL, + payload JSONB NOT NULL, + published_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + processed_at TIMESTAMP, + processed_by VARCHAR(255), + status VARCHAR(50) DEFAULT 'published', + error_message TEXT + )`, + + // Create indexes for event log + `CREATE INDEX IF NOT EXISTS idx_event_log_event_id ON event_log(event_id)`, + `CREATE INDEX IF NOT EXISTS idx_event_log_subject ON event_log(subject)`, + `CREATE INDEX IF NOT EXISTS idx_event_log_status ON event_log(status)`, + `CREATE INDEX IF NOT EXISTS idx_event_log_published_at ON event_log(published_at)`, + + // Add platform fields to installed_applications for async installation tracking + `ALTER TABLE installed_applications ADD COLUMN IF NOT EXISTS install_status VARCHAR(50) DEFAULT 'pending'`, + `ALTER TABLE installed_applications ADD COLUMN IF NOT EXISTS install_message TEXT`, + `ALTER TABLE installed_applications ADD COLUMN IF NOT EXISTS platform VARCHAR(50) DEFAULT 'kubernetes'`, + + // Create index for install status + `CREATE INDEX IF NOT EXISTS idx_installed_applications_status ON installed_applications(install_status)`, + `CREATE INDEX IF NOT EXISTS idx_installed_applications_platform ON installed_applications(platform)`, + + // Add platform fields to sessions for multi-platform support + `ALTER TABLE sessions ADD COLUMN IF NOT EXISTS platform VARCHAR(50) DEFAULT 'kubernetes'`, + `ALTER TABLE sessions ADD COLUMN IF NOT EXISTS controller_id VARCHAR(255)`, + + // Create indexes for session platform tracking + `CREATE INDEX IF NOT EXISTS idx_sessions_platform ON sessions(platform)`, + `CREATE INDEX IF NOT EXISTS idx_sessions_controller_id ON sessions(controller_id)`, + + // Add additional session fields for multi-platform support + `ALTER TABLE sessions ADD COLUMN IF NOT EXISTS pod_name VARCHAR(255)`, + `ALTER TABLE sessions ADD COLUMN IF NOT EXISTS memory VARCHAR(50)`, + `ALTER TABLE sessions ADD COLUMN IF NOT EXISTS cpu VARCHAR(50)`, + `ALTER TABLE sessions ADD COLUMN IF NOT EXISTS persistent_home BOOLEAN DEFAULT false`, + `ALTER TABLE sessions ADD COLUMN IF NOT EXISTS idle_timeout VARCHAR(50)`, + `ALTER TABLE sessions ADD COLUMN IF NOT EXISTS max_session_duration VARCHAR(50)`, + `ALTER TABLE sessions ADD COLUMN IF NOT EXISTS last_activity TIMESTAMP`, + + // Create index for idle session queries + `CREATE INDEX IF NOT EXISTS idx_sessions_last_activity ON sessions(last_activity)`, } // Execute migrations diff --git a/api/internal/db/sessions.go b/api/internal/db/sessions.go new file mode 100644 index 00000000..8ad6bc60 --- /dev/null +++ b/api/internal/db/sessions.go @@ -0,0 +1,354 @@ +// Package db provides PostgreSQL database access for StreamSpace. +// +// This file implements session management operations for multi-platform support. +// Sessions are the source of truth in the database, updated by controller status events. +package db + +import ( + "context" + "database/sql" + "fmt" + "time" + + "github.com/google/uuid" +) + +// Session represents a StreamSpace session in the database. +// This mirrors the k8s.Session structure for API compatibility. +type Session struct { + ID string `json:"id"` + UserID string `json:"user_id"` + TeamID string `json:"team_id,omitempty"` + TemplateName string `json:"template_name"` + State string `json:"state"` // running, hibernated, terminated, pending, failed + AppType string `json:"app_type"` + ActiveConnections int `json:"active_connections"` + URL string `json:"url,omitempty"` + Namespace string `json:"namespace"` + Platform string `json:"platform"` + PodName string `json:"pod_name,omitempty"` + Memory string `json:"memory,omitempty"` + CPU string `json:"cpu,omitempty"` + PersistentHome bool `json:"persistent_home"` + IdleTimeout string `json:"idle_timeout,omitempty"` + MaxSessionDuration string `json:"max_session_duration,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` + LastConnection *time.Time `json:"last_connection,omitempty"` + LastDisconnect *time.Time `json:"last_disconnect,omitempty"` + LastActivity *time.Time `json:"last_activity,omitempty"` +} + +// SessionDB handles database operations for sessions. +type SessionDB struct { + db *sql.DB +} + +// NewSessionDB creates a new SessionDB instance. +func NewSessionDB(db *sql.DB) *SessionDB { + return &SessionDB{db: db} +} + +// CreateSession creates a new session in the database. +func (s *SessionDB) CreateSession(ctx context.Context, session *Session) error { + if session.ID == "" { + session.ID = uuid.New().String() + } + if session.CreatedAt.IsZero() { + session.CreatedAt = time.Now() + } + session.UpdatedAt = time.Now() + + query := ` + INSERT INTO sessions ( + id, user_id, team_id, template_name, state, app_type, + active_connections, url, namespace, platform, pod_name, + memory, cpu, persistent_home, idle_timeout, max_session_duration, + created_at, updated_at, last_connection, last_disconnect, last_activity + ) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21) + ON CONFLICT (id) DO UPDATE SET + state = EXCLUDED.state, + url = EXCLUDED.url, + pod_name = EXCLUDED.pod_name, + updated_at = EXCLUDED.updated_at + ` + + _, err := s.db.ExecContext(ctx, query, + session.ID, session.UserID, nullString(session.TeamID), session.TemplateName, session.State, session.AppType, + session.ActiveConnections, session.URL, session.Namespace, session.Platform, session.PodName, + session.Memory, session.CPU, session.PersistentHome, session.IdleTimeout, session.MaxSessionDuration, + session.CreatedAt, session.UpdatedAt, session.LastConnection, session.LastDisconnect, session.LastActivity, + ) + return err +} + +// GetSession retrieves a session by ID. +func (s *SessionDB) GetSession(ctx context.Context, sessionID string) (*Session, error) { + session := &Session{} + + query := ` + SELECT + id, user_id, COALESCE(team_id, ''), template_name, state, COALESCE(app_type, 'desktop'), + active_connections, COALESCE(url, ''), COALESCE(namespace, 'streamspace'), + COALESCE(platform, 'kubernetes'), COALESCE(pod_name, ''), + COALESCE(memory, ''), COALESCE(cpu, ''), COALESCE(persistent_home, false), + COALESCE(idle_timeout, ''), COALESCE(max_session_duration, ''), + created_at, updated_at, last_connection, last_disconnect, last_activity + FROM sessions + WHERE id = $1 + ` + + err := s.db.QueryRowContext(ctx, query, sessionID).Scan( + &session.ID, &session.UserID, &session.TeamID, &session.TemplateName, &session.State, &session.AppType, + &session.ActiveConnections, &session.URL, &session.Namespace, &session.Platform, &session.PodName, + &session.Memory, &session.CPU, &session.PersistentHome, &session.IdleTimeout, &session.MaxSessionDuration, + &session.CreatedAt, &session.UpdatedAt, &session.LastConnection, &session.LastDisconnect, &session.LastActivity, + ) + if err != nil { + if err == sql.ErrNoRows { + return nil, fmt.Errorf("session not found: %s", sessionID) + } + return nil, err + } + + return session, nil +} + +// ListSessions retrieves all sessions. +func (s *SessionDB) ListSessions(ctx context.Context) ([]*Session, error) { + query := ` + SELECT + id, user_id, COALESCE(team_id, ''), template_name, state, COALESCE(app_type, 'desktop'), + active_connections, COALESCE(url, ''), COALESCE(namespace, 'streamspace'), + COALESCE(platform, 'kubernetes'), COALESCE(pod_name, ''), + COALESCE(memory, ''), COALESCE(cpu, ''), COALESCE(persistent_home, false), + COALESCE(idle_timeout, ''), COALESCE(max_session_duration, ''), + created_at, updated_at, last_connection, last_disconnect, last_activity + FROM sessions + WHERE state != 'deleted' + ORDER BY created_at DESC + ` + + return s.querySessions(ctx, query) +} + +// ListSessionsByUser retrieves all sessions for a specific user. +func (s *SessionDB) ListSessionsByUser(ctx context.Context, userID string) ([]*Session, error) { + query := ` + SELECT + id, user_id, COALESCE(team_id, ''), template_name, state, COALESCE(app_type, 'desktop'), + active_connections, COALESCE(url, ''), COALESCE(namespace, 'streamspace'), + COALESCE(platform, 'kubernetes'), COALESCE(pod_name, ''), + COALESCE(memory, ''), COALESCE(cpu, ''), COALESCE(persistent_home, false), + COALESCE(idle_timeout, ''), COALESCE(max_session_duration, ''), + created_at, updated_at, last_connection, last_disconnect, last_activity + FROM sessions + WHERE user_id = $1 AND state != 'deleted' + ORDER BY created_at DESC + ` + + rows, err := s.db.QueryContext(ctx, query, userID) + if err != nil { + return nil, err + } + defer rows.Close() + + return s.scanSessions(rows) +} + +// ListSessionsByState retrieves all sessions with a specific state. +func (s *SessionDB) ListSessionsByState(ctx context.Context, state string) ([]*Session, error) { + query := ` + SELECT + id, user_id, COALESCE(team_id, ''), template_name, state, COALESCE(app_type, 'desktop'), + active_connections, COALESCE(url, ''), COALESCE(namespace, 'streamspace'), + COALESCE(platform, 'kubernetes'), COALESCE(pod_name, ''), + COALESCE(memory, ''), COALESCE(cpu, ''), COALESCE(persistent_home, false), + COALESCE(idle_timeout, ''), COALESCE(max_session_duration, ''), + created_at, updated_at, last_connection, last_disconnect, last_activity + FROM sessions + WHERE state = $1 + ORDER BY created_at DESC + ` + + rows, err := s.db.QueryContext(ctx, query, state) + if err != nil { + return nil, err + } + defer rows.Close() + + return s.scanSessions(rows) +} + +// UpdateSessionState updates the state of a session. +func (s *SessionDB) UpdateSessionState(ctx context.Context, sessionID, state string) error { + query := ` + UPDATE sessions + SET state = $1, updated_at = $2 + WHERE id = $3 + ` + + result, err := s.db.ExecContext(ctx, query, state, time.Now(), sessionID) + if err != nil { + return err + } + + rows, _ := result.RowsAffected() + if rows == 0 { + return fmt.Errorf("session not found: %s", sessionID) + } + + return nil +} + +// UpdateSessionURL updates the URL of a session. +func (s *SessionDB) UpdateSessionURL(ctx context.Context, sessionID, url string) error { + query := ` + UPDATE sessions + SET url = $1, updated_at = $2 + WHERE id = $3 + ` + + _, err := s.db.ExecContext(ctx, query, url, time.Now(), sessionID) + return err +} + +// UpdateSessionStatus updates session state, URL, and pod name from controller status events. +func (s *SessionDB) UpdateSessionStatus(ctx context.Context, sessionID, state, url, podName string) error { + query := ` + UPDATE sessions + SET state = $1, url = $2, pod_name = $3, updated_at = $4 + WHERE id = $5 + ` + + result, err := s.db.ExecContext(ctx, query, state, url, podName, time.Now(), sessionID) + if err != nil { + return err + } + + rows, _ := result.RowsAffected() + if rows == 0 { + return fmt.Errorf("session not found: %s", sessionID) + } + + return nil +} + +// UpdateLastActivity updates the last activity timestamp. +func (s *SessionDB) UpdateLastActivity(ctx context.Context, sessionID string) error { + query := ` + UPDATE sessions + SET last_activity = $1, updated_at = $1 + WHERE id = $2 + ` + + _, err := s.db.ExecContext(ctx, query, time.Now(), sessionID) + return err +} + +// UpdateActiveConnections updates the connection count for a session. +func (s *SessionDB) UpdateActiveConnections(ctx context.Context, sessionID string, count int) error { + now := time.Now() + query := ` + UPDATE sessions + SET active_connections = $1, last_connection = $2, updated_at = $2 + WHERE id = $3 + ` + + _, err := s.db.ExecContext(ctx, query, count, now, sessionID) + return err +} + +// DeleteSession marks a session as deleted. +func (s *SessionDB) DeleteSession(ctx context.Context, sessionID string) error { + query := ` + UPDATE sessions + SET state = 'deleted', updated_at = $1 + WHERE id = $2 + ` + + _, err := s.db.ExecContext(ctx, query, time.Now(), sessionID) + return err +} + +// HardDeleteSession permanently removes a session from the database. +func (s *SessionDB) HardDeleteSession(ctx context.Context, sessionID string) error { + _, err := s.db.ExecContext(ctx, "DELETE FROM sessions WHERE id = $1", sessionID) + return err +} + +// CountSessionsByUser returns the number of active sessions for a user. +func (s *SessionDB) CountSessionsByUser(ctx context.Context, userID string) (int, error) { + var count int + err := s.db.QueryRowContext(ctx, ` + SELECT COUNT(*) FROM sessions + WHERE user_id = $1 AND state IN ('running', 'pending', 'hibernated') + `, userID).Scan(&count) + return count, err +} + +// GetIdleSessions returns sessions that have been idle beyond their timeout. +func (s *SessionDB) GetIdleSessions(ctx context.Context) ([]*Session, error) { + query := ` + SELECT + id, user_id, COALESCE(team_id, ''), template_name, state, COALESCE(app_type, 'desktop'), + active_connections, COALESCE(url, ''), COALESCE(namespace, 'streamspace'), + COALESCE(platform, 'kubernetes'), COALESCE(pod_name, ''), + COALESCE(memory, ''), COALESCE(cpu, ''), COALESCE(persistent_home, false), + COALESCE(idle_timeout, ''), COALESCE(max_session_duration, ''), + created_at, updated_at, last_connection, last_disconnect, last_activity + FROM sessions + WHERE state = 'running' + AND idle_timeout != '' + AND last_activity IS NOT NULL + AND last_activity < NOW() - (idle_timeout || ' seconds')::INTERVAL + ORDER BY last_activity ASC + ` + + return s.querySessions(ctx, query) +} + +// querySessions executes a query and returns sessions. +func (s *SessionDB) querySessions(ctx context.Context, query string, args ...interface{}) ([]*Session, error) { + rows, err := s.db.QueryContext(ctx, query, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + return s.scanSessions(rows) +} + +// scanSessions scans rows into Session structs. +func (s *SessionDB) scanSessions(rows *sql.Rows) ([]*Session, error) { + var sessions []*Session + + for rows.Next() { + session := &Session{} + err := rows.Scan( + &session.ID, &session.UserID, &session.TeamID, &session.TemplateName, &session.State, &session.AppType, + &session.ActiveConnections, &session.URL, &session.Namespace, &session.Platform, &session.PodName, + &session.Memory, &session.CPU, &session.PersistentHome, &session.IdleTimeout, &session.MaxSessionDuration, + &session.CreatedAt, &session.UpdatedAt, &session.LastConnection, &session.LastDisconnect, &session.LastActivity, + ) + if err != nil { + return nil, err + } + sessions = append(sessions, session) + } + + if err := rows.Err(); err != nil { + return nil, err + } + + return sessions, nil +} + +// nullString returns a sql.NullString for empty strings. +func nullString(s string) sql.NullString { + if s == "" { + return sql.NullString{} + } + return sql.NullString{String: s, Valid: true} +} diff --git a/api/internal/events/publisher.go b/api/internal/events/publisher.go new file mode 100644 index 00000000..cca786d5 --- /dev/null +++ b/api/internal/events/publisher.go @@ -0,0 +1,353 @@ +package events + +import ( + "context" + "encoding/json" + "fmt" + "log" + "os" + "time" + + "github.com/google/uuid" + "github.com/nats-io/nats.go" +) + +// Publisher handles publishing events to NATS. +type Publisher struct { + conn *nats.Conn + js nats.JetStreamContext + enabled bool +} + +// Config holds NATS connection configuration. +type Config struct { + URL string + User string + Password string + TLS bool +} + +// NewPublisher creates a new NATS event publisher. +// If NATS is unavailable, returns a disabled publisher that logs warnings. +func NewPublisher(cfg Config) (*Publisher, error) { + if cfg.URL == "" { + cfg.URL = os.Getenv("NATS_URL") + } + if cfg.URL == "" { + log.Println("Warning: NATS_URL not configured, event publishing disabled") + return &Publisher{enabled: false}, nil + } + + // Build connection options + opts := []nats.Option{ + nats.Name("streamspace-api"), + nats.ReconnectWait(2 * time.Second), + nats.MaxReconnects(10), + nats.DisconnectErrHandler(func(nc *nats.Conn, err error) { + if err != nil { + log.Printf("NATS disconnected: %v", err) + } + }), + nats.ReconnectHandler(func(nc *nats.Conn) { + log.Printf("NATS reconnected to %s", nc.ConnectedUrl()) + }), + nats.ErrorHandler(func(nc *nats.Conn, sub *nats.Subscription, err error) { + log.Printf("NATS error: %v", err) + }), + } + + // Add authentication if configured + if cfg.User != "" { + opts = append(opts, nats.UserInfo(cfg.User, cfg.Password)) + } + + // Connect to NATS + conn, err := nats.Connect(cfg.URL, opts...) + if err != nil { + log.Printf("Warning: Failed to connect to NATS at %s: %v", cfg.URL, err) + log.Println("Event publishing disabled - controllers will not receive events") + return &Publisher{enabled: false}, nil + } + + log.Printf("Connected to NATS at %s", conn.ConnectedUrl()) + + // Try to get JetStream context for persistence (optional) + js, err := conn.JetStream() + if err != nil { + log.Printf("JetStream not available: %v (using core NATS)", err) + } else { + // Create streams for durable message delivery + if err := createStreams(js); err != nil { + log.Printf("Warning: Failed to create JetStream streams: %v", err) + log.Println("Events will be published without durability guarantees") + js = nil + } else { + log.Println("JetStream streams configured for durable event delivery") + } + } + + return &Publisher{ + conn: conn, + js: js, + enabled: true, + }, nil +} + +// createStreams creates JetStream streams for durable event delivery. +func createStreams(js nats.JetStreamContext) error { + streams := []struct { + name string + subjects []string + }{ + { + name: "STREAMSPACE_SESSIONS", + subjects: []string{ + "streamspace.session.>", + }, + }, + { + name: "STREAMSPACE_APPS", + subjects: []string{ + "streamspace.app.>", + }, + }, + { + name: "STREAMSPACE_TEMPLATES", + subjects: []string{ + "streamspace.template.>", + }, + }, + { + name: "STREAMSPACE_NODES", + subjects: []string{ + "streamspace.node.>", + }, + }, + { + name: "STREAMSPACE_CONTROLLERS", + subjects: []string{ + "streamspace.controller.>", + }, + }, + } + + for _, s := range streams { + _, err := js.AddStream(&nats.StreamConfig{ + Name: s.name, + Subjects: s.subjects, + Retention: nats.WorkQueuePolicy, // Messages deleted after acknowledgment + MaxAge: 24 * time.Hour, // Keep messages for 24 hours max + Storage: nats.FileStorage, // Persist to disk + Replicas: 1, // Single replica for simplicity + }) + if err != nil { + // Stream might already exist, try to update it + if err.Error() != "stream name already in use" { + return fmt.Errorf("failed to create stream %s: %w", s.name, err) + } + } + } + + return nil +} + +// Close closes the NATS connection. +func (p *Publisher) Close() { + if p.conn != nil { + p.conn.Drain() + p.conn.Close() + } +} + +// IsEnabled returns whether event publishing is enabled. +func (p *Publisher) IsEnabled() bool { + return p.enabled +} + +// Publish publishes an event to the given subject. +func (p *Publisher) Publish(subject string, event interface{}) error { + if !p.enabled { + log.Printf("Event publishing disabled, skipping: %s", subject) + return nil + } + + data, err := json.Marshal(event) + if err != nil { + return fmt.Errorf("failed to marshal event: %w", err) + } + + if err := p.conn.Publish(subject, data); err != nil { + return fmt.Errorf("failed to publish to %s: %w", subject, err) + } + + log.Printf("Published event to %s", subject) + return nil +} + +// PublishWithPlatform publishes an event to a platform-specific subject. +func (p *Publisher) PublishWithPlatform(subject, platform string, event interface{}) error { + // Publish to both generic and platform-specific subjects + if err := p.Publish(subject, event); err != nil { + return err + } + return p.Publish(SubjectWithPlatform(subject, platform), event) +} + +// Request publishes a request and waits for a response. +func (p *Publisher) Request(subject string, event interface{}, timeout time.Duration) (*nats.Msg, error) { + if !p.enabled { + return nil, fmt.Errorf("event publishing disabled") + } + + data, err := json.Marshal(event) + if err != nil { + return nil, fmt.Errorf("failed to marshal event: %w", err) + } + + return p.conn.Request(subject, data, timeout) +} + +// Subscribe subscribes to a subject with a handler. +func (p *Publisher) Subscribe(subject string, handler nats.MsgHandler) (*nats.Subscription, error) { + if !p.enabled { + return nil, fmt.Errorf("event publishing disabled") + } + return p.conn.Subscribe(subject, handler) +} + +// QueueSubscribe subscribes to a subject with a queue group. +func (p *Publisher) QueueSubscribe(subject, queue string, handler nats.MsgHandler) (*nats.Subscription, error) { + if !p.enabled { + return nil, fmt.Errorf("event publishing disabled") + } + return p.conn.QueueSubscribe(subject, queue, handler) +} + +// Helper methods for publishing specific events + +// PublishSessionCreate publishes a session create event. +func (p *Publisher) PublishSessionCreate(ctx context.Context, event *SessionCreateEvent) error { + if event.EventID == "" { + event.EventID = uuid.New().String() + } + if event.Timestamp.IsZero() { + event.Timestamp = time.Now() + } + return p.PublishWithPlatform(SubjectSessionCreate, event.Platform, event) +} + +// PublishSessionDelete publishes a session delete event. +func (p *Publisher) PublishSessionDelete(ctx context.Context, event *SessionDeleteEvent) error { + if event.EventID == "" { + event.EventID = uuid.New().String() + } + if event.Timestamp.IsZero() { + event.Timestamp = time.Now() + } + return p.PublishWithPlatform(SubjectSessionDelete, event.Platform, event) +} + +// PublishSessionHibernate publishes a session hibernate event. +func (p *Publisher) PublishSessionHibernate(ctx context.Context, event *SessionHibernateEvent) error { + if event.EventID == "" { + event.EventID = uuid.New().String() + } + if event.Timestamp.IsZero() { + event.Timestamp = time.Now() + } + return p.PublishWithPlatform(SubjectSessionHibernate, event.Platform, event) +} + +// PublishSessionWake publishes a session wake event. +func (p *Publisher) PublishSessionWake(ctx context.Context, event *SessionWakeEvent) error { + if event.EventID == "" { + event.EventID = uuid.New().String() + } + if event.Timestamp.IsZero() { + event.Timestamp = time.Now() + } + return p.PublishWithPlatform(SubjectSessionWake, event.Platform, event) +} + +// PublishAppInstall publishes an application install event. +func (p *Publisher) PublishAppInstall(ctx context.Context, event *AppInstallEvent) error { + if event.EventID == "" { + event.EventID = uuid.New().String() + } + if event.Timestamp.IsZero() { + event.Timestamp = time.Now() + } + return p.PublishWithPlatform(SubjectAppInstall, event.Platform, event) +} + +// PublishAppUninstall publishes an application uninstall event. +func (p *Publisher) PublishAppUninstall(ctx context.Context, event *AppUninstallEvent) error { + if event.EventID == "" { + event.EventID = uuid.New().String() + } + if event.Timestamp.IsZero() { + event.Timestamp = time.Now() + } + return p.PublishWithPlatform(SubjectAppUninstall, event.Platform, event) +} + +// PublishTemplateCreate publishes a template create event. +func (p *Publisher) PublishTemplateCreate(ctx context.Context, event *TemplateCreateEvent) error { + if event.EventID == "" { + event.EventID = uuid.New().String() + } + if event.Timestamp.IsZero() { + event.Timestamp = time.Now() + } + return p.PublishWithPlatform(SubjectTemplateCreate, event.Platform, event) +} + +// PublishTemplateDelete publishes a template delete event. +func (p *Publisher) PublishTemplateDelete(ctx context.Context, event *TemplateDeleteEvent) error { + if event.EventID == "" { + event.EventID = uuid.New().String() + } + if event.Timestamp.IsZero() { + event.Timestamp = time.Now() + } + return p.PublishWithPlatform(SubjectTemplateDelete, event.Platform, event) +} + +// PublishNodeCordon publishes a node cordon event. +func (p *Publisher) PublishNodeCordon(ctx context.Context, event *NodeCordonEvent) error { + if event.EventID == "" { + event.EventID = uuid.New().String() + } + if event.Timestamp.IsZero() { + event.Timestamp = time.Now() + } + return p.PublishWithPlatform(SubjectNodeCordon, event.Platform, event) +} + +// PublishNodeUncordon publishes a node uncordon event. +func (p *Publisher) PublishNodeUncordon(ctx context.Context, event *NodeUncordonEvent) error { + if event.EventID == "" { + event.EventID = uuid.New().String() + } + if event.Timestamp.IsZero() { + event.Timestamp = time.Now() + } + return p.PublishWithPlatform(SubjectNodeUncordon, event.Platform, event) +} + +// PublishNodeDrain publishes a node drain event. +func (p *Publisher) PublishNodeDrain(ctx context.Context, event *NodeDrainEvent) error { + if event.EventID == "" { + event.EventID = uuid.New().String() + } + if event.Timestamp.IsZero() { + event.Timestamp = time.Now() + } + return p.PublishWithPlatform(SubjectNodeDrain, event.Platform, event) +} + +// GetConnection returns the underlying NATS connection. +// Use with caution - prefer using Publisher methods. +func (p *Publisher) GetConnection() *nats.Conn { + return p.conn +} diff --git a/api/internal/events/subjects.go b/api/internal/events/subjects.go new file mode 100644 index 00000000..5aaf2905 --- /dev/null +++ b/api/internal/events/subjects.go @@ -0,0 +1,47 @@ +package events + +// NATS subject constants for StreamSpace events. +// Format: streamspace..[.] + +const ( + // Session events + SubjectSessionCreate = "streamspace.session.create" + SubjectSessionDelete = "streamspace.session.delete" + SubjectSessionHibernate = "streamspace.session.hibernate" + SubjectSessionWake = "streamspace.session.wake" + SubjectSessionStatus = "streamspace.session.status" + + // Application events + SubjectAppInstall = "streamspace.app.install" + SubjectAppUninstall = "streamspace.app.uninstall" + SubjectAppStatus = "streamspace.app.status" + + // Template events + SubjectTemplateCreate = "streamspace.template.create" + SubjectTemplateDelete = "streamspace.template.delete" + + // Node management events + SubjectNodeCordon = "streamspace.node.cordon" + SubjectNodeUncordon = "streamspace.node.uncordon" + SubjectNodeDrain = "streamspace.node.drain" + + // Controller events + SubjectControllerHeartbeat = "streamspace.controller.heartbeat" + + // Dead letter queue prefix + SubjectDLQPrefix = "streamspace.dlq" +) + +// PlatformSubject returns a platform-specific subject. +// Example: SubjectWithPlatform(SubjectSessionCreate, PlatformKubernetes) +// Returns: "streamspace.session.create.kubernetes" +func SubjectWithPlatform(subject, platform string) string { + return subject + "." + platform +} + +// DLQSubject returns the dead letter queue subject for a given subject. +// Example: DLQSubject(SubjectSessionCreate) +// Returns: "streamspace.dlq.streamspace.session.create" +func DLQSubject(subject string) string { + return SubjectDLQPrefix + "." + subject +} diff --git a/api/internal/events/subscriber.go b/api/internal/events/subscriber.go new file mode 100644 index 00000000..1d4e3c87 --- /dev/null +++ b/api/internal/events/subscriber.go @@ -0,0 +1,220 @@ +// Package events provides NATS event publishing and subscribing for StreamSpace. +// +// The subscriber handles incoming status events from platform controllers +// and updates the API database accordingly. +package events + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "log" + "time" + + "github.com/nats-io/nats.go" +) + +// Subscriber handles receiving events from NATS. +type Subscriber struct { + conn *nats.Conn + db *sql.DB + enabled bool + controllerID string + subs []*nats.Subscription +} + +// NewSubscriber creates a new NATS event subscriber. +// If NATS is unavailable, returns a disabled subscriber. +func NewSubscriber(cfg Config, db *sql.DB) (*Subscriber, error) { + if cfg.URL == "" { + log.Println("Warning: NATS_URL not configured, event subscription disabled") + return &Subscriber{enabled: false}, nil + } + + // Build connection options + opts := []nats.Option{ + nats.Name("streamspace-api-subscriber"), + nats.ReconnectWait(2 * time.Second), + nats.MaxReconnects(10), + nats.DisconnectErrHandler(func(nc *nats.Conn, err error) { + if err != nil { + log.Printf("NATS subscriber disconnected: %v", err) + } + }), + nats.ReconnectHandler(func(nc *nats.Conn) { + log.Printf("NATS subscriber reconnected to %s", nc.ConnectedUrl()) + }), + nats.ErrorHandler(func(nc *nats.Conn, sub *nats.Subscription, err error) { + log.Printf("NATS subscriber error: %v", err) + }), + } + + // Add authentication if configured + if cfg.User != "" { + opts = append(opts, nats.UserInfo(cfg.User, cfg.Password)) + } + + // Connect to NATS + conn, err := nats.Connect(cfg.URL, opts...) + if err != nil { + log.Printf("Warning: Failed to connect subscriber to NATS at %s: %v", cfg.URL, err) + log.Println("Event subscription disabled - API will not receive controller status updates") + return &Subscriber{enabled: false}, nil + } + + log.Printf("API subscriber connected to NATS at %s", conn.ConnectedUrl()) + + return &Subscriber{ + conn: conn, + db: db, + enabled: true, + subs: make([]*nats.Subscription, 0), + }, nil +} + +// Start begins subscribing to status events from controllers. +func (s *Subscriber) Start(ctx context.Context) error { + if !s.enabled { + log.Println("NATS subscriber disabled, not starting") + return nil + } + + // Subscribe to session status events (from all platforms) + sessionSub, err := s.conn.Subscribe(SubjectSessionStatus, func(msg *nats.Msg) { + s.handleSessionStatus(msg.Data) + }) + if err != nil { + return fmt.Errorf("failed to subscribe to session status: %w", err) + } + s.subs = append(s.subs, sessionSub) + log.Printf("Subscribed to %s", SubjectSessionStatus) + + // Subscribe to app status events (from all platforms) + appSub, err := s.conn.Subscribe(SubjectAppStatus, func(msg *nats.Msg) { + s.handleAppStatus(msg.Data) + }) + if err != nil { + return fmt.Errorf("failed to subscribe to app status: %w", err) + } + s.subs = append(s.subs, appSub) + log.Printf("Subscribed to %s", SubjectAppStatus) + + // Subscribe to controller heartbeats + heartbeatSub, err := s.conn.Subscribe(SubjectControllerHeartbeat, func(msg *nats.Msg) { + s.handleControllerHeartbeat(msg.Data) + }) + if err != nil { + return fmt.Errorf("failed to subscribe to controller heartbeat: %w", err) + } + s.subs = append(s.subs, heartbeatSub) + log.Printf("Subscribed to %s", SubjectControllerHeartbeat) + + log.Println("API event subscriber started, listening for controller status events") + + // Wait for context cancellation + <-ctx.Done() + return nil +} + +// Close closes the NATS connection and unsubscribes from all subjects. +func (s *Subscriber) Close() { + if s.conn != nil { + for _, sub := range s.subs { + sub.Unsubscribe() + } + s.conn.Drain() + s.conn.Close() + } +} + +// IsEnabled returns whether event subscription is enabled. +func (s *Subscriber) IsEnabled() bool { + return s.enabled +} + +// handleSessionStatus processes session status events from controllers. +func (s *Subscriber) handleSessionStatus(data []byte) { + var event SessionStatusEvent + if err := json.Unmarshal(data, &event); err != nil { + log.Printf("Failed to unmarshal session status event: %v", err) + return + } + + log.Printf("Received session status: session=%s status=%s phase=%s from=%s", + event.SessionID, event.Status, event.Phase, event.ControllerID) + + // Update session in database + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // Update the session state and URL + query := ` + UPDATE sessions + SET state = $1, url = $2, updated_at = $3 + WHERE id = $4 + ` + + result, err := s.db.ExecContext(ctx, query, event.Status, event.URL, time.Now(), event.SessionID) + if err != nil { + log.Printf("Failed to update session %s status: %v", event.SessionID, err) + return + } + + rows, _ := result.RowsAffected() + if rows == 0 { + log.Printf("Session %s not found in database (may not be created yet)", event.SessionID) + } else { + log.Printf("Updated session %s to status=%s", event.SessionID, event.Status) + } +} + +// handleAppStatus processes application installation status events from controllers. +func (s *Subscriber) handleAppStatus(data []byte) { + var event AppStatusEvent + if err := json.Unmarshal(data, &event); err != nil { + log.Printf("Failed to unmarshal app status event: %v", err) + return + } + + log.Printf("Received app status: install=%s status=%s from=%s", + event.InstallID, event.Status, event.ControllerID) + + // Update installed application in database + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + query := ` + UPDATE installed_applications + SET install_status = $1, install_message = $2, updated_at = $3 + WHERE id = $4 + ` + + result, err := s.db.ExecContext(ctx, query, event.Status, event.Message, time.Now(), event.InstallID) + if err != nil { + log.Printf("Failed to update app %s status: %v", event.InstallID, err) + return + } + + rows, _ := result.RowsAffected() + if rows == 0 { + log.Printf("Application %s not found in database", event.InstallID) + } else { + log.Printf("Updated application %s to status=%s", event.InstallID, event.Status) + } +} + +// handleControllerHeartbeat processes heartbeat events from controllers. +func (s *Subscriber) handleControllerHeartbeat(data []byte) { + var event ControllerHeartbeatEvent + if err := json.Unmarshal(data, &event); err != nil { + log.Printf("Failed to unmarshal controller heartbeat: %v", err) + return + } + + log.Printf("Controller heartbeat: id=%s platform=%s status=%s", + event.ControllerID, event.Platform, event.Status) + + // Could update a controllers table here to track controller health + // For now, just log it +} diff --git a/api/internal/events/types.go b/api/internal/events/types.go new file mode 100644 index 00000000..f208f57e --- /dev/null +++ b/api/internal/events/types.go @@ -0,0 +1,195 @@ +// Package events provides NATS event publishing for StreamSpace. +// +// This package enables event-driven communication between the API and +// platform controllers (Kubernetes, Docker, Hyper-V, vCenter, etc.). +// +// Events are published to NATS subjects and consumed by controllers +// that perform platform-specific operations. +package events + +import ( + "time" +) + +// SessionCreateEvent is published when a new session is requested. +type SessionCreateEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + TemplateID string `json:"template_id"` + Platform string `json:"platform"` + Resources ResourceSpec `json:"resources"` + PersistentHome bool `json:"persistent_home"` + IdleTimeout string `json:"idle_timeout"` + Metadata map[string]string `json:"metadata,omitempty"` +} + +// SessionDeleteEvent is published when a session should be deleted. +type SessionDeleteEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + Platform string `json:"platform"` + Force bool `json:"force"` +} + +// SessionHibernateEvent is published when a session should be hibernated. +type SessionHibernateEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + Platform string `json:"platform"` +} + +// SessionWakeEvent is published when a hibernated session should be woken. +type SessionWakeEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + Platform string `json:"platform"` +} + +// SessionStatusEvent is published by controllers when session status changes. +type SessionStatusEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + Status string `json:"status"` + Phase string `json:"phase"` + URL string `json:"url,omitempty"` + PodName string `json:"pod_name,omitempty"` + Message string `json:"message,omitempty"` + ResourceUsage *ResourceSpec `json:"resource_usage,omitempty"` + ControllerID string `json:"controller_id"` +} + +// AppInstallEvent is published when an application should be installed. +type AppInstallEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + InstallID string `json:"install_id"` + CatalogTemplateID int `json:"catalog_template_id"` + TemplateName string `json:"template_name"` + DisplayName string `json:"display_name"` + Description string `json:"description,omitempty"` + Category string `json:"category,omitempty"` + IconURL string `json:"icon_url,omitempty"` + Manifest string `json:"manifest"` + InstalledBy string `json:"installed_by"` + Platform string `json:"platform"` +} + +// AppUninstallEvent is published when an application should be uninstalled. +type AppUninstallEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + InstallID string `json:"install_id"` + TemplateName string `json:"template_name"` + Platform string `json:"platform"` +} + +// AppStatusEvent is published by controllers when app installation status changes. +type AppStatusEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + InstallID string `json:"install_id"` + Status string `json:"status"` // pending, installing, ready, failed + TemplateName string `json:"template_name,omitempty"` + TemplateNamespace string `json:"template_namespace,omitempty"` + Message string `json:"message,omitempty"` + ControllerID string `json:"controller_id"` +} + +// TemplateCreateEvent is published when a template is created. +type TemplateCreateEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + TemplateID string `json:"template_id"` + DisplayName string `json:"display_name"` + Category string `json:"category,omitempty"` + BaseImage string `json:"base_image,omitempty"` + Manifest string `json:"manifest,omitempty"` + Platform string `json:"platform"` + CreatedBy string `json:"created_by,omitempty"` +} + +// TemplateDeleteEvent is published when a template should be deleted. +type TemplateDeleteEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + TemplateName string `json:"template_name"` + Platform string `json:"platform"` +} + +// NodeCordonEvent is published when a node should be cordoned. +type NodeCordonEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + NodeName string `json:"node_name"` + Platform string `json:"platform"` +} + +// NodeUncordonEvent is published when a node should be uncordoned. +type NodeUncordonEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + NodeName string `json:"node_name"` + Platform string `json:"platform"` +} + +// NodeDrainEvent is published when a node should be drained. +type NodeDrainEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + NodeName string `json:"node_name"` + Platform string `json:"platform"` + GracePeriodSeconds *int64 `json:"grace_period_seconds,omitempty"` +} + +// ControllerHeartbeatEvent is published by controllers to indicate health. +type ControllerHeartbeatEvent struct { + ControllerID string `json:"controller_id"` + Platform string `json:"platform"` + Timestamp time.Time `json:"timestamp"` + Status string `json:"status"` // healthy, unhealthy + Version string `json:"version"` + Capabilities []string `json:"capabilities"` + ClusterInfo map[string]interface{} `json:"cluster_info,omitempty"` +} + +// ResourceSpec defines resource requirements. +type ResourceSpec struct { + Memory string `json:"memory,omitempty"` + CPU string `json:"cpu,omitempty"` +} + +// Platform constants +const ( + PlatformKubernetes = "kubernetes" + PlatformDocker = "docker" + PlatformHyperV = "hyperv" + PlatformVCenter = "vcenter" +) + +// Status constants +const ( + StatusPending = "pending" + StatusCreating = "creating" + StatusRunning = "running" + StatusHibernated = "hibernated" + StatusFailed = "failed" + StatusDeleting = "deleting" + StatusDeleted = "deleted" +) + +// Install status constants +const ( + InstallStatusPending = "pending" + InstallStatusInstalling = "installing" + InstallStatusReady = "ready" + InstallStatusFailed = "failed" +) diff --git a/api/internal/handlers/applications.go b/api/internal/handlers/applications.go index 154d6a2a..eb5cc54b 100644 --- a/api/internal/handlers/applications.go +++ b/api/internal/handlers/applications.go @@ -35,19 +35,18 @@ // // Example Usage: // -// handler := NewApplicationHandler(database, k8sClient, "streamspace") +// handler := NewApplicationHandler(database, publisher, "kubernetes") // handler.RegisterRoutes(router.Group("/api/v1")) package handlers import ( - "fmt" + "context" "log" "net/http" - "strings" "github.com/gin-gonic/gin" "github.com/streamspace/streamspace/api/internal/db" - "github.com/streamspace/streamspace/api/internal/k8s" + "github.com/streamspace/streamspace/api/internal/events" "github.com/streamspace/streamspace/api/internal/models" ) @@ -55,17 +54,20 @@ import ( type ApplicationHandler struct { db *db.Database appDB *db.ApplicationDB - k8sClient *k8s.Client - namespace string + publisher *events.Publisher + platform string } // NewApplicationHandler creates a new application handler -func NewApplicationHandler(database *db.Database, k8sClient *k8s.Client, namespace string) *ApplicationHandler { +func NewApplicationHandler(database *db.Database, publisher *events.Publisher, platform string) *ApplicationHandler { + if platform == "" { + platform = events.PlatformKubernetes + } return &ApplicationHandler{ db: database, appDB: db.NewApplicationDB(database.DB()), - k8sClient: k8sClient, - namespace: namespace, + publisher: publisher, + platform: platform, } } @@ -88,6 +90,18 @@ func (h *ApplicationHandler) RegisterRoutes(router *gin.RouterGroup) { } } +// updateInstallStatus updates the installation status of an application in the database +func (h *ApplicationHandler) updateInstallStatus(ctx context.Context, appID, status, message string) { + _, err := h.db.DB().ExecContext(ctx, ` + UPDATE installed_applications + SET install_status = $1, install_message = $2, updated_at = NOW() + WHERE id = $3 + `, status, message, appID) + if err != nil { + log.Printf("Failed to update install status for %s: %v", appID, err) + } +} + // ListApplications godoc // @Summary List all installed applications // @Description Get all installed applications with optional filtering @@ -141,13 +155,14 @@ func (h *ApplicationHandler) ListApplications(c *gin.Context) { // Installation Flow: // 1. Validate request and authenticate user // 2. Fetch template manifest from catalog_templates database -// 3. Create ApplicationInstall CRD (controller will create Template) -// 4. Create installed_applications database record -// 5. Grant group access permissions if specified +// 3. Create installed_applications database record (status: pending) +// 4. Grant group access permissions if specified +// 5. Publish NATS event for controller to process // 6. Return the created application with full details // -// The controller watches ApplicationInstall resources and creates the corresponding -// Template CRD. This pattern provides automatic retry and proper separation of concerns. +// The controller subscribes to NATS events and creates platform-specific resources +// (Kubernetes Template CRD, Docker container, Hyper-V VM, etc.). This pattern +// decouples the API from platform-specific operations. func (h *ApplicationHandler) InstallApplication(c *gin.Context) { ctx := c.Request.Context() @@ -198,64 +213,44 @@ func (h *ApplicationHandler) InstallApplication(c *gin.Context) { return } - // Step 3: Create ApplicationInstall CRD - // The controller will watch this and create the corresponding Template CRD - if h.k8sClient == nil { - log.Printf("Error: k8sClient is nil, cannot create ApplicationInstall for %s", name) + // Step 3: Create database record in installed_applications table + // The record is created with install_status = 'pending' + app, err := h.appDB.InstallApplication(ctx, &req, userID.(string)) + if err != nil { c.JSON(http.StatusInternalServerError, ErrorResponse{ - Error: "Kubernetes client not configured", - Message: "Cannot install application: Kubernetes client is not available. Please check API server configuration.", + Error: "Installation failed", + Message: err.Error(), }) return } - // Generate unique name for ApplicationInstall - appInstallName := fmt.Sprintf("%s-%d", name, req.CatalogTemplateID) + // Step 4: Grant initial group access permissions if specified in request + for _, groupID := range req.GroupIDs { + h.appDB.AddGroupAccess(ctx, app.ID, groupID, "launch") + } - appInstall := &k8s.ApplicationInstall{ - Name: appInstallName, - Namespace: h.namespace, + // Step 5: Publish NATS event for controller to process + // The controller will create the platform-specific resources (Template CRD, Docker container, etc.) + installEvent := &events.AppInstallEvent{ + InstallID: app.ID, CatalogTemplateID: req.CatalogTemplateID, TemplateName: name, DisplayName: displayName, Description: description, Category: category, - Icon: iconURL, + IconURL: iconURL, Manifest: manifest, InstalledBy: userID.(string), + Platform: h.platform, } - _, err = h.k8sClient.CreateApplicationInstall(ctx, appInstall) - if err != nil { - // "already exists" is OK - application may have been installed before - errStr := err.Error() - if strings.Contains(errStr, "already exists") { - log.Printf("ApplicationInstall %s already exists, continuing with database record", appInstallName) - } else { - log.Printf("Failed to create ApplicationInstall %s: %v", appInstallName, err) - c.JSON(http.StatusInternalServerError, ErrorResponse{ - Error: "Failed to create application install request", - Message: fmt.Sprintf("Could not create ApplicationInstall '%s': %v", appInstallName, err), - }) - return - } + if err := h.publisher.PublishAppInstall(ctx, installEvent); err != nil { + // Log error but don't fail - the database record exists and controller can retry + log.Printf("Warning: Failed to publish app install event for %s: %v", app.ID, err) + // Update install status to indicate event publishing failed + h.updateInstallStatus(ctx, app.ID, events.InstallStatusPending, "Event publish failed, waiting for retry") } else { - log.Printf("Successfully created ApplicationInstall %s (controller will create Template)", appInstallName) - } - - // Step 4: Create database record in installed_applications table - app, err := h.appDB.InstallApplication(ctx, &req, userID.(string)) - if err != nil { - c.JSON(http.StatusInternalServerError, ErrorResponse{ - Error: "Installation failed", - Message: err.Error(), - }) - return - } - - // Step 5: Grant initial group access permissions if specified in request - for _, groupID := range req.GroupIDs { - h.appDB.AddGroupAccess(ctx, app.ID, groupID, "launch") + log.Printf("Published app install event for %s (controller will create resources)", app.ID) } // Step 7: Fetch complete application record with template info and group access @@ -373,9 +368,21 @@ func (h *ApplicationHandler) UpdateApplication(c *gin.Context) { // @Failure 500 {object} ErrorResponse // @Router /api/v1/applications/{id} [delete] func (h *ApplicationHandler) DeleteApplication(c *gin.Context) { + ctx := c.Request.Context() appID := c.Param("id") - err := h.appDB.DeleteApplication(c.Request.Context(), appID) + // Get application info before deleting (for the uninstall event) + app, err := h.appDB.GetApplication(ctx, appID) + if err != nil { + c.JSON(http.StatusNotFound, ErrorResponse{ + Error: "Application not found", + Message: err.Error(), + }) + return + } + + // Delete from database + err = h.appDB.DeleteApplication(ctx, appID) if err != nil { c.JSON(http.StatusInternalServerError, ErrorResponse{ Error: "Delete failed", @@ -384,6 +391,20 @@ func (h *ApplicationHandler) DeleteApplication(c *gin.Context) { return } + // Publish uninstall event for controller to clean up platform resources + uninstallEvent := &events.AppUninstallEvent{ + InstallID: appID, + TemplateName: app.TemplateName, + Platform: h.platform, + } + + if err := h.publisher.PublishAppUninstall(ctx, uninstallEvent); err != nil { + // Log error but don't fail - database record is already deleted + log.Printf("Warning: Failed to publish app uninstall event for %s: %v", appID, err) + } else { + log.Printf("Published app uninstall event for %s", appID) + } + c.JSON(http.StatusOK, gin.H{ "message": "Application deleted successfully", }) diff --git a/api/internal/handlers/nodes.go b/api/internal/handlers/nodes.go index bdeab240..c50b04de 100644 --- a/api/internal/handlers/nodes.go +++ b/api/internal/handlers/nodes.go @@ -63,11 +63,13 @@ package handlers import ( "context" "fmt" + "log" "net/http" "time" "github.com/gin-gonic/gin" "github.com/streamspace/streamspace/api/internal/db" + "github.com/streamspace/streamspace/api/internal/events" "github.com/streamspace/streamspace/api/internal/k8s" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -77,13 +79,20 @@ import ( type NodeHandler struct { db *db.Database k8sClient *k8s.Client + publisher *events.Publisher + platform string } // NewNodeHandler creates a new node management handler -func NewNodeHandler(database *db.Database, k8sClient *k8s.Client) *NodeHandler { +func NewNodeHandler(database *db.Database, k8sClient *k8s.Client, publisher *events.Publisher, platform string) *NodeHandler { + if platform == "" { + platform = events.PlatformKubernetes + } return &NodeHandler{ db: database, k8sClient: k8sClient, + publisher: publisher, + platform: platform, } } @@ -387,6 +396,15 @@ func (h *NodeHandler) CordonNode(c *gin.Context) { return } + // Publish node cordon event for controllers + event := &events.NodeCordonEvent{ + NodeName: nodeName, + Platform: h.platform, + } + if err := h.publisher.PublishNodeCordon(ctx, event); err != nil { + log.Printf("Warning: Failed to publish node cordon event: %v", err) + } + c.JSON(http.StatusOK, gin.H{"message": "Node cordoned successfully"}) } @@ -409,6 +427,15 @@ func (h *NodeHandler) UncordonNode(c *gin.Context) { return } + // Publish node uncordon event for controllers + event := &events.NodeUncordonEvent{ + NodeName: nodeName, + Platform: h.platform, + } + if err := h.publisher.PublishNodeUncordon(ctx, event); err != nil { + log.Printf("Warning: Failed to publish node uncordon event: %v", err) + } + c.JSON(http.StatusOK, gin.H{"message": "Node uncordoned successfully"}) } @@ -439,6 +466,16 @@ func (h *NodeHandler) DrainNode(c *gin.Context) { return } + // Publish node drain event for controllers + event := &events.NodeDrainEvent{ + NodeName: nodeName, + Platform: h.platform, + GracePeriodSeconds: req.GracePeriodSeconds, + } + if err := h.publisher.PublishNodeDrain(ctx, event); err != nil { + log.Printf("Warning: Failed to publish node drain event: %v", err) + } + c.JSON(http.StatusOK, gin.H{"message": "Node drained successfully"}) } diff --git a/api/internal/tracker/tracker.go b/api/internal/tracker/tracker.go index 18fabd9b..4ae93a8b 100644 --- a/api/internal/tracker/tracker.go +++ b/api/internal/tracker/tracker.go @@ -48,6 +48,7 @@ import ( "time" "github.com/streamspace/streamspace/api/internal/db" + "github.com/streamspace/streamspace/api/internal/events" "github.com/streamspace/streamspace/api/internal/k8s" ) @@ -74,6 +75,12 @@ type ConnectionTracker struct { // k8sClient interacts with Kubernetes to manage session state. k8sClient *k8s.Client + // publisher publishes NATS events for platform-agnostic operations. + publisher *events.Publisher + + // platform identifies the target platform (kubernetes, docker, etc.) + platform string + // connections is the in-memory map of active connections. // Key: connection ID, Value: Connection struct // Protected by mu for thread safety. @@ -156,12 +163,17 @@ type Connection struct { // // Example: // -// tracker := NewConnectionTracker(database, k8sClient) +// tracker := NewConnectionTracker(database, k8sClient, publisher, "kubernetes") // go tracker.Start() // Run in background -func NewConnectionTracker(database *db.Database, k8sClient *k8s.Client) *ConnectionTracker { +func NewConnectionTracker(database *db.Database, k8sClient *k8s.Client, publisher *events.Publisher, platform string) *ConnectionTracker { + if platform == "" { + platform = events.PlatformKubernetes + } return &ConnectionTracker{ db: database, k8sClient: k8sClient, + publisher: publisher, + platform: platform, connections: make(map[string]*Connection), checkInterval: 30 * time.Second, // Check every 30 seconds heartbeatWindow: 60 * time.Second, // Disconnect if no heartbeat for 60s @@ -466,6 +478,16 @@ func (ct *ConnectionTracker) autoStartSession(ctx context.Context, sessionID str return } + // Publish wake event for controllers + event := &events.SessionWakeEvent{ + SessionID: sessionID, + UserID: session.User, + Platform: ct.platform, + } + if err := ct.publisher.PublishSessionWake(ctx, event); err != nil { + log.Printf("Warning: Failed to publish session wake event: %v", err) + } + log.Printf("Session auto-started: %s", sessionID) } @@ -515,6 +537,16 @@ func (ct *ConnectionTracker) autoHibernateSession(ctx context.Context, sessionID return } + // Publish hibernate event for controllers + event := &events.SessionHibernateEvent{ + SessionID: sessionID, + UserID: session.User, + Platform: ct.platform, + } + if err := ct.publisher.PublishSessionHibernate(ctx, event); err != nil { + log.Printf("Warning: Failed to publish session hibernate event: %v", err) + } + log.Printf("Session auto-hibernated: %s", sessionID) } diff --git a/chart/values.yaml b/chart/values.yaml index 763f4bf6..bf072d9b 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -11,13 +11,15 @@ global: # Storage class for all PVCs storageClass: "" -## StreamSpace Controller +## StreamSpace Kubernetes Controller +## This is the Kubernetes-specific platform controller for the multi-platform architecture. +## For Docker environments, use the Docker controller (docker-controller/). controller: enabled: true image: registry: ghcr.io - repository: streamspace/streamspace-controller + repository: streamspace/streamspace-kubernetes-controller tag: "v0.2.0" pullPolicy: IfNotPresent diff --git a/docker-compose.yml b/docker-compose.yml index d09e5358..bc3d56d8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,6 +21,29 @@ services: networks: - streamspace + # NATS message broker for event-driven architecture + nats: + image: nats:2.10-alpine + container_name: streamspace-nats + command: + - "--jetstream" + - "--store_dir=/data" + - "--http_port=8222" + ports: + - "4222:4222" # Client connections + - "8222:8222" # HTTP monitoring + - "6222:6222" # Cluster routing + volumes: + - nats-data:/data + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8222/healthz"] + interval: 10s + timeout: 5s + retries: 5 + networks: + - streamspace + restart: unless-stopped + # StreamSpace API Backend api: build: @@ -34,6 +57,8 @@ services: depends_on: postgres: condition: service_healthy + nats: + condition: service_healthy environment: # Database configuration DB_HOST: postgres @@ -49,6 +74,12 @@ services: # JWT configuration JWT_SECRET: dev-secret-change-in-production + # NATS configuration + NATS_URL: nats://nats:4222 + NATS_USER: "" + NATS_PASSWORD: "" + PLATFORM: kubernetes + # Sync configuration SYNC_INTERVAL: 1h @@ -69,6 +100,29 @@ services: - streamspace restart: unless-stopped + # StreamSpace Docker Controller (for Docker platform support) + docker-controller: + build: + context: ./docker-controller + dockerfile: Dockerfile + container_name: streamspace-docker-controller + depends_on: + nats: + condition: service_healthy + environment: + NATS_URL: nats://nats:4222 + NATS_USER: "" + NATS_PASSWORD: "" + CONTROLLER_ID: streamspace-docker-controller-1 + DOCKER_NETWORK: streamspace + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - streamspace + profiles: + - docker + restart: unless-stopped + # pgAdmin for database management (optional, for development) pgadmin: image: dpage/pgadmin4:latest @@ -134,6 +188,8 @@ networks: volumes: postgres-data: name: streamspace-postgres-data + nats-data: + name: streamspace-nats-data pgadmin-data: name: streamspace-pgadmin-data prometheus-data: diff --git a/docker-controller/Dockerfile b/docker-controller/Dockerfile new file mode 100644 index 00000000..f4e3b54d --- /dev/null +++ b/docker-controller/Dockerfile @@ -0,0 +1,33 @@ +# Build stage +FROM golang:1.21-alpine AS builder + +WORKDIR /app + +# Install build dependencies +RUN apk add --no-cache git ca-certificates + +# Copy source code (cache bust: v2) +COPY . . + +# Download dependencies and generate go.sum if missing +RUN go mod tidy && go mod download + +# Build binary +RUN CGO_ENABLED=0 GOOS=linux go build -o docker-controller ./cmd/main.go + +# Runtime stage +FROM alpine:3.19 + +WORKDIR /app + +# Install runtime dependencies +RUN apk add --no-cache ca-certificates + +# Copy binary from builder +COPY --from=builder /app/docker-controller /app/docker-controller + +# Run as non-root user +RUN adduser -D -u 1000 controller +USER controller + +ENTRYPOINT ["/app/docker-controller"] diff --git a/docker-controller/cmd/main.go b/docker-controller/cmd/main.go new file mode 100644 index 00000000..3eae7607 --- /dev/null +++ b/docker-controller/cmd/main.go @@ -0,0 +1,102 @@ +// Package main is the entry point for the StreamSpace Docker controller. +// +// This controller manages StreamSpace sessions using Docker containers instead +// of Kubernetes. It subscribes to NATS events and performs Docker operations. +// +// Key responsibilities: +// - Session container lifecycle (create, start, stop, remove) +// - Container networking and port mapping +// - Volume management for persistent home directories +// - Auto-hibernation (stop containers) and wake (start containers) +// +// Architecture: +// - Subscribes to NATS events on streamspace.*.docker subjects +// - Uses Docker API to manage containers +// - Publishes status events back to NATS +// +// Deployment: +// The controller can run as a standalone binary or Docker container with: +// - Access to Docker socket (/var/run/docker.sock) +// - NATS connection for event communication +package main + +import ( + "context" + "flag" + "log" + "os" + "os/signal" + "syscall" + + "github.com/streamspace/docker-controller/pkg/docker" + "github.com/streamspace/docker-controller/pkg/events" +) + +func main() { + var natsURL string + var natsUser string + var natsPassword string + var controllerID string + var dockerHost string + var networkName string + + // Parse command-line flags + flag.StringVar(&natsURL, "nats-url", getEnv("NATS_URL", "nats://localhost:4222"), "NATS server URL") + flag.StringVar(&natsUser, "nats-user", getEnv("NATS_USER", ""), "NATS username") + flag.StringVar(&natsPassword, "nats-password", getEnv("NATS_PASSWORD", ""), "NATS password") + flag.StringVar(&controllerID, "controller-id", getEnv("CONTROLLER_ID", "streamspace-docker-controller-1"), "Unique controller ID") + flag.StringVar(&dockerHost, "docker-host", getEnv("DOCKER_HOST", "unix:///var/run/docker.sock"), "Docker host") + flag.StringVar(&networkName, "network", getEnv("DOCKER_NETWORK", "streamspace"), "Docker network name") + flag.Parse() + + log.Printf("StreamSpace Docker Controller starting...") + log.Printf("NATS URL: %s", natsURL) + log.Printf("Controller ID: %s", controllerID) + log.Printf("Docker Host: %s", dockerHost) + + // Initialize Docker client + dockerClient, err := docker.NewClient(dockerHost, networkName) + if err != nil { + log.Fatalf("Failed to create Docker client: %v", err) + } + defer dockerClient.Close() + + // Initialize NATS event subscriber + subscriber, err := events.NewSubscriber(events.Config{ + URL: natsURL, + User: natsUser, + Password: natsPassword, + }, dockerClient, controllerID) + + if err != nil { + log.Fatalf("Failed to create NATS subscriber: %v", err) + } + defer subscriber.Close() + + // Start subscriber in background + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go func() { + if err := subscriber.Start(ctx); err != nil { + log.Printf("NATS subscriber error: %v", err) + } + }() + + log.Printf("Docker controller started successfully") + + // Wait for shutdown signal + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + <-sigCh + + log.Printf("Shutting down Docker controller...") +} + +// getEnv gets an environment variable with a default fallback +func getEnv(key, defaultValue string) string { + if value := os.Getenv(key); value != "" { + return value + } + return defaultValue +} diff --git a/docker-controller/go.mod b/docker-controller/go.mod new file mode 100644 index 00000000..5652fd4b --- /dev/null +++ b/docker-controller/go.mod @@ -0,0 +1,33 @@ +module github.com/streamspace/docker-controller + +go 1.21 + +require ( + github.com/docker/docker v24.0.7+incompatible + github.com/docker/go-connections v0.4.0 + github.com/google/uuid v1.6.0 + github.com/nats-io/nats.go v1.37.0 +) + +require ( + github.com/Microsoft/go-winio v0.6.1 // indirect + github.com/distribution/reference v0.5.0 // indirect + github.com/docker/distribution v2.8.3+incompatible // indirect + github.com/docker/go-units v0.5.0 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/klauspost/compress v1.17.2 // indirect + github.com/moby/term v0.5.0 // indirect + github.com/morikuni/aec v1.0.0 // indirect + github.com/nats-io/nkeys v0.4.7 // indirect + github.com/nats-io/nuid v1.0.1 // indirect + github.com/opencontainers/go-digest v1.0.0 // indirect + github.com/opencontainers/image-spec v1.0.2 // indirect + github.com/pkg/errors v0.9.1 // indirect + golang.org/x/crypto v0.18.0 // indirect + golang.org/x/mod v0.8.0 // indirect + golang.org/x/net v0.20.0 // indirect + golang.org/x/sys v0.16.0 // indirect + golang.org/x/time v0.5.0 // indirect + golang.org/x/tools v0.6.0 // indirect + gotest.tools/v3 v3.5.1 // indirect +) diff --git a/docker-controller/pkg/docker/client.go b/docker-controller/pkg/docker/client.go new file mode 100644 index 00000000..19d8b3e8 --- /dev/null +++ b/docker-controller/pkg/docker/client.go @@ -0,0 +1,291 @@ +// Package docker provides Docker container management for StreamSpace sessions. +package docker + +import ( + "context" + "fmt" + "log" + "strings" + + "github.com/docker/docker/api/types" + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/filters" + "github.com/docker/docker/api/types/mount" + "github.com/docker/docker/api/types/network" + "github.com/docker/docker/api/types/volume" + "github.com/docker/docker/client" + "github.com/docker/go-connections/nat" +) + +// Client wraps the Docker API client for StreamSpace operations. +type Client struct { + docker *client.Client + networkName string +} + +// NewClient creates a new Docker client. +func NewClient(host, networkName string) (*Client, error) { + opts := []client.Opt{ + client.FromEnv, + client.WithAPIVersionNegotiation(), + } + + if host != "" && host != "unix:///var/run/docker.sock" { + opts = append(opts, client.WithHost(host)) + } + + cli, err := client.NewClientWithOpts(opts...) + if err != nil { + return nil, fmt.Errorf("failed to create Docker client: %w", err) + } + + // Test connection + ctx := context.Background() + _, err = cli.Ping(ctx) + if err != nil { + return nil, fmt.Errorf("failed to connect to Docker: %w", err) + } + + return &Client{ + docker: cli, + networkName: networkName, + }, nil +} + +// Close closes the Docker client. +func (c *Client) Close() error { + return c.docker.Close() +} + +// SessionConfig holds configuration for creating a session container. +type SessionConfig struct { + SessionID string + UserID string + TemplateID string + Image string + Memory int64 // bytes + CPUShares int64 + VNCPort int + PersistentHome bool + HomeVolume string + Env map[string]string +} + +// CreateSession creates a new session container. +func (c *Client) CreateSession(ctx context.Context, config SessionConfig) (string, error) { + containerName := fmt.Sprintf("ss-%s", config.SessionID) + + // Build environment variables + env := []string{ + fmt.Sprintf("SESSION_ID=%s", config.SessionID), + fmt.Sprintf("USER_ID=%s", config.UserID), + fmt.Sprintf("TEMPLATE_ID=%s", config.TemplateID), + } + for k, v := range config.Env { + env = append(env, fmt.Sprintf("%s=%s", k, v)) + } + + // Configure port bindings + exposedPorts := nat.PortSet{} + portBindings := nat.PortMap{} + + if config.VNCPort > 0 { + vncPort := nat.Port(fmt.Sprintf("%d/tcp", config.VNCPort)) + exposedPorts[vncPort] = struct{}{} + portBindings[vncPort] = []nat.PortBinding{ + {HostIP: "0.0.0.0", HostPort: ""}, // Auto-assign host port + } + } + + // Configure mounts + var mounts []mount.Mount + if config.PersistentHome && config.HomeVolume != "" { + mounts = append(mounts, mount.Mount{ + Type: mount.TypeVolume, + Source: config.HomeVolume, + Target: "/config", + }) + } + + // Container configuration + containerConfig := &container.Config{ + Image: config.Image, + Env: env, + ExposedPorts: exposedPorts, + Labels: map[string]string{ + "streamspace.io/managed": "true", + "streamspace.io/session": config.SessionID, + "streamspace.io/user": config.UserID, + "streamspace.io/template": config.TemplateID, + }, + } + + // Host configuration + hostConfig := &container.HostConfig{ + PortBindings: portBindings, + Mounts: mounts, + Resources: container.Resources{ + Memory: config.Memory, + CPUShares: config.CPUShares, + }, + RestartPolicy: container.RestartPolicy{ + Name: "unless-stopped", + }, + } + + // Network configuration + networkConfig := &network.NetworkingConfig{ + EndpointsConfig: map[string]*network.EndpointSettings{ + c.networkName: {}, + }, + } + + // Create container + resp, err := c.docker.ContainerCreate(ctx, containerConfig, hostConfig, networkConfig, nil, containerName) + if err != nil { + return "", fmt.Errorf("failed to create container: %w", err) + } + + // Start container + if err := c.docker.ContainerStart(ctx, resp.ID, types.ContainerStartOptions{}); err != nil { + // Clean up on failure + c.docker.ContainerRemove(ctx, resp.ID, types.ContainerRemoveOptions{Force: true}) + return "", fmt.Errorf("failed to start container: %w", err) + } + + log.Printf("Created and started container %s for session %s", containerName, config.SessionID) + return resp.ID, nil +} + +// StopSession stops (hibernates) a session container. +func (c *Client) StopSession(ctx context.Context, sessionID string) error { + containerName := fmt.Sprintf("ss-%s", sessionID) + + timeout := 30 // seconds + if err := c.docker.ContainerStop(ctx, containerName, container.StopOptions{Timeout: &timeout}); err != nil { + if strings.Contains(err.Error(), "No such container") { + return nil // Already stopped/removed + } + return fmt.Errorf("failed to stop container: %w", err) + } + + log.Printf("Stopped container %s for session %s", containerName, sessionID) + return nil +} + +// StartSession starts (wakes) a hibernated session container. +func (c *Client) StartSession(ctx context.Context, sessionID string) error { + containerName := fmt.Sprintf("ss-%s", sessionID) + + if err := c.docker.ContainerStart(ctx, containerName, types.ContainerStartOptions{}); err != nil { + return fmt.Errorf("failed to start container: %w", err) + } + + log.Printf("Started container %s for session %s", containerName, sessionID) + return nil +} + +// RemoveSession removes a session container. +func (c *Client) RemoveSession(ctx context.Context, sessionID string, force bool) error { + containerName := fmt.Sprintf("ss-%s", sessionID) + + if err := c.docker.ContainerRemove(ctx, containerName, types.ContainerRemoveOptions{ + Force: force, + RemoveVolumes: false, // Keep volumes for data persistence + }); err != nil { + if strings.Contains(err.Error(), "No such container") { + return nil // Already removed + } + return fmt.Errorf("failed to remove container: %w", err) + } + + log.Printf("Removed container %s for session %s", containerName, sessionID) + return nil +} + +// GetSessionStatus returns the status of a session container. +func (c *Client) GetSessionStatus(ctx context.Context, sessionID string) (string, error) { + containerName := fmt.Sprintf("ss-%s", sessionID) + + info, err := c.docker.ContainerInspect(ctx, containerName) + if err != nil { + if strings.Contains(err.Error(), "No such container") { + return "not_found", nil + } + return "", fmt.Errorf("failed to inspect container: %w", err) + } + + if info.State.Running { + return "running", nil + } + if info.State.Paused { + return "paused", nil + } + return "stopped", nil +} + +// GetSessionURL returns the URL to access the session. +func (c *Client) GetSessionURL(ctx context.Context, sessionID string, vncPort int) (string, error) { + containerName := fmt.Sprintf("ss-%s", sessionID) + + info, err := c.docker.ContainerInspect(ctx, containerName) + if err != nil { + return "", fmt.Errorf("failed to inspect container: %w", err) + } + + portKey := fmt.Sprintf("%d/tcp", vncPort) + if bindings, ok := info.NetworkSettings.Ports[nat.Port(portKey)]; ok && len(bindings) > 0 { + return fmt.Sprintf("http://localhost:%s", bindings[0].HostPort), nil + } + + return "", fmt.Errorf("VNC port not exposed") +} + +// EnsureUserVolume creates a volume for user's persistent home if it doesn't exist. +func (c *Client) EnsureUserVolume(ctx context.Context, userID string) (string, error) { + volumeName := fmt.Sprintf("streamspace-home-%s", userID) + + // Check if volume exists + _, err := c.docker.VolumeInspect(ctx, volumeName) + if err == nil { + return volumeName, nil // Already exists + } + + // Create volume + _, err = c.docker.VolumeCreate(ctx, volume.CreateOptions{ + Name: volumeName, + Labels: map[string]string{ + "streamspace.io/managed": "true", + "streamspace.io/user": userID, + "streamspace.io/type": "home", + }, + }) + if err != nil { + return "", fmt.Errorf("failed to create volume: %w", err) + } + + log.Printf("Created volume %s for user %s", volumeName, userID) + return volumeName, nil +} + +// ListSessions returns all StreamSpace session containers. +func (c *Client) ListSessions(ctx context.Context) ([]string, error) { + containers, err := c.docker.ContainerList(ctx, types.ContainerListOptions{ + All: true, + Filters: filters.NewArgs( + filters.Arg("label", "streamspace.io/managed=true"), + ), + }) + if err != nil { + return nil, fmt.Errorf("failed to list containers: %w", err) + } + + var sessions []string + for _, c := range containers { + if sessionID, ok := c.Labels["streamspace.io/session"]; ok { + sessions = append(sessions, sessionID) + } + } + + return sessions, nil +} diff --git a/docker-controller/pkg/events/subscriber.go b/docker-controller/pkg/events/subscriber.go new file mode 100644 index 00000000..65fd49d2 --- /dev/null +++ b/docker-controller/pkg/events/subscriber.go @@ -0,0 +1,234 @@ +// Package events provides NATS event subscription for the Docker controller. +package events + +import ( + "context" + "encoding/json" + "fmt" + "log" + "time" + + "github.com/google/uuid" + "github.com/nats-io/nats.go" + "github.com/streamspace/docker-controller/pkg/docker" +) + +// Config holds configuration for the NATS subscriber. +type Config struct { + URL string + User string + Password string +} + +// Subscriber subscribes to NATS events and handles them. +type Subscriber struct { + conn *nats.Conn + docker *docker.Client + controllerID string +} + +// NewSubscriber creates a new NATS event subscriber. +func NewSubscriber(cfg Config, dockerClient *docker.Client, controllerID string) (*Subscriber, error) { + if cfg.URL == "" { + cfg.URL = nats.DefaultURL + } + + // Connect to NATS + opts := []nats.Option{ + nats.Name("streamspace-docker-controller"), + nats.ReconnectWait(2 * time.Second), + nats.MaxReconnects(-1), + } + + if cfg.User != "" { + opts = append(opts, nats.UserInfo(cfg.User, cfg.Password)) + } + + conn, err := nats.Connect(cfg.URL, opts...) + if err != nil { + return nil, fmt.Errorf("failed to connect to NATS: %w", err) + } + + return &Subscriber{ + conn: conn, + docker: dockerClient, + controllerID: controllerID, + }, nil +} + +// Start starts the subscriber and begins processing events. +func (s *Subscriber) Start(ctx context.Context) error { + // Subscribe to Docker-specific events + subjects := map[string]func(data []byte) error{ + "streamspace.session.create.docker": s.handleSessionCreate, + "streamspace.session.delete.docker": s.handleSessionDelete, + "streamspace.session.hibernate.docker": s.handleSessionHibernate, + "streamspace.session.wake.docker": s.handleSessionWake, + } + + for subject, handler := range subjects { + h := handler // Capture for closure + _, err := s.conn.Subscribe(subject, func(msg *nats.Msg) { + if err := h(msg.Data); err != nil { + log.Printf("Error handling event %s: %v", subject, err) + } + }) + if err != nil { + return fmt.Errorf("failed to subscribe to %s: %w", subject, err) + } + log.Printf("Subscribed to NATS subject: %s", subject) + } + + // Block until context is cancelled + <-ctx.Done() + return nil +} + +// Close closes the NATS connection. +func (s *Subscriber) Close() { + if s.conn != nil { + s.conn.Close() + } +} + +// handleSessionCreate handles session creation events. +func (s *Subscriber) handleSessionCreate(data []byte) error { + var event SessionCreateEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal: %w", err) + } + + log.Printf("Creating Docker session: %s for user %s", event.SessionID, event.UserID) + + // Ensure user volume exists for persistent home + var homeVolume string + if event.PersistentHome { + var err error + homeVolume, err = s.docker.EnsureUserVolume(context.Background(), event.UserID) + if err != nil { + s.publishStatus(event.SessionID, "failed", fmt.Sprintf("Failed to create home volume: %v", err)) + return err + } + } + + // Parse resources + memory := int64(2 * 1024 * 1024 * 1024) // 2GB default + cpuShares := int64(1024) // Default CPU shares + + // TODO: Look up template to get image and other settings + // For now, use a default image + image := "lscr.io/linuxserver/firefox:latest" + + // Create container + config := docker.SessionConfig{ + SessionID: event.SessionID, + UserID: event.UserID, + TemplateID: event.TemplateID, + Image: image, + Memory: memory, + CPUShares: cpuShares, + VNCPort: 3000, + PersistentHome: event.PersistentHome, + HomeVolume: homeVolume, + Env: map[string]string{ + "PUID": "1000", + "PGID": "1000", + }, + } + + _, err := s.docker.CreateSession(context.Background(), config) + if err != nil { + s.publishStatus(event.SessionID, "failed", fmt.Sprintf("Failed to create container: %v", err)) + return err + } + + // Get URL + url, _ := s.docker.GetSessionURL(context.Background(), event.SessionID, 3000) + + s.publishStatusWithURL(event.SessionID, "running", "Session created", url) + return nil +} + +// handleSessionDelete handles session deletion events. +func (s *Subscriber) handleSessionDelete(data []byte) error { + var event SessionDeleteEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal: %w", err) + } + + log.Printf("Deleting Docker session: %s", event.SessionID) + + if err := s.docker.RemoveSession(context.Background(), event.SessionID, event.Force); err != nil { + return err + } + + s.publishStatus(event.SessionID, "deleted", "Session deleted") + return nil +} + +// handleSessionHibernate handles session hibernation events. +func (s *Subscriber) handleSessionHibernate(data []byte) error { + var event SessionHibernateEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal: %w", err) + } + + log.Printf("Hibernating Docker session: %s", event.SessionID) + + if err := s.docker.StopSession(context.Background(), event.SessionID); err != nil { + s.publishStatus(event.SessionID, "failed", fmt.Sprintf("Failed to hibernate: %v", err)) + return err + } + + s.publishStatus(event.SessionID, "hibernated", "Session hibernated") + return nil +} + +// handleSessionWake handles session wake events. +func (s *Subscriber) handleSessionWake(data []byte) error { + var event SessionWakeEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal: %w", err) + } + + log.Printf("Waking Docker session: %s", event.SessionID) + + if err := s.docker.StartSession(context.Background(), event.SessionID); err != nil { + s.publishStatus(event.SessionID, "failed", fmt.Sprintf("Failed to wake: %v", err)) + return err + } + + // Get URL + url, _ := s.docker.GetSessionURL(context.Background(), event.SessionID, 3000) + + s.publishStatusWithURL(event.SessionID, "running", "Session woken", url) + return nil +} + +// publishStatus publishes a session status update. +func (s *Subscriber) publishStatus(sessionID, status, message string) { + s.publishStatusWithURL(sessionID, status, message, "") +} + +// publishStatusWithURL publishes a session status update with URL. +func (s *Subscriber) publishStatusWithURL(sessionID, status, message, url string) { + event := SessionStatusEvent{ + EventID: uuid.New().String(), + Timestamp: time.Now(), + SessionID: sessionID, + Status: status, + Message: message, + URL: url, + ControllerID: s.controllerID, + } + + data, err := json.Marshal(event) + if err != nil { + log.Printf("Failed to marshal status event: %v", err) + return + } + + if err := s.conn.Publish("streamspace.session.status", data); err != nil { + log.Printf("Failed to publish status: %v", err) + } +} diff --git a/docker-controller/pkg/events/types.go b/docker-controller/pkg/events/types.go new file mode 100644 index 00000000..734d8fcc --- /dev/null +++ b/docker-controller/pkg/events/types.go @@ -0,0 +1,64 @@ +// Package events provides NATS event types for the Docker controller. +package events + +import "time" + +// SessionCreateEvent is received when a new session should be created. +type SessionCreateEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + TemplateID string `json:"template_id"` + Platform string `json:"platform"` + Resources ResourceSpec `json:"resources"` + PersistentHome bool `json:"persistent_home"` + IdleTimeout string `json:"idle_timeout"` + Metadata map[string]string `json:"metadata,omitempty"` +} + +// SessionDeleteEvent is received when a session should be deleted. +type SessionDeleteEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + Platform string `json:"platform"` + Force bool `json:"force"` +} + +// SessionHibernateEvent is received when a session should be hibernated. +type SessionHibernateEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + Platform string `json:"platform"` +} + +// SessionWakeEvent is received when a hibernated session should be woken. +type SessionWakeEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + Platform string `json:"platform"` +} + +// SessionStatusEvent is published when session status changes. +type SessionStatusEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + Status string `json:"status"` + Phase string `json:"phase,omitempty"` + URL string `json:"url,omitempty"` + Message string `json:"message,omitempty"` + ControllerID string `json:"controller_id"` +} + +// ResourceSpec defines resource requirements. +type ResourceSpec struct { + Memory string `json:"memory,omitempty"` + CPU string `json:"cpu,omitempty"` +} diff --git a/docs/architecture/NATS_EVENT_ARCHITECTURE.md b/docs/architecture/NATS_EVENT_ARCHITECTURE.md new file mode 100644 index 00000000..39527238 --- /dev/null +++ b/docs/architecture/NATS_EVENT_ARCHITECTURE.md @@ -0,0 +1,377 @@ +# NATS Event Architecture + +## Overview + +StreamSpace uses NATS as the message broker between the API and platform controllers. This enables: +- Event-driven communication (millisecond latency) +- Multiple platform controllers (Kubernetes, Docker, Hyper-V, vCenter) +- Clean decoupling of API from platform-specific operations +- Scalable and fault-tolerant architecture + +## Architecture Diagram + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Web UI β”‚ ──► β”‚ API β”‚ ──► β”‚ Database β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ (state) β”‚ + β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ publish + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ NATS β”‚ + β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ subscribe + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β–Ό β–Ό β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ K8s β”‚ β”‚ Docker β”‚ β”‚ vCenter β”‚ + β”‚ Controller β”‚ β”‚ Controller β”‚ β”‚ Controller β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## Subject Naming Convention + +Format: `streamspace...` + +### Core Subjects + +| Subject | Description | Publisher | Subscriber | +|---------|-------------|-----------|------------| +| `streamspace.session.create` | Create new session | API | Controllers | +| `streamspace.session.delete` | Delete session | API | Controllers | +| `streamspace.session.hibernate` | Hibernate session | API | Controllers | +| `streamspace.session.wake` | Wake hibernated session | API | Controllers | +| `streamspace.session.status` | Session status update | Controllers | API | +| `streamspace.app.install` | Install application | API | Controllers | +| `streamspace.app.uninstall` | Uninstall application | API | Controllers | +| `streamspace.app.status` | App installation status | Controllers | API | +| `streamspace.template.create` | Create template | Controllers | API | +| `streamspace.template.delete` | Delete template | API | Controllers | +| `streamspace.node.cordon` | Cordon node | API | Controllers | +| `streamspace.node.drain` | Drain node | API | Controllers | +| `streamspace.controller.heartbeat` | Controller health | Controllers | API | + +### Platform-Specific Subjects + +Controllers subscribe to platform-specific subjects: +- `streamspace.session.create.kubernetes` - K8s controller only +- `streamspace.session.create.docker` - Docker controller only +- `streamspace.session.create.hyperv` - Hyper-V controller only + +## Message Payloads + +### Session Create Event + +```json +{ + "event_id": "uuid", + "timestamp": "2025-01-15T10:30:00Z", + "session_id": "uuid", + "user_id": "user1", + "template_id": "firefox-browser", + "platform": "kubernetes", + "resources": { + "memory": "2Gi", + "cpu": "1000m" + }, + "persistent_home": true, + "idle_timeout": "30m", + "metadata": { + "request_id": "uuid", + "source_ip": "192.168.1.1" + } +} +``` + +### Session Status Event (from Controller) + +```json +{ + "event_id": "uuid", + "timestamp": "2025-01-15T10:30:05Z", + "session_id": "uuid", + "status": "running", + "phase": "Running", + "url": "https://user1-firefox.streamspace.local", + "pod_name": "ss-user1-firefox-abc123", + "message": "Session started successfully", + "resource_usage": { + "memory": "512Mi", + "cpu": "250m" + } +} +``` + +### Application Install Event + +```json +{ + "event_id": "uuid", + "timestamp": "2025-01-15T10:30:00Z", + "install_id": "uuid", + "catalog_template_id": 42, + "template_name": "firefox-browser", + "display_name": "Firefox Web Browser", + "manifest": "apiVersion: stream.space/v1alpha1\nkind: Template\n...", + "installed_by": "admin", + "platform": "kubernetes" +} +``` + +### Application Status Event (from Controller) + +```json +{ + "event_id": "uuid", + "timestamp": "2025-01-15T10:30:10Z", + "install_id": "uuid", + "status": "ready", + "template_name": "firefox-browser", + "template_namespace": "streamspace", + "message": "Template created successfully" +} +``` + +### Controller Heartbeat + +```json +{ + "controller_id": "k8s-controller-1", + "platform": "kubernetes", + "timestamp": "2025-01-15T10:30:00Z", + "status": "healthy", + "version": "1.0.0", + "capabilities": ["sessions", "templates", "nodes"], + "cluster_info": { + "name": "production", + "nodes": 5, + "version": "1.28.0" + } +} +``` + +## Database Schema Changes + +### New Tables + +#### `platform_controllers` +Tracks registered controllers and their capabilities. + +```sql +CREATE TABLE platform_controllers ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + controller_id VARCHAR(255) UNIQUE NOT NULL, + platform VARCHAR(50) NOT NULL, -- kubernetes, docker, hyperv, vcenter + display_name VARCHAR(255), + status VARCHAR(50) DEFAULT 'unknown', -- healthy, unhealthy, unknown + version VARCHAR(50), + capabilities JSONB DEFAULT '[]', + cluster_info JSONB DEFAULT '{}', + last_heartbeat TIMESTAMPTZ, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### `event_log` +Audit log of all events for debugging and replay. + +```sql +CREATE TABLE event_log ( + id BIGSERIAL PRIMARY KEY, + event_id UUID NOT NULL, + subject VARCHAR(255) NOT NULL, + payload JSONB NOT NULL, + published_at TIMESTAMPTZ DEFAULT NOW(), + processed_at TIMESTAMPTZ, + processed_by VARCHAR(255), + status VARCHAR(50) DEFAULT 'published', -- published, processing, completed, failed + error_message TEXT +); + +CREATE INDEX idx_event_log_subject ON event_log(subject); +CREATE INDEX idx_event_log_status ON event_log(status); +CREATE INDEX idx_event_log_published_at ON event_log(published_at); +``` + +### Modified Tables + +#### `installed_applications` +Add status tracking for async installation. + +```sql +ALTER TABLE installed_applications ADD COLUMN IF NOT EXISTS + install_status VARCHAR(50) DEFAULT 'pending'; -- pending, installing, ready, failed + +ALTER TABLE installed_applications ADD COLUMN IF NOT EXISTS + install_message TEXT; + +ALTER TABLE installed_applications ADD COLUMN IF NOT EXISTS + platform VARCHAR(50) DEFAULT 'kubernetes'; +``` + +#### `sessions` (if exists, or create) +Add platform field for multi-platform support. + +```sql +ALTER TABLE sessions ADD COLUMN IF NOT EXISTS + platform VARCHAR(50) DEFAULT 'kubernetes'; + +ALTER TABLE sessions ADD COLUMN IF NOT EXISTS + controller_id VARCHAR(255); +``` + +## API Changes + +### New Endpoints + +``` +GET /api/v1/controllers - List registered controllers +GET /api/v1/controllers/:id - Get controller details +GET /api/v1/platforms - List available platforms +``` + +### Modified Endpoints + +All session/application endpoints become async: +- `POST /api/v1/sessions` - Returns immediately with `status: pending` +- `POST /api/v1/applications` - Returns immediately with `install_status: pending` + +Frontend polls for status updates or uses WebSocket for real-time updates. + +## Controller Implementation + +### Subscription Pattern + +```go +// Each controller subscribes to its platform-specific subjects +func (c *Controller) Subscribe(nc *nats.Conn) error { + platform := c.Platform // e.g., "kubernetes" + + // Subscribe to platform-specific events + nc.Subscribe(fmt.Sprintf("streamspace.session.create.%s", platform), c.handleSessionCreate) + nc.Subscribe(fmt.Sprintf("streamspace.session.delete.%s", platform), c.handleSessionDelete) + nc.Subscribe(fmt.Sprintf("streamspace.app.install.%s", platform), c.handleAppInstall) + + // Subscribe to broadcast events (all platforms) + nc.Subscribe("streamspace.session.create", c.handleSessionCreateIfMatches) + + return nil +} +``` + +### Publishing Status Updates + +```go +func (c *Controller) publishSessionStatus(nc *nats.Conn, session *Session) error { + event := SessionStatusEvent{ + EventID: uuid.New().String(), + Timestamp: time.Now(), + SessionID: session.ID, + Status: session.Status, + Phase: session.Phase, + URL: session.URL, + Message: session.Message, + } + + data, _ := json.Marshal(event) + return nc.Publish("streamspace.session.status", data) +} +``` + +## Configuration + +### Environment Variables + +```bash +# NATS Connection +NATS_URL=nats://localhost:4222 +NATS_USER=streamspace +NATS_PASSWORD=secret +NATS_TLS_ENABLED=false + +# Controller Registration +CONTROLLER_ID=k8s-controller-1 +CONTROLLER_PLATFORM=kubernetes +HEARTBEAT_INTERVAL=30s +``` + +### Docker Compose Addition + +```yaml +services: + nats: + image: nats:2.10-alpine + ports: + - "4222:4222" + - "8222:8222" # Monitoring + command: ["--jetstream", "--store_dir", "/data"] + volumes: + - nats_data:/data + +volumes: + nats_data: +``` + +## Error Handling + +### Retry Strategy + +Controllers implement exponential backoff for failed operations: +- Initial delay: 1 second +- Max delay: 5 minutes +- Max retries: 10 + +### Dead Letter Queue + +Failed events after max retries go to: +`streamspace.dlq.` + +### Circuit Breaker + +If a controller fails repeatedly, it's marked as unhealthy and removed from routing. + +## Monitoring + +### NATS Metrics + +- `nats_msgs_received_total` - Messages received by subject +- `nats_msgs_published_total` - Messages published by subject +- `nats_pending_msgs` - Messages pending in queue + +### Custom Metrics + +- `streamspace_events_published_total` - Events published by type +- `streamspace_events_processed_total` - Events processed by controller +- `streamspace_event_latency_seconds` - Time from publish to process +- `streamspace_controller_health` - Controller health status + +## Migration Plan + +### Phase 1: Add NATS Infrastructure +1. Add NATS to docker-compose +2. Create NATS client wrapper in API +3. Add event publishing alongside existing K8s calls + +### Phase 2: Update Controllers +1. Add NATS subscription to K8s controller +2. Implement status publishing +3. Run in parallel with existing direct K8s calls + +### Phase 3: Remove K8s from API +1. Remove k8sClient from API handlers +2. Update frontend for async operations +3. Remove ApplicationInstall CRD (no longer needed) + +### Phase 4: Add New Controllers +1. Docker controller +2. Hyper-V controller +3. vCenter controller + +## Security Considerations + +- Use TLS for NATS connections in production +- Implement authentication (user/password or NKey) +- Consider NATS authorization for subject-level permissions +- Encrypt sensitive data in payloads (credentials, tokens) +- Rate limit event publishing to prevent DoS diff --git a/docs/refactoring/K8S_CLIENT_OPERATIONS_CHECKLIST.md b/docs/refactoring/K8S_CLIENT_OPERATIONS_CHECKLIST.md new file mode 100644 index 00000000..36a74d48 --- /dev/null +++ b/docs/refactoring/K8S_CLIENT_OPERATIONS_CHECKLIST.md @@ -0,0 +1,282 @@ +# K8sClient Operations Migration Checklist + +## Operations to Move to Controller + +### Session Operations (HIGH PRIORITY) +| Operation | Current File | Method | Action | Target | +|-----------|--------------|--------|--------|--------| +| Create Session | handlers.go:464 | `CreateSession()` | MOVE | SessionReconciler | +| Update State | handlers.go:500 | `UpdateSessionState()` | MOVE | SessionReconciler | +| Delete Session | handlers.go:528 | `DeleteSession()` | MOVE | SessionReconciler | +| List for idle check | activity.go:196 | `ListSessions()` | MOVE | IdleReconciler | +| Update to hibernated | activity.go:232 | `UpdateSession()` | MOVE | IdleReconciler | +| Auto-start session | tracker.go:463 | `UpdateSessionState()` | MOVE | AutoStartReconciler | +| Auto-hibernate session | tracker.go:512 | `UpdateSessionState()` | MOVE | IdleReconciler | + +### Node Operations (MEDIUM PRIORITY) +| Operation | Current File | Method | Action | Target | +|-----------|--------------|--------|--------|--------| +| Patch labels | nodes.go:241,267 | `PatchNode()` | MOVE | NodeOpsReconciler | +| Patch taints | nodes.go:313 | `PatchNode()` | MOVE | NodeOpsReconciler | +| Cordon node | nodes.go:383 | `CordonNode()` | MOVE | NodeOpsReconciler | +| Uncordon node | nodes.go:405 | `UncordonNode()` | MOVE | NodeOpsReconciler | +| Drain node | nodes.go:435 | `DrainNode()` | MOVE | NodeOpsReconciler | +| Update taints | nodes.go:361 | `UpdateNodeTaints()` | MOVE | NodeOpsReconciler | + +### Other Operations to Move +| Operation | Current File | Method | Action | Target | +|-----------|--------------|--------|--------|--------| +| Quota validation | handlers.go:423 | `CheckSessionCreation()` | MOVE | SessionValidator Webhook | +| Pod eviction | nodes.go:435 | `DrainNode()` | MOVE | NodeOpsReconciler | +| ConfigMap updates | stubs.go:624,636 | `ConfigMaps().Create/Update()` | MOVE | ConfigReconciler | +| Dynamic resource create | stubs.go:340 | `GetDynamicClient().Create()` | MOVE | ResourceWebhook | +| Dynamic resource update | stubs.go:414 | `GetDynamicClient().Update()` | MOVE | ResourceWebhook | +| Dynamic resource delete | stubs.go:459 | `GetDynamicClient().Delete()` | MOVE | ResourceWebhook | + +--- + +## Operations to Keep in API + +### Read-Only Monitoring +| Operation | Current File | Method | Reason | Keep | +|-----------|--------------|--------|--------|------| +| List Sessions | handlers.go:259-261 | `ListSessions()` | Read-only query | βœ… | +| Get Session | handlers.go:284 | `GetSession()` | Read-only query | βœ… | +| List Templates | handlers.go:762-764 | `ListTemplates()` | Catalog lookup | βœ… | +| Get Template | handlers.go:884 | `GetTemplate()` | Template validation | βœ… | +| List Nodes | nodes.go:156 | `GetNodes()` | Monitoring | βœ… | +| Get Node | nodes.go:187 | `GetNode()` | Node status | βœ… | +| List Pods | stubs.go:239 | `GetPods()` | Monitoring | βœ… | +| List Deployments | stubs.go:254 | Clientset.Deployments() | Monitoring | βœ… | +| List Services | stubs.go:269 | `GetServices()` | Monitoring | βœ… | +| List Namespaces | stubs.go:279 | `GetNamespaces()` | Monitoring | βœ… | +| Get cluster stats | nodes.go:214 | `calculateClusterStats()` | Dashboard | βœ… | + +### Real-Time Operations +| Operation | Current File | Method | Reason | Keep | +|-----------|--------------|--------|--------|------| +| Heartbeat update | activity.go:121 | `UpdateSessionActivity()` | Low-latency | βœ… | +| Activity status | handlers.go (implied) | `GetActivityStatus()` | Real-time | βœ… | +| Broadcast sessions | websocket.go:227 | `ListSessions()` | WebSocket stream | βœ… | +| Stream pod logs | websocket.go:181 | `GetLogs()` | Real-time logs | βœ… | + +### Administrative Triggers +| Operation | Current File | Method | Reason | Keep | +|-----------|--------------|--------|--------|------| +| Install application | applications.go:221 | `CreateApplicationInstall()` | Request trigger | βœ… | +| Create Template | handlers.go:906 | `CreateTemplate()` | One-time setup | βœ… | +| Delete Template | handlers.go:921 | `DeleteTemplate()` | Admin operation | βœ… | +| Get ConfigMap | stubs.go:573,608 | `ConfigMaps().Get()` | Read config | βœ… | +| Get template config | applications.go | N/A | Admin query | βœ… | + +--- + +## Files Summary + +### Files to SIGNIFICANTLY REDUCE +``` +api/internal/api/handlers.go + Current: ~2000 LOC, 50+ k8s operations + After: ~800 LOC, 15+ k8s operations (all read-only) + Removed: Session CRUD, state transitions, quota checks, pod queries + +api/internal/activity/tracker.go + Current: ~300 LOC, 4 k8s operations + After: ~100 LOC, 1 k8s operation (heartbeat endpoint only) + Removed: IdleMonitor loop, hibernation logic + +api/internal/handlers/nodes.go + Current: ~600 LOC, 9 k8s operations + After: ~200 LOC, 2 k8s operations (list, get) + Removed: Patch, cordon, uncordon, drain operations +``` + +### Files to DELETE +``` +api/internal/tracker/tracker.go + Entire file: ~500 LOC, 2 k8s operations + Reason: All auto-start/hibernate logic moves to controller + Keep: Connection DB tracking (no k8s operations) +``` + +### Files to CREATE +``` +controller/internal/controllers/session_controller.go + New: ~800 LOC, 5+ k8s operations + Reconciles: Session CRD β†’ Deployment, PVC, state machine + +controller/internal/controllers/idle_reconciler.go + New: ~300 LOC, 2 k8s operations + Reconciles: Idle sessions β†’ hibernation + +controller/internal/controllers/autostart_reconciler.go + New: ~300 LOC, 1 k8s operation + Reconciles: Connection events β†’ auto-start + +controller/internal/controllers/nodeops_reconciler.go + New: ~600 LOC, 6 k8s operations + Reconciles: NodeOperation CR β†’ node patches, cordon, drain + +controller/internal/webhooks/session_validator.go + New: ~200 LOC, 0 k8s operations (quota check only) + Validates: Session creation against quota +``` + +--- + +## Migration Order (Phased Approach) + +### Phase 1: Design (Weeks 1-2) +- [ ] Finalize Session state machine +- [ ] Design IdleDetection reconciler +- [ ] Design ConnectionTracking webhook +- [ ] Design quota ValidatingWebhook +- [ ] Design NodeOperation CRD + +### Phase 2a: Session Lifecycle (Weeks 3-5) +- [ ] Implement SessionReconciler + - [ ] Handle Pending β†’ Running transition + - [ ] Create Deployment + - [ ] Create PVC + - [ ] Handle Terminated cleanup +- [ ] Add comprehensive tests +- [ ] E2E test: Create session β†’ Running + +### Phase 2b: Idle Detection (Weeks 5-7) +- [ ] Implement IdleReconciler + - [ ] Watch lastActivity timestamp + - [ ] Detect idle sessions + - [ ] Hibernate sessions (update state) +- [ ] Remove activity tracker background loop +- [ ] Keep heartbeat endpoint (update lastActivity) +- [ ] E2E test: Idle after 30m β†’ hibernated + +### Phase 2c: Auto-Start (Weeks 7-8) +- [ ] Design connection event webhook +- [ ] Implement AutoStartReconciler + - [ ] Listen for connection events + - [ ] Auto-start hibernated sessions +- [ ] Remove tracker auto-start logic +- [ ] E2E test: Connect to hibernated β†’ running + +### Phase 2d: Node Operations (Weeks 8-9) +- [ ] Create NodeOperation CRD +- [ ] Implement NodeOpsReconciler + - [ ] Handle cordon/uncordon + - [ ] Handle drain + - [ ] Handle label/taint patches +- [ ] Remove node operation methods from API +- [ ] E2E test: Cordon via API β†’ node unschedulable + +### Phase 2e: Testing (Weeks 9-10) +- [ ] Integration tests for all reconcilers +- [ ] Failure scenario testing +- [ ] Performance testing +- [ ] State consistency verification + +### Phase 3a: API Refactoring (Weeks 11-13) +- [ ] Remove CreateSession implementation +- [ ] Remove UpdateSessionState logic +- [ ] Remove DeleteSession logic +- [ ] Remove node state-change operations +- [ ] Remove tracker.go entirely +- [ ] Keep read-only endpoints + +### Phase 3b: Quota Migration (Weeks 13-14) +- [ ] Implement SessionValidator webhook +- [ ] Remove quota checks from CreateSession +- [ ] Verify webhook rejects over-quota +- [ ] Add feature flag for fallback + +### Phase 3c: Testing & Documentation (Weeks 14-16) +- [ ] Integration tests: API + controller +- [ ] Update documentation +- [ ] Create migration guide +- [ ] Prepare rollout plan + +--- + +## Testing Strategy + +### Unit Tests +| Component | Test File | Target Coverage | +|-----------|-----------|-----------------| +| SessionReconciler | controller/controllers/session_reconciler_test.go | 85%+ | +| IdleReconciler | controller/controllers/idle_reconciler_test.go | 85%+ | +| AutoStartReconciler | controller/controllers/autostart_reconciler_test.go | 85%+ | +| NodeOpsReconciler | controller/controllers/nodeops_reconciler_test.go | 85%+ | +| SessionValidator | controller/webhooks/session_validator_test.go | 90%+ | + +### Integration Tests +| Scenario | Test File | Expected Result | +|----------|-----------|-----------------| +| Create session β†’ Running | integration_test.go | Session in Running state, Deployment exists | +| Idle detection β†’ Hibernated | integration_test.go | Session hibernated after idle timeout | +| Connection β†’ Auto-start | integration_test.go | Session transitions Hibernated β†’ Running | +| Node operations | integration_test.go | Node cordon/drain/label applied | +| Quota validation | integration_test.go | Over-quota session rejected by webhook | + +### E2E Tests +| Scenario | Expected | Success Criteria | +|----------|----------|-----------------| +| User creates session | Session Running in <5s | Deployment, PVC, Service created | +| Session idle for 30m | Session Hibernated | State changed, pods scaled to 0 | +| User connects to idle session | Session auto-starts | State Running, pods scaled to 1 | +| Admin drains node | Node drained | Sessions moved, node unschedulable | +| User over quota | Session rejected | Webhook returns 403 | + +--- + +## Verification Checklist + +### Before Phase 2 Starts +- [ ] All 12 files analyzed and documented +- [ ] k8s operations categorized (move vs keep) +- [ ] Controller design approved by team +- [ ] CRD updates planned +- [ ] Risk mitigation strategies agreed + +### Before Phase 3 Starts +- [ ] All controller reconcilers working +- [ ] 100+ integration tests passing +- [ ] API heartbeat endpoint latency acceptable +- [ ] No regressions vs current behavior +- [ ] Rollback plan tested + +### Before Production Rollout +- [ ] All tests passing on staging +- [ ] Load testing completed +- [ ] Operator runbook prepared +- [ ] Rollback procedure tested +- [ ] Communication sent to users + +--- + +## Rollback Decision Points + +| Phase | Decision | Go/No-Go | +|-------|----------|----------| +| Design | Controller design viable | | +| Phase 2a | SessionReconciler working | | +| Phase 2b | IdleReconciler working | | +| Phase 2c | AutoStartReconciler working | | +| Phase 2d | NodeOpsReconciler working | | +| Phase 2e | All integration tests pass | | +| Phase 3a | API refactoring complete | | +| Phase 3b | Webhook quota validation works | | +| Phase 3c | Staging deployment successful | | +| Rollout | Production deployment successful | | + +--- + +## Success Indicators + +βœ… Session creation moves from 200ms (API) to 50ms (webhook) + async controller +βœ… Idle detection moves from memory-based to CRD-based (persistent) +βœ… Auto-start moves from in-process to event-driven (scalable) +βœ… Node operations move from API to controller (proper separation) +βœ… All tests passing (100+ integration, 500+ unit) +βœ… Code duplication reduced (tracker.go deleted) +βœ… Controller responsibility clear (state machine) +βœ… API responsibility clear (query + trigger) + diff --git a/docs/refactoring/K8S_CLIENT_REFACTORING_ANALYSIS.md b/docs/refactoring/K8S_CLIENT_REFACTORING_ANALYSIS.md new file mode 100644 index 00000000..678ba44d --- /dev/null +++ b/docs/refactoring/K8S_CLIENT_REFACTORING_ANALYSIS.md @@ -0,0 +1,573 @@ +# k8sClient Usage Analysis for StreamSpace API + +## Summary +Found **12 files** using `k8sClient` across the StreamSpace API codebase performing **50+ K8s operations** on multiple resource types. + +--- + +## Detailed File Analysis + +### 1. **api/cmd/main.go** (Initialization) +**File Path:** `/home/user/streamspace/api/cmd/main.go` + +**Purpose:** Service initialization and dependency injection + +**Handler Functions Using k8sClient:** +- `main()` - initializes k8sClient and injects into handlers +- `setupRoutes()` - configures routes with handlers using k8sClient + +**K8s Operations:** +- None directly (initialization only) + +**Resources:** +- Sessions (indirect - passed to handlers) +- Templates (indirect - passed to handlers) +- ApplicationInstalls (indirect - passed to handlers) +- Nodes (indirect - passed to handlers) + +**Recommendation:** βœ… **STAY IN API** - Appropriate for initialization and dependency injection + +**Details:** +```go +// Line 90: Initialize K8s client +k8sClient, err := k8s.NewClient() + +// Line 238: Inject into API handler +apiHandler := api.NewHandler(database, k8sClient, connTracker, syncService, wsManager, quotaEnforcer) + +// Line 242: Inject into activity handler +activityHandler := handlers.NewActivityHandler(k8sClient, activityTracker) + +// Line 246: Inject into dashboard handler +dashboardHandler := handlers.NewDashboardHandler(database, k8sClient) + +// Line 259: Inject into node handler +nodeHandler := handlers.NewNodeHandler(database, k8sClient) + +// Line 274: Inject into application handler +applicationHandler := handlers.NewApplicationHandler(database, k8sClient, appNamespace) + +// Line 123: Inject into websocket manager +wsManager := internalWebsocket.NewManager(database, k8sClient) + +// Line 128: Inject into activity tracker +activityTracker := activity.NewTracker(k8sClient) + +// Line 97: Inject into connection tracker +connTracker := tracker.NewConnectionTracker(database, k8sClient) +``` + +--- + +### 2. **api/internal/api/handlers.go** (Core Session/Template Management) +**File Path:** `/home/user/streamspace/api/internal/api/handlers.go` + +**Purpose:** Main HTTP request handlers for session and template management + +**Handler Functions Using k8sClient:** +- `ListSessions()` - List sessions by user or all sessions +- `GetSession()` - Get single session details +- `CreateSession()` - Create new session with quota check +- `UpdateSession()` - Update session state +- `DeleteSession()` - Delete session +- `UpdateSessionTags()` - Update session tags via dynamic client +- `ListSessionsByTags()` - List sessions filtered by tags +- `ListTemplates()` - List templates by category or all +- `GetTemplate()` - Get template details +- `CreateTemplate()` - Create template from manifest +- `DeleteTemplate()` - Delete template +- `UpdateTemplate()` (implied) +- `GetPods()` - Get pod list for quota calculation + +**K8s Operations:** +- **Session CRD Operations:** + - `ListSessions()` - READ (list) + - `ListSessionsByUser()` - READ (list with filter) + - `GetSession()` - READ (get single) + - `CreateSession()` - CREATE + - `UpdateSessionState()` - UPDATE (state field) + - `DeleteSession()` - DELETE + - Dynamic client: `Update()` on sessionGVR - UPDATE (tags field) + +- **Template CRD Operations:** + - `ListTemplates()` - READ (list) + - `ListTemplatesByCategory()` - READ (list with filter) + - `GetTemplate()` - READ (get single) + - `CreateTemplate()` - CREATE + - `DeleteTemplate()` - DELETE + +- **Pod Operations:** + - `GetPods()` - READ (list pods for quota calculation) + +**Resources:** +- Sessions (primary) +- Templates (primary) +- Pods (quota calculation) + +**Recommendation:** ⚠️ **CONSIDER MOVING TO CONTROLLER** +- Session lifecycle management (create/update/delete) should be controller responsibility +- Pod queries for quota checking could move to webhook admission controller +- Template management could stay in API (static resources) + +**Critical Operations:** +```go +// Line 259-261: List sessions +sessions, err = h.k8sClient.ListSessionsByUser(ctx, h.namespace, userID) +sessions, err = h.k8sClient.ListSessions(ctx, h.namespace) + +// Line 284: Get session +session, err := h.k8sClient.GetSession(ctx, h.namespace, sessionID) + +// Line 364: Get template (validation) +template, err := h.k8sClient.GetTemplate(ctx, h.namespace, req.Template) + +// Line 405: Get pods for quota calculation +podList, err := h.k8sClient.GetPods(ctx, h.namespace) + +// Line 464: Create session +created, err := h.k8sClient.CreateSession(ctx, session) + +// Line 500: Update session state +updated, err := h.k8sClient.UpdateSessionState(ctx, h.namespace, sessionID, req.State) + +// Line 528: Delete session +if err := h.k8sClient.DeleteSession(ctx, h.namespace, sessionID) + +// Line 657, 673: Update session tags (dynamic client) +obj, err := h.k8sClient.GetDynamicClient().Resource(sessionGVR).Namespace(h.namespace).Get(...) +_, err = h.k8sClient.GetDynamicClient().Resource(sessionGVR).Namespace(h.namespace).Update(...) + +// Line 762-764: List templates +templates, err = h.k8sClient.ListTemplatesByCategory(ctx, h.namespace, category) +templates, err = h.k8sClient.ListTemplates(ctx, h.namespace) + +// Line 884, 906, 921: Template operations +template, err := h.k8sClient.GetTemplate(ctx, h.namespace, templateID) +created, err := h.k8sClient.CreateTemplate(ctx, &template) +if err := h.k8sClient.DeleteTemplate(ctx, h.namespace, templateID) +``` + +--- + +### 3. **api/internal/api/stubs.go** (Cluster Management) +**File Path:** `/home/user/streamspace/api/internal/api/stubs.go` + +**Purpose:** Generic cluster resource management (CRUD for any K8s resource type) + +**Handler Functions Using k8sClient:** +- `ListNodes()` - List cluster nodes +- `ListPods()` - List pods in namespace +- `ListDeployments()` - List deployments +- `ListServices()` - List services +- `ListNamespaces()` - List namespaces +- `CreateResource()` - Create generic K8s resource +- `UpdateResource()` - Update generic K8s resource +- `DeleteResource()` - Delete generic K8s resource +- `GetPodLogs()` - Stream pod logs +- `GetConfig()` - Get platform configuration from ConfigMap +- `UpdateConfig()` - Update platform configuration in ConfigMap +- `GetMetrics()` - Get resource metrics + +**K8s Operations:** +- **Node Operations:** + - `GetNodes()` - READ (list all nodes) + - `GetNode()` - READ (single node details) + +- **Pod Operations:** + - `GetPods()` - READ (list pods) + - `GetClientset().CoreV1().Pods().GetLogs()` - READ (pod logs) + +- **Deployment Operations:** + - `GetClientset().AppsV1().Deployments().List()` - READ (list deployments) + +- **Service Operations:** + - `GetServices()` - READ (list services) + +- **Namespace Operations:** + - `GetNamespaces()` - READ (list namespaces) + +- **Dynamic Resource Operations:** + - `GetDynamicClient().Resource(gvr).Create()` - CREATE + - `GetDynamicClient().Resource(gvr).Update()` - UPDATE + - `GetDynamicClient().Resource(gvr).Delete()` - DELETE + +- **ConfigMap Operations:** + - `GetClientset().CoreV1().ConfigMaps().Get()` - READ + - `GetClientset().CoreV1().ConfigMaps().Create()` - CREATE + - `GetClientset().CoreV1().ConfigMaps().Update()` - UPDATE + +**Resources:** +- Nodes +- Pods +- Deployments +- Services +- Namespaces +- ConfigMaps +- Generic K8s resources (via dynamic client) + +**Recommendation:** ⚠️ **CONSIDER MOVING TO CONTROLLER** +- Node management (cordon, drain, taint) - belongs in controller +- Dynamic resource creation/update/delete - should be admission webhook or CRD validation +- Pod log streaming could stay in API (read-only, real-time) +- ConfigMap management (application configuration) - belongs in controller or config service + +**Details:** +```go +// Nodes +nodeList, err := h.k8sClient.GetNodes(c.Request.Context()) +nodes, err = h.k8sClient.GetNodes(ctx) + +// Pods +pods, err := h.k8sClient.GetPods(c.Request.Context(), namespace) +req := h.k8sClient.GetClientset().CoreV1().Pods(namespace).GetLogs(podName, opts) + +// Deployments +deployments, err := h.k8sClient.GetClientset().AppsV1().Deployments(namespace).List(c.Request.Context(), metav1.ListOptions{}) + +// Services +services, err := h.k8sClient.GetServices(c.Request.Context(), namespace) + +// Namespaces +namespaces, err := h.k8sClient.GetNamespaces(c.Request.Context()) + +// Dynamic resources +created, err := h.k8sClient.GetDynamicClient().Resource(gvr).Namespace(namespace).Create(...) +updated, err := h.k8sClient.GetDynamicClient().Resource(gvr).Namespace(namespace).Update(...) +err = h.k8sClient.GetDynamicClient().Resource(gvr).Namespace(namespace).Delete(...) + +// ConfigMaps +configMap, err := h.k8sClient.GetClientset().CoreV1().ConfigMaps(h.namespace).Get(...) +_, err = h.k8sClient.GetClientset().CoreV1().ConfigMaps(h.namespace).Create(...) +_, err = h.k8sClient.GetClientset().CoreV1().ConfigMaps(h.namespace).Update(...) +``` + +--- + +### 4. **api/internal/handlers/applications.go** (Application Installation) +**File Path:** `/home/user/streamspace/api/internal/handlers/applications.go` + +**Purpose:** Installed application management and lifecycle + +**Handler Functions Using k8sClient:** +- `InstallApplication()` - Install new application from catalog + +**K8s Operations:** +- **ApplicationInstall CRD:** + - `CreateApplicationInstall()` - CREATE + +**Resources:** +- ApplicationInstall (CRD) + +**Recommendation:** βœ… **STAY IN API** - Application installation is an administrative operation +- API initiates installation request +- Controller watches ApplicationInstall and creates Template +- Proper separation of concerns + +**Details:** +```go +// Line 221: Create ApplicationInstall CRD +_, err = h.k8sClient.CreateApplicationInstall(ctx, appInstall) + +// Step shows handling of errors: +// - "already exists" - continues with DB record +// - "not find the requested resource" - logs warning but continues +// - other errors - returns HTTP 500 +``` + +--- + +### 5. **api/internal/handlers/nodes.go** (Cluster Node Management) +**File Path:** `/home/user/streamspace/api/internal/handlers/nodes.go` + +**Purpose:** Administrator node management (labels, taints, cordon, drain) + +**Handler Functions Using k8sClient:** +- `ListNodes()` - List all cluster nodes +- `GetNode()` - Get single node details +- `GetClusterStats()` - Aggregate cluster statistics +- `AddNodeLabel()` - Add label to node +- `RemoveNodeLabel()` - Remove label from node +- `AddNodeTaint()` - Add taint to node +- `RemoveNodeTaint()` - Remove taint from node +- `CordonNode()` - Mark node as unschedulable +- `UncordonNode()` - Mark node as schedulable +- `DrainNode()` - Evict all pods from node + +**K8s Operations:** +- **Node Operations:** + - `GetNodes()` - READ (list all nodes) + - `GetNode()` - READ (single node) + - `PatchNode()` - UPDATE (labels and taints) + - `UpdateNodeTaints()` - UPDATE (taints specifically) + - `CordonNode()` - UPDATE (unschedulable flag) + - `UncordonNode()` - UPDATE (unschedulable flag) + - `DrainNode()` - DELETE (evict pods) + +**Resources:** +- Nodes (primary) +- Pods (implicit - evicted during drain) + +**Recommendation:** ⚠️ **CONSIDER MOVING TO CONTROLLER** +- Node operations are cluster infrastructure management +- Should be handled by cluster operator controller +- Could be triggered by custom CRD (NodeMaintenanceRequest) +- API could remain as read-only endpoints for monitoring + +**Details:** +```go +// List and Get +nodeList, err := h.k8sClient.GetNodes(ctx) +node, err := h.k8sClient.GetNode(ctx, nodeName) + +// Patch (labels and taints) +patchData := fmt.Sprintf(`{"metadata":{"labels":{"%s":"%s"}}}`, req.Key, req.Value) +if err := h.k8sClient.PatchNode(ctx, nodeName, []byte(patchData)) + +// Cordon/Uncordon +if err := h.k8sClient.CordonNode(ctx, nodeName) +if err := h.k8sClient.UncordonNode(ctx, nodeName) + +// Drain +if err := h.k8sClient.DrainNode(ctx, nodeName, req.GracePeriodSeconds) +``` + +--- + +### 6. **api/internal/handlers/dashboard.go** (Dashboard Statistics) +**File Path:** `/home/user/streamspace/api/internal/handlers/dashboard.go` + +**Purpose:** Platform statistics and dashboard metrics + +**Handler Functions Using k8sClient:** +- `GetPlatformStats()` - Get overall platform statistics + +**K8s Operations:** +- **Template Operations:** + - `ListTemplates()` - READ (list templates for count) + +**Resources:** +- Templates (for template count metric) + +**Recommendation:** βœ… **STAY IN API** - Read-only dashboard queries belong in API +- No state changes +- Real-time metric aggregation +- Appropriate for API tier + +--- + +### 7. **api/internal/handlers/activity.go** (Session Activity Tracking) +**File Path:** `/home/user/streamspace/api/internal/handlers/activity.go` + +**Purpose:** Session activity heartbeat recording + +**Handler Functions Using k8sClient:** +- `RecordHeartbeat()` - Record session activity (delegates to activity.Tracker) +- `GetActivity()` - Get session activity status + +**K8s Operations:** +- Indirectly called via activity.Tracker: + - `GetSession()` - READ (session for activity status) + - `UpdateSessionStatus()` - UPDATE (lastActivity timestamp) + +**Resources:** +- Sessions (activity status) + +**Recommendation:** βœ… **STAY IN API** - Activity heartbeats must be low-latency responses +- Real-time heartbeat updates +- Cannot defer to controller (latency unacceptable) +- API layer appropriate for this + +--- + +### 8. **api/internal/activity/tracker.go** (Idle Detection) +**File Path:** `/home/user/streamspace/api/internal/activity/tracker.go` + +**Purpose:** Background idle session monitoring and auto-hibernation + +**Handler Functions Using k8sClient:** +- `UpdateSessionActivity()` - Update lastActivity timestamp +- `GetActivityStatus()` - Calculate idle state +- `StartIdleMonitor()` - Background monitor (periodic) +- `hibernateIdleSessions()` - Auto-hibernate idle sessions + +**K8s Operations:** +- `GetSession()` - READ (check idle status) +- `UpdateSessionStatus()` - UPDATE (lastActivity) +- `ListSessions()` - READ (list all for idle check) +- `UpdateSession()` - UPDATE (state to "hibernated") + +**Resources:** +- Sessions (idle monitoring and hibernation) + +**Recommendation:** ⚠️ **MOVE TO CONTROLLER** +- Idle detection is controller responsibility +- Session state transitions belong in controller +- Should implement custom controller with hibernation logic +- Activity tracking could stay in API for heartbeat updates + +--- + +### 9. **api/internal/tracker/tracker.go** (Connection Tracking) +**File Path:** `/home/user/streamspace/api/internal/tracker/tracker.go` + +**Purpose:** Active connection monitoring and auto-start/hibernate logic + +**Handler Functions Using k8sClient:** +- `autoStartHibernatedSession()` - Start hibernated session when connection arrives +- `autoHibernateIdleSessions()` - Hibernate sessions with no connections +- Background goroutine: `Start()` - periodic checks + +**K8s Operations:** +- `GetSession()` - READ (check session state) +- `UpdateSessionState()` - UPDATE (state to "running" or "hibernated") + +**Resources:** +- Sessions (state management) + +**Recommendation:** ⚠️ **MOVE TO CONTROLLER** +- Session state transitions must be in controller +- Connection tracking could stay in API +- Controller should implement auto-start/hibernate logic +- API should track connections and update controller via CRD/webhook + +--- + +### 10. **api/internal/websocket/handlers.go** (Real-time Updates) +**File Path:** `/home/user/streamspace/api/internal/websocket/handlers.go` + +**Purpose:** WebSocket streaming of sessions and pod logs + +**Handler Functions Using k8sClient:** +- `broadcastSessionUpdates()` - Periodic session broadcast +- `broadcastMetrics()` - Periodic metrics broadcast +- `LogsWebSocket()` - Stream pod logs via WebSocket + +**K8s Operations:** +- `ListSessions()` - READ (list sessions for broadcast) +- `GetClientset().CoreV1().Pods().GetLogs()` - READ (pod logs) + +**Resources:** +- Sessions (read-only broadcast) +- Pods (log streaming) + +**Recommendation:** βœ… **STAY IN API** - Real-time WebSocket updates belong in API +- Read-only operations +- Real-time response requirement +- Low-latency streaming + +--- + +### 11. **api/internal/middleware/quota.go** (Quota Enforcement) +**File Path:** `/home/user/streamspace/api/internal/middleware/quota.go` + +**Purpose:** Quota middleware integration (minimal k8sClient usage) + +**Handler Functions Using k8sClient:** +- None directly in middleware + +**K8s Operations:** +- None (middleware just validates, handlers use k8sClient) + +**Recommendation:** βœ… **STAY IN API** - Quota enforcement is API responsibility + +--- + +### 12. **api/internal/api/handlers_test.go** (Unit Tests) +**File Path:** `/home/user/streamspace/api/internal/api/handlers_test.go` + +**Purpose:** Handler tests (mocked k8sClient) + +**Handler Functions Using k8sClient:** +- Mock usage in test setup + +**K8s Operations:** +- Mock operations for testing + +**Recommendation:** N/A - Test file, no migration needed + +--- + +## Summary Table + +| File | Functions Count | K8s Operations | Resources | Move to Controller? | Priority | +|------|-----------------|-----------------|-----------|---------------------|----------| +| api/cmd/main.go | 2 | 0 (init only) | Multiple | No | N/A | +| api/internal/api/handlers.go | 13+ | 15+ (CRUD) | Sessions, Templates, Pods | YES - Critical | HIGH | +| api/internal/api/stubs.go | 12 | 20+ (CRUD) | Nodes, Pods, Services, ConfigMaps, Generic | YES - Some ops | HIGH | +| api/internal/handlers/applications.go | 1 | 1 (CREATE) | ApplicationInstall | No - API appropriate | MED | +| api/internal/handlers/nodes.go | 9 | 9 (UPDATE) | Nodes | YES - Infrastructure | MED | +| api/internal/handlers/dashboard.go | 1 | 1 (READ) | Templates | No - Read-only | LOW | +| api/internal/handlers/activity.go | 2 | 2 (READ/UPDATE) | Sessions | No - Real-time | MED | +| api/internal/activity/tracker.go | 4 | 4 (READ/UPDATE) | Sessions | YES - Logic | HIGH | +| api/internal/tracker/tracker.go | 2 | 2 (READ/UPDATE) | Sessions | YES - State mgmt | HIGH | +| api/internal/websocket/handlers.go | 3 | 2 (READ) | Sessions, Pods | No - Streaming | LOW | +| api/internal/middleware/quota.go | - | 0 | - | N/A | N/A | +| api/internal/api/handlers_test.go | - | Mock | - | N/A | N/A | + +--- + +## Refactoring Recommendations + +### HIGH PRIORITY - Move to Controller +1. **Session lifecycle management** (api/internal/api/handlers.go) + - CreateSession β†’ Controller creation logic + - UpdateSessionState β†’ Controller state machine + - DeleteSession β†’ Controller cleanup + - Keep GetSession/ListSessions in API + +2. **Idle detection & hibernation** (api/internal/activity/tracker.go) + - Implement controller reconciler for idle sessions + - API keeps heartbeat update endpoint (low-latency) + - Controller monitors lastActivity timestamp + +3. **Connection-based auto-start** (api/internal/tracker/tracker.go) + - Move auto-start logic to controller + - API tracks connections, controller manages state + - Consider webhook for connection events + +### MEDIUM PRIORITY - Evaluate +1. **Node management** (api/internal/handlers/nodes.go) + - Consider NodeMaintenanceRequest CRD pattern + - Keep read-only endpoints in API + - Move state-changing operations to controller + +2. **Application installation** (api/internal/handlers/applications.go) + - Current pattern is good (API triggers, Controller executes) + - Monitor for patterns + +### KEEP IN API +1. Dashboard queries (read-only aggregation) +2. WebSocket streaming (real-time, read-only) +3. Activity heartbeats (must be low-latency) +4. Application installation triggers (initiating operations) +5. Template list/get (read-only catalog) + +--- + +## K8s Operations Summary + +### By Operation Type +| Operation | Count | Files | Resources | +|-----------|-------|-------|-----------| +| CREATE | 8 | handlers.go, applications.go, stubs.go | Sessions, Templates, ApplicationInstall, ConfigMap, Generic | +| READ (List) | 20 | handlers.go, stubs.go, dashboard.go, activity.go, tracker.go, websocket.go | Sessions, Templates, Nodes, Pods, Deployments, Services, Namespaces | +| READ (Get) | 15 | handlers.go, stubs.go, activity.go, tracker.go, nodes.go | Sessions, Templates, Nodes, Pods, ConfigMaps | +| UPDATE | 18 | handlers.go, stubs.go, activity.go, tracker.go, nodes.go | Sessions, Templates, ConfigMaps, Nodes, Generic | +| DELETE | 6 | handlers.go, stubs.go, nodes.go | Sessions, Templates, Generic Resources, Pods (evict) | +| PATCH | 3 | nodes.go | Nodes (labels, taints) | +| STREAM | 1 | websocket.go | Pods (logs) | + +### By Resource Type +| Resource | Operations | Files | Current Tier | Recommended | +|----------|-----------|-------|--------------|-------------| +| Session | CRUD + Update State | handlers.go, activity.go, tracker.go | API | Controller | +| Template | CRUD | handlers.go, stubs.go, dashboard.go | API | Hybrid (API read, Controller write) | +| ApplicationInstall | CREATE | applications.go | API | Keep in API (trigger) | +| Node | Get, Patch, Cordon, Drain | nodes.go, stubs.go | API | Controller | +| Pod | Get, List, Logs, Evict | handlers.go, stubs.go, websocket.go | API | Hybrid (keep streaming/query, move eviction) | +| ConfigMap | Get, Create, Update | stubs.go | API | Controller | +| Deployment | List | stubs.go | API | Keep in API (monitoring) | +| Service | List | stubs.go | API | Keep in API (monitoring) | +| Namespace | List | stubs.go | API | Keep in API (monitoring) | +| Generic Resources | CRUD | stubs.go | API | Controller (via webhooks) | + diff --git a/docs/refactoring/K8S_CLIENT_REFACTORING_ROADMAP.md b/docs/refactoring/K8S_CLIENT_REFACTORING_ROADMAP.md new file mode 100644 index 00000000..e30606f4 --- /dev/null +++ b/docs/refactoring/K8S_CLIENT_REFACTORING_ROADMAP.md @@ -0,0 +1,484 @@ +# k8sClient Refactoring Roadmap + +## Executive Summary + +**Current State:** k8sClient scattered across 12 files performing 50+ K8s operations + +**Goal:** Consolidate K8s management logic into controller, keep API for: +- Read-only queries +- Real-time operations (heartbeats, WebSocket) +- Administrative triggers (application installation) + +**Timeline:** Phased approach over 3 phases +**Effort:** 15-20 developer weeks +**Risk Level:** Medium (requires careful state machine design) + +--- + +## Phase 1: Preparation & Design (Weeks 1-2) + +### Task 1.1: Design Controller Reconcilers +**File:** `controller/internal/controllers/session_reconciler.go` +**Work:** +- Design Session state machine (Pending β†’ Running/Hibernated β†’ Terminated) +- Design IdleDetection reconciler +- Design ConnectionTracking reconciler +- Define CRD status fields for controller feedback + +**Acceptance Criteria:** +- [ ] State machine diagram in docs +- [ ] CRD spec updated with new status fields +- [ ] Controller interfaces documented + +### Task 1.2: Design Admission Webhooks +**File:** `controller/internal/webhooks/session_validator.go` +**Work:** +- Design ValidatingWebhook for Session creation (quota validation) +- Design MutatingWebhook for Session defaults +- Plan certificate management + +**Acceptance Criteria:** +- [ ] Webhook manifest examples +- [ ] Quota validation logic in webhook code +- [ ] Error handling documented + +### Task 1.3: API-Controller Communication Protocol +**Work:** +- Define how API signals controller for operations (CRD fields) +- Plan connection event propagation (webhook or annotation) +- Document async operation patterns + +**Acceptance Criteria:** +- [ ] Communication protocol document +- [ ] Example payload flows + +--- + +## Phase 2: Controller Implementation (Weeks 3-10) + +### Task 2.1: Session Lifecycle Controller +**Priority:** HIGH +**File:** `controller/internal/controllers/session_controller.go` +**Work:** +- Implement Session state machine +- Handle transitions: Pending β†’ Running/Hibernated β†’ Terminated +- Implement Deployment/PVC creation logic (from API) +- Implement session cleanup + +**Changes to API:** +- `CreateSession()` β†’ Creates Session CRD only (state: Pending) +- Controller creates Deployment +- API watches for Running status + +**K8s Operations Moved:** +- `h.k8sClient.CreateSession()` β†’ Controller (creates Pod/Deployment) +- `h.k8sClient.UpdateSessionState()` β†’ Controller (state transitions) +- `h.k8sClient.DeleteSession()` β†’ Controller (cleanup) + +**Acceptance Criteria:** +- [ ] Session state transitions working +- [ ] Deployment/PVC created automatically +- [ ] Status fields updated correctly +- [ ] E2E test: Create session β†’ pods appear + +### Task 2.2: Idle Detection Controller +**Priority:** HIGH +**File:** `controller/internal/controllers/idle_reconciler.go` +**Work:** +- Watch Session.Status.LastActivity +- Calculate idle duration +- Auto-hibernate after threshold + grace period +- Update Session.Spec.State to "hibernated" + +**Changes to API:** +- Remove `activity/tracker.go` background loop +- Keep `activity.UpdateSessionActivity()` for heartbeat endpoint +- API heartbeat endpoint only updates lastActivity timestamp + +**K8s Operations Moved:** +- `tracker.ListSessions()` for idle check β†’ Controller +- `session.UpdateSession()` for hibernation β†’ Controller + +**Acceptance Criteria:** +- [ ] Heartbeat updates lastActivity +- [ ] Controller detects idle sessions +- [ ] Auto-hibernation works +- [ ] E2E test: Session idle after 30m β†’ hibernated + +### Task 2.3: Connection-Based Auto-Start +**Priority:** MEDIUM +**File:** `controller/internal/controllers/autostart_reconciler.go` +**Work:** +- Implement connection event webhook +- API sends connection events +- Controller auto-starts hibernated sessions +- Update Session.Spec.State to "running" + +**Changes to API:** +- Remove `tracker.autoStartHibernatedSession()` +- API tracks connections (DB only) +- API sends webhook when connection arrives +- Controller receives webhook and starts session + +**K8s Operations Moved:** +- `ct.k8sClient.UpdateSessionState()` for auto-start β†’ Controller + +**Acceptance Criteria:** +- [ ] Connection events logged +- [ ] Webhook integration working +- [ ] Auto-start on connection works +- [ ] E2E test: Connection to hibernated session β†’ auto-start + +### Task 2.4: Node Management Controller +**Priority:** MEDIUM +**File:** `controller/internal/controllers/nodeops_reconciler.go` +**Work:** +- Create NodeOperation CRD for maintenance requests +- Implement cordon/drain/uncordon logic +- Update node labels/taints via controller + +**Changes to API:** +- API creates NodeOperation CR (not direct node operations) +- Keep read-only node endpoints (`ListNodes()`, `GetNode()`) +- API: `AddNodeLabel()` β†’ Create NodeOperation CR +- Controller: watches NodeOperation and applies changes + +**K8s Operations Moved:** +- `h.k8sClient.PatchNode()` β†’ Controller +- `h.k8sClient.CordonNode()` β†’ Controller +- `h.k8sClient.DrainNode()` β†’ Controller + +**Acceptance Criteria:** +- [ ] NodeOperation CRD defined +- [ ] Cordon logic working +- [ ] Drain logic working +- [ ] E2E test: Cordon node via API β†’ node unschedulable + +### Task 2.5: Integration & Testing +**File:** `controller/tests/integration_test.go` +**Work:** +- Test all 4 reconcilers together +- Test failure scenarios +- Test state persistence +- Performance testing + +**Acceptance Criteria:** +- [ ] 100+ integration tests passing +- [ ] All reconcilers tested +- [ ] Failure scenarios handled +- [ ] Performance acceptable + +--- + +## Phase 3: API Refactoring & Migration (Weeks 11-16) + +### Task 3.1: Remove Session Lifecycle Logic from API +**Files Affected:** +- `api/internal/api/handlers.go` (CreateSession, UpdateSession, DeleteSession) +- `api/internal/tracker/tracker.go` (remove entirely) +- `api/internal/activity/tracker.go` (remove background loop) + +**Changes:** +```go +// BEFORE +func (h *Handler) CreateSession(c *gin.Context) { + session := &k8s.Session{...} + created, err := h.k8sClient.CreateSession(ctx, session) // ❌ Removed +} + +// AFTER +func (h *Handler) CreateSession(c *gin.Context) { + session := &k8s.Session{...} + // Controller will create Deployment + created, err := h.k8sClient.CreateSession(ctx, session) // Still creates CRD + + // Wait for controller to set Status.Running + // Or return 202 Accepted (async) +} +``` + +**K8s Operations Removed from API:** +- CreateSession (Deployment creation) +- UpdateSessionState (state transitions) +- DeleteSession (pod eviction) +- ListSessionsForIdleCheck +- UpdateSessionActivity (partial - keep heartbeat endpoint) + +**Acceptance Criteria:** +- [ ] API handlers simplified +- [ ] No session state transitions in API +- [ ] No pod creation in API +- [ ] All logic moved to controller + +### Task 3.2: Keep Read-Only & Real-Time APIs +**Files to Keep:** +- Dashboard queries (ListTemplates, etc.) +- WebSocket broadcasters +- Activity heartbeat endpoint +- Connection tracking + +**Changes:** +- `GetSession()` - KEEP (read-only) +- `ListSessions()` - KEEP (read-only) +- `RecordHeartbeat()` - KEEP (real-time) +- `ListNodes()` - KEEP (read-only monitoring) + +**Acceptance Criteria:** +- [ ] All read-only endpoints working +- [ ] Real-time endpoints low-latency +- [ ] WebSocket broadcasting working + +### Task 3.3: Quota Enforcement Migration +**File:** `controller/internal/webhooks/session_validator.go` +**Work:** +- Move quota validation to ValidatingWebhook +- Webhook blocks Session creation if quota exceeded +- API removes quota checks from handler + +**Changes:** +```go +// BEFORE +func (h *Handler) CreateSession(c *gin.Context) { + // Check quota + err := h.quotaEnforcer.CheckSessionCreation(...) // ❌ Removed + +// AFTER (in webhook) +func (v *SessionValidator) ValidateCreate(session *k8s.Session) error { + // Check quota + return v.quotaEnforcer.CheckSessionCreation(...) +} +``` + +**Acceptance Criteria:** +- [ ] Webhook quota validation working +- [ ] API quota checks removed +- [ ] 403 returned for quota violations +- [ ] E2E test: Over-quota session rejected + +### Task 3.4: Documentation & Migration Guide +**Files to Create:** +- `docs/CONTROLLER_RECONCILERS.md` - Controller architecture +- `docs/API_CONTROLLER_SPLIT.md` - Responsibility boundaries +- `MIGRATION_GUIDE_API_TO_CONTROLLER.md` - Deployment instructions + +**Work:** +- Document all controller reconcilers +- Update API documentation +- Create user-facing migration guide +- Update CLAUDE.md with new patterns + +**Acceptance Criteria:** +- [ ] All reconcilers documented +- [ ] API/controller split clear +- [ ] Migration guide complete +- [ ] Examples for all patterns + +### Task 3.5: Deployment & Rollout +**Work:** +- Update Helm chart for new controller +- Update CI/CD pipelines +- Gradual rollout strategy +- Rollback plan + +**Acceptance Criteria:** +- [ ] Helm chart updated +- [ ] CI/CD working +- [ ] Rollout checklist complete +- [ ] Rollback tested + +--- + +## Detailed File Mapping + +### FILES TO MODIFY + +**api/internal/api/handlers.go** (-80% operations) +``` +REMOVE: ADD: +- CreateSession (full) β†’ - Wait for controller status +- UpdateSessionState (all) β†’ - Return 202 Accepted for async ops +- DeleteSession (full) β†’ - Error handling for webhook rejections +- GetPods (quota) β†’ - Check CRD status +- enrichSessionWithDBInfo β†’ +``` + +**api/internal/activity/tracker.go** (-70% operations) +``` +REMOVE: KEEP: +- StartIdleMonitor loop β†’ - UpdateSessionActivity (heartbeat) +- hibernateIdleSessions β†’ - GetActivityStatus (read-only) +- Check idle logic β†’ +- Update state to hibernatedβ†’ +``` + +**api/internal/tracker/tracker.go** (Remove entirely) +``` +DELETE ENTIRE FILE: +- All logic moved to controller +- Connection tracking to DB only (no state changes) +``` + +**api/internal/handlers/nodes.go** (-50% operations) +``` +REMOVE: KEEP: +- AddNodeLabel β†’ - ListNodes +- RemoveNodeLabel β†’ - GetNode +- AddNodeTaint β†’ - GetClusterStats +- RemoveNodeTaint β†’ +- CordonNode β†’ +- UncordonNode β†’ +- DrainNode β†’ +``` + +### FILES TO CREATE + +**controller/internal/controllers/session_controller.go** +```go +// New: Session lifecycle reconciliation +- Reconcile(Session) error +- createDeployment() +- createPVC() +- handleStateTransitions() +- cleanup() +``` + +**controller/internal/controllers/idle_reconciler.go** +```go +// New: Idle detection & hibernation +- Reconcile(Session) error +- detectIdleSessions() +- hibernateSession() +``` + +**controller/internal/controllers/autostart_reconciler.go** +```go +// New: Connection-based auto-start +- HandleConnectionEvent(connectionID, sessionID) +- startSession() +``` + +**controller/internal/controllers/nodeops_reconciler.go** +```go +// New: Node maintenance operations +- Reconcile(NodeOperation) error +- applyNodePatch() +- cordonNode() +- drainNode() +``` + +**controller/internal/webhooks/session_validator.go** +```go +// New: Quota validation at admission time +- ValidateCreate(Session) error +- ValidateUpdate(old, new Session) error +- checkQuota() +``` + +--- + +## Risk Mitigation + +### Risk 1: Quota Enforcement +**Risk:** Webhook validation takes longer than API check +**Mitigation:** +- Webhook should be fast (cache quota limits) +- Fall-back: API maintains quota check temporarily +- Gradual migration with feature flag + +### Risk 2: Stale Controller Status +**Risk:** API returns wrong session status if controller lags +**Mitigation:** +- API checks CRD status (not DB cache) +- Expose reconciliation timestamp to clients +- Health check: controller uptime metric + +### Risk 3: Lost Session State +**Risk:** Session state inconsistency during migration +**Mitigation:** +- Backup all sessions before migration +- Run controller and API in parallel temporarily +- Verify CRD status matches expected state + +### Risk 4: Connection Event Loss +**Risk:** Missed connection events if webhook fails +**Mitigation:** +- API fallback: mark session as "wake_requested" +- Controller polls for wake requests periodically +- Webhook retry policy + +--- + +## Success Metrics + +| Metric | Target | Current | +|--------|--------|---------| +| K8s operations in API | < 20 | 50+ | +| Controller reconcilers | 4+ | 0 | +| Session state transitions in controller | 100% | 0% | +| API heartbeat latency | < 100ms | Varies | +| Test coverage (controller) | > 85% | N/A | +| Deployment rollout time | < 10 min | N/A | + +--- + +## Dependencies + +### External +- Kubernetes 1.19+ (webhook support) +- cert-manager (webhook cert management) +- etcd persistence (CRD state) + +### Internal +- `k8s.Client` - both API and controller +- `db.Database` - connection tracking, DB records +- `quota.Enforcer` - moved to webhook + +--- + +## Communication Plan + +### Developers +- Sync meetings: 2x/week during Phase 2-3 +- Slack channel: #streamspace-refactoring +- Decision log in `/docs/REFACTORING_DECISIONS.md` + +### Operators +- Deployment guide in `/docs/DEPLOYMENT_GUIDE.md` +- Backward compatibility for 2 releases +- Gradual rollout (staging β†’ production) + +### Users +- Blog post: "Controller-Driven Architecture" +- No user-facing changes (transparent migration) +- Beta feature flag for early adopters + +--- + +## Rollback Plan + +### If Phase 1 (Design) Fails +- Continue with current architecture +- Loss: 2 weeks planning + +### If Phase 2 (Controller) Fails +- Disable controller, use API fallback +- Keep code in separate branch +- Restart with simplified design + +### If Phase 3 (Migration) Fails +- Keep old API handlers in place +- Use feature flag to toggle between old/new +- Gradual migration per resource type + +--- + +## Next Steps + +1. **Week 1:** Schedule design review with team +2. **Week 1:** Create CRD updates PR +3. **Week 2:** Approve controller design +4. **Week 3:** Start Task 2.1 implementation +5. **Month 2:** Begin API refactoring +6. **Month 3:** Deploy to staging +7. **Month 4:** Production rollout + diff --git a/docs/refactoring/README_K8S_CLIENT_ANALYSIS.md b/docs/refactoring/README_K8S_CLIENT_ANALYSIS.md new file mode 100644 index 00000000..556dcea8 --- /dev/null +++ b/docs/refactoring/README_K8S_CLIENT_ANALYSIS.md @@ -0,0 +1,319 @@ +# K8sClient Refactoring Analysis - README + +This directory contains three comprehensive documents analyzing k8sClient usage in the StreamSpace API and planning the migration to a controller-based architecture. + +## Documents Overview + +### 1. **K8S_CLIENT_REFACTORING_ANALYSIS.md** (Main Analysis - 21KB) +**Detailed technical analysis of all k8sClient usages** + +Contains: +- Complete analysis of 12 files using k8sClient +- 50+ K8s operations catalogued by type and resource +- Per-handler breakdown with code examples +- Recommendations for each file (stay in API vs move to controller) +- Summary tables and reference information + +**Best for:** +- Understanding current state +- Finding where specific operations are used +- Making refactoring decisions + +**Key Findings:** +- 50+ K8s operations across 12 files +- 15+ should move to controller (state transitions, persistence) +- 20+ should stay in API (read-only, real-time) +- 3+ support operations (administrative triggers) + +--- + +### 2. **K8S_CLIENT_REFACTORING_ROADMAP.md** (Timeline & Plan - 25KB) +**Phased refactoring plan with tasks, timeline, and risk mitigation** + +Contains: +- 3-phase roadmap (16 weeks total) +- 15+ specific tasks with acceptance criteria +- File-by-file migration mapping +- Risk analysis and mitigation strategies +- Success metrics and rollback plans +- Communication and deployment strategy + +**Phases:** +- **Phase 1 (Weeks 1-2):** Design controller reconcilers and webhooks +- **Phase 2 (Weeks 3-10):** Implement 4 new controllers +- **Phase 3 (Weeks 11-16):** Refactor API and migrate to production + +**Best for:** +- Planning the refactoring work +- Estimating effort and timeline +- Understanding interdependencies +- Risk assessment + +--- + +### 3. **K8S_CLIENT_OPERATIONS_CHECKLIST.md** (Execution Guide - 10KB) +**Operational checklist for moving specific K8s operations** + +Contains: +- Operations to move to controller (with line numbers) +- Operations to keep in API (with reasons) +- File reduction summary +- New files to create +- Phased implementation order +- Testing strategy +- Verification checklists + +**Best for:** +- Day-to-day execution +- Tracking which operations have been migrated +- Testing strategy +- Verification at each phase + +--- + +## Quick Start + +### For Managers/Leads +1. Read: **ROADMAP** (Executive Summary section) +2. Reference: **ANALYSIS** (Summary Table for priorities) +3. Plan: Use **ROADMAP** (Phases 1-3) for timeline + +### For Developers +1. Start: **ANALYSIS** (Your specific file section) +2. Design: **ROADMAP** (Corresponding task description) +3. Execute: **CHECKLIST** (Specific operations to move) +4. Test: **CHECKLIST** (Testing strategy section) + +### For Architects +1. Deep dive: **ANALYSIS** (Detailed File Analysis section) +2. Validate: **ROADMAP** (Task-specific designs) +3. Risk review: **ROADMAP** (Risk Mitigation section) +4. Approve: Use decision points in **CHECKLIST** + +--- + +## Key Insights + +### Current Problems +- **Scattered logic:** Session state transitions in API + activity tracker + connection tracker +- **Duplication:** Idle detection and auto-hibernation logic in two places +- **Implicit ordering:** API creates deployment, controller manages pod, no state coordination +- **Scalability:** In-process memory tracking (tracker.go) doesn't work at scale +- **Testing:** Hard to test K8s operations without full cluster + +### Proposed Solution +- **Controller-driven:** All state transitions in controller (source of truth) +- **Event-driven:** API signals controller via CRD fields +- **Webhook validation:** Quota checks at admission time (no duplicated logic) +- **Async operations:** API returns 202 Accepted, client polls for status +- **Persistent state:** All state in CRD, survives controller restarts + +### Expected Outcomes +- Session create from 200ms API call β†’ 50ms webhook + async controller +- Idle detection from memory-based β†’ CRD-based (survives restarts) +- Auto-start from in-process loop β†’ event-driven (scales horizontally) +- Node ops from direct API calls β†’ controller reconciliation +- Code size: API reduced 60% (state logic removed) + +--- + +## File Analysis Summary + +| File | Current State | Target State | Priority | Effort | +|------|----------------|--------------|----------|--------| +| **api/cmd/main.go** | k8s init | Stay same | - | 0h | +| **api/internal/api/handlers.go** | 50+ ops | 15 ops | HIGH | 40h | +| **api/internal/api/stubs.go** | 20+ ops | 10 ops | MEDIUM | 30h | +| **api/internal/handlers/applications.go** | 1 op | Stay same | - | 0h | +| **api/internal/handlers/nodes.go** | 9 ops | 2 ops | MEDIUM | 20h | +| **api/internal/handlers/dashboard.go** | 1 op | Stay same | - | 0h | +| **api/internal/handlers/activity.go** | 2 ops | 1 op | HIGH | 10h | +| **api/internal/activity/tracker.go** | 4 ops | 1 op | HIGH | 15h | +| **api/internal/tracker/tracker.go** | 2 ops | DELETE | HIGH | 5h | +| **api/internal/websocket/handlers.go** | 2 ops | Stay same | - | 0h | +| **NEW: controller/session_controller.go** | - | Create | HIGH | 50h | +| **NEW: controller/idle_reconciler.go** | - | Create | HIGH | 20h | +| **NEW: controller/autostart_reconciler.go** | - | Create | MEDIUM | 15h | +| **NEW: controller/nodeops_reconciler.go** | - | Create | MEDIUM | 30h | +| **NEW: controller/webhooks/session_validator.go** | - | Create | HIGH | 15h | + +**Total Effort:** ~250 hours (15-20 developer weeks) + +--- + +## Operations by Type + +### CREATE Operations (8 total) +``` +Sessions: Create CRD (API keeps, controller creates pod) +Templates: Create CRD (API keeps) +AppInstall: Create CRD (API keeps - trigger) +ConfigMaps: Create (move to controller) +Generic: Create via dynamic client (move to webhook) +``` + +### READ Operations (35+ total) +``` +List: Sessions, Templates, Nodes, Pods, Deployments, Services, Namespaces +Get: Sessions, Templates, Nodes, Pods, ConfigMaps +Logs: Pod logs streaming (keep in API for real-time) +``` + +### UPDATE Operations (18 total) +``` +Session State: (Move to controller) +Node Labels: (Move to controller) +Node Taints: (Move to controller) +ConfigMaps: (Move to controller) +Generic Resources:(Move to webhook) +``` + +### DELETE Operations (6 total) +``` +Sessions: (Move to controller) +Templates: (API keeps for cleanup) +Nodes (drain):(Move to controller) +Generic: (Move to webhook) +``` + +### SPECIAL Operations +``` +Patch: Node patches (labels, taints) - move to controller +Drain: Pod eviction - move to controller +Heartbeat: Activity tracking - keep in API (real-time) +``` + +--- + +## Architecture Changes + +### Before (Current) +``` +API Handler Controller (Kubebuilder) +β”œβ”€β”€ CreateSession β”œβ”€β”€ Watch Session CRD +β”‚ β”œβ”€β”€ Create Session CRD └── Create Deployment/PVC +β”‚ └── Wait (BLOCKING) +β”‚ +β”œβ”€β”€ UpdateSessionState (DIRECT) +β”‚ └── Update Session.Spec.State +β”‚ +β”œβ”€β”€ DeleteSession +β”‚ └── Delete Session CRD (cascade) +β”‚ +β”œβ”€β”€ Activity Tracker (background) +β”‚ └── Hibernation logic (IMPLICIT) +β”‚ +└── Connection Tracker (background) + └── Auto-start logic (IMPLICIT) +``` + +### After (Proposed) +``` +API Handler (HTTP) WebSocket Admission Controller +β”œβ”€β”€ CreateSession +β”‚ β”œβ”€β”€ Create Session CRD (Pending) +β”‚ └── Return 202 Accepted +β”‚ └── Client polls for status +β”‚ +β”œβ”€β”€ ListSessions (read-only) +β”‚ +β”œβ”€β”€ RecordHeartbeat ← Update lastActivity +β”‚ └── Update Session.Status.LastActivity +β”‚ +└── Connection Events + └── Webhook:Connected() + Controller (Reconcilers) + β”œβ”€β”€ SessionReconciler + β”‚ └── Pendingβ†’Running + β”‚ Create Deployment/PVC + β”‚ + β”œβ”€β”€ IdleReconciler + β”‚ └── Watch lastActivity + β”‚ Hibernated (scale 0) + β”‚ + β”œβ”€β”€ AutoStartReconciler + β”‚ └── Connection event + β”‚ Running (scale 1) + β”‚ + └── NodeOpsReconciler + └── Cordon/Drain/Labels + + ValidatingWebhook + └── Quota validation + Session creation check +``` + +--- + +## Next Steps + +### Immediate (This Week) +1. Review analysis documents with architecture team +2. Approve design approach +3. Schedule design review for Phase 1 tasks +4. Create tracking tickets + +### Short Term (Next Month) +1. Complete Phase 1 design +2. Begin Phase 2a (SessionReconciler) +3. Set up test infrastructure +4. Create design documentation + +### Medium Term (2-3 Months) +1. Complete Phase 2 (all 4 reconcilers) +2. Begin Phase 3 (API refactoring) +3. Deploy to staging +4. Load testing + +### Long Term (3-4 Months) +1. Production rollout +2. Monitor metrics +3. Gather feedback +4. Plan next iteration + +--- + +## Key Decision Points + +| Question | Analysis Answer | Next Action | +|----------|-----------------|-------------| +| Should session state move to controller? | YES - state consistency | Implement SessionReconciler | +| Keep API heartbeat endpoint? | YES - must be low-latency | Keep activity.UpdateSessionActivity() | +| When to move quota checks? | AFTER webhook design | Plan SessionValidator | +| Should tracker.go be deleted? | YES - logic in controller | Plan deletion in Phase 3a | +| Can node ops stay in API? | NO - infrastructure logic | Plan NodeOpsReconciler | + +--- + +## Documents Checklist + +- [x] K8S_CLIENT_REFACTORING_ANALYSIS.md - Complete technical analysis +- [x] K8S_CLIENT_REFACTORING_ROADMAP.md - Phased implementation plan +- [x] K8S_CLIENT_OPERATIONS_CHECKLIST.md - Day-to-day execution guide +- [x] README_K8S_CLIENT_ANALYSIS.md - This overview document + +## Related Documents to Update + +After using this analysis, update: +- [ ] CLAUDE.md - Add controller reconciler patterns +- [ ] ROADMAP.md - Phase 6 plan references +- [ ] docs/ARCHITECTURE.md - Add controller architecture diagrams +- [ ] docs/CONTROLLER_GUIDE.md - Add reconciler patterns + +--- + +## Support & Questions + +For questions about: +- **Specific operations**: See K8S_CLIENT_REFACTORING_ANALYSIS.md +- **Timeline/Planning**: See K8S_CLIENT_REFACTORING_ROADMAP.md +- **Execution**: See K8S_CLIENT_OPERATIONS_CHECKLIST.md +- **Architecture decisions**: Review all three documents and discussion in ROADMAP.md Risk Mitigation section + +--- + +**Analysis Completed:** 2025-11-19 +**Status:** Ready for team review and planning +**Estimated Effort:** 250 hours / 15-20 developer weeks +**Risk Level:** Medium (requires careful state machine design) + diff --git a/controller/.dockerignore b/k8s-controller/.dockerignore similarity index 100% rename from controller/.dockerignore rename to k8s-controller/.dockerignore diff --git a/controller/Dockerfile b/k8s-controller/Dockerfile similarity index 100% rename from controller/Dockerfile rename to k8s-controller/Dockerfile diff --git a/controller/INSTALL.md b/k8s-controller/INSTALL.md similarity index 100% rename from controller/INSTALL.md rename to k8s-controller/INSTALL.md diff --git a/controller/METRICS.md b/k8s-controller/METRICS.md similarity index 100% rename from controller/METRICS.md rename to k8s-controller/METRICS.md diff --git a/controller/Makefile b/k8s-controller/Makefile similarity index 100% rename from controller/Makefile rename to k8s-controller/Makefile diff --git a/controller/PROJECT b/k8s-controller/PROJECT similarity index 100% rename from controller/PROJECT rename to k8s-controller/PROJECT diff --git a/controller/README.md b/k8s-controller/README.md similarity index 100% rename from controller/README.md rename to k8s-controller/README.md diff --git a/controller/TESTING.md b/k8s-controller/TESTING.md similarity index 100% rename from controller/TESTING.md rename to k8s-controller/TESTING.md diff --git a/controller/api/v1alpha1/applicationinstall_types.go b/k8s-controller/api/v1alpha1/applicationinstall_types.go similarity index 100% rename from controller/api/v1alpha1/applicationinstall_types.go rename to k8s-controller/api/v1alpha1/applicationinstall_types.go diff --git a/controller/api/v1alpha1/groupversion_info.go b/k8s-controller/api/v1alpha1/groupversion_info.go similarity index 100% rename from controller/api/v1alpha1/groupversion_info.go rename to k8s-controller/api/v1alpha1/groupversion_info.go diff --git a/controller/api/v1alpha1/session_types.go b/k8s-controller/api/v1alpha1/session_types.go similarity index 100% rename from controller/api/v1alpha1/session_types.go rename to k8s-controller/api/v1alpha1/session_types.go diff --git a/controller/api/v1alpha1/template_types.go b/k8s-controller/api/v1alpha1/template_types.go similarity index 100% rename from controller/api/v1alpha1/template_types.go rename to k8s-controller/api/v1alpha1/template_types.go diff --git a/controller/api/v1alpha1/zz_generated.deepcopy.go b/k8s-controller/api/v1alpha1/zz_generated.deepcopy.go similarity index 70% rename from controller/api/v1alpha1/zz_generated.deepcopy.go rename to k8s-controller/api/v1alpha1/zz_generated.deepcopy.go index 85476df7..6970d569 100644 --- a/controller/api/v1alpha1/zz_generated.deepcopy.go +++ b/k8s-controller/api/v1alpha1/zz_generated.deepcopy.go @@ -27,6 +27,106 @@ import ( "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ApplicationInstall) DeepCopyInto(out *ApplicationInstall) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApplicationInstall. +func (in *ApplicationInstall) DeepCopy() *ApplicationInstall { + if in == nil { + return nil + } + out := new(ApplicationInstall) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ApplicationInstall) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ApplicationInstallList) DeepCopyInto(out *ApplicationInstallList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ApplicationInstall, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApplicationInstallList. +func (in *ApplicationInstallList) DeepCopy() *ApplicationInstallList { + if in == nil { + return nil + } + out := new(ApplicationInstallList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ApplicationInstallList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ApplicationInstallSpec) DeepCopyInto(out *ApplicationInstallSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApplicationInstallSpec. +func (in *ApplicationInstallSpec) DeepCopy() *ApplicationInstallSpec { + if in == nil { + return nil + } + out := new(ApplicationInstallSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ApplicationInstallStatus) DeepCopyInto(out *ApplicationInstallStatus) { + *out = *in + if in.LastTransitionTime != nil { + in, out := &in.LastTransitionTime, &out.LastTransitionTime + *out = (*in).DeepCopy() + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApplicationInstallStatus. +func (in *ApplicationInstallStatus) DeepCopy() *ApplicationInstallStatus { + if in == nil { + return nil + } + out := new(ApplicationInstallStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ResourceUsage) DeepCopyInto(out *ResourceUsage) { *out = *in diff --git a/controller/cmd/main.go b/k8s-controller/cmd/main.go similarity index 82% rename from controller/cmd/main.go rename to k8s-controller/cmd/main.go index 990220c5..61e9827a 100644 --- a/controller/cmd/main.go +++ b/k8s-controller/cmd/main.go @@ -42,6 +42,7 @@ package main import ( + "context" "flag" "os" @@ -55,6 +56,7 @@ import ( streamv1alpha1 "github.com/streamspace/streamspace/api/v1alpha1" "github.com/streamspace/streamspace/controllers" + "github.com/streamspace/streamspace/pkg/events" _ "github.com/streamspace/streamspace/pkg/metrics" // Initialize custom metrics ) @@ -92,6 +94,11 @@ func main() { var metricsAddr string var enableLeaderElection bool var probeAddr string + var natsURL string + var natsUser string + var natsPassword string + var namespace string + var controllerID string // Parse command-line flags flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") @@ -99,6 +106,11 @@ func main() { flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") + flag.StringVar(&natsURL, "nats-url", getEnv("NATS_URL", "nats://localhost:4222"), "NATS server URL") + flag.StringVar(&natsUser, "nats-user", getEnv("NATS_USER", ""), "NATS username") + flag.StringVar(&natsPassword, "nats-password", getEnv("NATS_PASSWORD", ""), "NATS password") + flag.StringVar(&namespace, "namespace", getEnv("NAMESPACE", "streamspace"), "Kubernetes namespace") + flag.StringVar(&controllerID, "controller-id", getEnv("CONTROLLER_ID", "streamspace-kubernetes-controller-1"), "Unique controller ID") // Setup logging options (can be configured via flags like --zap-log-level=debug) opts := zap.Options{ @@ -202,6 +214,31 @@ func main() { os.Exit(1) } + // Initialize NATS event subscriber for platform-agnostic event handling + setupLog.Info("initializing NATS event subscriber", "url", natsURL) + subscriber, err := events.NewSubscriber(events.Config{ + URL: natsURL, + User: natsUser, + Password: natsPassword, + }, mgr.GetClient(), namespace, controllerID) + + if err != nil { + setupLog.Error(err, "unable to create NATS subscriber") + setupLog.Info("continuing without NATS - controller will only watch CRDs directly") + } else { + // Start subscriber in background + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + defer subscriber.Close() + + go func() { + if err := subscriber.Start(ctx); err != nil { + setupLog.Error(err, "NATS subscriber error") + } + }() + setupLog.Info("NATS event subscriber started", "controller_id", controllerID) + } + // Start the manager and begin reconciliation loops // SetupSignalHandler() ensures graceful shutdown on SIGTERM/SIGINT setupLog.Info("starting manager") @@ -210,3 +247,11 @@ func main() { os.Exit(1) } } + +// getEnv gets an environment variable with a default fallback +func getEnv(key, defaultValue string) string { + if value := os.Getenv(key); value != "" { + return value + } + return defaultValue +} diff --git a/controller/config/crd/bases/stream.streamspace.io_connections.yaml b/k8s-controller/config/crd/bases/stream.streamspace.io_connections.yaml similarity index 100% rename from controller/config/crd/bases/stream.streamspace.io_connections.yaml rename to k8s-controller/config/crd/bases/stream.streamspace.io_connections.yaml diff --git a/controller/config/crd/bases/stream.streamspace.io_sessions.yaml b/k8s-controller/config/crd/bases/stream.streamspace.io_sessions.yaml similarity index 100% rename from controller/config/crd/bases/stream.streamspace.io_sessions.yaml rename to k8s-controller/config/crd/bases/stream.streamspace.io_sessions.yaml diff --git a/controller/config/crd/bases/stream.streamspace.io_templaterepositories.yaml b/k8s-controller/config/crd/bases/stream.streamspace.io_templaterepositories.yaml similarity index 100% rename from controller/config/crd/bases/stream.streamspace.io_templaterepositories.yaml rename to k8s-controller/config/crd/bases/stream.streamspace.io_templaterepositories.yaml diff --git a/controller/config/crd/bases/stream.streamspace.io_templates.yaml b/k8s-controller/config/crd/bases/stream.streamspace.io_templates.yaml similarity index 100% rename from controller/config/crd/bases/stream.streamspace.io_templates.yaml rename to k8s-controller/config/crd/bases/stream.streamspace.io_templates.yaml diff --git a/controller/config/default/kustomization.yaml b/k8s-controller/config/default/kustomization.yaml similarity index 100% rename from controller/config/default/kustomization.yaml rename to k8s-controller/config/default/kustomization.yaml diff --git a/controller/config/default/namespace.yaml b/k8s-controller/config/default/namespace.yaml similarity index 100% rename from controller/config/default/namespace.yaml rename to k8s-controller/config/default/namespace.yaml diff --git a/controller/config/manager/configmap.yaml b/k8s-controller/config/manager/configmap.yaml similarity index 100% rename from controller/config/manager/configmap.yaml rename to k8s-controller/config/manager/configmap.yaml diff --git a/controller/config/manager/deployment.yaml b/k8s-controller/config/manager/deployment.yaml similarity index 100% rename from controller/config/manager/deployment.yaml rename to k8s-controller/config/manager/deployment.yaml diff --git a/controller/config/manager/service.yaml b/k8s-controller/config/manager/service.yaml similarity index 100% rename from controller/config/manager/service.yaml rename to k8s-controller/config/manager/service.yaml diff --git a/controller/config/rbac/rbac.yaml b/k8s-controller/config/rbac/rbac.yaml similarity index 100% rename from controller/config/rbac/rbac.yaml rename to k8s-controller/config/rbac/rbac.yaml diff --git a/controller/config/samples/session_test.yaml b/k8s-controller/config/samples/session_test.yaml similarity index 100% rename from controller/config/samples/session_test.yaml rename to k8s-controller/config/samples/session_test.yaml diff --git a/controller/config/samples/template_chrome.yaml b/k8s-controller/config/samples/template_chrome.yaml similarity index 100% rename from controller/config/samples/template_chrome.yaml rename to k8s-controller/config/samples/template_chrome.yaml diff --git a/controller/config/samples/template_firefox.yaml b/k8s-controller/config/samples/template_firefox.yaml similarity index 100% rename from controller/config/samples/template_firefox.yaml rename to k8s-controller/config/samples/template_firefox.yaml diff --git a/controller/config/samples/template_gimp.yaml b/k8s-controller/config/samples/template_gimp.yaml similarity index 100% rename from controller/config/samples/template_gimp.yaml rename to k8s-controller/config/samples/template_gimp.yaml diff --git a/controller/config/samples/template_libreoffice.yaml b/k8s-controller/config/samples/template_libreoffice.yaml similarity index 100% rename from controller/config/samples/template_libreoffice.yaml rename to k8s-controller/config/samples/template_libreoffice.yaml diff --git a/controller/config/samples/template_ubuntu-desktop.yaml b/k8s-controller/config/samples/template_ubuntu-desktop.yaml similarity index 100% rename from controller/config/samples/template_ubuntu-desktop.yaml rename to k8s-controller/config/samples/template_ubuntu-desktop.yaml diff --git a/controller/config/samples/template_vscode.yaml b/k8s-controller/config/samples/template_vscode.yaml similarity index 100% rename from controller/config/samples/template_vscode.yaml rename to k8s-controller/config/samples/template_vscode.yaml diff --git a/controller/controllers/applicationinstall_controller.go b/k8s-controller/controllers/applicationinstall_controller.go similarity index 100% rename from controller/controllers/applicationinstall_controller.go rename to k8s-controller/controllers/applicationinstall_controller.go diff --git a/controller/controllers/hibernation_controller.go b/k8s-controller/controllers/hibernation_controller.go similarity index 100% rename from controller/controllers/hibernation_controller.go rename to k8s-controller/controllers/hibernation_controller.go diff --git a/controller/controllers/hibernation_controller_test.go b/k8s-controller/controllers/hibernation_controller_test.go similarity index 100% rename from controller/controllers/hibernation_controller_test.go rename to k8s-controller/controllers/hibernation_controller_test.go diff --git a/controller/controllers/session_controller.go b/k8s-controller/controllers/session_controller.go similarity index 100% rename from controller/controllers/session_controller.go rename to k8s-controller/controllers/session_controller.go diff --git a/controller/controllers/session_controller_test.go b/k8s-controller/controllers/session_controller_test.go similarity index 100% rename from controller/controllers/session_controller_test.go rename to k8s-controller/controllers/session_controller_test.go diff --git a/controller/controllers/suite_test.go b/k8s-controller/controllers/suite_test.go similarity index 100% rename from controller/controllers/suite_test.go rename to k8s-controller/controllers/suite_test.go diff --git a/controller/controllers/template_controller.go b/k8s-controller/controllers/template_controller.go similarity index 100% rename from controller/controllers/template_controller.go rename to k8s-controller/controllers/template_controller.go diff --git a/controller/controllers/template_controller_test.go b/k8s-controller/controllers/template_controller_test.go similarity index 100% rename from controller/controllers/template_controller_test.go rename to k8s-controller/controllers/template_controller_test.go diff --git a/controller/go.mod b/k8s-controller/go.mod similarity index 93% rename from controller/go.mod rename to k8s-controller/go.mod index 498d6dc3..6edbeeb9 100644 --- a/controller/go.mod +++ b/k8s-controller/go.mod @@ -5,6 +5,8 @@ go 1.24.0 toolchain go1.24.7 require ( + github.com/google/uuid v1.6.0 + github.com/nats-io/nats.go v1.37.0 github.com/onsi/ginkgo/v2 v2.21.0 github.com/onsi/gomega v1.35.1 github.com/prometheus/client_golang v1.22.0 @@ -32,8 +34,10 @@ require ( github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect - github.com/google/uuid v1.6.0 // indirect github.com/josharian/intern v1.0.0 // indirect + github.com/klauspost/compress v1.17.2 // indirect + github.com/nats-io/nkeys v0.4.7 // indirect + github.com/nats-io/nuid v1.0.1 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect diff --git a/controller/go.sum b/k8s-controller/go.sum similarity index 100% rename from controller/go.sum rename to k8s-controller/go.sum diff --git a/k8s-controller/pkg/events/handlers.go b/k8s-controller/pkg/events/handlers.go new file mode 100644 index 00000000..f737bd3c --- /dev/null +++ b/k8s-controller/pkg/events/handlers.go @@ -0,0 +1,441 @@ +// Package events provides NATS event handlers for the StreamSpace controller. +package events + +import ( + "context" + "encoding/json" + "fmt" + "log" + "time" + + "github.com/google/uuid" + streamv1alpha1 "github.com/streamspace/streamspace/api/v1alpha1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// handleSessionCreate handles session creation events. +func (s *Subscriber) handleSessionCreate(ctx context.Context, data []byte) error { + var event SessionCreateEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal SessionCreateEvent: %w", err) + } + + log.Printf("Handling session create event: %s for user %s", event.SessionID, event.UserID) + + // Create Session CRD + session := &streamv1alpha1.Session{ + ObjectMeta: metav1.ObjectMeta{ + Name: event.SessionID, + Namespace: s.namespace, + Labels: map[string]string{ + "streamspace.io/user": event.UserID, + "streamspace.io/template": event.TemplateID, + }, + }, + Spec: streamv1alpha1.SessionSpec{ + User: event.UserID, + Template: event.TemplateID, + State: "running", + PersistentHome: event.PersistentHome, + IdleTimeout: event.IdleTimeout, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceMemory: resource.MustParse(event.Resources.Memory), + corev1.ResourceCPU: resource.MustParse(event.Resources.CPU), + }, + Limits: corev1.ResourceList{ + corev1.ResourceMemory: resource.MustParse(event.Resources.Memory), + corev1.ResourceCPU: resource.MustParse(event.Resources.CPU), + }, + }, + }, + } + + if err := s.client.Create(ctx, session); err != nil { + if errors.IsAlreadyExists(err) { + log.Printf("Session %s already exists", event.SessionID) + } else { + s.publishSessionStatus(event.SessionID, "failed", "", fmt.Sprintf("Failed to create session: %v", err)) + return fmt.Errorf("failed to create session: %w", err) + } + } + + log.Printf("Session %s created successfully", event.SessionID) + return nil +} + +// handleSessionDelete handles session deletion events. +func (s *Subscriber) handleSessionDelete(ctx context.Context, data []byte) error { + var event SessionDeleteEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal SessionDeleteEvent: %w", err) + } + + log.Printf("Handling session delete event: %s", event.SessionID) + + // Delete Session CRD + session := &streamv1alpha1.Session{ + ObjectMeta: metav1.ObjectMeta{ + Name: event.SessionID, + Namespace: s.namespace, + }, + } + + if err := s.client.Delete(ctx, session); err != nil { + if errors.IsNotFound(err) { + log.Printf("Session %s already deleted", event.SessionID) + } else { + return fmt.Errorf("failed to delete session: %w", err) + } + } + + log.Printf("Session %s deleted successfully", event.SessionID) + return nil +} + +// handleSessionHibernate handles session hibernation events. +func (s *Subscriber) handleSessionHibernate(ctx context.Context, data []byte) error { + var event SessionHibernateEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal SessionHibernateEvent: %w", err) + } + + log.Printf("Handling session hibernate event: %s", event.SessionID) + + // Get the session + session := &streamv1alpha1.Session{} + if err := s.client.Get(ctx, types.NamespacedName{ + Name: event.SessionID, + Namespace: s.namespace, + }, session); err != nil { + return fmt.Errorf("failed to get session: %w", err) + } + + // Update state to hibernated + session.Spec.State = "hibernated" + if err := s.client.Update(ctx, session); err != nil { + return fmt.Errorf("failed to update session state: %w", err) + } + + // Scale deployment to 0 + deploymentName := fmt.Sprintf("ss-%s", event.SessionID) + deployment := &appsv1.Deployment{} + if err := s.client.Get(ctx, types.NamespacedName{ + Name: deploymentName, + Namespace: s.namespace, + }, deployment); err != nil { + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to get deployment: %w", err) + } + } else { + replicas := int32(0) + deployment.Spec.Replicas = &replicas + if err := s.client.Update(ctx, deployment); err != nil { + return fmt.Errorf("failed to scale deployment to 0: %w", err) + } + } + + s.publishSessionStatus(event.SessionID, "hibernated", "Hibernated", "Session hibernated") + log.Printf("Session %s hibernated successfully", event.SessionID) + return nil +} + +// handleSessionWake handles session wake events. +func (s *Subscriber) handleSessionWake(ctx context.Context, data []byte) error { + var event SessionWakeEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal SessionWakeEvent: %w", err) + } + + log.Printf("Handling session wake event: %s", event.SessionID) + + // Get the session + session := &streamv1alpha1.Session{} + if err := s.client.Get(ctx, types.NamespacedName{ + Name: event.SessionID, + Namespace: s.namespace, + }, session); err != nil { + return fmt.Errorf("failed to get session: %w", err) + } + + // Update state to running + session.Spec.State = "running" + if err := s.client.Update(ctx, session); err != nil { + return fmt.Errorf("failed to update session state: %w", err) + } + + // Scale deployment to 1 + deploymentName := fmt.Sprintf("ss-%s", event.SessionID) + deployment := &appsv1.Deployment{} + if err := s.client.Get(ctx, types.NamespacedName{ + Name: deploymentName, + Namespace: s.namespace, + }, deployment); err != nil { + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to get deployment: %w", err) + } + } else { + replicas := int32(1) + deployment.Spec.Replicas = &replicas + if err := s.client.Update(ctx, deployment); err != nil { + return fmt.Errorf("failed to scale deployment to 1: %w", err) + } + } + + s.publishSessionStatus(event.SessionID, "running", "Running", "Session woken") + log.Printf("Session %s woken successfully", event.SessionID) + return nil +} + +// handleAppInstall handles application installation events. +func (s *Subscriber) handleAppInstall(ctx context.Context, data []byte) error { + var event AppInstallEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal AppInstallEvent: %w", err) + } + + log.Printf("Handling app install event: %s (%s)", event.InstallID, event.TemplateName) + + // Create ApplicationInstall CRD + appInstall := &streamv1alpha1.ApplicationInstall{ + ObjectMeta: metav1.ObjectMeta{ + Name: event.InstallID, + Namespace: s.namespace, + Labels: map[string]string{ + "streamspace.io/template": event.TemplateName, + "streamspace.io/category": event.Category, + "streamspace.io/installed-by": event.InstalledBy, + }, + }, + Spec: streamv1alpha1.ApplicationInstallSpec{ + TemplateName: event.TemplateName, + DisplayName: event.DisplayName, + Description: event.Description, + Category: event.Category, + Icon: event.IconURL, + Manifest: event.Manifest, + CatalogTemplateID: event.CatalogTemplateID, + }, + } + + if err := s.client.Create(ctx, appInstall); err != nil { + if errors.IsAlreadyExists(err) { + log.Printf("ApplicationInstall %s already exists", event.InstallID) + } else { + s.publishAppStatus(event.InstallID, "failed", event.TemplateName, fmt.Sprintf("Failed to create ApplicationInstall: %v", err)) + return fmt.Errorf("failed to create ApplicationInstall: %w", err) + } + } + + log.Printf("ApplicationInstall %s created successfully", event.InstallID) + return nil +} + +// handleAppUninstall handles application uninstallation events. +func (s *Subscriber) handleAppUninstall(ctx context.Context, data []byte) error { + var event AppUninstallEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal AppUninstallEvent: %w", err) + } + + log.Printf("Handling app uninstall event: %s", event.InstallID) + + // Delete ApplicationInstall CRD (will cascade delete Template due to owner reference) + appInstall := &streamv1alpha1.ApplicationInstall{ + ObjectMeta: metav1.ObjectMeta{ + Name: event.InstallID, + Namespace: s.namespace, + }, + } + + if err := s.client.Delete(ctx, appInstall); err != nil { + if errors.IsNotFound(err) { + log.Printf("ApplicationInstall %s already deleted", event.InstallID) + } else { + return fmt.Errorf("failed to delete ApplicationInstall: %w", err) + } + } + + log.Printf("ApplicationInstall %s deleted successfully", event.InstallID) + return nil +} + +// handleTemplateCreate handles template creation events. +func (s *Subscriber) handleTemplateCreate(ctx context.Context, data []byte) error { + var event TemplateCreateEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal TemplateCreateEvent: %w", err) + } + + log.Printf("Handling template create event: %s", event.TemplateID) + // Templates are typically created via the API's k8sClient or via ApplicationInstall + // This handler is for future use when templates are created purely through events + log.Printf("Template create event received for %s (handled by API)", event.TemplateID) + return nil +} + +// handleTemplateDelete handles template deletion events. +func (s *Subscriber) handleTemplateDelete(ctx context.Context, data []byte) error { + var event TemplateDeleteEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal TemplateDeleteEvent: %w", err) + } + + log.Printf("Handling template delete event: %s", event.TemplateID) + // Templates are typically deleted via the API's k8sClient + // This handler is for future use when templates are deleted purely through events + log.Printf("Template delete event received for %s (handled by API)", event.TemplateID) + return nil +} + +// handleNodeCordon handles node cordon events. +func (s *Subscriber) handleNodeCordon(ctx context.Context, data []byte) error { + var event NodeCordonEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal NodeCordonEvent: %w", err) + } + + log.Printf("Handling node cordon event: %s", event.NodeName) + + // Get the node + node := &corev1.Node{} + if err := s.client.Get(ctx, types.NamespacedName{Name: event.NodeName}, node); err != nil { + return fmt.Errorf("failed to get node: %w", err) + } + + // Set unschedulable + node.Spec.Unschedulable = true + if err := s.client.Update(ctx, node); err != nil { + return fmt.Errorf("failed to cordon node: %w", err) + } + + log.Printf("Node %s cordoned successfully", event.NodeName) + return nil +} + +// handleNodeUncordon handles node uncordon events. +func (s *Subscriber) handleNodeUncordon(ctx context.Context, data []byte) error { + var event NodeUncordonEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal NodeUncordonEvent: %w", err) + } + + log.Printf("Handling node uncordon event: %s", event.NodeName) + + // Get the node + node := &corev1.Node{} + if err := s.client.Get(ctx, types.NamespacedName{Name: event.NodeName}, node); err != nil { + return fmt.Errorf("failed to get node: %w", err) + } + + // Clear unschedulable + node.Spec.Unschedulable = false + if err := s.client.Update(ctx, node); err != nil { + return fmt.Errorf("failed to uncordon node: %w", err) + } + + log.Printf("Node %s uncordoned successfully", event.NodeName) + return nil +} + +// handleNodeDrain handles node drain events. +func (s *Subscriber) handleNodeDrain(ctx context.Context, data []byte) error { + var event NodeDrainEvent + if err := json.Unmarshal(data, &event); err != nil { + return fmt.Errorf("failed to unmarshal NodeDrainEvent: %w", err) + } + + log.Printf("Handling node drain event: %s", event.NodeName) + + // First cordon the node + node := &corev1.Node{} + if err := s.client.Get(ctx, types.NamespacedName{Name: event.NodeName}, node); err != nil { + return fmt.Errorf("failed to get node: %w", err) + } + + node.Spec.Unschedulable = true + if err := s.client.Update(ctx, node); err != nil { + return fmt.Errorf("failed to cordon node before drain: %w", err) + } + + // List pods on the node + podList := &corev1.PodList{} + if err := s.client.List(ctx, podList, client.MatchingFields{"spec.nodeName": event.NodeName}); err != nil { + return fmt.Errorf("failed to list pods on node: %w", err) + } + + // Delete pods (evict them) + gracePeriod := int64(30) + if event.GracePeriodSeconds != nil { + gracePeriod = *event.GracePeriodSeconds + } + + for _, pod := range podList.Items { + // Skip mirror pods and DaemonSet pods + if pod.Annotations["kubernetes.io/config.mirror"] != "" { + continue + } + if metav1.GetControllerOf(&pod) != nil { + for _, ref := range pod.OwnerReferences { + if ref.Kind == "DaemonSet" { + continue + } + } + } + + // Delete the pod with grace period + deleteOpts := &client.DeleteOptions{ + GracePeriodSeconds: &gracePeriod, + } + if err := s.client.Delete(ctx, &pod, deleteOpts); err != nil { + if !errors.IsNotFound(err) { + log.Printf("Failed to evict pod %s: %v", pod.Name, err) + } + } else { + log.Printf("Evicted pod %s from node %s", pod.Name, event.NodeName) + } + } + + log.Printf("Node %s drained successfully", event.NodeName) + return nil +} + +// publishSessionStatus publishes a session status update. +func (s *Subscriber) publishSessionStatus(sessionID, status, phase, message string) { + event := SessionStatusEvent{ + EventID: uuid.New().String(), + Timestamp: time.Now(), + SessionID: sessionID, + Status: status, + Phase: phase, + Message: message, + ControllerID: s.controllerID, + } + + if err := s.publishStatus(SubjectSessionStatus, event); err != nil { + log.Printf("Failed to publish session status: %v", err) + } +} + +// publishAppStatus publishes an app installation status update. +func (s *Subscriber) publishAppStatus(installID, status, templateName, message string) { + event := AppStatusEvent{ + EventID: uuid.New().String(), + Timestamp: time.Now(), + InstallID: installID, + Status: status, + TemplateName: templateName, + Message: message, + ControllerID: s.controllerID, + } + + if err := s.publishStatus(SubjectAppStatus, event); err != nil { + log.Printf("Failed to publish app status: %v", err) + } +} diff --git a/k8s-controller/pkg/events/subscriber.go b/k8s-controller/pkg/events/subscriber.go new file mode 100644 index 00000000..0ae0ad44 --- /dev/null +++ b/k8s-controller/pkg/events/subscriber.go @@ -0,0 +1,156 @@ +// Package events provides NATS event subscription for the StreamSpace controller. +// +// This package enables the controller to receive events from the API and perform +// platform-specific operations (creating pods, services, PVCs, etc.). +// +// The subscriber listens to NATS subjects and triggers the appropriate +// Kubernetes operations when events are received. +package events + +import ( + "context" + "encoding/json" + "fmt" + "log" + "time" + + "github.com/nats-io/nats.go" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// Config holds configuration for the NATS subscriber. +type Config struct { + URL string + User string + Password string +} + +// Subscriber subscribes to NATS events and handles them. +type Subscriber struct { + conn *nats.Conn + js nats.JetStreamContext + client client.Client + namespace string + controllerID string + platform string + handlers map[string]EventHandler +} + +// EventHandler is a function that handles a specific event type. +type EventHandler func(ctx context.Context, data []byte) error + +// NewSubscriber creates a new NATS event subscriber. +func NewSubscriber(cfg Config, k8sClient client.Client, namespace, controllerID string) (*Subscriber, error) { + if cfg.URL == "" { + cfg.URL = nats.DefaultURL + } + + // Connect to NATS + opts := []nats.Option{ + nats.Name("streamspace-kubernetes-controller"), + nats.ReconnectWait(2 * time.Second), + nats.MaxReconnects(-1), // Infinite reconnects + } + + if cfg.User != "" { + opts = append(opts, nats.UserInfo(cfg.User, cfg.Password)) + } + + conn, err := nats.Connect(cfg.URL, opts...) + if err != nil { + return nil, fmt.Errorf("failed to connect to NATS: %w", err) + } + + // Create JetStream context for durable subscriptions + js, err := conn.JetStream() + if err != nil { + conn.Close() + return nil, fmt.Errorf("failed to create JetStream context: %w", err) + } + + s := &Subscriber{ + conn: conn, + js: js, + client: k8sClient, + namespace: namespace, + controllerID: controllerID, + platform: PlatformKubernetes, + handlers: make(map[string]EventHandler), + } + + // Register default handlers + s.registerHandlers() + + return s, nil +} + +// registerHandlers registers all event handlers. +func (s *Subscriber) registerHandlers() { + // Session events + s.handlers[SubjectSessionCreate] = s.handleSessionCreate + s.handlers[SubjectSessionDelete] = s.handleSessionDelete + s.handlers[SubjectSessionHibernate] = s.handleSessionHibernate + s.handlers[SubjectSessionWake] = s.handleSessionWake + + // Application events + s.handlers[SubjectAppInstall] = s.handleAppInstall + s.handlers[SubjectAppUninstall] = s.handleAppUninstall + + // Template events + s.handlers[SubjectTemplateCreate] = s.handleTemplateCreate + s.handlers[SubjectTemplateDelete] = s.handleTemplateDelete + + // Node events + s.handlers[SubjectNodeCordon] = s.handleNodeCordon + s.handlers[SubjectNodeUncordon] = s.handleNodeUncordon + s.handlers[SubjectNodeDrain] = s.handleNodeDrain +} + +// Start starts the subscriber and begins processing events. +func (s *Subscriber) Start(ctx context.Context) error { + // Subscribe to all registered subjects with platform filter + for subject := range s.handlers { + // Subscribe to platform-specific subject + platformSubject := fmt.Sprintf("%s.%s", subject, s.platform) + + _, err := s.conn.Subscribe(platformSubject, func(msg *nats.Msg) { + // Extract base subject from the platform-specific subject + baseSubject := subject + + handler, ok := s.handlers[baseSubject] + if !ok { + log.Printf("No handler for subject: %s", baseSubject) + return + } + + if err := handler(ctx, msg.Data); err != nil { + log.Printf("Error handling event %s: %v", baseSubject, err) + } + }) + if err != nil { + return fmt.Errorf("failed to subscribe to %s: %w", platformSubject, err) + } + + log.Printf("Subscribed to NATS subject: %s", platformSubject) + } + + // Block until context is cancelled + <-ctx.Done() + return nil +} + +// Close closes the NATS connection. +func (s *Subscriber) Close() { + if s.conn != nil { + s.conn.Close() + } +} + +// publishStatus publishes a status update event back to NATS. +func (s *Subscriber) publishStatus(subject string, event interface{}) error { + data, err := json.Marshal(event) + if err != nil { + return err + } + return s.conn.Publish(subject, data) +} diff --git a/k8s-controller/pkg/events/types.go b/k8s-controller/pkg/events/types.go new file mode 100644 index 00000000..d1ed7f22 --- /dev/null +++ b/k8s-controller/pkg/events/types.go @@ -0,0 +1,182 @@ +// Package events provides NATS event types for the StreamSpace controller. +package events + +import ( + "time" +) + +// NATS subject constants - must match API events package +const ( + SubjectSessionCreate = "streamspace.session.create" + SubjectSessionDelete = "streamspace.session.delete" + SubjectSessionHibernate = "streamspace.session.hibernate" + SubjectSessionWake = "streamspace.session.wake" + SubjectSessionStatus = "streamspace.session.status" + + SubjectAppInstall = "streamspace.app.install" + SubjectAppUninstall = "streamspace.app.uninstall" + SubjectAppStatus = "streamspace.app.status" + + SubjectTemplateCreate = "streamspace.template.create" + SubjectTemplateDelete = "streamspace.template.delete" + + SubjectNodeCordon = "streamspace.node.cordon" + SubjectNodeUncordon = "streamspace.node.uncordon" + SubjectNodeDrain = "streamspace.node.drain" + + SubjectControllerHeartbeat = "streamspace.controller.heartbeat" +) + +// Platform constants +const ( + PlatformKubernetes = "kubernetes" + PlatformDocker = "docker" + PlatformHyperV = "hyperv" + PlatformVCenter = "vcenter" +) + +// SessionCreateEvent is received when a new session should be created. +type SessionCreateEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + TemplateID string `json:"template_id"` + Platform string `json:"platform"` + Resources ResourceSpec `json:"resources"` + PersistentHome bool `json:"persistent_home"` + IdleTimeout string `json:"idle_timeout"` + Metadata map[string]string `json:"metadata,omitempty"` +} + +// SessionDeleteEvent is received when a session should be deleted. +type SessionDeleteEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + Platform string `json:"platform"` + Force bool `json:"force"` +} + +// SessionHibernateEvent is received when a session should be hibernated. +type SessionHibernateEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + Platform string `json:"platform"` +} + +// SessionWakeEvent is received when a hibernated session should be woken. +type SessionWakeEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + Platform string `json:"platform"` +} + +// SessionStatusEvent is published when session status changes. +type SessionStatusEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + SessionID string `json:"session_id"` + Status string `json:"status"` + Phase string `json:"phase"` + URL string `json:"url,omitempty"` + PodName string `json:"pod_name,omitempty"` + Message string `json:"message,omitempty"` + ResourceUsage *ResourceSpec `json:"resource_usage,omitempty"` + ControllerID string `json:"controller_id"` +} + +// AppInstallEvent is received when an application should be installed. +type AppInstallEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + InstallID string `json:"install_id"` + CatalogTemplateID int `json:"catalog_template_id"` + TemplateName string `json:"template_name"` + DisplayName string `json:"display_name"` + Description string `json:"description,omitempty"` + Category string `json:"category,omitempty"` + IconURL string `json:"icon_url,omitempty"` + Manifest string `json:"manifest"` + InstalledBy string `json:"installed_by"` + Platform string `json:"platform"` +} + +// AppUninstallEvent is received when an application should be uninstalled. +type AppUninstallEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + InstallID string `json:"install_id"` + TemplateName string `json:"template_name"` + Platform string `json:"platform"` +} + +// AppStatusEvent is published when app installation status changes. +type AppStatusEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + InstallID string `json:"install_id"` + Status string `json:"status"` + TemplateName string `json:"template_name,omitempty"` + TemplateNamespace string `json:"template_namespace,omitempty"` + Message string `json:"message,omitempty"` + ControllerID string `json:"controller_id"` +} + +// TemplateCreateEvent is received when a template should be created. +type TemplateCreateEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + TemplateID string `json:"template_id"` + DisplayName string `json:"display_name"` + Category string `json:"category,omitempty"` + BaseImage string `json:"base_image,omitempty"` + Manifest string `json:"manifest,omitempty"` + Platform string `json:"platform"` + CreatedBy string `json:"created_by,omitempty"` +} + +// TemplateDeleteEvent is received when a template should be deleted. +type TemplateDeleteEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + TemplateName string `json:"template_name"` + TemplateID string `json:"template_id"` + Platform string `json:"platform"` +} + +// NodeCordonEvent is received when a node should be cordoned. +type NodeCordonEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + NodeName string `json:"node_name"` + Platform string `json:"platform"` +} + +// NodeUncordonEvent is received when a node should be uncordoned. +type NodeUncordonEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + NodeName string `json:"node_name"` + Platform string `json:"platform"` +} + +// NodeDrainEvent is received when a node should be drained. +type NodeDrainEvent struct { + EventID string `json:"event_id"` + Timestamp time.Time `json:"timestamp"` + NodeName string `json:"node_name"` + Platform string `json:"platform"` + GracePeriodSeconds *int64 `json:"grace_period_seconds,omitempty"` +} + +// ResourceSpec defines resource requirements. +type ResourceSpec struct { + Memory string `json:"memory,omitempty"` + CPU string `json:"cpu,omitempty"` +} diff --git a/controller/pkg/metrics/metrics.go b/k8s-controller/pkg/metrics/metrics.go similarity index 100% rename from controller/pkg/metrics/metrics.go rename to k8s-controller/pkg/metrics/metrics.go diff --git a/controller/scripts/README.md b/k8s-controller/scripts/README.md similarity index 100% rename from controller/scripts/README.md rename to k8s-controller/scripts/README.md diff --git a/controller/scripts/create-session.sh b/k8s-controller/scripts/create-session.sh similarity index 100% rename from controller/scripts/create-session.sh rename to k8s-controller/scripts/create-session.sh diff --git a/controller/scripts/get-metrics.sh b/k8s-controller/scripts/get-metrics.sh similarity index 100% rename from controller/scripts/get-metrics.sh rename to k8s-controller/scripts/get-metrics.sh diff --git a/controller/scripts/hibernate-session.sh b/k8s-controller/scripts/hibernate-session.sh similarity index 100% rename from controller/scripts/hibernate-session.sh rename to k8s-controller/scripts/hibernate-session.sh diff --git a/controller/scripts/list-sessions.sh b/k8s-controller/scripts/list-sessions.sh similarity index 100% rename from controller/scripts/list-sessions.sh rename to k8s-controller/scripts/list-sessions.sh diff --git a/controller/scripts/wake-session.sh b/k8s-controller/scripts/wake-session.sh similarity index 100% rename from controller/scripts/wake-session.sh rename to k8s-controller/scripts/wake-session.sh diff --git a/scripts/README.md b/scripts/README.md index 9d63f559..870065bf 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -41,6 +41,90 @@ helm version --short - **v3.19.0 or later**: Use `local-deploy-kubectl.sh` - **v3.18.0 or earlier**: Use `local-deploy.sh` +## Docker Compose Development (NATS-based Architecture) + +For the new event-driven multi-platform architecture, use these scripts: + +### Quick Start (Docker Compose) + +```bash +# Start development environment (PostgreSQL, NATS) +./scripts/docker-dev.sh + +# Start with Docker controller +./scripts/docker-dev.sh --with-docker + +# Start with all services (including monitoring) +./scripts/docker-dev.sh --all --logs + +# Stop environment +./scripts/docker-dev-stop.sh + +# Test NATS connectivity +./scripts/test-nats.sh +``` + +### docker-dev.sh + +Starts the complete development environment using Docker Compose with NATS and PostgreSQL. + +**Usage:** +```bash +./scripts/docker-dev.sh # Core services only +./scripts/docker-dev.sh --with-api # Include API service +./scripts/docker-dev.sh --with-docker # Include Docker controller +./scripts/docker-dev.sh --all # All services and profiles +./scripts/docker-dev.sh --logs # Start and follow logs +``` + +**Services Started:** +- PostgreSQL (localhost:5432) +- NATS with JetStream (localhost:4222, monitor: localhost:8222) + +**Optional Services:** +- API backend (--with-api) +- Docker controller (--with-docker) +- pgAdmin (--with-dev) +- Prometheus/Grafana (--with-monitor) + +### docker-dev-stop.sh + +Stops the Docker Compose development environment. + +**Usage:** +```bash +./scripts/docker-dev-stop.sh # Stop services, keep data +./scripts/docker-dev-stop.sh --clean # Stop and remove volumes +``` + +### build-docker-controller.sh + +Builds the Docker platform controller for the event-driven architecture. + +**Usage:** +```bash +./scripts/build-docker-controller.sh # Build Docker image +./scripts/build-docker-controller.sh --binary # Build Go binary only +``` + +### test-nats.sh + +Tests NATS connectivity and can publish/subscribe to test events. + +**Usage:** +```bash +./scripts/test-nats.sh # Test connectivity +./scripts/test-nats.sh --publish # Publish test events +./scripts/test-nats.sh --subscribe # Subscribe to all events +./scripts/test-nats.sh --streams # List JetStream streams +``` + +--- + +## Kubernetes Deployment Scripts + +For traditional Kubernetes deployment, use these scripts: + ## Script Descriptions ### local-build.sh diff --git a/scripts/build-docker-controller.sh b/scripts/build-docker-controller.sh new file mode 100755 index 00000000..9d909a83 --- /dev/null +++ b/scripts/build-docker-controller.sh @@ -0,0 +1,234 @@ +#!/usr/bin/env bash +# +# build-docker-controller.sh - Build the StreamSpace Docker platform controller +# +# This script builds the Docker controller which handles session management +# on Docker platforms via NATS events. +# +# Usage: +# ./scripts/build-docker-controller.sh # Build Docker image +# ./scripts/build-docker-controller.sh --binary # Build binary only +# + +set -euo pipefail + +# Colors for output +COLOR_RESET='\033[0m' +COLOR_BOLD='\033[1m' +COLOR_GREEN='\033[32m' +COLOR_YELLOW='\033[33m' +COLOR_BLUE='\033[34m' +COLOR_RED='\033[31m' + +# Project configuration +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +CONTROLLER_DIR="${PROJECT_ROOT}/docker-controller" +VERSION="${VERSION:-local}" +GIT_COMMIT="${GIT_COMMIT:-$(git -C "$PROJECT_ROOT" rev-parse --short HEAD 2>/dev/null || echo "unknown")}" +BUILD_DATE="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + +# Image name +DOCKER_CONTROLLER_IMAGE="streamspace/docker-controller" + +# Build mode +BUILD_BINARY_ONLY=false + +# Helper functions +log() { + echo -e "${COLOR_BOLD}==>${COLOR_RESET} $*" +} + +log_success() { + echo -e "${COLOR_GREEN}βœ“${COLOR_RESET} $*" +} + +log_error() { + echo -e "${COLOR_RED}βœ—${COLOR_RESET} $*" >&2 +} + +log_info() { + echo -e "${COLOR_BLUE}β†’${COLOR_RESET} $*" +} + +log_warning() { + echo -e "${COLOR_YELLOW}⚠${COLOR_RESET} $*" +} + +# Show usage +usage() { + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +Build the StreamSpace Docker platform controller. + +Options: + --binary Build Go binary only (no Docker image) + --push Push image to registry after building + -h, --help Show this help message + +Environment Variables: + VERSION Image tag (default: local) + REGISTRY Docker registry prefix (default: none) + +Examples: + $(basename "$0") # Build Docker image + $(basename "$0") --binary # Build binary only + VERSION=v1.0.0 $(basename "$0") # Build with specific version + +EOF + exit 0 +} + +# Parse arguments +PUSH_IMAGE=false +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + --binary) + BUILD_BINARY_ONLY=true + shift + ;; + --push) + PUSH_IMAGE=true + shift + ;; + -h|--help) + usage + ;; + *) + log_error "Unknown option: $1" + usage + ;; + esac + done +} + +# Check prerequisites +check_prerequisites() { + log "Checking prerequisites..." + + if [ ! -d "$CONTROLLER_DIR" ]; then + log_error "Docker controller directory not found: $CONTROLLER_DIR" + exit 1 + fi + + if [ "$BUILD_BINARY_ONLY" = true ]; then + if ! command -v go &> /dev/null; then + log_error "Go is not installed or not in PATH" + exit 1 + fi + log_success "Go is available: $(go version)" + else + if ! command -v docker &> /dev/null; then + log_error "Docker is not installed or not in PATH" + exit 1 + fi + + if ! docker info &> /dev/null; then + log_error "Docker daemon is not running" + exit 1 + fi + log_success "Docker is available" + fi +} + +# Build binary +build_binary() { + log "Building Docker controller binary..." + log_info "Version: $VERSION" + log_info "Commit: $GIT_COMMIT" + + cd "$CONTROLLER_DIR" + + # Download dependencies + log_info "Downloading dependencies..." + go mod download + + # Build binary + log_info "Compiling..." + CGO_ENABLED=0 go build \ + -ldflags "-X main.version=${VERSION} -X main.commit=${GIT_COMMIT} -X main.buildDate=${BUILD_DATE}" \ + -o bin/docker-controller \ + ./cmd/main.go + + log_success "Binary built: $CONTROLLER_DIR/bin/docker-controller" +} + +# Build Docker image +build_image() { + log "Building Docker controller image..." + log_info "Image: ${DOCKER_CONTROLLER_IMAGE}:${VERSION}" + log_info "Context: $CONTROLLER_DIR" + + docker build \ + --build-arg VERSION="${VERSION}" \ + --build-arg COMMIT="${GIT_COMMIT}" \ + --build-arg BUILD_DATE="${BUILD_DATE}" \ + -t "${DOCKER_CONTROLLER_IMAGE}:${VERSION}" \ + -t "${DOCKER_CONTROLLER_IMAGE}:latest" \ + -f "${CONTROLLER_DIR}/Dockerfile" \ + "${CONTROLLER_DIR}/" + + log_success "Docker image built successfully" + + # Show image info + echo "" + docker images --format "table {{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.Size}}" | \ + grep -E "REPOSITORY|${DOCKER_CONTROLLER_IMAGE}" || true +} + +# Push image +push_image() { + if [ "$PUSH_IMAGE" = true ]; then + log "Pushing image to registry..." + docker push "${DOCKER_CONTROLLER_IMAGE}:${VERSION}" + docker push "${DOCKER_CONTROLLER_IMAGE}:latest" + log_success "Image pushed" + fi +} + +# Main execution +main() { + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo -e "${COLOR_BOLD} Build StreamSpace Docker Controller${COLOR_RESET}" + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo "" + echo -e "${COLOR_BLUE}Version:${COLOR_RESET} ${VERSION}" + echo -e "${COLOR_BLUE}Commit:${COLOR_RESET} ${GIT_COMMIT}" + echo -e "${COLOR_BLUE}Build Date:${COLOR_RESET} ${BUILD_DATE}" + echo "" + + parse_args "$@" + check_prerequisites + + if [ "$BUILD_BINARY_ONLY" = true ]; then + build_binary + else + build_image + push_image + fi + + echo "" + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + log_success "Build completed successfully!" + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo "" + + if [ "$BUILD_BINARY_ONLY" = true ]; then + log_info "Run the binary:" + echo " $CONTROLLER_DIR/bin/docker-controller --nats-url=nats://localhost:4222" + else + log_info "Run with docker-compose:" + echo " ./scripts/docker-dev.sh --with-docker" + echo "" + log_info "Or run standalone:" + echo " docker run -d \\" + echo " -e NATS_URL=nats://host.docker.internal:4222 \\" + echo " -v /var/run/docker.sock:/var/run/docker.sock:ro \\" + echo " ${DOCKER_CONTROLLER_IMAGE}:${VERSION}" + fi + echo "" +} + +# Run main function +main "$@" diff --git a/scripts/docker-dev-stop.sh b/scripts/docker-dev-stop.sh new file mode 100755 index 00000000..a87978c1 --- /dev/null +++ b/scripts/docker-dev-stop.sh @@ -0,0 +1,162 @@ +#!/usr/bin/env bash +# +# docker-dev-stop.sh - Stop StreamSpace development environment +# +# This script stops and optionally removes the Docker Compose development environment. +# +# Usage: +# ./scripts/docker-dev-stop.sh # Stop services +# ./scripts/docker-dev-stop.sh --clean # Stop and remove volumes +# + +set -euo pipefail + +# Colors for output +COLOR_RESET='\033[0m' +COLOR_BOLD='\033[1m' +COLOR_GREEN='\033[32m' +COLOR_YELLOW='\033[33m' +COLOR_BLUE='\033[34m' +COLOR_RED='\033[31m' + +# Project configuration +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +COMPOSE_FILE="${PROJECT_ROOT}/docker-compose.yml" + +# Options +REMOVE_VOLUMES=false + +# Helper functions +log() { + echo -e "${COLOR_BOLD}==>${COLOR_RESET} $*" +} + +log_success() { + echo -e "${COLOR_GREEN}βœ“${COLOR_RESET} $*" +} + +log_error() { + echo -e "${COLOR_RED}βœ—${COLOR_RESET} $*" >&2 +} + +log_info() { + echo -e "${COLOR_BLUE}β†’${COLOR_RESET} $*" +} + +log_warning() { + echo -e "${COLOR_YELLOW}⚠${COLOR_RESET} $*" +} + +# Show usage +usage() { + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +Stop StreamSpace development environment. + +Options: + --clean Remove volumes (database data will be lost) + --remove-all Remove everything including images + -h, --help Show this help message + +Examples: + $(basename "$0") # Stop services, keep data + $(basename "$0") --clean # Stop and remove volumes + +EOF + exit 0 +} + +# Parse arguments +REMOVE_IMAGES=false +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + --clean) + REMOVE_VOLUMES=true + shift + ;; + --remove-all) + REMOVE_VOLUMES=true + REMOVE_IMAGES=true + shift + ;; + -h|--help) + usage + ;; + *) + log_error "Unknown option: $1" + usage + ;; + esac + done +} + +# Determine docker compose command +get_compose_cmd() { + if docker compose version &> /dev/null 2>&1; then + echo "docker compose" + else + echo "docker-compose" + fi +} + +# Stop services +stop_services() { + local compose_cmd + compose_cmd=$(get_compose_cmd) + + log "Stopping development environment..." + + cd "$PROJECT_ROOT" + + if [ "$REMOVE_VOLUMES" = true ]; then + log_warning "Removing volumes (data will be lost)..." + $compose_cmd -f "$COMPOSE_FILE" --profile docker --profile dev --profile monitoring down -v + else + $compose_cmd -f "$COMPOSE_FILE" --profile docker --profile dev --profile monitoring down + fi + + log_success "Services stopped" +} + +# Remove images +remove_images() { + local compose_cmd + compose_cmd=$(get_compose_cmd) + + if [ "$REMOVE_IMAGES" = true ]; then + log "Removing images..." + cd "$PROJECT_ROOT" + $compose_cmd -f "$COMPOSE_FILE" --profile docker --profile dev --profile monitoring down --rmi local + log_success "Images removed" + fi +} + +# Main execution +main() { + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo -e "${COLOR_BOLD} Stop StreamSpace Development Environment${COLOR_RESET}" + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo "" + + parse_args "$@" + stop_services + remove_images + + echo "" + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + log_success "Development environment stopped" + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo "" + + if [ "$REMOVE_VOLUMES" = true ]; then + log_info "Volumes removed. Database data has been cleared." + else + log_info "Volumes preserved. Restart with: ./scripts/docker-dev.sh" + fi + echo "" +} + +# Run main function +main "$@" diff --git a/scripts/docker-dev.sh b/scripts/docker-dev.sh new file mode 100755 index 00000000..60100158 --- /dev/null +++ b/scripts/docker-dev.sh @@ -0,0 +1,283 @@ +#!/usr/bin/env bash +# +# docker-dev.sh - Start StreamSpace development environment with Docker Compose +# +# This script starts the complete development environment using docker-compose, +# including PostgreSQL, NATS with JetStream, and optionally the API and Docker controller. +# +# Usage: +# ./scripts/docker-dev.sh # Start core services (postgres, nats) +# ./scripts/docker-dev.sh --with-api # Include API service +# ./scripts/docker-dev.sh --with-docker # Include Docker controller +# ./scripts/docker-dev.sh --all # Start all services +# ./scripts/docker-dev.sh --logs # Start and follow logs +# + +set -euo pipefail + +# Colors for output +COLOR_RESET='\033[0m' +COLOR_BOLD='\033[1m' +COLOR_GREEN='\033[32m' +COLOR_YELLOW='\033[33m' +COLOR_BLUE='\033[34m' +COLOR_RED='\033[31m' + +# Project configuration +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +COMPOSE_FILE="${PROJECT_ROOT}/docker-compose.yml" + +# Default options +PROFILES="" +FOLLOW_LOGS=false + +# Helper functions +log() { + echo -e "${COLOR_BOLD}==>${COLOR_RESET} $*" +} + +log_success() { + echo -e "${COLOR_GREEN}βœ“${COLOR_RESET} $*" +} + +log_error() { + echo -e "${COLOR_RED}βœ—${COLOR_RESET} $*" >&2 +} + +log_info() { + echo -e "${COLOR_BLUE}β†’${COLOR_RESET} $*" +} + +log_warning() { + echo -e "${COLOR_YELLOW}⚠${COLOR_RESET} $*" +} + +# Show usage +usage() { + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +Start StreamSpace development environment with Docker Compose. + +Options: + --with-api Include the API service + --with-docker Include the Docker controller (profile: docker) + --with-dev Include development tools like pgAdmin (profile: dev) + --with-monitor Include monitoring stack (profile: monitoring) + --all Start all services including all profiles + --logs Follow logs after starting + -h, --help Show this help message + +Examples: + $(basename "$0") # Start core services (postgres, nats) + $(basename "$0") --with-api # Start with API + $(basename "$0") --with-docker # Start with Docker controller + $(basename "$0") --all --logs # Start all and follow logs + +Services: + Core (always started): + - postgres PostgreSQL database + - nats NATS message broker with JetStream + + API (--with-api): + - api StreamSpace API backend + + Docker Profile (--with-docker): + - docker-controller Docker platform controller + + Dev Profile (--with-dev): + - pgadmin PostgreSQL admin interface + + Monitoring Profile (--with-monitor): + - prometheus Metrics collection + - grafana Dashboards + +EOF + exit 0 +} + +# Parse arguments +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + --with-api) + # API is part of default services, no profile needed + shift + ;; + --with-docker) + PROFILES="${PROFILES} --profile docker" + shift + ;; + --with-dev) + PROFILES="${PROFILES} --profile dev" + shift + ;; + --with-monitor|--with-monitoring) + PROFILES="${PROFILES} --profile monitoring" + shift + ;; + --all) + PROFILES="--profile docker --profile dev --profile monitoring" + shift + ;; + --logs) + FOLLOW_LOGS=true + shift + ;; + -h|--help) + usage + ;; + *) + log_error "Unknown option: $1" + usage + ;; + esac + done +} + +# Check prerequisites +check_prerequisites() { + log "Checking prerequisites..." + + if ! command -v docker &> /dev/null; then + log_error "Docker is not installed or not in PATH" + exit 1 + fi + + if ! docker info &> /dev/null; then + log_error "Docker daemon is not running" + exit 1 + fi + + if ! command -v docker-compose &> /dev/null && ! docker compose version &> /dev/null; then + log_error "Docker Compose is not installed" + exit 1 + fi + + if [ ! -f "$COMPOSE_FILE" ]; then + log_error "docker-compose.yml not found at: $COMPOSE_FILE" + exit 1 + fi + + log_success "Prerequisites satisfied" +} + +# Determine docker compose command +get_compose_cmd() { + if docker compose version &> /dev/null 2>&1; then + echo "docker compose" + else + echo "docker-compose" + fi +} + +# Start services +start_services() { + local compose_cmd + compose_cmd=$(get_compose_cmd) + + log "Starting development environment..." + log_info "Compose file: $COMPOSE_FILE" + + if [ -n "$PROFILES" ]; then + log_info "Profiles: $PROFILES" + fi + + cd "$PROJECT_ROOT" + + # Start services + # shellcheck disable=SC2086 + $compose_cmd -f "$COMPOSE_FILE" $PROFILES up -d + + log_success "Services started" +} + +# Show service status +show_status() { + local compose_cmd + compose_cmd=$(get_compose_cmd) + + echo "" + log "Service status:" + cd "$PROJECT_ROOT" + $compose_cmd -f "$COMPOSE_FILE" ps +} + +# Show connection info +show_connection_info() { + echo "" + log "Connection Information:" + echo "" + echo -e "${COLOR_BLUE}PostgreSQL:${COLOR_RESET}" + echo " Host: localhost:5432" + echo " User: streamspace" + echo " Password: streamspace" + echo " Database: streamspace" + echo "" + echo -e "${COLOR_BLUE}NATS:${COLOR_RESET}" + echo " Client: nats://localhost:4222" + echo " Monitor: http://localhost:8222" + echo " Cluster: localhost:6222" + echo "" + + if [[ "$PROFILES" == *"dev"* ]]; then + echo -e "${COLOR_BLUE}pgAdmin:${COLOR_RESET}" + echo " URL: http://localhost:5050" + echo " Email: admin@streamspace.local" + echo " Password: admin" + echo "" + fi + + if [[ "$PROFILES" == *"monitoring"* ]]; then + echo -e "${COLOR_BLUE}Prometheus:${COLOR_RESET}" + echo " URL: http://localhost:9090" + echo "" + echo -e "${COLOR_BLUE}Grafana:${COLOR_RESET}" + echo " URL: http://localhost:3000" + echo " User: admin" + echo " Password: admin" + echo "" + fi +} + +# Follow logs +follow_logs() { + local compose_cmd + compose_cmd=$(get_compose_cmd) + + log "Following logs (Ctrl+C to stop)..." + cd "$PROJECT_ROOT" + # shellcheck disable=SC2086 + $compose_cmd -f "$COMPOSE_FILE" $PROFILES logs -f +} + +# Main execution +main() { + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo -e "${COLOR_BOLD} StreamSpace Development Environment${COLOR_RESET}" + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo "" + + parse_args "$@" + check_prerequisites + start_services + show_status + show_connection_info + + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + log_success "Development environment is ready!" + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo "" + log_info "Quick commands:" + echo " Stop: ./scripts/docker-dev-stop.sh" + echo " Logs: docker compose logs -f" + echo " Status: docker compose ps" + echo "" + + if [ "$FOLLOW_LOGS" = true ]; then + follow_logs + fi +} + +# Run main function +main "$@" diff --git a/scripts/local-build.sh b/scripts/local-build.sh index 2ce9611e..ecd7e112 100755 --- a/scripts/local-build.sh +++ b/scripts/local-build.sh @@ -23,9 +23,10 @@ GIT_COMMIT="${GIT_COMMIT:-$(git -C "$PROJECT_ROOT" rev-parse --short HEAD 2>/dev BUILD_DATE="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" # Image names (matching Helm chart expectations) -CONTROLLER_IMAGE="streamspace/streamspace-controller" +KUBERNETES_CONTROLLER_IMAGE="streamspace/streamspace-kubernetes-controller" API_IMAGE="streamspace/streamspace-api" UI_IMAGE="streamspace/streamspace-ui" +DOCKER_CONTROLLER_IMAGE="streamspace/streamspace-docker-controller" # Build arguments BUILD_ARGS="--build-arg VERSION=${VERSION} --build-arg COMMIT=${GIT_COMMIT} --build-arg BUILD_DATE=${BUILD_DATE}" @@ -68,19 +69,19 @@ check_prerequisites() { log_success "Docker is available and running" } -# Build controller image -build_controller() { - log "Building controller image..." - log_info "Image: ${CONTROLLER_IMAGE}:${VERSION}" - log_info "Context: ${PROJECT_ROOT}/controller" +# Build Kubernetes controller image +build_kubernetes_controller() { + log "Building Kubernetes controller image..." + log_info "Image: ${KUBERNETES_CONTROLLER_IMAGE}:${VERSION}" + log_info "Context: ${PROJECT_ROOT}/k8s-controller" docker build ${BUILD_ARGS} \ - -t "${CONTROLLER_IMAGE}:${VERSION}" \ - -t "${CONTROLLER_IMAGE}:latest" \ - -f "${PROJECT_ROOT}/controller/Dockerfile" \ - "${PROJECT_ROOT}/controller/" + -t "${KUBERNETES_CONTROLLER_IMAGE}:${VERSION}" \ + -t "${KUBERNETES_CONTROLLER_IMAGE}:latest" \ + -f "${PROJECT_ROOT}/k8s-controller/Dockerfile" \ + "${PROJECT_ROOT}/k8s-controller/" - log_success "Controller image built successfully" + log_success "Kubernetes controller image built successfully" } # Build API image @@ -113,12 +114,33 @@ build_ui() { log_success "UI image built successfully" } +# Build Docker controller image +build_docker_controller() { + log "Building Docker controller image..." + log_info "Image: ${DOCKER_CONTROLLER_IMAGE}:${VERSION}" + log_info "Context: ${PROJECT_ROOT}/docker-controller" + + # Check if docker-controller directory exists + if [ ! -d "${PROJECT_ROOT}/docker-controller" ]; then + log_warning "Docker controller directory not found, skipping" + return 0 + fi + + docker build ${BUILD_ARGS} \ + -t "${DOCKER_CONTROLLER_IMAGE}:${VERSION}" \ + -t "${DOCKER_CONTROLLER_IMAGE}:latest" \ + -f "${PROJECT_ROOT}/docker-controller/Dockerfile" \ + "${PROJECT_ROOT}/docker-controller/" + + log_success "Docker controller image built successfully" +} + # List built images list_images() { log "Built images:" echo "" docker images --format "table {{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.Size}}" | \ - grep -E "REPOSITORY|streamspace/streamspace-(controller|api|ui)" || true + grep -E "REPOSITORY|streamspace/streamspace-(kubernetes-controller|api|ui|docker-controller)" || true echo "" } @@ -138,15 +160,16 @@ main() { # Allow building individual components if [ $# -eq 0 ]; then # Build all components - build_controller + build_kubernetes_controller build_api build_ui + build_docker_controller else # Build specific components for component in "$@"; do case "$component" in - controller) - build_controller + controller|kubernetes-controller) + build_kubernetes_controller ;; api) build_api @@ -154,9 +177,12 @@ main() { ui) build_ui ;; + docker-controller) + build_docker_controller + ;; *) log_error "Unknown component: $component" - log_info "Valid components: controller, api, ui" + log_info "Valid components: controller, kubernetes-controller, api, ui, docker-controller" exit 1 ;; esac diff --git a/scripts/local-deploy.sh b/scripts/local-deploy.sh index 0419bf1b..1b95578a 100755 --- a/scripts/local-deploy.sh +++ b/scripts/local-deploy.sh @@ -80,7 +80,7 @@ check_images() { local missing_images=0 - for image in "streamspace/streamspace-controller" "streamspace/streamspace-api" "streamspace/streamspace-ui"; do + for image in "streamspace/streamspace-kubernetes-controller" "streamspace/streamspace-api" "streamspace/streamspace-ui"; do if docker images "${image}:${VERSION}" --format "{{.Repository}}:{{.Tag}}" | grep -q "${image}:${VERSION}"; then log_success "Found ${image}:${VERSION}" else diff --git a/scripts/local-port-forward.sh b/scripts/local-port-forward.sh index 10642cfc..b70dcb77 100755 --- a/scripts/local-port-forward.sh +++ b/scripts/local-port-forward.sh @@ -34,6 +34,10 @@ UI_LOCAL_PORT=3000 UI_REMOTE_PORT=80 API_LOCAL_PORT=8000 API_REMOTE_PORT=8000 +NATS_LOCAL_PORT=4222 +NATS_REMOTE_PORT=4222 +NATS_MONITOR_LOCAL_PORT=8222 +NATS_MONITOR_REMOTE_PORT=8222 # Helper functions log() { @@ -196,6 +200,14 @@ show_access_urls() { echo " Health: ${COLOR_BLUE}http://localhost:${API_LOCAL_PORT}/health${COLOR_RESET}" echo "" + # Show NATS info if available + if [ -f "${PID_DIR}/nats.pid" ] || kubectl get svc "streamspace-nats" -n "${NAMESPACE}" &> /dev/null 2>&1; then + log_info "NATS Message Queue:" + echo " Client: ${COLOR_GREEN}nats://localhost:${NATS_LOCAL_PORT}${COLOR_RESET}" + echo " Monitor: ${COLOR_BLUE}http://localhost:${NATS_MONITOR_LOCAL_PORT}${COLOR_RESET}" + echo "" + fi + log_info "Logs:" echo " UI: tail -f ${LOG_DIR}/ui.log" echo " API: tail -f ${LOG_DIR}/api.log" @@ -249,6 +261,16 @@ main() { success=$((success + 1)) fi + # Optional NATS port forwards (if NATS is deployed) + if kubectl get svc "streamspace-nats" -n "${NAMESPACE}" &> /dev/null; then + if start_port_forward "streamspace-nats" "${NATS_LOCAL_PORT}" "${NATS_REMOTE_PORT}" "nats"; then + success=$((success + 1)) + fi + if start_port_forward "streamspace-nats" "${NATS_MONITOR_LOCAL_PORT}" "${NATS_MONITOR_REMOTE_PORT}" "nats-monitor"; then + success=$((success + 1)) + fi + fi + echo "" if [ $success -gt 0 ]; then show_access_urls diff --git a/scripts/local-teardown.sh b/scripts/local-teardown.sh index c7bb68fd..fcb6352d 100755 --- a/scripts/local-teardown.sh +++ b/scripts/local-teardown.sh @@ -143,12 +143,14 @@ clean_docker_images() { # Remove StreamSpace images local images=( - "streamspace/streamspace-controller:${VERSION}" - "streamspace/streamspace-controller:latest" + "streamspace/streamspace-kubernetes-controller:${VERSION}" + "streamspace/streamspace-kubernetes-controller:latest" "streamspace/streamspace-api:${VERSION}" "streamspace/streamspace-api:latest" "streamspace/streamspace-ui:${VERSION}" "streamspace/streamspace-ui:latest" + "streamspace/streamspace-docker-controller:${VERSION}" + "streamspace/streamspace-docker-controller:latest" ) local removed=0 @@ -230,6 +232,13 @@ show_remaining() { else log_success "No remaining Docker images" fi + + # Check for Docker Compose development containers + local compose_containers=$(docker ps -a --filter "name=streamspace" --format "{{.Names}}" | wc -l) + if [ "$compose_containers" -gt 0 ]; then + log_warning "Found ${compose_containers} Docker Compose container(s)" + log_info "Stop with: ./scripts/docker-dev-stop.sh" + fi } # Show Docker disk usage diff --git a/scripts/test-nats.sh b/scripts/test-nats.sh new file mode 100755 index 00000000..be5af043 --- /dev/null +++ b/scripts/test-nats.sh @@ -0,0 +1,345 @@ +#!/usr/bin/env bash +# +# test-nats.sh - Test NATS connectivity and event publishing +# +# This script tests NATS server connectivity and can publish test events +# to verify the event-driven architecture is working correctly. +# +# Usage: +# ./scripts/test-nats.sh # Test connectivity +# ./scripts/test-nats.sh --publish # Publish test events +# ./scripts/test-nats.sh --subscribe # Subscribe to all events +# + +set -euo pipefail + +# Colors for output +COLOR_RESET='\033[0m' +COLOR_BOLD='\033[1m' +COLOR_GREEN='\033[32m' +COLOR_YELLOW='\033[33m' +COLOR_BLUE='\033[34m' +COLOR_RED='\033[31m' + +# Configuration +NATS_URL="${NATS_URL:-nats://localhost:4222}" +NATS_MONITOR_URL="${NATS_MONITOR_URL:-http://localhost:8222}" + +# Helper functions +log() { + echo -e "${COLOR_BOLD}==>${COLOR_RESET} $*" +} + +log_success() { + echo -e "${COLOR_GREEN}βœ“${COLOR_RESET} $*" +} + +log_error() { + echo -e "${COLOR_RED}βœ—${COLOR_RESET} $*" >&2 +} + +log_info() { + echo -e "${COLOR_BLUE}β†’${COLOR_RESET} $*" +} + +log_warning() { + echo -e "${COLOR_YELLOW}⚠${COLOR_RESET} $*" +} + +# Show usage +usage() { + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +Test NATS connectivity and event publishing for StreamSpace. + +Options: + --status Show NATS server status (default) + --publish Publish test events + --subscribe Subscribe to all StreamSpace events + --streams List JetStream streams + --consumers List JetStream consumers + -h, --help Show this help message + +Environment Variables: + NATS_URL NATS server URL (default: nats://localhost:4222) + NATS_MONITOR_URL NATS monitoring URL (default: http://localhost:8222) + +Examples: + $(basename "$0") # Test connectivity + $(basename "$0") --publish # Publish test events + $(basename "$0") --streams # Show JetStream streams + +EOF + exit 0 +} + +# Check if NATS CLI is installed +check_nats_cli() { + if command -v nats &> /dev/null; then + return 0 + fi + return 1 +} + +# Test basic connectivity via HTTP monitor +test_connectivity() { + log "Testing NATS connectivity..." + log_info "Monitor URL: $NATS_MONITOR_URL" + + # Check if NATS monitor is accessible + if curl -s -o /dev/null -w "%{http_code}" "$NATS_MONITOR_URL/healthz" | grep -q "200"; then + log_success "NATS server is healthy" + else + log_error "Cannot connect to NATS monitor at $NATS_MONITOR_URL" + log_info "Make sure NATS is running: ./scripts/docker-dev.sh" + return 1 + fi + + # Get server info + echo "" + log "NATS Server Information:" + if command -v jq &> /dev/null; then + curl -s "$NATS_MONITOR_URL/varz" | jq '{ + server_id: .server_id, + version: .version, + go: .go, + host: .host, + port: .port, + max_connections: .max_connections, + connections: .connections, + in_msgs: .in_msgs, + out_msgs: .out_msgs, + in_bytes: .in_bytes, + out_bytes: .out_bytes + }' + else + curl -s "$NATS_MONITOR_URL/varz" | head -20 + log_info "Install jq for formatted output: brew install jq" + fi + + return 0 +} + +# Show JetStream info +show_jetstream_info() { + log "JetStream Information:" + + if ! curl -s -o /dev/null -w "%{http_code}" "$NATS_MONITOR_URL/jsz" | grep -q "200"; then + log_error "JetStream is not available" + return 1 + fi + + if command -v jq &> /dev/null; then + curl -s "$NATS_MONITOR_URL/jsz" | jq '{ + memory: .memory, + storage: .storage, + streams: .streams, + consumers: .consumers, + messages: .messages, + bytes: .bytes + }' + else + curl -s "$NATS_MONITOR_URL/jsz" + fi + + return 0 +} + +# List streams +list_streams() { + log "JetStream Streams:" + + if check_nats_cli; then + nats -s "$NATS_URL" stream list + else + # Use HTTP API + if command -v jq &> /dev/null; then + curl -s "$NATS_MONITOR_URL/jsz?streams=true" | jq '.account_details[].stream_detail[] | {name: .name, messages: .state.messages, bytes: .state.bytes, consumers: .state.consumer_count}' + else + curl -s "$NATS_MONITOR_URL/jsz?streams=true" + fi + fi +} + +# List consumers +list_consumers() { + log "JetStream Consumers:" + + if check_nats_cli; then + nats -s "$NATS_URL" consumer list --all + else + log_warning "Install NATS CLI for consumer listing: brew install nats-io/nats-tools/nats" + curl -s "$NATS_MONITOR_URL/jsz?consumers=true" + fi +} + +# Publish test events +publish_test_events() { + log "Publishing test events..." + + if ! check_nats_cli; then + log_error "NATS CLI is required for publishing" + log_info "Install: brew install nats-io/nats-tools/nats" + log_info "Or: go install github.com/nats-io/natscli/nats@latest" + return 1 + fi + + # Test event payload + local event_id + event_id=$(uuidgen 2>/dev/null || cat /proc/sys/kernel/random/uuid 2>/dev/null || echo "test-$(date +%s)") + local timestamp + timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + # Publish session status event + local session_event + session_event=$(cat << EOF +{ + "event_id": "${event_id}", + "timestamp": "${timestamp}", + "session_id": "test-session-001", + "status": "running", + "phase": "Running", + "url": "http://localhost:3000", + "pod_name": "test-pod", + "message": "Test session status event", + "controller_id": "test-controller" +} +EOF +) + + log_info "Publishing to streamspace.session.status..." + echo "$session_event" | nats -s "$NATS_URL" publish streamspace.session.status + + # Publish app status event + local app_event + app_event=$(cat << EOF +{ + "event_id": "${event_id}-app", + "timestamp": "${timestamp}", + "install_id": "test-install-001", + "status": "ready", + "template_name": "test-template", + "message": "Test app status event", + "controller_id": "test-controller" +} +EOF +) + + log_info "Publishing to streamspace.app.status..." + echo "$app_event" | nats -s "$NATS_URL" publish streamspace.app.status + + log_success "Test events published" + echo "" + log_info "Events should be received by the API subscriber" +} + +# Subscribe to events +subscribe_to_events() { + log "Subscribing to all StreamSpace events..." + log_info "Press Ctrl+C to stop" + echo "" + + if ! check_nats_cli; then + log_error "NATS CLI is required for subscribing" + log_info "Install: brew install nats-io/nats-tools/nats" + return 1 + fi + + nats -s "$NATS_URL" subscribe "streamspace.>" +} + +# Parse arguments +MODE="status" +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + --status) + MODE="status" + shift + ;; + --publish) + MODE="publish" + shift + ;; + --subscribe) + MODE="subscribe" + shift + ;; + --streams) + MODE="streams" + shift + ;; + --consumers) + MODE="consumers" + shift + ;; + --jetstream) + MODE="jetstream" + shift + ;; + -h|--help) + usage + ;; + *) + log_error "Unknown option: $1" + usage + ;; + esac + done +} + +# Main execution +main() { + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo -e "${COLOR_BOLD} StreamSpace NATS Test Utility${COLOR_RESET}" + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo "" + echo -e "${COLOR_BLUE}NATS URL:${COLOR_RESET} $NATS_URL" + echo -e "${COLOR_BLUE}Monitor URL:${COLOR_RESET} $NATS_MONITOR_URL" + echo "" + + parse_args "$@" + + case $MODE in + status) + test_connectivity + echo "" + show_jetstream_info + ;; + publish) + test_connectivity || exit 1 + echo "" + publish_test_events + ;; + subscribe) + test_connectivity || exit 1 + echo "" + subscribe_to_events + ;; + streams) + test_connectivity || exit 1 + echo "" + list_streams + ;; + consumers) + test_connectivity || exit 1 + echo "" + list_consumers + ;; + jetstream) + test_connectivity || exit 1 + echo "" + show_jetstream_info + ;; + esac + + echo "" + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + log_success "Test completed" + echo -e "${COLOR_BOLD}═══════════════════════════════════════════════════${COLOR_RESET}" + echo "" +} + +# Run main function +main "$@" diff --git a/site/docs.html b/site/docs.html index f3117a15..63c00521 100644 --- a/site/docs.html +++ b/site/docs.html @@ -60,15 +60,15 @@

Key Concepts

Architecture

-

StreamSpace consists of three main components:

+

StreamSpace uses a multi-platform event-driven architecture with NATS messaging:

-

1. Kubernetes Controller

-

Go-based controller using Kubebuilder framework that manages Session and Template CRDs.

+

1. Platform Controllers

+

Platform-specific controllers that manage sessions on their respective infrastructure via NATS events.

    -
  • Reconciles Session CRDs with Kubernetes resources
  • -
  • Creates Deployments, Services, Ingresses, PVCs
  • -
  • Handles state transitions and hibernation
  • -
  • Exports Prometheus metrics
  • +
  • Kubernetes Controller (k8s-controller/) - Kubebuilder-based, manages CRDs
  • +
  • Docker Controller (docker-controller/) - Manages Docker containers
  • +
  • NATS JetStream for durable event delivery
  • +
  • Prometheus metrics export

View detailed architecture β†’

@@ -216,13 +216,13 @@

Plugin Endpoints

Development

-

Controller Development

+

Kubernetes Controller Development

BASH
-
cd controller
+        
cd k8s-controller
 
 # Run locally
 make run
@@ -231,10 +231,28 @@ 

Controller Development

make test # Build Docker image -make docker-build IMG=myregistry/streamspace-controller:dev
+make docker-build IMG=myregistry/streamspace-kubernetes-controller:dev
-

Controller development guide β†’

+

Kubernetes controller development guide β†’

+ +

Docker Controller Development

+
+
+ BASH + +
+
cd docker-controller
+
+# Build locally
+go build -o streamspace-docker-controller
+
+# Run with Docker Compose
+./scripts/docker-dev.sh
+
+# Test NATS connectivity
+./scripts/test-nats.sh
+

API Development

diff --git a/site/features.html b/site/features.html index 7f1f9909..b8d007a5 100644 --- a/site/features.html +++ b/site/features.html @@ -75,14 +75,14 @@

Auto-Hibernation

-
☸️
-

Kubernetes Native

-

Built on Kubernetes with custom resource definitions (CRDs). Leverage your existing K8s infrastructure, monitoring, and deployment tools.

+
πŸ–₯️
+

Multi-Platform Support

+

Deploy on Kubernetes, Docker, or hybrid environments. Event-driven architecture with NATS JetStream for platform coordination.

    -
  • Session CRD for user workspaces
  • -
  • Template CRD for applications
  • -
  • Native kubectl integration
  • -
  • Helm chart for deployment
  • +
  • Kubernetes controller with CRDs
  • +
  • Docker controller for standalone hosts
  • +
  • NATS JetStream messaging
  • +
  • Helm chart for K8s deployment
@@ -263,14 +263,14 @@

Technical Capabilities

-

Controller (Go + Kubebuilder)

+

Platform Controllers (Go)

    +
  • Kubernetes controller (Kubebuilder)
  • +
  • Docker controller (standalone)
  • +
  • NATS JetStream event handling
  • Session lifecycle management
  • Automatic resource provisioning
  • -
  • State machine (running/hibernated/terminated)
  • -
  • Deployment scaling (0/1 replicas)
  • -
  • Service and Ingress creation
  • -
  • PVC provisioning for user homes
  • +
  • State machine (running/hibernated)
  • Prometheus metrics export
  • Leader election for HA
@@ -321,6 +321,12 @@

K3s (Recommended)

Optimized for k3s lightweight Kubernetes. Perfect for edge deployments and ARM64 architectures like Raspberry Pi clusters.

+
+
🐳
+

Docker Standalone

+

Deploy on a single Docker host with Docker Compose. Great for development, testing, or small teams.

+
+
☁️

Cloud Providers

diff --git a/site/index.html b/site/index.html index d3033d5a..ebc23dc8 100644 --- a/site/index.html +++ b/site/index.html @@ -62,9 +62,9 @@

Browser-Based Access

No client installation required. Access any application directly from your web browser using VNC streaming technology.

-
☸️
-

Kubernetes Native

-

Built on Kubernetes with custom CRDs. Leverage your existing K8s infrastructure and tools.

+
πŸ–₯️
+

Multi-Platform

+

Deploy on Kubernetes, Docker, or hybrid environments. Event-driven architecture with NATS messaging.

⚑
@@ -148,20 +148,19 @@

Architecture

β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”         β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”         β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
-β”‚   Web UI    │────────▢│ API Backend │────────▢│ Kubernetes   β”‚
-β”‚  (React)    β”‚  REST/WSβ”‚  (Go/Gin)   β”‚  K8s APIβ”‚  Controller  β”‚
+β”‚   Web UI    │────────▢│ API Backend │────────▢│     NATS     β”‚
+β”‚  (React)    β”‚  REST/WSβ”‚  (Go/Gin)   β”‚  Events β”‚  JetStream   β”‚
 β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜         β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”˜         β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜
-                               β”‚                        β”‚
-                               β”‚                        β”‚
-                        β”Œβ”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”       β”Œβ”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”
-                        β”‚  PostgreSQL   β”‚       β”‚  Sessions    β”‚
-                        β”‚   Database    β”‚       β”‚  (CRDs)      β”‚
-                        β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜       β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
+ β”‚ β”Œβ”€β”€β”€β”΄β”€β”€β”€β” + β”Œβ”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”΄β”€β”€β”€β” └───────┐ + β”‚ PostgreSQL β”‚ β”‚ K8s β”‚ β”‚Docker β”‚ + β”‚ Database β”‚ β”‚ Ctrl β”‚ β”‚ Ctrl β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”˜
-

Go Controller

-

Kubebuilder-based controller manages session lifecycle, hibernation, and resource provisioning.

+

Platform Controllers

+

Kubernetes and Docker controllers manage sessions on their respective platforms via NATS events.

Go API Backend