Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 43 additions & 5 deletions internal/commands/surface/exocortex/cluster_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
package exocortex

import (
"castra/internal/device"
"castra/internal/exocortex"
"castra/internal/irisdb"
"context"
"flag"
"fmt"
"log"
"os"
Expand Down Expand Up @@ -73,6 +73,16 @@ func (c *ClusterInitCommand) Execute(cmdCtx *Context) error {
if err != nil {
return err
}

// Persist parent-mode device config so subsequent CLI invocations
// (memory_write, memory_search, sovereign verify) on this host know
// they are the parent and route accordingly. SetDeviceMode is non-
// destructive — it preserves device.key (and thus the castra.db HMAC
// seal) and only rewrites device.yaml. Non-fatal: the daemon is already
// running, but mode-aware consumers will misroute without this.
if err := device.SetDeviceMode(device.DeviceDir(), device.DeviceModeParent, ""); err != nil {
log.Printf("[castra] cluster_init: failed to persist parent mode in device.yaml: %v", err)
}
fmt.Printf("exocortex enrollment listener on %s\n", enrollSrv.Addr())
fmt.Printf("enrollment token (single-use, 1h TTL, bound to CN %q): %s\n", cfg.EnrollCN, tok)
fmt.Println()
Expand All @@ -83,11 +93,16 @@ func (c *ClusterInitCommand) Execute(cmdCtx *Context) error {
}

// clusterInitParseFlags pre-checks --role and parses the full daemon-flag set.
//
// We extract --role manually (extractStringFlag) instead of routing through a
// dedicated FlagSet because the dedicated-FlagSet approach errors on every
// downstream daemon flag (--listen, --enroll-addr, etc.) — flag.ContinueOnError
// returns "flag provided but not defined" before reaching the second-stage
// parser. Manual extraction is what the downstream stripFlag already does;
// reusing it here keeps the surface honest about the two-FlagSet seam.
func clusterInitParseFlags(args []string) (*exocortex.Config, error) {
fs := flag.NewFlagSet("exocortex cluster init", flag.ContinueOnError)
var role string
fs.StringVar(&role, "role", "", `Node role — only "parent" is supported`)
if err := fs.Parse(args); err != nil {
role, err := extractStringFlag(args, "role")
if err != nil {
return nil, err
}
if role != "parent" {
Expand All @@ -96,6 +111,29 @@ func clusterInitParseFlags(args []string) (*exocortex.Config, error) {
return parseExocortexDaemonFlags("exocortex cluster init", stripFlag(args, "role"))
}

// extractStringFlag returns the value of --name from args without consuming
// the FlagSet machinery (which would error on every unknown sibling flag).
// Supports `--name value`, `--name=value`, `-name value`, `-name=value`.
// Returns "" with nil error when the flag is absent.
func extractStringFlag(args []string, name string) (string, error) {
for i := 0; i < len(args); i++ {
a := args[i]
if a == "--"+name || a == "-"+name {
if i+1 >= len(args) {
return "", fmt.Errorf("flag --%s requires a value", name)
}
return args[i+1], nil
}
if pfx := "--" + name + "="; len(a) > len(pfx) && a[:len(pfx)] == pfx {
return a[len(pfx):], nil
}
if pfx := "-" + name + "="; len(a) > len(pfx) && a[:len(pfx)] == pfx {
return a[len(pfx):], nil
}
}
return "", nil
}

// clusterInitGenerateCerts generates CA + server certs and patches cfg paths.
func clusterInitGenerateCerts(cfg *exocortex.Config) error {
certsResult, err := exocortex.GenerateClusterCerts(cfg)
Expand Down
46 changes: 46 additions & 0 deletions internal/commands/surface/exocortex/enroll.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
package exocortex

import (
"castra/internal/device"
"castra/internal/exocortex"
"castra/internal/persona"
"context"
"errors"
"fmt"
"net/url"
"os"
"strings"
"time"
Expand Down Expand Up @@ -99,14 +101,58 @@ func (c *ClusterEnrollCommand) Execute(cmdCtx *Context) error {
return fmt.Errorf("exocortex cluster enroll: %w", err)
}

// Persist client-mode device config so subsequent CLI invocations
// (memory_write, memory_search, sovereign verify) route to the parent
// daemon via mTLS instead of falling back to the WAL queue.
parentAddr, deriveErr := deriveParentDaemonAddr(parentURL)
if deriveErr != nil {
return fmt.Errorf("enrollment succeeded but failed to derive parent daemon address from %q: %w", parentURL, deriveErr)
}
if err := device.SetDeviceMode(device.DeviceDir(), device.DeviceModeClient, parentAddr); err != nil {
return fmt.Errorf("enrollment succeeded but failed to persist client mode in device.yaml: %w", err)
}

fmt.Println("Enrollment complete.")
fmt.Printf(" cn: %s\n", res.CommonName)
fmt.Printf(" ca cert: %s\n", res.CACertPath)
fmt.Printf(" client cert: %s\n", res.ClientCertPath)
fmt.Printf(" client key: %s\n", res.ClientKeyPath)
fmt.Printf(" parent_addr: %s (recorded in device.yaml)\n", parentAddr)
return nil
}

// deriveParentDaemonAddr converts the enrollment URL the operator supplied on
// the CLI (https://host:enroll-port) into the host:port the data-plane mTLS
// daemon listens on. The enrollment listener and the mTLS daemon run on
// different ports by design (DefaultEnrollAddr=:9438 vs DefaultListenAddr=:9437),
// and the enrollment response payload does not currently carry the daemon
// address — so we derive it heuristically: take the parent host, attach the
// canonical DefaultListenAddr port.
//
// This is correct for every cluster that runs the parent daemon on the default
// port (the only configuration the current cluster_init surface supports). If
// a future iteration of cluster_init learns to bind the daemon on a non-default
// port, the enrollment response should carry the daemon addr explicitly and
// this heuristic should be replaced.
func deriveParentDaemonAddr(parentURL string) (string, error) {
u, err := url.Parse(parentURL)
if err != nil {
return "", fmt.Errorf("parse url: %w", err)
}
host := u.Hostname()
if host == "" {
return "", fmt.Errorf("empty host")
}
// Extract the canonical daemon port from DefaultListenAddr. The daemon
// listens on this port regardless of which port the enrollment URL used.
idx := strings.LastIndex(exocortex.DefaultListenAddr, ":")
if idx < 0 {
return "", fmt.Errorf("internal: DefaultListenAddr %q has no port", exocortex.DefaultListenAddr)
}
daemonPort := exocortex.DefaultListenAddr[idx+1:]
return host + ":" + daemonPort, nil
}

// defaultEnrollCN returns a sane default common name: the host's local
// hostname, falling back to "castra-child" if the lookup fails. Operators
// can always override with --cn.
Expand Down
30 changes: 15 additions & 15 deletions internal/creator/creator.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import (
"database/sql"
"log"
"os"
"path/filepath"
"time"
)

Expand Down Expand Up @@ -216,26 +215,27 @@ func VerifyClient(irisDB *sql.DB) CreatorAuthResult {
//
// Errors are audit-logged without leaking detail to callers (opaque Verified=false).
func clientAttestRoundtrip(parentAddr string, irisDB *sql.DB) CreatorAuthResult {
// Resolve client-side mTLS certificate paths. The device must have had its
// HEIST certs initialized (castra exocortex start / cluster enroll) for
// these to exist.
certDir, err := castradir.Path("certs")
if err != nil {
log.Printf("[castra] clientAttestRoundtrip: resolve home dir (audit): %v", err)
// Resolve client-side mTLS certificate paths via the mode-aware seam in
// exocortex.NewDefaultConfig — this ensures clientAttestRoundtrip and
// every other mTLS consumer (memory_write, memory_search, heartbeat) read
// from the same cert filenames (client.crt / client.key / ca.crt for
// enrolled clients). Prior to mode awareness this function hard-coded the
// parent-shape names (server.pem / server.key / ca.pem), causing the mTLS
// handshake to fail on enrolled clients and silent fall-through to public
// mode for every sovereign-verify call.
certPath, keyPath, caPath, _, resolveErr := exocortex.ResolveModeAwareClientCertPaths()
if resolveErr != nil {
log.Printf("[castra] clientAttestRoundtrip: resolve cert dir (audit): %v", resolveErr)
return CreatorAuthResult{Verified: false}
}
certPath := filepath.Join(certDir, "server.pem")
keyPath := filepath.Join(certDir, "server.key")
// Fall back to the alternate key name used by the cert-rotation tooling.
if _, statErr := os.Stat(keyPath); os.IsNotExist(statErr) {
keyPath = filepath.Join(certDir, "server-key.pem")
}
caPath := filepath.Join(certDir, "ca.pem")

// Build an exocortex.Config pointing at the parent daemon. The bus client
// (exocortex.NewClient) is responsible for cert/key/CA loading, ServerName
// derivation, and TLS handshake — we only supply file paths and the
// destination address.
// destination address. We pass parentAddr (from device.yaml) explicitly
// here rather than the resolver-returned daemon addr because VerifyClient
// already loaded it via device.LoadDeviceParentAddr and we want to keep
// the dispatch source-of-truth at the caller.
cfg := &exocortex.Config{
ListenAddr: parentAddr,
CertPath: certPath,
Expand Down
9 changes: 6 additions & 3 deletions internal/creator/creator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,9 +337,12 @@ func startAttestDaemon(t *testing.T, db *sql.DB, sovereignVerifier func() bool)
}
}

copyFile(d.ClientConfig.CertPath, filepath.Join(certDir, "server.pem"))
copyFile(d.ClientConfig.KeyPath, filepath.Join(certDir, "server.key"))
copyFile(d.ClientConfig.CAPath, filepath.Join(certDir, "ca.pem"))
// Client-shape cert paths post-mode-aware-resolution (task lhg7ls2qok4pqvds):
// clientAttestRoundtrip now reads client.crt / client.key / ca.crt — the
// enrolled-client filenames — instead of the legacy server-shape names.
copyFile(d.ClientConfig.CertPath, filepath.Join(certDir, "client.crt"))
copyFile(d.ClientConfig.KeyPath, filepath.Join(certDir, "client.key"))
copyFile(d.ClientConfig.CAPath, filepath.Join(certDir, "ca.crt"))

return d, fakeHome
}
Expand Down
58 changes: 50 additions & 8 deletions internal/device/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,20 +70,60 @@ func IsInitialized() bool {

// InitDevice initializes the device identity in the given directory.
// Returns an error if device.key already exists (idempotency guard).
//
// Mode and ParentAddr are not set by this entry point — bootstrap surfaces
// (cluster_init / cluster_enroll) call InitDeviceForce post-success to imprint
// the role and parent address once they're known. Pre-cluster local installs
// land here with implicit-mode device.yaml, preserving the prior single-node
// behaviour.
func InitDevice(dir string) (*DeviceIdentity, error) {
keyPath := filepath.Join(dir, deviceKeyFile)
if _, err := os.Stat(keyPath); err == nil {
return nil, fmt.Errorf("device already initialized; use --force to reinitialize")
}
return initDeviceInternal(dir)
return initDeviceInternal(dir, "", "")
}

// InitDeviceForce initializes the device identity, overwriting any existing keys.
func InitDeviceForce(dir string) (*DeviceIdentity, error) {
return initDeviceInternal(dir)
//
// mode and parentAddr are stamped into device.yaml when non-empty. Pass mode=""
// and parentAddr="" for the legacy implicit-mode behaviour.
//
// NOTE: this entry point regenerates the device keypair. It is destructive in
// the presence of an existing castra.db that is HMAC-sealed against the prior
// device.key — the DB sentinel will fail integrity check on next open. Use
// SetDeviceMode for the post-bootstrap mode/parent_addr stamp performed by
// cluster_init and cluster_enroll; that path preserves the existing key and
// only mutates device.yaml.
func InitDeviceForce(dir string, mode DeviceMode, parentAddr string) (*DeviceIdentity, error) {
return initDeviceInternal(dir, mode, parentAddr)
}

// SetDeviceMode updates the mode and parent_addr fields on an already-
// initialized device.yaml, preserving the device.key, ID, token, and public
// key. This is the non-destructive seam cluster_init / cluster_enroll use
// after the cluster bootstrap succeeds to record the host's cluster role
// without invalidating the HMAC seal on castra.db.
//
// The on-disk yaml is rewritten via the existing writeDeviceYAML helper so
// the file format stays consistent (mode/parent_addr emitted only when set).
// Returns an error when device.yaml does not exist — callers must run
// InitDevice first.
func SetDeviceMode(dir string, mode DeviceMode, parentAddr string) error {
yamlPath := filepath.Join(dir, deviceYAMLFile)
identity, err := readDeviceYAML(yamlPath)
if err != nil {
return fmt.Errorf("SetDeviceMode: read device.yaml: %w", err)
}
identity.Mode = mode
identity.ParentAddr = parentAddr
if err := writeDeviceYAML(dir, identity); err != nil {
return fmt.Errorf("SetDeviceMode: write device.yaml: %w", err)
}
return nil
}

func initDeviceInternal(dir string) (*DeviceIdentity, error) {
func initDeviceInternal(dir string, mode DeviceMode, parentAddr string) (*DeviceIdentity, error) {
if err := castradir.MkdirPrivate(dir); err != nil {
return nil, fmt.Errorf("failed to create device directory: %w", err)
}
Expand All @@ -106,10 +146,12 @@ func initDeviceInternal(dir string) (*DeviceIdentity, error) {
createdAt := time.Now().UTC().Format(time.RFC3339)

identity := &DeviceIdentity{
ID: id,
Token: token,
PublicKey: pubKeyB64,
CreatedAt: createdAt,
ID: id,
Token: token,
PublicKey: pubKeyB64,
CreatedAt: createdAt,
Mode: mode,
ParentAddr: parentAddr,
}

if err := writeDeviceYAML(dir, identity); err != nil {
Expand Down
Loading
Loading