diff --git a/internal/commands/surface/exocortex/cluster_init.go b/internal/commands/surface/exocortex/cluster_init.go index 72a5f184..c587a963 100644 --- a/internal/commands/surface/exocortex/cluster_init.go +++ b/internal/commands/surface/exocortex/cluster_init.go @@ -27,10 +27,10 @@ package exocortex import ( + "castra/internal/device" "castra/internal/exocortex" "castra/internal/irisdb" "context" - "flag" "fmt" "log" "os" @@ -73,6 +73,16 @@ func (c *ClusterInitCommand) Execute(cmdCtx *Context) error { if err != nil { return err } + + // Persist parent-mode device config so subsequent CLI invocations + // (memory_write, memory_search, sovereign verify) on this host know + // they are the parent and route accordingly. SetDeviceMode is non- + // destructive — it preserves device.key (and thus the castra.db HMAC + // seal) and only rewrites device.yaml. Non-fatal: the daemon is already + // running, but mode-aware consumers will misroute without this. + if err := device.SetDeviceMode(device.DeviceDir(), device.DeviceModeParent, ""); err != nil { + log.Printf("[castra] cluster_init: failed to persist parent mode in device.yaml: %v", err) + } fmt.Printf("exocortex enrollment listener on %s\n", enrollSrv.Addr()) fmt.Printf("enrollment token (single-use, 1h TTL, bound to CN %q): %s\n", cfg.EnrollCN, tok) fmt.Println() @@ -83,11 +93,16 @@ func (c *ClusterInitCommand) Execute(cmdCtx *Context) error { } // clusterInitParseFlags pre-checks --role and parses the full daemon-flag set. +// +// We extract --role manually (extractStringFlag) instead of routing through a +// dedicated FlagSet because the dedicated-FlagSet approach errors on every +// downstream daemon flag (--listen, --enroll-addr, etc.) — flag.ContinueOnError +// returns "flag provided but not defined" before reaching the second-stage +// parser. Manual extraction is what the downstream stripFlag already does; +// reusing it here keeps the surface honest about the two-FlagSet seam. func clusterInitParseFlags(args []string) (*exocortex.Config, error) { - fs := flag.NewFlagSet("exocortex cluster init", flag.ContinueOnError) - var role string - fs.StringVar(&role, "role", "", `Node role — only "parent" is supported`) - if err := fs.Parse(args); err != nil { + role, err := extractStringFlag(args, "role") + if err != nil { return nil, err } if role != "parent" { @@ -96,6 +111,29 @@ func clusterInitParseFlags(args []string) (*exocortex.Config, error) { return parseExocortexDaemonFlags("exocortex cluster init", stripFlag(args, "role")) } +// extractStringFlag returns the value of --name from args without consuming +// the FlagSet machinery (which would error on every unknown sibling flag). +// Supports `--name value`, `--name=value`, `-name value`, `-name=value`. +// Returns "" with nil error when the flag is absent. +func extractStringFlag(args []string, name string) (string, error) { + for i := 0; i < len(args); i++ { + a := args[i] + if a == "--"+name || a == "-"+name { + if i+1 >= len(args) { + return "", fmt.Errorf("flag --%s requires a value", name) + } + return args[i+1], nil + } + if pfx := "--" + name + "="; len(a) > len(pfx) && a[:len(pfx)] == pfx { + return a[len(pfx):], nil + } + if pfx := "-" + name + "="; len(a) > len(pfx) && a[:len(pfx)] == pfx { + return a[len(pfx):], nil + } + } + return "", nil +} + // clusterInitGenerateCerts generates CA + server certs and patches cfg paths. func clusterInitGenerateCerts(cfg *exocortex.Config) error { certsResult, err := exocortex.GenerateClusterCerts(cfg) diff --git a/internal/commands/surface/exocortex/enroll.go b/internal/commands/surface/exocortex/enroll.go index c8637aef..7a110301 100644 --- a/internal/commands/surface/exocortex/enroll.go +++ b/internal/commands/surface/exocortex/enroll.go @@ -17,11 +17,13 @@ package exocortex import ( + "castra/internal/device" "castra/internal/exocortex" "castra/internal/persona" "context" "errors" "fmt" + "net/url" "os" "strings" "time" @@ -99,14 +101,58 @@ func (c *ClusterEnrollCommand) Execute(cmdCtx *Context) error { return fmt.Errorf("exocortex cluster enroll: %w", err) } + // Persist client-mode device config so subsequent CLI invocations + // (memory_write, memory_search, sovereign verify) route to the parent + // daemon via mTLS instead of falling back to the WAL queue. + parentAddr, deriveErr := deriveParentDaemonAddr(parentURL) + if deriveErr != nil { + return fmt.Errorf("enrollment succeeded but failed to derive parent daemon address from %q: %w", parentURL, deriveErr) + } + if err := device.SetDeviceMode(device.DeviceDir(), device.DeviceModeClient, parentAddr); err != nil { + return fmt.Errorf("enrollment succeeded but failed to persist client mode in device.yaml: %w", err) + } + fmt.Println("Enrollment complete.") fmt.Printf(" cn: %s\n", res.CommonName) fmt.Printf(" ca cert: %s\n", res.CACertPath) fmt.Printf(" client cert: %s\n", res.ClientCertPath) fmt.Printf(" client key: %s\n", res.ClientKeyPath) + fmt.Printf(" parent_addr: %s (recorded in device.yaml)\n", parentAddr) return nil } +// deriveParentDaemonAddr converts the enrollment URL the operator supplied on +// the CLI (https://host:enroll-port) into the host:port the data-plane mTLS +// daemon listens on. The enrollment listener and the mTLS daemon run on +// different ports by design (DefaultEnrollAddr=:9438 vs DefaultListenAddr=:9437), +// and the enrollment response payload does not currently carry the daemon +// address — so we derive it heuristically: take the parent host, attach the +// canonical DefaultListenAddr port. +// +// This is correct for every cluster that runs the parent daemon on the default +// port (the only configuration the current cluster_init surface supports). If +// a future iteration of cluster_init learns to bind the daemon on a non-default +// port, the enrollment response should carry the daemon addr explicitly and +// this heuristic should be replaced. +func deriveParentDaemonAddr(parentURL string) (string, error) { + u, err := url.Parse(parentURL) + if err != nil { + return "", fmt.Errorf("parse url: %w", err) + } + host := u.Hostname() + if host == "" { + return "", fmt.Errorf("empty host") + } + // Extract the canonical daemon port from DefaultListenAddr. The daemon + // listens on this port regardless of which port the enrollment URL used. + idx := strings.LastIndex(exocortex.DefaultListenAddr, ":") + if idx < 0 { + return "", fmt.Errorf("internal: DefaultListenAddr %q has no port", exocortex.DefaultListenAddr) + } + daemonPort := exocortex.DefaultListenAddr[idx+1:] + return host + ":" + daemonPort, nil +} + // defaultEnrollCN returns a sane default common name: the host's local // hostname, falling back to "castra-child" if the lookup fails. Operators // can always override with --cn. diff --git a/internal/creator/creator.go b/internal/creator/creator.go index 2c78952c..b0b8a200 100644 --- a/internal/creator/creator.go +++ b/internal/creator/creator.go @@ -13,7 +13,6 @@ import ( "database/sql" "log" "os" - "path/filepath" "time" ) @@ -216,26 +215,27 @@ func VerifyClient(irisDB *sql.DB) CreatorAuthResult { // // Errors are audit-logged without leaking detail to callers (opaque Verified=false). func clientAttestRoundtrip(parentAddr string, irisDB *sql.DB) CreatorAuthResult { - // Resolve client-side mTLS certificate paths. The device must have had its - // HEIST certs initialized (castra exocortex start / cluster enroll) for - // these to exist. - certDir, err := castradir.Path("certs") - if err != nil { - log.Printf("[castra] clientAttestRoundtrip: resolve home dir (audit): %v", err) + // Resolve client-side mTLS certificate paths via the mode-aware seam in + // exocortex.NewDefaultConfig — this ensures clientAttestRoundtrip and + // every other mTLS consumer (memory_write, memory_search, heartbeat) read + // from the same cert filenames (client.crt / client.key / ca.crt for + // enrolled clients). Prior to mode awareness this function hard-coded the + // parent-shape names (server.pem / server.key / ca.pem), causing the mTLS + // handshake to fail on enrolled clients and silent fall-through to public + // mode for every sovereign-verify call. + certPath, keyPath, caPath, _, resolveErr := exocortex.ResolveModeAwareClientCertPaths() + if resolveErr != nil { + log.Printf("[castra] clientAttestRoundtrip: resolve cert dir (audit): %v", resolveErr) return CreatorAuthResult{Verified: false} } - certPath := filepath.Join(certDir, "server.pem") - keyPath := filepath.Join(certDir, "server.key") - // Fall back to the alternate key name used by the cert-rotation tooling. - if _, statErr := os.Stat(keyPath); os.IsNotExist(statErr) { - keyPath = filepath.Join(certDir, "server-key.pem") - } - caPath := filepath.Join(certDir, "ca.pem") // Build an exocortex.Config pointing at the parent daemon. The bus client // (exocortex.NewClient) is responsible for cert/key/CA loading, ServerName // derivation, and TLS handshake — we only supply file paths and the - // destination address. + // destination address. We pass parentAddr (from device.yaml) explicitly + // here rather than the resolver-returned daemon addr because VerifyClient + // already loaded it via device.LoadDeviceParentAddr and we want to keep + // the dispatch source-of-truth at the caller. cfg := &exocortex.Config{ ListenAddr: parentAddr, CertPath: certPath, diff --git a/internal/creator/creator_test.go b/internal/creator/creator_test.go index 5dd8b5ea..4e993d4b 100644 --- a/internal/creator/creator_test.go +++ b/internal/creator/creator_test.go @@ -337,9 +337,12 @@ func startAttestDaemon(t *testing.T, db *sql.DB, sovereignVerifier func() bool) } } - copyFile(d.ClientConfig.CertPath, filepath.Join(certDir, "server.pem")) - copyFile(d.ClientConfig.KeyPath, filepath.Join(certDir, "server.key")) - copyFile(d.ClientConfig.CAPath, filepath.Join(certDir, "ca.pem")) + // Client-shape cert paths post-mode-aware-resolution (task lhg7ls2qok4pqvds): + // clientAttestRoundtrip now reads client.crt / client.key / ca.crt — the + // enrolled-client filenames — instead of the legacy server-shape names. + copyFile(d.ClientConfig.CertPath, filepath.Join(certDir, "client.crt")) + copyFile(d.ClientConfig.KeyPath, filepath.Join(certDir, "client.key")) + copyFile(d.ClientConfig.CAPath, filepath.Join(certDir, "ca.crt")) return d, fakeHome } diff --git a/internal/device/device.go b/internal/device/device.go index 9fa1e139..a61a0381 100644 --- a/internal/device/device.go +++ b/internal/device/device.go @@ -70,20 +70,60 @@ func IsInitialized() bool { // InitDevice initializes the device identity in the given directory. // Returns an error if device.key already exists (idempotency guard). +// +// Mode and ParentAddr are not set by this entry point — bootstrap surfaces +// (cluster_init / cluster_enroll) call InitDeviceForce post-success to imprint +// the role and parent address once they're known. Pre-cluster local installs +// land here with implicit-mode device.yaml, preserving the prior single-node +// behaviour. func InitDevice(dir string) (*DeviceIdentity, error) { keyPath := filepath.Join(dir, deviceKeyFile) if _, err := os.Stat(keyPath); err == nil { return nil, fmt.Errorf("device already initialized; use --force to reinitialize") } - return initDeviceInternal(dir) + return initDeviceInternal(dir, "", "") } // InitDeviceForce initializes the device identity, overwriting any existing keys. -func InitDeviceForce(dir string) (*DeviceIdentity, error) { - return initDeviceInternal(dir) +// +// mode and parentAddr are stamped into device.yaml when non-empty. Pass mode="" +// and parentAddr="" for the legacy implicit-mode behaviour. +// +// NOTE: this entry point regenerates the device keypair. It is destructive in +// the presence of an existing castra.db that is HMAC-sealed against the prior +// device.key — the DB sentinel will fail integrity check on next open. Use +// SetDeviceMode for the post-bootstrap mode/parent_addr stamp performed by +// cluster_init and cluster_enroll; that path preserves the existing key and +// only mutates device.yaml. +func InitDeviceForce(dir string, mode DeviceMode, parentAddr string) (*DeviceIdentity, error) { + return initDeviceInternal(dir, mode, parentAddr) +} + +// SetDeviceMode updates the mode and parent_addr fields on an already- +// initialized device.yaml, preserving the device.key, ID, token, and public +// key. This is the non-destructive seam cluster_init / cluster_enroll use +// after the cluster bootstrap succeeds to record the host's cluster role +// without invalidating the HMAC seal on castra.db. +// +// The on-disk yaml is rewritten via the existing writeDeviceYAML helper so +// the file format stays consistent (mode/parent_addr emitted only when set). +// Returns an error when device.yaml does not exist — callers must run +// InitDevice first. +func SetDeviceMode(dir string, mode DeviceMode, parentAddr string) error { + yamlPath := filepath.Join(dir, deviceYAMLFile) + identity, err := readDeviceYAML(yamlPath) + if err != nil { + return fmt.Errorf("SetDeviceMode: read device.yaml: %w", err) + } + identity.Mode = mode + identity.ParentAddr = parentAddr + if err := writeDeviceYAML(dir, identity); err != nil { + return fmt.Errorf("SetDeviceMode: write device.yaml: %w", err) + } + return nil } -func initDeviceInternal(dir string) (*DeviceIdentity, error) { +func initDeviceInternal(dir string, mode DeviceMode, parentAddr string) (*DeviceIdentity, error) { if err := castradir.MkdirPrivate(dir); err != nil { return nil, fmt.Errorf("failed to create device directory: %w", err) } @@ -106,10 +146,12 @@ func initDeviceInternal(dir string) (*DeviceIdentity, error) { createdAt := time.Now().UTC().Format(time.RFC3339) identity := &DeviceIdentity{ - ID: id, - Token: token, - PublicKey: pubKeyB64, - CreatedAt: createdAt, + ID: id, + Token: token, + PublicKey: pubKeyB64, + CreatedAt: createdAt, + Mode: mode, + ParentAddr: parentAddr, } if err := writeDeviceYAML(dir, identity); err != nil { diff --git a/internal/device/device_test.go b/internal/device/device_test.go index ab0e0617..9b5f5988 100644 --- a/internal/device/device_test.go +++ b/internal/device/device_test.go @@ -97,7 +97,7 @@ func TestInitDeviceForce_OverwritesExistingKey(t *testing.T) { } // Force re-init: must succeed and produce a NEW identity. - id2, err := InitDeviceForce(dir) + id2, err := InitDeviceForce(dir, "", "") if err != nil { t.Fatalf("InitDeviceForce: %v", err) } @@ -108,6 +108,115 @@ func TestInitDeviceForce_OverwritesExistingKey(t *testing.T) { } } +// TestSetDeviceMode_PreservesKeyAndStampsYAML verifies the non-destructive +// seam cluster_init / cluster_enroll use to record cluster role + parent +// address after a regular InitDevice has already run. The device.key on disk +// must NOT be rotated — that would invalidate the castra.db HMAC seal. +func TestSetDeviceMode_PreservesKeyAndStampsYAML(t *testing.T) { + dir := t.TempDir() + id1, err := InitDevice(dir) + if err != nil { + t.Fatalf("InitDevice: %v", err) + } + + keyPath := filepath.Join(dir, deviceKeyFile) + keyBytesBefore, err := os.ReadFile(keyPath) + if err != nil { + t.Fatalf("read device.key: %v", err) + } + + if err := SetDeviceMode(dir, DeviceModeClient, "parent.test:9437"); err != nil { + t.Fatalf("SetDeviceMode: %v", err) + } + + keyBytesAfter, err := os.ReadFile(keyPath) + if err != nil { + t.Fatalf("re-read device.key: %v", err) + } + if !bytesEqual(keyBytesBefore, keyBytesAfter) { + t.Fatal("SetDeviceMode rotated device.key — would break castra.db HMAC seal") + } + + loaded, err := LoadDevice(dir) + if err != nil { + t.Fatalf("LoadDevice: %v", err) + } + if loaded.ID != id1.ID { + t.Errorf("ID rotated: %q -> %q", id1.ID, loaded.ID) + } + if loaded.PublicKey != id1.PublicKey { + t.Errorf("PublicKey rotated: %q -> %q", id1.PublicKey, loaded.PublicKey) + } + if loaded.Mode != DeviceModeClient { + t.Errorf("Mode = %q, want client", loaded.Mode) + } + if loaded.ParentAddr != "parent.test:9437" { + t.Errorf("ParentAddr = %q, want parent.test:9437", loaded.ParentAddr) + } +} + +func bytesEqual(a, b []byte) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +// TestInitDeviceForce_StampsModeAndParentAddr verifies the post-bootstrap +// wiring contract: InitDeviceForce must persist Mode and ParentAddr into +// device.yaml when supplied. This is the seam cluster_init / cluster_enroll +// rely on to durably record role + dial target so subsequent CLI invocations +// route correctly without re-running enrollment. +func TestInitDeviceForce_StampsModeAndParentAddr(t *testing.T) { + t.Run("client mode with parent_addr", func(t *testing.T) { + dir := t.TempDir() + id, err := InitDeviceForce(dir, DeviceModeClient, "parent.example:9437") + if err != nil { + t.Fatalf("InitDeviceForce: %v", err) + } + if id.Mode != DeviceModeClient { + t.Errorf("Mode = %q, want %q", id.Mode, DeviceModeClient) + } + if id.ParentAddr != "parent.example:9437" { + t.Errorf("ParentAddr = %q, want %q", id.ParentAddr, "parent.example:9437") + } + + loaded, err := LoadDevice(dir) + if err != nil { + t.Fatalf("LoadDevice: %v", err) + } + if loaded.Mode != DeviceModeClient || loaded.ParentAddr != "parent.example:9437" { + t.Errorf("LoadDevice round-trip mismatch: mode=%q parent_addr=%q", loaded.Mode, loaded.ParentAddr) + } + }) + + t.Run("parent mode no parent_addr", func(t *testing.T) { + dir := t.TempDir() + id, err := InitDeviceForce(dir, DeviceModeParent, "") + if err != nil { + t.Fatalf("InitDeviceForce: %v", err) + } + if id.Mode != DeviceModeParent { + t.Errorf("Mode = %q, want %q", id.Mode, DeviceModeParent) + } + if id.ParentAddr != "" { + t.Errorf("ParentAddr = %q, want empty", id.ParentAddr) + } + loaded, err := LoadDevice(dir) + if err != nil { + t.Fatalf("LoadDevice: %v", err) + } + if loaded.Mode != DeviceModeParent { + t.Errorf("LoadDevice: Mode = %q, want %q", loaded.Mode, DeviceModeParent) + } + }) +} + // ─── LoadDevice ───────────────────────────────────────────────────────────── func TestLoadDevice_ReadsExistingYAMLCorrectly(t *testing.T) { diff --git a/internal/exocortex/certgen.go b/internal/exocortex/certgen.go index 10d8be4f..c030e3b4 100644 --- a/internal/exocortex/certgen.go +++ b/internal/exocortex/certgen.go @@ -43,6 +43,11 @@ import ( const ( clusterCAValidity = 10 * 365 * 24 * time.Hour clusterServerValidity = 365 * 24 * time.Hour + // clusterServerLegacyCN is the pre-v5.0 hard-coded Common Name. Retained + // only as the empty-ListenAddr fallback so degraded configs still emit a + // non-empty Subject; on any well-formed --listen the CN tracks the host so + // it aligns with the client's host-derived pin policy. + clusterServerLegacyCN = "castra-cluster-parent" ) // GenerateClusterCertsResult reports the on-disk paths of the generated cert @@ -139,9 +144,21 @@ func GenerateClusterCerts(cfg *Config) (*GenerateClusterCertsResult, error) { } listenHost := extractHost(cfg.ListenAddr) + // Server-leaf CN tracks the listen host so the client's host-derived pin + // policy (cfg.pinnedCN() == serverName == host(ListenAddr)) matches the + // presented cert. Symmetric with how single-machine daemon-start cert + // generation already populates the SAN from cfg.ListenAddr. Falls back to + // the legacy constant only when listenHost is empty (degraded input, e.g. + // malformed --listen) so the server still presents *some* identity rather + // than an empty Subject; that path stays "best effort" — operators who need + // strict pin matching must pass a well-formed --listen. + serverCN := listenHost + if serverCN == "" { + serverCN = clusterServerLegacyCN + } serverTmpl := &x509.Certificate{ SerialNumber: serverSerial, - Subject: pkix.Name{CommonName: "castra-cluster-parent"}, + Subject: pkix.Name{CommonName: serverCN}, NotBefore: now.Add(-time.Minute), NotAfter: now.Add(clusterServerValidity), KeyUsage: x509.KeyUsageDigitalSignature, diff --git a/internal/exocortex/certgen_test.go b/internal/exocortex/certgen_test.go index 802ca034..71217de6 100644 --- a/internal/exocortex/certgen_test.go +++ b/internal/exocortex/certgen_test.go @@ -195,6 +195,76 @@ func TestGenerateClusterCerts_SANInjection(t *testing.T) { } } +// TestGenerateClusterCerts_CNMatchesListenHost is the regression guard for the +// pre-v5.0 bug where the server-leaf CN was a constant ("castra-cluster-parent") +// while the client's host-derived pin policy compared against host(ListenAddr). +// The mismatch broke every cluster-mode mTLS handshake including parent-self- +// dial. Fix: server CN now tracks the listen host. +func TestGenerateClusterCerts_CNMatchesListenHost(t *testing.T) { + cases := []struct { + name string + listenAddr string + wantCN string + }{ + {"loopback IP", "127.0.0.1:9437", "127.0.0.1"}, + {"hostname", "parent.example.test:9437", "parent.example.test"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + certDir := t.TempDir() + t.Setenv("HOME", certDir) + cfg := &Config{ListenAddr: tc.listenAddr} + res, err := GenerateClusterCerts(cfg) + if err != nil { + t.Fatalf("GenerateClusterCerts: %v", err) + } + serverCertPEM, err := os.ReadFile(res.ServerCertPath) + if err != nil { + t.Fatalf("read server cert: %v", err) + } + pair, err := tls.X509KeyPair(serverCertPEM, mustReadFile(t, res.ServerKeyPath)) + if err != nil { + t.Fatalf("X509KeyPair: %v", err) + } + leaf, err := x509.ParseCertificate(pair.Certificate[0]) + if err != nil { + t.Fatalf("ParseCertificate: %v", err) + } + if leaf.Subject.CommonName != tc.wantCN { + t.Errorf("server CN = %q, want %q (must match client host-derived pin)", leaf.Subject.CommonName, tc.wantCN) + } + }) + } +} + +// TestGenerateClusterCerts_LegacyCNFallback verifies that a degraded +// ListenAddr (empty / unparsable) still yields a non-empty Subject CN — the +// legacy constant — so the server cert remains structurally valid even when +// host extraction fails. Strict pin matching obviously cannot work in this +// degraded mode; operators who need it must pass a well-formed --listen. +func TestGenerateClusterCerts_LegacyCNFallback(t *testing.T) { + certDir := t.TempDir() + t.Setenv("HOME", certDir) + // Empty ListenAddr fails Validate() but GenerateClusterCerts itself does + // not call Validate; extractHost returns "" → CN falls back to legacy. + cfg := &Config{ListenAddr: ""} + res, err := GenerateClusterCerts(cfg) + if err != nil { + t.Fatalf("GenerateClusterCerts: %v", err) + } + pair, err := tls.X509KeyPair(mustReadFile(t, res.ServerCertPath), mustReadFile(t, res.ServerKeyPath)) + if err != nil { + t.Fatalf("X509KeyPair: %v", err) + } + leaf, err := x509.ParseCertificate(pair.Certificate[0]) + if err != nil { + t.Fatalf("ParseCertificate: %v", err) + } + if leaf.Subject.CommonName != "castra-cluster-parent" { + t.Errorf("degraded-config CN = %q, want %q legacy fallback", leaf.Subject.CommonName, "castra-cluster-parent") + } +} + func mustReadFile(t *testing.T, path string) []byte { t.Helper() b, err := os.ReadFile(path) diff --git a/internal/exocortex/client_test.go b/internal/exocortex/client_test.go index 4dceb6d6..bd707ec0 100644 --- a/internal/exocortex/client_test.go +++ b/internal/exocortex/client_test.go @@ -375,6 +375,62 @@ func TestClient_CertPinningRejectsWrongCN(t *testing.T) { } } +// TestClusterCertsAcceptedByClientPin is the end-to-end regression for the +// pre-v5.0 CN-vs-pin mismatch bug. It runs the real GenerateClusterCerts + +// the real NewClient/Dial path against a fixture tls.Server presenting the +// generated server cert, and asserts the pinning callback accepts. Before the +// fix this test would fail with `cert pinning: server CN "castra-cluster-parent" +// does not match pinned CN "127.0.0.1"`. +func TestClusterCertsAcceptedByClientPin(t *testing.T) { + certDir := t.TempDir() + t.Setenv("HOME", certDir) + + cfg := &exocortex.Config{ListenAddr: "127.0.0.1:9437"} + res, err := exocortex.GenerateClusterCerts(cfg) + if err != nil { + t.Fatalf("GenerateClusterCerts: %v", err) + } + + caPEM, err := os.ReadFile(res.CACertPath) + if err != nil { + t.Fatalf("read CA: %v", err) + } + caPool := x509.NewCertPool() + if !caPool.AppendCertsFromPEM(caPEM) { + t.Fatal("AppendCertsFromPEM: no certs") + } + serverCertPEM, err := os.ReadFile(res.ServerCertPath) + if err != nil { + t.Fatalf("read server cert: %v", err) + } + serverKeyPEM, err := os.ReadFile(res.ServerKeyPath) + if err != nil { + t.Fatalf("read server key: %v", err) + } + + // Fixture server presents the cluster-generated cert and requires a client + // cert signed by the same CA. The server cert reuses ServerCertPath as the + // client identity in this round-trip (symmetric with how the client wires + // CertPath in single-machine mode). + srvTLS := serverTLSConfig(t, serverCertPEM, serverKeyPEM, caPool) + addr := startFixtureServer(t, srvTLS) + + // The client must use the same loopback host that the server cert was + // issued for so the host-derived pin matches the server CN. + clientCfg := makeCfg(res.ServerCertPath, res.ServerKeyPath, res.CACertPath, addr) + client, err := exocortex.NewClient(clientCfg) + if err != nil { + t.Fatalf("NewClient: %v", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := client.Dial(ctx); err != nil { + t.Fatalf("cluster cert -> client pin round-trip failed: %v", err) + } + _ = client.Close() +} + // TestClient_CloseIdempotent verifies that Close() can be called twice without // returning an error the second time. func TestClient_CloseIdempotent(t *testing.T) { diff --git a/internal/exocortex/config.go b/internal/exocortex/config.go index e9e6f66f..4a950fcd 100644 --- a/internal/exocortex/config.go +++ b/internal/exocortex/config.go @@ -10,7 +10,9 @@ package exocortex import ( "castra/internal/castradir" + "castra/internal/device" "fmt" + "log" "os" "path/filepath" ) @@ -143,31 +145,32 @@ const DefaultEnrollAddr = "127.0.0.1:9438" // NewDefaultConfig returns a Config populated entirely from the compiled // defaults. Callers override individual fields from CLI flags before handing // the struct to the daemon runtime (task 2+). +// +// Mode awareness: NewDefaultConfig consults device.LoadDeviceMode() and +// device.LoadDeviceParentAddr() so client-mode hosts get client-shape cert +// paths (client.crt / client.key / ca.crt) and dial the parent's daemon +// instead of returning parent-shape paths that don't exist on a child node. +// Parent-mode and unset-mode hosts continue to return the original parent- +// shape paths plus DefaultListenAddr — preserving the pre-cluster single- +// node behaviour exactly. func NewDefaultConfig() (*Config, error) { cfgPath, err := DefaultConfigPath() if err != nil { return nil, err } - certPath, err := DefaultCertPath() - if err != nil { - return nil, err - } - keyPath, err := DefaultKeyPath() - if err != nil { - return nil, err - } - caPath, err := DefaultCAPath() + caKeyPath, err := DefaultCAKeyPath() if err != nil { return nil, err } - caKeyPath, err := DefaultCAKeyPath() + + certPath, keyPath, caPath, listenAddr, err := resolveModeAwareCertPaths() if err != nil { return nil, err } return &Config{ ConfigPath: cfgPath, - ListenAddr: DefaultListenAddr, + ListenAddr: listenAddr, CertPath: certPath, KeyPath: keyPath, CAPath: caPath, @@ -176,6 +179,60 @@ func NewDefaultConfig() (*Config, error) { }, nil } +// resolveModeAwareCertPaths returns the cert / key / CA / listen-addr quad +// appropriate for the current device mode. Centralised here so the creator +// package and any future consumer of "default client-shape paths" can call +// the same logic instead of re-deriving cert filenames inline. +func resolveModeAwareCertPaths() (certPath, keyPath, caPath, listenAddr string, err error) { + dir, err := DefaultCertDir() + if err != nil { + return "", "", "", "", err + } + mode := device.LoadDeviceMode() + switch mode { + case device.DeviceModeClient: + certPath = filepath.Join(dir, "client.crt") + keyPath = filepath.Join(dir, "client.key") + caPath = filepath.Join(dir, "ca.crt") + listenAddr = device.LoadDeviceParentAddr() + if listenAddr == "" { + log.Println("[castra] NewDefaultConfig: client mode but parent_addr unset — falling back to localhost daemon address") + listenAddr = DefaultListenAddr + } + default: // DeviceModeParent or unset implicit-mode + certPath, err = DefaultCertPath() + if err != nil { + return "", "", "", "", err + } + keyPath, err = DefaultKeyPath() + if err != nil { + return "", "", "", "", err + } + caPath, err = DefaultCAPath() + if err != nil { + return "", "", "", "", err + } + listenAddr = DefaultListenAddr + } + return certPath, keyPath, caPath, listenAddr, nil +} + +// ResolveModeAwareClientCertPaths is the exported seam consumers outside this +// package use to obtain the cert / key / CA / daemon-addr quad matching the +// current device mode. Used by internal/creator/clientAttestRoundtrip to +// avoid duplicating the mode-dispatch + filename logic in that file. +// +// Returns the parent daemon dial address as the fourth value (named listenAddr +// for symmetry with Config.ListenAddr — that field doubles as the client's +// dial target since the client never "listens" itself). +func ResolveModeAwareClientCertPaths() (certPath, keyPath, caPath, daemonAddr string, err error) { + return resolveModeAwareCertPaths() +} + +// Compile-time sanity that DefaultListenAddr remains non-empty so the +// fallback branch in resolveModeAwareCertPaths cannot land on "". +var _ = DefaultListenAddr + // Validate performs flag-level sanity checks only. File-system and network // validation is deferred to the server task — this is scaffolding. func (c *Config) Validate() error {