From 7f1c072f52a9aeb702e01ed2148a961fd0956a88 Mon Sep 17 00:00:00 2001 From: Adil Date: Sat, 6 Jun 2026 20:44:08 +0500 Subject: [PATCH 01/13] feat: containerd runtime (nerdctl ExecSet) Add containerd as a third runtime alongside bare and apple. It drives containerd through nerdctl (its docker-compatible CLI), so containers run with no Docker daemon on top. Fits the four-string ExecSet contract today: pre_start nerdctl pull start nerdctl run --name - --init [flags] [cmd] stop nerdctl stop - post_stop nerdctl rm -f - Full spec coverage (env, env-files, volumes, publish, memory, cpus, user, workdir, entrypoint, cmd) and host-mode passthrough, same shape as apple's CLI path. check() gates on nerdctl/containerd reachability, so it errors cleanly off-Linux. A future mode-2 will drive containerd's gRPC API in process; this CLI path is the v1. orchd --runtime containerd grow --- src/runtime/containerd.rs | 221 ++++++++++++++++++++++++++++++++++++++ src/runtime/mod.rs | 4 +- 2 files changed, 224 insertions(+), 1 deletion(-) create mode 100644 src/runtime/containerd.rs diff --git a/src/runtime/containerd.rs b/src/runtime/containerd.rs new file mode 100644 index 0000000..7ff7420 --- /dev/null +++ b/src/runtime/containerd.rs @@ -0,0 +1,221 @@ +//! containerd runtime: an ExecSet over `nerdctl` (containerd's docker-compatible +//! CLI). Linux. +//! +//! nerdctl drives containerd directly, so this is "containerd as the runtime" +//! with no Docker daemon on top. It fits the four-string ExecSet contract today +//! (the pragmatic wiring). A future mode-2 will drive containerd's gRPC API in +//! process; this CLI path is the v1. +//! +//! Container lifecycle, supervised by orchdi/launchd/systemd: +//! pre_start nerdctl pull +//! start nerdctl run --name - --init [flags] [cmd] +//! stop nerdctl stop - +//! post_stop nerdctl rm -f - + +use std::process::{Command, Stdio}; + +use crate::config::Config; +use crate::exec::ExecSet; +use crate::runtime::{Runtime, RuntimeError}; +use crate::types::Service; + +pub struct ContainerdRuntime { + namespace: String, + data_dir: std::path::PathBuf, +} + +impl ContainerdRuntime { + pub fn new(config: &Config) -> Self { + ContainerdRuntime { + namespace: config.namespace.clone(), + data_dir: config.data_dir.clone(), + } + } + + fn container_name(&self, service: &Service) -> String { + format!("{}-{}", self.namespace, service.name) + } + + fn require_image<'a>(&self, service: &'a Service) -> Result<&'a str, RuntimeError> { + service.image.as_deref().ok_or_else(|| { + RuntimeError::Other(format!( + "service '{}' is container-mode but has no FROM image", + service.name + )) + }) + } +} + +impl Runtime for ContainerdRuntime { + fn name(&self) -> &str { + "containerd" + } + + fn check(&self) -> Result<(), RuntimeError> { + let status = Command::new("nerdctl") + .arg("version") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .map_err(|e| { + RuntimeError::BinaryNotFound(format!("could not spawn 'nerdctl': {e}")) + })?; + if status.success() { + Ok(()) + } else { + Err(RuntimeError::Other( + "containerd not reachable via nerdctl (is containerd running?)".to_string(), + )) + } + } + + fn prepare(&self, service: &Service) -> Result<(), RuntimeError> { + if service.is_host() { + let dir = self.data_dir.join(&service.name); + std::fs::create_dir_all(&dir).map_err(|e| { + RuntimeError::Other(format!( + "failed to create data directory '{}': {e}", + dir.display() + )) + })?; + return Ok(()); + } + // Image pull is deferred to pre_start (no I/O at prepare time). + Ok(()) + } + + fn exec_set(&self, service: &Service) -> Result { + if service.is_host() { + // Host-mode services pass through as plain programs, same as bare. + let start = service.run_command.clone().ok_or_else(|| { + RuntimeError::Other(format!( + "service '{}' is host-mode but has no RUN command", + service.name + )) + })?; + return Ok(ExecSet { + start, + pre_start: None, + stop: service.stop_command.clone(), + post_stop: None, + }); + } + + let image = self.require_image(service)?; + let name = self.container_name(service); + + let pre_start = format!("nerdctl pull {image}"); + + // --init forwards signals and reaps zombies inside the container. + let mut start = format!("nerdctl run --name {name} --init"); + + let mut envs: Vec<(&String, &String)> = service.env.iter().collect(); + envs.sort_by(|a, b| a.0.cmp(b.0)); + for (k, v) in envs { + start.push_str(&format!(" --env {k}={v}")); + } + for ef in &service.env_files { + start.push_str(&format!(" --env-file {ef}")); + } + for vol in &service.volumes { + start.push_str(&format!(" --volume {}:{}", vol.source, vol.destination)); + } + for p in &service.publish { + match &p.address { + Some(addr) => { + start.push_str(&format!(" --publish {addr}:{}:{}", p.host, p.container)) + } + None => start.push_str(&format!(" --publish {}:{}", p.host, p.container)), + } + } + if let Some(mem) = &service.resources.memory { + start.push_str(&format!(" --memory {mem}")); + } + if let Some(cpus) = service.resources.cpus { + if cpus.fract() == 0.0 { + start.push_str(&format!(" --cpus {}", cpus as u64)); + } else { + start.push_str(&format!(" --cpus {cpus}")); + } + } + if let Some(user) = &service.user { + start.push_str(&format!(" --user {user}")); + } + if let Some(wd) = &service.workdir { + start.push_str(&format!(" --workdir {wd}")); + } + if let Some(ep) = &service.entrypoint { + start.push_str(&format!(" --entrypoint {ep}")); + } + start.push_str(&format!(" {image}")); + if let Some(cmd) = &service.cmd { + start.push_str(&format!(" {cmd}")); + } + + Ok(ExecSet { + start, + pre_start: Some(pre_start), + stop: Some(format!("nerdctl stop {name}")), + post_stop: Some(format!("nerdctl rm -f {name}")), + }) + } +} + +#[cfg(test)] +#[allow(non_snake_case)] +mod tests { + use super::*; + + fn runtime() -> ContainerdRuntime { + ContainerdRuntime { + namespace: "orch".to_string(), + data_dir: std::env::temp_dir().join("orchd-containerd-test"), + } + } + + #[test] + fn test_exec_set__container_maps_to_nerdctl() { + let rt = runtime(); + let mut svc: Service = serde_json::from_str(STUB_CONTAINER).unwrap(); + svc.name = "web".into(); + svc.image = Some("nginx:alpine".into()); + svc.env.insert("FOO".into(), "bar".into()); + svc.publish = vec![crate::types::PortMapping { + address: None, + host: 8080, + container: 80, + }]; + svc.resources.memory = Some("512M".into()); + + let exec = rt.exec_set(&svc).expect("exec_set"); + assert_eq!(exec.pre_start.as_deref(), Some("nerdctl pull nginx:alpine")); + assert!(exec.start.starts_with("nerdctl run --name orch-web --init")); + assert!(exec.start.contains(" --env FOO=bar")); + assert!(exec.start.contains(" --publish 8080:80")); + assert!(exec.start.contains(" --memory 512M")); + assert!(exec.start.ends_with(" nginx:alpine")); + assert_eq!(exec.stop.as_deref(), Some("nerdctl stop orch-web")); + assert_eq!(exec.post_stop.as_deref(), Some("nerdctl rm -f orch-web")); + } + + #[test] + fn test_exec_set__host_passthrough() { + let rt = runtime(); + let mut svc: Service = serde_json::from_str(STUB_HOST).unwrap(); + svc.run_command = Some("/usr/bin/redis-server".into()); + let exec = rt.exec_set(&svc).expect("host passthrough"); + assert_eq!(exec.start, "/usr/bin/redis-server"); + assert!(exec.pre_start.is_none()); + } + + const STUB_CONTAINER: &str = r#"{ + "name":"x","mode":"container","image":"x", + "oneshot":false,"disabled":false,"recreate":"never", + "restart":{"policy":"no"},"timeouts":{},"resources":{},"logging":{} + }"#; + const STUB_HOST: &str = r#"{ + "name":"x","mode":"host","run_command":"x", + "oneshot":false,"disabled":false,"recreate":"never", + "restart":{"policy":"no"},"timeouts":{},"resources":{},"logging":{} + }"#; +} diff --git a/src/runtime/mod.rs b/src/runtime/mod.rs index 9158261..4373b45 100644 --- a/src/runtime/mod.rs +++ b/src/runtime/mod.rs @@ -1,5 +1,6 @@ pub mod apple; pub mod bare; +pub mod containerd; use crate::config::Config; use crate::exec::ExecSet; @@ -45,8 +46,9 @@ pub fn create_runtime(name: &str, config: &Config) -> Result, R match name { "bare" => Ok(Box::new(bare::BareRuntime::new(config.data_dir.clone()))), "apple" => Ok(Box::new(apple::AppleRuntime::new(config))), + "containerd" => Ok(Box::new(containerd::ContainerdRuntime::new(config))), _ => Err(RuntimeError::Other(format!( - "unknown runtime '{}'. Available: bare, apple", + "unknown runtime '{}'. Available: bare, apple, containerd", name ))), } From 63a7e143a4a57aea7917b0c3bf4cd85c52113a0c Mon Sep 17 00:00:00 2001 From: Adil Date: Sat, 6 Jun 2026 21:12:05 +0500 Subject: [PATCH 02/13] build: cross-compile a static aarch64-linux orchd (just build-linux) First ingredient of the containerd dogfood harness: a Linux orchd to run inside an orchd-osx VM. cargo-zigbuild + rustup musl target produce a 1.4M statically-linked ELF with no runtime deps, droppable into any Linux guest. --- justfile | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/justfile b/justfile index 006170e..26e14a2 100644 --- a/justfile +++ b/justfile @@ -11,6 +11,7 @@ prefix := env_var_or_default("PREFIX", "/usr/local") kernel := env_var_or_default("HOME", "") / ".orch/osx/kernel/vmlinux" kata_ver := "3.31.0" opt := "ReleaseSafe" +linux_tgt := "aarch64-unknown-linux-musl" # List recipes. default: @@ -26,6 +27,19 @@ build: build-orchd: cargo build --release +# Cross-compile a static aarch64-linux orchd: drops into any Linux VM with no +# runtime deps. Feeds the dogfood harness (orchd-osx boots a Linux guest that +# runs this orchd against containerd). On a Linux host, `just build-orchd` is +# already a Linux build; this recipe cross-builds from macOS. Needs rustup +# (`brew install rustup`) and cargo-zigbuild (`cargo install cargo-zigbuild`). +build-linux: + #!/usr/bin/env bash + set -euo pipefail + export PATH="/opt/homebrew/opt/rustup/bin:$PATH" + rustup target add {{linux_tgt}} >/dev/null 2>&1 || true + cargo zigbuild --target {{linux_tgt}} --release + echo "linux orchd -> target/{{linux_tgt}}/release/orchd ($(file -b target/{{linux_tgt}}/release/orchd | cut -d, -f1-2))" + [macos] build-apple: cd orchd-apple && zig build -Doptimize={{opt}} From 89723057a9fde3c7c4eff8ab507d87a8a52b28ee Mon Sep 17 00:00:00 2001 From: Adil Date: Sat, 6 Jun 2026 21:14:15 +0500 Subject: [PATCH 03/13] feat(orchd-osx): boot a local OCI image layout offline resolve() now routes a reference that names a local path (starts with '/' or '.') to the existing unpackLayout(), skipping the registry pull. This is the harness enabler: a locally-assembled image (containerd + runc + orchd) boots with no registry push. orchd-osx run /path/to/oci-layout --- orchd-osx/src/oci.zig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/orchd-osx/src/oci.zig b/orchd-osx/src/oci.zig index 10ca9c1..8e6923b 100644 --- a/orchd-osx/src/oci.zig +++ b/orchd-osx/src/oci.zig @@ -585,6 +585,14 @@ pub fn resolve( work_dir: []const u8, reference: []const u8, ) !Image { + // Offline path: a reference naming a local OCI image layout on disk (starts + // with '/' or '.', which no registry reference ever does) is unpacked + // directly, no network. This is how the dogfood harness boots a locally + // assembled image (containerd + runc + orchd) with no registry push. + if (reference.len > 0 and (reference[0] == '/' or reference[0] == '.')) { + return unpackLayout(allocator, io, reference, work_dir); + } + const ref = try parseReference(allocator, reference); defer ref.deinit(allocator); From 75f40be325f392a82eb506a612723a78f295e07e Mon Sep 17 00:00:00 2001 From: Adil Date: Sat, 6 Jun 2026 21:32:50 +0500 Subject: [PATCH 04/13] fix(orchd-osx): follow nested image-index in local OCI layouts container image save emits index.json -> image-index -> arm64 manifest. unpackLayout picked the first index.json entry and parsed it as a manifest, failing on the nested index. Now it follows nested indexes (bounded depth) until it reaches a manifest with layers. This is what lets orchd-osx boot an image built by the container CLI and exported to a local OCI layout. --- orchd-osx/src/oci.zig | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/orchd-osx/src/oci.zig b/orchd-osx/src/oci.zig index 8e6923b..7af4b40 100644 --- a/orchd-osx/src/oci.zig +++ b/orchd-osx/src/oci.zig @@ -514,8 +514,32 @@ pub fn unpackLayout( const chosen_digest = pickLayoutManifest(index.value.manifests, "arm64", "linux") orelse return Error.NoMatchingPlatform; - // 2. image manifest -> config + layers. - const manifest_body = try readBlob(allocator, io, oci_layout_dir, chosen_digest); + // 2. Resolve to a concrete image manifest. The chosen descriptor may be an + // image manifest directly, or a nested image-index pointing at per-platform + // manifests (what `container image save` emits: index.json -> image-index -> + // arm64 manifest). Follow nested indexes until we reach a manifest with + // layers. + var manifest_body = try readBlob(allocator, io, oci_layout_dir, chosen_digest); + var nest_guard: u8 = 0; + while (looksLikeIndex(manifest_body)) { + nest_guard += 1; + if (nest_guard > 8) { + allocator.free(manifest_body); + return Error.ManifestError; + } + const nested = std.json.parseFromSlice(LayoutIndexDoc, allocator, manifest_body, .{ .ignore_unknown_fields = true }) catch { + allocator.free(manifest_body); + return Error.ManifestError; + }; + defer nested.deinit(); + const next_digest = pickLayoutManifest(nested.value.manifests, "arm64", "linux") orelse { + allocator.free(manifest_body); + return Error.NoMatchingPlatform; + }; + const next_body = try readBlob(allocator, io, oci_layout_dir, next_digest); + allocator.free(manifest_body); + manifest_body = next_body; + } defer allocator.free(manifest_body); const manifest = try std.json.parseFromSlice(ManifestDoc, allocator, manifest_body, .{ .ignore_unknown_fields = true }); defer manifest.deinit(); From 2c820b5c12721ab6d834ca647f97485c5b12f6d0 Mon Sep 17 00:00:00 2001 From: Adil Date: Sat, 6 Jun 2026 23:11:45 +0500 Subject: [PATCH 05/13] =?UTF-8?q?examples:=20inception=20=E2=80=94=20orchd?= =?UTF-8?q?=20running=20containerd=20inside=20a=20VM=20orchd=20booted?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A nested composability + robustness example: orchd --runtime apple (osx mode) boots a Linux microVM, sized and mounted from the Orchfile spec, and inside it orchd --runtime containerd drives containerd to run a container. orchd orchestrating orchd, two runtimes deep. It only runs if the full spec is honored end to end (the VM's memory/cpus come from the Orchfile; the containerd toolchain is mounted as a volume rather than baked into an image that could never fit the in-RAM initramfs), so it doubles as the spec-alignment proof. setup.sh stages the ~600MB toolchain (not committed); needs ~3 GiB free RAM to boot the VM. --- examples/inception/.gitignore | 4 +++ examples/inception/Orchfile | 13 +++++++ examples/inception/README.md | 56 +++++++++++++++++++++++++++++++ examples/inception/inner-Orchfile | 5 +++ examples/inception/run-test.sh | 45 +++++++++++++++++++++++++ examples/inception/setup.sh | 42 +++++++++++++++++++++++ 6 files changed, 165 insertions(+) create mode 100644 examples/inception/.gitignore create mode 100644 examples/inception/Orchfile create mode 100644 examples/inception/README.md create mode 100644 examples/inception/inner-Orchfile create mode 100755 examples/inception/run-test.sh create mode 100755 examples/inception/setup.sh diff --git a/examples/inception/.gitignore b/examples/inception/.gitignore new file mode 100644 index 0000000..272eecd --- /dev/null +++ b/examples/inception/.gitignore @@ -0,0 +1,4 @@ +# Fetched/generated by setup.sh, not committed. +tools/ +Orchfile.run +state/ diff --git a/examples/inception/Orchfile b/examples/inception/Orchfile new file mode 100644 index 0000000..41b3159 --- /dev/null +++ b/examples/inception/Orchfile @@ -0,0 +1,13 @@ +# Inception: orchd boots a Linux VM with its own apple runtime (osx mode), sizes +# it from this spec, mounts the containerd toolchain in, and runs the nested +# containerd test inside it. See README.md. +# +# setup.sh rewrites __TOOLS__ to the absolute path of ./tools and writes the +# runnable copy as Orchfile.run. + +SERVICE ctd +FROM docker.io/library/debian:bookworm-slim +MEMORY 3G +CPUS 3 +VOLUME __TOOLS__:/opt/tools +CMD /bin/sh /opt/tools/run-test.sh diff --git a/examples/inception/README.md b/examples/inception/README.md new file mode 100644 index 0000000..81a7cda --- /dev/null +++ b/examples/inception/README.md @@ -0,0 +1,56 @@ +# Inception: orchd running containerd, inside a container orchd booted + +A composability test, and the most demanding one in this repo. It exercises +**both** of orchd's runtimes at once, nested: + +``` + macOS host + └─ orchd --runtime apple (osx mode) <- boots a Linux microVM, no daemon + └─ Debian VM (sized + mounted from the Orchfile spec) + └─ orchd --runtime containerd <- drives containerd inside the VM + └─ containerd -> a container (alpine) +``` + +orchd orchestrates orchd, two runtimes deep, in a box orchd created. If the +spec isn't honored end to end, this doesn't run: the outer VM needs the +**memory** and **cpus** from the Orchfile, and the containerd toolchain is +**mounted** in as a volume (not baked into an image, which would never fit the +in-RAM initramfs). So this doubles as the proof that orchd-osx honors the full +service spec (memory / cpus / volumes). + +## What's here + +| file | role | +|------|------| +| `Orchfile` | the **outer** unit: boot a Debian VM, sized, with the toolchain mounted, running the driver | +| `run-test.sh` | runs **inside** the VM: starts containerd, then has the inner orchd drive it | +| `inner-Orchfile` | the **inner** workload the containerd runtime runs (an alpine container) | +| `setup.sh` | stages `tools/` (builds the Linux orchd, fetches the containerd toolchain) and writes a runnable `Orchfile.run` | + +`tools/` (the ~600 MB containerd toolchain + the Linux orchd) is fetched by +`setup.sh`, not committed. + +## Run it + +```sh +cd examples/inception +./setup.sh # builds the linux orchd, fetches nerdctl-full, stages tools/ +ORCHD_APPLE_MODE=osx \ + orchd --orchfile Orchfile.run --runtime apple --platform orchdi \ + --state-dir ./state grow +# watch the nested test: +tail -f ./state/logs/orch.ctd.log +``` + +You should see, from inside the VM: containerd come up, then +`orchd --runtime containerd grow` pull and run the inner alpine container, +and `nerdctl ps` list it. + +## Requirements + +- macOS on Apple silicon, the orchd-osx runtime built + signed (`just build-osx`) + and the kernel fetched (`just kernel`). +- **~3 GiB of free RAM.** The Orchfile asks for a 3 GiB / 3 cpu VM; containerd + plus a nested container needs the room. On an 8 GiB machine, close other VMs + and memory-heavy apps first (`colima stop`, `container system stop`, browsers) + or the VM start fails with `BootFailed` (the host simply can't spare the RAM). diff --git a/examples/inception/inner-Orchfile b/examples/inception/inner-Orchfile new file mode 100644 index 0000000..e935f3c --- /dev/null +++ b/examples/inception/inner-Orchfile @@ -0,0 +1,5 @@ +# Inner workload: what orchd-inside-the-VM drives through containerd. +# orchd's containerd runtime turns this into nerdctl pull + nerdctl run. +SERVICE web +FROM docker.io/library/alpine:latest +CMD sleep 300 diff --git a/examples/inception/run-test.sh b/examples/inception/run-test.sh new file mode 100755 index 0000000..a4abc6f --- /dev/null +++ b/examples/inception/run-test.sh @@ -0,0 +1,45 @@ +#!/bin/sh +# Runs INSIDE the orchd-osx VM (Debian). Brings up containerd from the mounted +# nerdctl-full toolchain, then has our Linux orchd drive it via the containerd +# runtime. Verbose so the detached supervisor's logfile tells the whole story. +set -u +log(){ echo "[inception] $*"; } +export PATH=/opt/tools/bin:$PATH + +log "STAGE 0: $(uname -m); cgroup=$(stat -fc %T /sys/fs/cgroup 2>/dev/null); ip=$(ip -4 addr show 2>/dev/null | awk '/inet /{print $2}' | grep -v 127 | head -1)" + +# nerdctl looks for CNI plugins at /opt/cni/bin by default; point it at the +# mounted plugins so container networking works. +mkdir -p /opt/cni +ln -sf /opt/tools/libexec/cni /opt/cni/bin + +# nerdctl's default bridge needs iptables; the toolchain has none and the VM has +# outbound network, so pull it at runtime. +if ! command -v iptables >/dev/null 2>&1; then + log "installing iptables..." + if apt-get update -qq >/dev/null 2>&1 && apt-get install -y -qq iptables >/dev/null 2>&1; then + log "iptables ok" + else + log "iptables install FAILED (container networking may not work)" + fi +fi + +log "STAGE 1: start containerd" +mkdir -p /run/containerd /var/lib/containerd +containerd >/var/log/containerd.log 2>&1 & +for i in $(seq 1 20); do ctr version >/dev/null 2>&1 && break; sleep 1; done +if ! ctr version >/dev/null 2>&1; then + log "containerd did NOT come up:"; tail -25 /var/log/containerd.log; exit 1 +fi +log "containerd up: $(ctr --version 2>/dev/null)" + +log "STAGE 2: orchd drives containerd (the runtime under test)" +mkdir -p /run/orchd +orchd --orchfile /opt/tools/inner-Orchfile --runtime containerd --platform orchdi --state-dir /run/orchd grow +log "orchd grow rc=$?" +sleep 8 +log "--- orchd survey ---"; orchd --platform orchdi --state-dir /run/orchd survey +log "--- nerdctl ps -a (what orchd started via containerd) ---"; nerdctl ps -a 2>&1 +log "--- supervisor logs ---"; tail -25 /run/orchd/logs/*.log 2>/dev/null +log "=== DONE ===" +sleep 5 diff --git a/examples/inception/setup.sh b/examples/inception/setup.sh new file mode 100755 index 0000000..a26a377 --- /dev/null +++ b/examples/inception/setup.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Stage the inception example: build the static Linux orchd, fetch the +# containerd toolchain (nerdctl-full: containerd + runc + cni + nerdctl), and +# lay out tools/ exactly as the Orchfile mounts it. Idempotent. +set -euo pipefail + +here="$(cd "$(dirname "$0")" && pwd)" +repo="$(cd "$here/../.." && pwd)" +tools="$here/tools" + +mkdir -p "$tools/bin" + +echo "==> building static aarch64-linux orchd" +( cd "$repo" && just build-linux >/dev/null ) +cp "$repo/target/aarch64-unknown-linux-musl/release/orchd" "$tools/bin/orchd" + +echo "==> fetching nerdctl-full (containerd + runc + cni + nerdctl)" +if [ ! -e "$tools/bin/containerd" ]; then + url="$(gh api repos/containerd/nerdctl/releases/latest \ + --jq '.assets[] | select(.name | test("nerdctl-full-.*-linux-arm64.tar.gz$")) | .browser_download_url' 2>/dev/null \ + || curl -fsSL https://api.github.com/repos/containerd/nerdctl/releases/latest \ + | grep -o 'https://[^"]*nerdctl-full-[^"]*-linux-arm64.tar.gz' | head -1)" + echo " $url" + curl -fsSL "$url" | tar -xz -C "$tools" +fi + +echo "==> copying the in-VM driver + inner workload into tools/" +cp "$here/run-test.sh" "$tools/run-test.sh" +cp "$here/inner-Orchfile" "$tools/inner-Orchfile" + +echo "==> writing runnable Orchfile.run (absolute volume path)" +sed "s|__TOOLS__|$tools|" "$here/Orchfile" > "$here/Orchfile.run" + +cat < $tools ($(du -sh "$tools" | cut -f1)) + +Run it: + ORCHD_APPLE_MODE=osx orchd --orchfile "$here/Orchfile.run" \\ + --runtime apple --platform orchdi --state-dir "$here/state" grow + tail -f "$here/state/logs/orch.ctd.log" +EOF From b7626f8ab3f8ae011c2d59fb11e269f2241f7fda Mon Sep 17 00:00:00 2001 From: Adil Date: Sat, 6 Jun 2026 23:30:59 +0500 Subject: [PATCH 06/13] examples/inception: stage a CA bundle so the in-VM registry pull works debian-slim ships no CA bundle; containerd's TLS could not verify the registry (x509: unknown authority). setup.sh now stages the host CA bundle into tools/, and run-test.sh points both Go (SSL_CERT_FILE) and apt/openssl (/etc/ssl/certs) at it. The alpine pull now completes; the remaining gap is nerdctl's CNI bridge requiring iptables (the case for driving containerd directly instead). --- examples/inception/run-test.sh | 8 ++++++++ examples/inception/setup.sh | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/examples/inception/run-test.sh b/examples/inception/run-test.sh index a4abc6f..74743f8 100755 --- a/examples/inception/run-test.sh +++ b/examples/inception/run-test.sh @@ -5,9 +5,17 @@ set -u log(){ echo "[inception] $*"; } export PATH=/opt/tools/bin:$PATH +# debian-slim ships no CA bundle, so containerd's TLS can't verify the registry. +# Point Go's TLS at the host CA bundle we mounted in. +export SSL_CERT_FILE=/opt/tools/ca-bundle.crt log "STAGE 0: $(uname -m); cgroup=$(stat -fc %T /sys/fs/cgroup 2>/dev/null); ip=$(ip -4 addr show 2>/dev/null | awk '/inet /{print $2}' | grep -v 127 | head -1)" +# Put the CA bundle where apt/openssl look too (not just Go's SSL_CERT_FILE), +# so apt-get over https works to install iptables below. +mkdir -p /etc/ssl/certs +cp /opt/tools/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt + # nerdctl looks for CNI plugins at /opt/cni/bin by default; point it at the # mounted plugins so container networking works. mkdir -p /opt/cni diff --git a/examples/inception/setup.sh b/examples/inception/setup.sh index a26a377..094a977 100755 --- a/examples/inception/setup.sh +++ b/examples/inception/setup.sh @@ -28,6 +28,13 @@ echo "==> copying the in-VM driver + inner workload into tools/" cp "$here/run-test.sh" "$tools/run-test.sh" cp "$here/inner-Orchfile" "$tools/inner-Orchfile" +echo "==> staging a CA bundle (debian-slim has none; containerd needs it for registry TLS)" +if [ -f /etc/ssl/cert.pem ]; then + cp /etc/ssl/cert.pem "$tools/ca-bundle.crt" +else + curl -fsSL https://curl.se/ca/cacert.pem -o "$tools/ca-bundle.crt" +fi + echo "==> writing runnable Orchfile.run (absolute volume path)" sed "s|__TOOLS__|$tools|" "$here/Orchfile" > "$here/Orchfile.run" From 032afd0521c48c831571c9dfbf23a7c3d1d2a8f3 Mon Sep 17 00:00:00 2001 From: Adil Date: Sun, 7 Jun 2026 00:11:29 +0500 Subject: [PATCH 07/13] =?UTF-8?q?feat:=20containerd=20runtime=20mode-2=20?= =?UTF-8?q?=E2=80=94=20drive=20containerd's=20gRPC=20API=20in=20process?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the nerdctl ExecSet with an in-process containerd client. The runtime now emits a single stateless foreground command, `orchd containerd-run --spec `, which talks to containerd's gRPC socket directly: Transfer-service pull -> chainID snapshot -> create container -> create/start task (host netns, no CNI/iptables) -> wait -> SIGTERM kill+delete. No nerdctl, no ctr, no Docker. - src/runtime/containerd/{mod.rs,run.rs}: runtime + the containerd-run leaf, moved into a folder; apple.rs likewise moved to apple/mod.rs. - Feature-gated behind `containerd` (containerd-client/tonic/tokio + host protoc), so the default build stays lean (verified: zero heavy deps pulled). - no_pivot_root=true in the runc options: orchd-osx boots the VM as an initramfs where pivot_root fails; runc uses MS_MOVE+chroot instead. - just build-linux now builds --features containerd. Validated end to end via examples/inception: orchd boots a Debian VM (apple-osx), containerd runs inside, orchd drives it over gRPC, alpine task RUNNING (ctr confirms PID). The iptables wall is gone (host netns). --- Cargo.lock | 1010 ++++++++++++++++++++++++ Cargo.toml | 16 + examples/inception/run-test.sh | 45 +- justfile | 14 +- src/cli.rs | 8 + src/main.rs | 6 + src/runtime/{apple.rs => apple/mod.rs} | 0 src/runtime/containerd.rs | 221 ------ src/runtime/containerd/mod.rs | 229 ++++++ src/runtime/containerd/run.rs | 551 +++++++++++++ 10 files changed, 1844 insertions(+), 256 deletions(-) rename src/runtime/{apple.rs => apple/mod.rs} (100%) delete mode 100644 src/runtime/containerd.rs create mode 100644 src/runtime/containerd/mod.rs create mode 100644 src/runtime/containerd/run.rs diff --git a/Cargo.lock b/Cargo.lock index 86960d4..8852e77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "anstream" version = "0.6.21" @@ -52,6 +61,62 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bitflags" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + [[package]] name = "clap" version = "4.5.60" @@ -98,36 +163,390 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "containerd-client" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "814eedf2860b6df6e8002f917a0fbabf53bace3d3d9d2c2022661ae55a6ab6e4" +dependencies = [ + "hyper-util", + "prost", + "prost-types", + "tokio", + "tonic", + "tonic-prost", + "tonic-prost-build", + "tower", +] + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-core", + "futures-task", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] + +[[package]] +name = "h2" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", +] + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "http" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "hyper" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55281c53a1894c864990125767da440a4e630446785086f52523b20033b74498" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2", + "http", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "libc", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.1", + "serde", + "serde_core", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "libc" version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "log" +version = "0.4.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" + [[package]] name = "memchr" version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "mio" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02bd0af71c67b473010cbbc60715ee815645a4dc942899111f494b4b737d6fda" +dependencies = [ + "libc", + "wasi", + "windows-sys", +] + +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + [[package]] name = "once_cell_polyfill" version = "1.70.2" @@ -147,12 +566,73 @@ dependencies = [ name = "orchd" version = "0.3.1" dependencies = [ + "anyhow", + "base64", "clap", + "containerd-client", + "hex", "libc", "orch", + "prost-types", "serde", "serde_json", + "sha2", "thiserror", + "tokio", + "tonic", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "petgraph" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" +dependencies = [ + "fixedbitset", + "hashbrown 0.15.5", + "indexmap", +] + +[[package]] +name = "pin-project" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", ] [[package]] @@ -164,6 +644,57 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prost" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +dependencies = [ + "heck", + "itertools", + "log", + "multimap", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +dependencies = [ + "prost", +] + [[package]] name = "quote" version = "1.0.45" @@ -173,6 +704,60 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + [[package]] name = "serde" version = "1.0.228" @@ -216,6 +801,49 @@ dependencies = [ "zmij", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "socket2" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52d1cfed4120b4d927bf7c0f86d2087a4a7d6027c906d9f9d525a80573b9be51" +dependencies = [ + "libc", + "windows-sys", +] + [[package]] name = "strsim" version = "0.11.1" @@ -233,6 +861,25 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + [[package]] name = "thiserror" version = "2.0.18" @@ -253,18 +900,287 @@ dependencies = [ "syn", ] +[[package]] +name = "tokio" +version = "1.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" +dependencies = [ + "bytes", + "libc", + "mio", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys", +] + +[[package]] +name = "tokio-macros" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "base64", + "bytes", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-build" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c68f61875ac5293cf72e6c8cf0158086428c82c37229e98c840878f1706b0322" +dependencies = [ + "prettyplease", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + +[[package]] +name = "tonic-prost-build" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "654e5643eff75d7f8c99197ce1440ed19a3474eada74c12bbac488b2cafdae27" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build", + "prost-types", + "quote", + "syn", + "tempfile", + "tonic-build", +] + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "indexmap", + "pin-project-lite", + "slab", + "sync_wrapper", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "typenum" +version = "1.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" + [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen 0.57.1", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + [[package]] name = "windows-link" version = "0.2.1" @@ -280,6 +1196,100 @@ dependencies = [ "windows-link", ] +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + [[package]] name = "zmij" version = "1.0.21" diff --git a/Cargo.toml b/Cargo.toml index 9848ef3..18c0eaa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,19 @@ serde_json = "1" clap = { version = "4", features = ["derive"] } thiserror = "2" libc = "0.2" +base64 = "0.22" + +# containerd mode-2 (feature "containerd"): drive containerd's gRPC API in +# process. Optional so the default build stays lean and needs no protoc. +containerd-client = { version = "0.9", optional = true } +tonic = { version = "0.14", default-features = false, features = ["codegen", "channel"], optional = true } +prost-types = { version = "0.14", optional = true } +tokio = { version = "1", features = ["rt", "rt-multi-thread", "macros", "net", "time", "signal"], optional = true } +sha2 = { version = "0.10", optional = true } +hex = { version = "0.4", optional = true } +anyhow = { version = "1", optional = true } + +[features] +default = [] +# Compile the in-process containerd backend (pulls tonic/tokio; needs host protoc). +containerd = ["dep:containerd-client", "dep:tonic", "dep:prost-types", "dep:tokio", "dep:sha2", "dep:hex", "dep:anyhow"] diff --git a/examples/inception/run-test.sh b/examples/inception/run-test.sh index 74743f8..edd7816 100755 --- a/examples/inception/run-test.sh +++ b/examples/inception/run-test.sh @@ -1,36 +1,19 @@ #!/bin/sh -# Runs INSIDE the orchd-osx VM (Debian). Brings up containerd from the mounted -# nerdctl-full toolchain, then has our Linux orchd drive it via the containerd -# runtime. Verbose so the detached supervisor's logfile tells the whole story. +# Runs INSIDE the orchd-osx VM (Debian). Starts containerd from the mounted +# toolchain, then has our Linux orchd drive it via the in-process containerd +# backend (mode-2 gRPC) — no nerdctl, no CNI, no iptables. Verbose so the +# detached supervisor's logfile tells the whole story. set -u log(){ echo "[inception] $*"; } export PATH=/opt/tools/bin:$PATH -# debian-slim ships no CA bundle, so containerd's TLS can't verify the registry. -# Point Go's TLS at the host CA bundle we mounted in. -export SSL_CERT_FILE=/opt/tools/ca-bundle.crt -log "STAGE 0: $(uname -m); cgroup=$(stat -fc %T /sys/fs/cgroup 2>/dev/null); ip=$(ip -4 addr show 2>/dev/null | awk '/inet /{print $2}' | grep -v 127 | head -1)" - -# Put the CA bundle where apt/openssl look too (not just Go's SSL_CERT_FILE), -# so apt-get over https works to install iptables below. +# containerd pulls images itself; debian-slim ships no CA bundle, so give it one +# (system path + the env Go reads) for registry TLS. mkdir -p /etc/ssl/certs cp /opt/tools/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt +export SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt -# nerdctl looks for CNI plugins at /opt/cni/bin by default; point it at the -# mounted plugins so container networking works. -mkdir -p /opt/cni -ln -sf /opt/tools/libexec/cni /opt/cni/bin - -# nerdctl's default bridge needs iptables; the toolchain has none and the VM has -# outbound network, so pull it at runtime. -if ! command -v iptables >/dev/null 2>&1; then - log "installing iptables..." - if apt-get update -qq >/dev/null 2>&1 && apt-get install -y -qq iptables >/dev/null 2>&1; then - log "iptables ok" - else - log "iptables install FAILED (container networking may not work)" - fi -fi +log "STAGE 0: $(uname -m); cgroup=$(stat -fc %T /sys/fs/cgroup 2>/dev/null)" log "STAGE 1: start containerd" mkdir -p /run/containerd /var/lib/containerd @@ -41,13 +24,17 @@ if ! ctr version >/dev/null 2>&1; then fi log "containerd up: $(ctr --version 2>/dev/null)" -log "STAGE 2: orchd drives containerd (the runtime under test)" +log "STAGE 2: orchd drives containerd via its gRPC API (mode-2, no nerdctl)" mkdir -p /run/orchd orchd --orchfile /opt/tools/inner-Orchfile --runtime containerd --platform orchdi --state-dir /run/orchd grow log "orchd grow rc=$?" sleep 8 -log "--- orchd survey ---"; orchd --platform orchdi --state-dir /run/orchd survey -log "--- nerdctl ps -a (what orchd started via containerd) ---"; nerdctl ps -a 2>&1 -log "--- supervisor logs ---"; tail -25 /run/orchd/logs/*.log 2>/dev/null +log "--- orchd survey (what orchd supervises) ---" +orchd --platform orchdi --state-dir /run/orchd survey +log "--- containerd's own view (ctr), proving the task is real ---" +for n in $(ctr namespaces ls -q 2>/dev/null); do + log "namespace=$n"; ctr -n "$n" tasks ls 2>&1; ctr -n "$n" containers ls 2>&1 +done +log "--- supervisor log ---"; tail -20 /run/orchd/logs/*.log 2>/dev/null log "=== DONE ===" sleep 5 diff --git a/justfile b/justfile index 26e14a2..6705cb5 100644 --- a/justfile +++ b/justfile @@ -27,17 +27,19 @@ build: build-orchd: cargo build --release -# Cross-compile a static aarch64-linux orchd: drops into any Linux VM with no -# runtime deps. Feeds the dogfood harness (orchd-osx boots a Linux guest that -# runs this orchd against containerd). On a Linux host, `just build-orchd` is -# already a Linux build; this recipe cross-builds from macOS. Needs rustup -# (`brew install rustup`) and cargo-zigbuild (`cargo install cargo-zigbuild`). +# Cross-compile a static aarch64-linux orchd with the containerd backend: drops +# into a Linux VM and drives containerd in process. Feeds examples/inception +# (orchd-osx boots a Linux guest that runs this orchd against containerd). On a +# Linux host, build natively with `--features containerd`; this recipe +# cross-builds from macOS. Needs rustup (`brew install rustup`), cargo-zigbuild +# (`cargo install cargo-zigbuild`), and protoc (`brew install protobuf`). build-linux: #!/usr/bin/env bash set -euo pipefail export PATH="/opt/homebrew/opt/rustup/bin:$PATH" + export PROTOC="$(command -v protoc)" rustup target add {{linux_tgt}} >/dev/null 2>&1 || true - cargo zigbuild --target {{linux_tgt}} --release + cargo zigbuild --target {{linux_tgt}} --release --features containerd echo "linux orchd -> target/{{linux_tgt}}/release/orchd ($(file -b target/{{linux_tgt}}/release/orchd | cut -d, -f1-2))" [macos] diff --git a/src/cli.rs b/src/cli.rs index 44d416f..b50b190 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -122,4 +122,12 @@ pub enum Commands { #[arg(long)] spec: PathBuf, }, + + /// Run one container over containerd's gRPC API (invoked by the supervisor; internal). + #[command(hide = true)] + ContainerdRun { + /// Base64-encoded ContainerdRunSpec JSON. + #[arg(long)] + spec: String, + }, } diff --git a/src/main.rs b/src/main.rs index f93bb07..52b340c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,6 +20,11 @@ fn main() { if let Commands::Supervise { spec } = &cli.command { std::process::exit(orchdi::run(spec)); } + // `containerd-run` is likewise a leaf the supervisor execs: it talks to + // containerd directly and runs the container in the foreground. + if let Commands::ContainerdRun { spec } = &cli.command { + std::process::exit(runtime::containerd::run::run(spec)); + } let config = Config::load(&cli); @@ -38,6 +43,7 @@ fn main() { Commands::Plant {} => orchard::plant(&config).map_err(boxed_orchard), Commands::Tend { no_start } => orchard::tend(&config, !*no_start).map_err(boxed_orchard), Commands::Supervise { .. } => unreachable!("handled above"), + Commands::ContainerdRun { .. } => unreachable!("handled above"), }; if let Err(e) = result { diff --git a/src/runtime/apple.rs b/src/runtime/apple/mod.rs similarity index 100% rename from src/runtime/apple.rs rename to src/runtime/apple/mod.rs diff --git a/src/runtime/containerd.rs b/src/runtime/containerd.rs deleted file mode 100644 index 7ff7420..0000000 --- a/src/runtime/containerd.rs +++ /dev/null @@ -1,221 +0,0 @@ -//! containerd runtime: an ExecSet over `nerdctl` (containerd's docker-compatible -//! CLI). Linux. -//! -//! nerdctl drives containerd directly, so this is "containerd as the runtime" -//! with no Docker daemon on top. It fits the four-string ExecSet contract today -//! (the pragmatic wiring). A future mode-2 will drive containerd's gRPC API in -//! process; this CLI path is the v1. -//! -//! Container lifecycle, supervised by orchdi/launchd/systemd: -//! pre_start nerdctl pull -//! start nerdctl run --name - --init [flags] [cmd] -//! stop nerdctl stop - -//! post_stop nerdctl rm -f - - -use std::process::{Command, Stdio}; - -use crate::config::Config; -use crate::exec::ExecSet; -use crate::runtime::{Runtime, RuntimeError}; -use crate::types::Service; - -pub struct ContainerdRuntime { - namespace: String, - data_dir: std::path::PathBuf, -} - -impl ContainerdRuntime { - pub fn new(config: &Config) -> Self { - ContainerdRuntime { - namespace: config.namespace.clone(), - data_dir: config.data_dir.clone(), - } - } - - fn container_name(&self, service: &Service) -> String { - format!("{}-{}", self.namespace, service.name) - } - - fn require_image<'a>(&self, service: &'a Service) -> Result<&'a str, RuntimeError> { - service.image.as_deref().ok_or_else(|| { - RuntimeError::Other(format!( - "service '{}' is container-mode but has no FROM image", - service.name - )) - }) - } -} - -impl Runtime for ContainerdRuntime { - fn name(&self) -> &str { - "containerd" - } - - fn check(&self) -> Result<(), RuntimeError> { - let status = Command::new("nerdctl") - .arg("version") - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .status() - .map_err(|e| { - RuntimeError::BinaryNotFound(format!("could not spawn 'nerdctl': {e}")) - })?; - if status.success() { - Ok(()) - } else { - Err(RuntimeError::Other( - "containerd not reachable via nerdctl (is containerd running?)".to_string(), - )) - } - } - - fn prepare(&self, service: &Service) -> Result<(), RuntimeError> { - if service.is_host() { - let dir = self.data_dir.join(&service.name); - std::fs::create_dir_all(&dir).map_err(|e| { - RuntimeError::Other(format!( - "failed to create data directory '{}': {e}", - dir.display() - )) - })?; - return Ok(()); - } - // Image pull is deferred to pre_start (no I/O at prepare time). - Ok(()) - } - - fn exec_set(&self, service: &Service) -> Result { - if service.is_host() { - // Host-mode services pass through as plain programs, same as bare. - let start = service.run_command.clone().ok_or_else(|| { - RuntimeError::Other(format!( - "service '{}' is host-mode but has no RUN command", - service.name - )) - })?; - return Ok(ExecSet { - start, - pre_start: None, - stop: service.stop_command.clone(), - post_stop: None, - }); - } - - let image = self.require_image(service)?; - let name = self.container_name(service); - - let pre_start = format!("nerdctl pull {image}"); - - // --init forwards signals and reaps zombies inside the container. - let mut start = format!("nerdctl run --name {name} --init"); - - let mut envs: Vec<(&String, &String)> = service.env.iter().collect(); - envs.sort_by(|a, b| a.0.cmp(b.0)); - for (k, v) in envs { - start.push_str(&format!(" --env {k}={v}")); - } - for ef in &service.env_files { - start.push_str(&format!(" --env-file {ef}")); - } - for vol in &service.volumes { - start.push_str(&format!(" --volume {}:{}", vol.source, vol.destination)); - } - for p in &service.publish { - match &p.address { - Some(addr) => { - start.push_str(&format!(" --publish {addr}:{}:{}", p.host, p.container)) - } - None => start.push_str(&format!(" --publish {}:{}", p.host, p.container)), - } - } - if let Some(mem) = &service.resources.memory { - start.push_str(&format!(" --memory {mem}")); - } - if let Some(cpus) = service.resources.cpus { - if cpus.fract() == 0.0 { - start.push_str(&format!(" --cpus {}", cpus as u64)); - } else { - start.push_str(&format!(" --cpus {cpus}")); - } - } - if let Some(user) = &service.user { - start.push_str(&format!(" --user {user}")); - } - if let Some(wd) = &service.workdir { - start.push_str(&format!(" --workdir {wd}")); - } - if let Some(ep) = &service.entrypoint { - start.push_str(&format!(" --entrypoint {ep}")); - } - start.push_str(&format!(" {image}")); - if let Some(cmd) = &service.cmd { - start.push_str(&format!(" {cmd}")); - } - - Ok(ExecSet { - start, - pre_start: Some(pre_start), - stop: Some(format!("nerdctl stop {name}")), - post_stop: Some(format!("nerdctl rm -f {name}")), - }) - } -} - -#[cfg(test)] -#[allow(non_snake_case)] -mod tests { - use super::*; - - fn runtime() -> ContainerdRuntime { - ContainerdRuntime { - namespace: "orch".to_string(), - data_dir: std::env::temp_dir().join("orchd-containerd-test"), - } - } - - #[test] - fn test_exec_set__container_maps_to_nerdctl() { - let rt = runtime(); - let mut svc: Service = serde_json::from_str(STUB_CONTAINER).unwrap(); - svc.name = "web".into(); - svc.image = Some("nginx:alpine".into()); - svc.env.insert("FOO".into(), "bar".into()); - svc.publish = vec![crate::types::PortMapping { - address: None, - host: 8080, - container: 80, - }]; - svc.resources.memory = Some("512M".into()); - - let exec = rt.exec_set(&svc).expect("exec_set"); - assert_eq!(exec.pre_start.as_deref(), Some("nerdctl pull nginx:alpine")); - assert!(exec.start.starts_with("nerdctl run --name orch-web --init")); - assert!(exec.start.contains(" --env FOO=bar")); - assert!(exec.start.contains(" --publish 8080:80")); - assert!(exec.start.contains(" --memory 512M")); - assert!(exec.start.ends_with(" nginx:alpine")); - assert_eq!(exec.stop.as_deref(), Some("nerdctl stop orch-web")); - assert_eq!(exec.post_stop.as_deref(), Some("nerdctl rm -f orch-web")); - } - - #[test] - fn test_exec_set__host_passthrough() { - let rt = runtime(); - let mut svc: Service = serde_json::from_str(STUB_HOST).unwrap(); - svc.run_command = Some("/usr/bin/redis-server".into()); - let exec = rt.exec_set(&svc).expect("host passthrough"); - assert_eq!(exec.start, "/usr/bin/redis-server"); - assert!(exec.pre_start.is_none()); - } - - const STUB_CONTAINER: &str = r#"{ - "name":"x","mode":"container","image":"x", - "oneshot":false,"disabled":false,"recreate":"never", - "restart":{"policy":"no"},"timeouts":{},"resources":{},"logging":{} - }"#; - const STUB_HOST: &str = r#"{ - "name":"x","mode":"host","run_command":"x", - "oneshot":false,"disabled":false,"recreate":"never", - "restart":{"policy":"no"},"timeouts":{},"resources":{},"logging":{} - }"#; -} diff --git a/src/runtime/containerd/mod.rs b/src/runtime/containerd/mod.rs new file mode 100644 index 0000000..5239722 --- /dev/null +++ b/src/runtime/containerd/mod.rs @@ -0,0 +1,229 @@ +//! containerd runtime (mode-2): orchd drives containerd's gRPC API directly, +//! in process. No nerdctl, no ctr, no Docker. Linux. +//! +//! The exec_set for a container is a single stateless foreground command, +//! `orchd containerd-run --spec ` (see `run`), which the supervisor +//! (orchdi/launchd/systemd) tracks. That process pulls the image, creates and +//! starts the container task over the containerd socket in the host network +//! namespace (so there is no CNI/iptables dependency), waits for it to exit, +//! and on SIGTERM kills + deletes it. One command owns the whole lifecycle, so +//! there is no separate pre_start/stop/post_stop. + +use std::path::Path; + +use crate::config::Config; +use crate::exec::ExecSet; +use crate::runtime::{Runtime, RuntimeError}; +use crate::types::Service; + +pub mod run; +use run::{encode_spec, ContainerdRunSpec}; + +const DEFAULT_SOCKET: &str = "/run/containerd/containerd.sock"; + +pub struct ContainerdRuntime { + namespace: String, + data_dir: std::path::PathBuf, + socket: String, +} + +impl ContainerdRuntime { + pub fn new(config: &Config) -> Self { + ContainerdRuntime { + namespace: config.namespace.clone(), + data_dir: config.data_dir.clone(), + socket: std::env::var("ORCHD_CONTAINERD_SOCKET") + .unwrap_or_else(|_| DEFAULT_SOCKET.to_string()), + } + } + + fn container_name(&self, service: &Service) -> String { + format!("{}-{}", self.namespace, service.name) + } + + fn require_image<'a>(&self, service: &'a Service) -> Result<&'a str, RuntimeError> { + service.image.as_deref().ok_or_else(|| { + RuntimeError::Other(format!( + "service '{}' is container-mode but has no FROM image", + service.name + )) + }) + } + + /// Path to the orchd binary to invoke for `containerd-run` (this same exe). + fn orchd_exe() -> String { + std::env::current_exe() + .ok() + .and_then(|p| p.to_str().map(String::from)) + .unwrap_or_else(|| "orchd".to_string()) + } +} + +impl Runtime for ContainerdRuntime { + fn name(&self) -> &str { + "containerd" + } + + fn check(&self) -> Result<(), RuntimeError> { + // We talk to containerd directly over its gRPC socket; the actual + // connection happens in `containerd-run`. Here we just confirm the + // socket exists, which fails cleanly off-Linux (no containerd). + if Path::new(&self.socket).exists() { + Ok(()) + } else { + Err(RuntimeError::Other(format!( + "containerd socket '{}' not found (is containerd running?)", + self.socket + ))) + } + } + + fn prepare(&self, service: &Service) -> Result<(), RuntimeError> { + if service.is_host() { + let dir = self.data_dir.join(&service.name); + std::fs::create_dir_all(&dir).map_err(|e| { + RuntimeError::Other(format!( + "failed to create data directory '{}': {e}", + dir.display() + )) + })?; + return Ok(()); + } + // Image pull is deferred to pre_start (no I/O at prepare time). + Ok(()) + } + + fn exec_set(&self, service: &Service) -> Result { + if service.is_host() { + // Host-mode services pass through as plain programs, same as bare. + let start = service.run_command.clone().ok_or_else(|| { + RuntimeError::Other(format!( + "service '{}' is host-mode but has no RUN command", + service.name + )) + })?; + return Ok(ExecSet { + start, + pre_start: None, + stop: service.stop_command.clone(), + post_stop: None, + }); + } + + let image = self.require_image(service)?; + let name = self.container_name(service); + + // Resolve argv from the service's ENTRYPOINT + CMD (space-split). Empty + // means containerd-run falls back to the image config's Entrypoint+Cmd. + let mut args: Vec = Vec::new(); + if let Some(ep) = &service.entrypoint { + args.extend(ep.split_whitespace().map(String::from)); + } + if let Some(cmd) = &service.cmd { + args.extend(cmd.split_whitespace().map(String::from)); + } + + let mut env: Vec = service + .env + .iter() + .map(|(k, v)| format!("{k}={v}")) + .collect(); + env.sort(); + + let spec = ContainerdRunSpec { + socket: self.socket.clone(), + namespace: self.namespace.clone(), + image: image.to_string(), + container_id: name, + args, + env, + cwd: service.workdir.clone().unwrap_or_default(), + user: service.user.clone(), + }; + + // start is a single foreground process the supervisor tracks: it pulls + // (if needed), runs the container task over containerd's gRPC socket, + // and on SIGTERM kills + deletes it. No nerdctl, no CNI, no iptables. + let start = format!( + "{} containerd-run --spec {}", + Self::orchd_exe(), + encode_spec(&spec) + ); + + Ok(ExecSet { + start, + pre_start: None, + stop: None, + post_stop: None, + }) + } +} + +#[cfg(test)] +#[allow(non_snake_case)] +mod tests { + use super::*; + + fn runtime() -> ContainerdRuntime { + ContainerdRuntime { + namespace: "orch".to_string(), + data_dir: std::env::temp_dir().join("orchd-containerd-test"), + socket: super::DEFAULT_SOCKET.to_string(), + } + } + + fn decode(b64: &str) -> ContainerdRunSpec { + use base64::Engine; + let json = base64::engine::general_purpose::STANDARD + .decode(b64) + .unwrap(); + serde_json::from_slice(&json).unwrap() + } + + #[test] + fn test_exec_set__container_emits_containerd_run() { + let rt = runtime(); + let mut svc: Service = serde_json::from_str(STUB_CONTAINER).unwrap(); + svc.name = "web".into(); + svc.image = Some("nginx:alpine".into()); + svc.env.insert("FOO".into(), "bar".into()); + svc.cmd = Some("sleep 300".into()); + + let exec = rt.exec_set(&svc).expect("exec_set"); + // start is ` containerd-run --spec `; no nerdctl, no separate + // pull/stop/post_stop (containerd-run owns the whole lifecycle). + assert!(exec.start.contains(" containerd-run --spec ")); + assert!(exec.pre_start.is_none()); + assert!(exec.stop.is_none()); + assert!(exec.post_stop.is_none()); + + let b64 = exec.start.rsplit(' ').next().unwrap(); + let spec = decode(b64); + assert_eq!(spec.image, "nginx:alpine"); + assert_eq!(spec.container_id, "orch-web"); + assert_eq!(spec.namespace, "orch"); + assert_eq!(spec.args, vec!["sleep".to_string(), "300".to_string()]); + assert_eq!(spec.env, vec!["FOO=bar".to_string()]); + } + + #[test] + fn test_exec_set__host_passthrough() { + let rt = runtime(); + let mut svc: Service = serde_json::from_str(STUB_HOST).unwrap(); + svc.run_command = Some("/usr/bin/redis-server".into()); + let exec = rt.exec_set(&svc).expect("host passthrough"); + assert_eq!(exec.start, "/usr/bin/redis-server"); + assert!(exec.pre_start.is_none()); + } + + const STUB_CONTAINER: &str = r#"{ + "name":"x","mode":"container","image":"x", + "oneshot":false,"disabled":false,"recreate":"never", + "restart":{"policy":"no"},"timeouts":{},"resources":{},"logging":{} + }"#; + const STUB_HOST: &str = r#"{ + "name":"x","mode":"host","run_command":"x", + "oneshot":false,"disabled":false,"recreate":"never", + "restart":{"policy":"no"},"timeouts":{},"resources":{},"logging":{} + }"#; +} diff --git a/src/runtime/containerd/run.rs b/src/runtime/containerd/run.rs new file mode 100644 index 0000000..bacf489 --- /dev/null +++ b/src/runtime/containerd/run.rs @@ -0,0 +1,551 @@ +//! containerd_run: orchd's in-process containerd client (mode-2). +//! +//! `orchd containerd-run --spec ` is the foreground process the +//! supervisor tracks for a containerd-backed service. It pulls the image (via +//! containerd's Transfer service), prepares a writable snapshot, creates and +//! starts the container task over containerd's gRPC socket, waits for it to +//! exit, and on SIGTERM kills + deletes it. No nerdctl, no ctr, no Docker. +//! +//! The container runs in the HOST network namespace (the OCI spec omits a new +//! network namespace), so there is no CNI/iptables dependency. +//! +//! The gRPC backend (tonic + containerd-client) lives behind the `containerd` +//! cargo feature, so the default orchd build stays lean and needs no protoc. +//! The spec-building half (used by the runtime's exec_set) is always compiled. + +use serde::{Deserialize, Serialize}; + +/// Everything `containerd-run` needs to pull and run one container. Built by the +/// containerd runtime's exec_set, consumed here. +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct ContainerdRunSpec { + /// containerd gRPC unix socket (e.g. /run/containerd/containerd.sock). + pub socket: String, + /// containerd namespace (e.g. "default"). + pub namespace: String, + /// Image reference to pull/run. + pub image: String, + /// Container id in containerd (e.g. "orch-web"). + pub container_id: String, + /// argv. Empty -> use the image config's Entrypoint ++ Cmd. + #[serde(default)] + pub args: Vec, + /// "KEY=VALUE" entries, merged after the image env. + #[serde(default)] + pub env: Vec, + /// Working directory. Empty -> the image config's WorkingDir (or "/"). + #[serde(default)] + pub cwd: String, + /// uid[:gid] (numeric). None -> the image config's User (or root). + #[serde(default)] + pub user: Option, +} + +/// Encode a spec as a shell-safe base64 arg for the ExecSet start command. +pub fn encode_spec(spec: &ContainerdRunSpec) -> String { + use base64::Engine; + let json = serde_json::to_vec(spec).expect("ContainerdRunSpec serializes"); + base64::engine::general_purpose::STANDARD.encode(json) +} + +fn decode_spec(b64: &str) -> Result { + use base64::Engine; + let json = base64::engine::general_purpose::STANDARD + .decode(b64.trim()) + .map_err(|e| format!("base64: {e}"))?; + serde_json::from_slice(&json).map_err(|e| format!("json: {e}")) +} + +/// Decode the base64 spec and run the container to completion. Returns the +/// container's exit code (or a non-zero orchd error code). +pub fn run(spec_b64: &str) -> i32 { + let spec = match decode_spec(spec_b64) { + Ok(s) => s, + Err(e) => { + eprintln!("containerd-run: bad --spec: {e}"); + return 1; + } + }; + + #[cfg(feature = "containerd")] + { + match backend::run(spec) { + Ok(code) => code, + Err(e) => { + eprintln!("containerd-run: {e:#}"); + 1 + } + } + } + #[cfg(not(feature = "containerd"))] + { + let _ = spec; + eprintln!("containerd-run: this orchd was built without the 'containerd' feature"); + 1 + } +} + +#[cfg(feature = "containerd")] +mod backend { + use super::ContainerdRunSpec; + use std::collections::HashMap; + use std::env::consts; + + use anyhow::{anyhow, Context, Result}; + use containerd_client::{ + services::v1::{ + container::Runtime, + snapshots::{PrepareSnapshotRequest, RemoveSnapshotRequest}, + Container, CreateContainerRequest, CreateTaskRequest, DeleteContainerRequest, + DeleteTaskRequest, GetImageRequest, KillRequest, ReadContentRequest, StartRequest, + TransferOptions, TransferRequest, WaitRequest, + }, + to_any, + types::{ + transfer::{ImageStore, OciRegistry, UnpackConfiguration}, + Platform, + }, + with_namespace, Client, + }; + use sha2::{Digest, Sha256}; + use tokio::signal::unix::{signal, SignalKind}; + use tonic::Request; + + const SIGTERM: u32 = 15; + const SNAPSHOTTER: &str = "overlayfs"; + + /// containerd's GOARCH string for this host. + fn goarch() -> &'static str { + match consts::ARCH { + "x86_64" => "amd64", + "aarch64" => "arm64", + other => other, + } + } + + /// Process defaults read from the image config. + #[derive(Default)] + struct ImageConfig { + entrypoint: Vec, + cmd: Vec, + env: Vec, + working_dir: String, + user: String, + } + + pub fn run(spec: ContainerdRunSpec) -> Result { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .context("build tokio runtime")?; + rt.block_on(run_async(spec)) + } + + async fn run_async(spec: ContainerdRunSpec) -> Result { + let ns = &spec.namespace; + let id = &spec.container_id; + let client = Client::from_path(&spec.socket) + .await + .with_context(|| format!("connect to containerd at {}", spec.socket))?; + + // Idempotent: clear any leftover container/task/snapshot from a prior run. + teardown(&client, ns, id).await; + + // Pull (Transfer service also unpacks into the snapshotter). + pull(&client, ns, &spec.image).await?; + + // Resolve the rootfs chainID + the image's process defaults. + let (diff_ids, cfg) = read_image(&client, ns, &spec.image).await?; + let chain = chain_id(&diff_ids); + let mut snapshots = client.snapshots(); + let mounts = snapshots + .prepare(with_namespace!( + PrepareSnapshotRequest { + snapshotter: SNAPSHOTTER.to_string(), + key: id.to_string(), + parent: chain, + labels: HashMap::new(), + }, + ns + )) + .await + .context("snapshots.prepare")? + .into_inner() + .mounts; + + // Layer the service spec over the image defaults. + let argv = if !spec.args.is_empty() { + spec.args.clone() + } else { + let mut a = cfg.entrypoint.clone(); + a.extend(cfg.cmd.clone()); + a + }; + if argv.is_empty() { + return Err(anyhow!( + "no argv: image has no entrypoint/cmd and none was provided" + )); + } + let mut env = cfg.env.clone(); + env.extend(spec.env.clone()); + let cwd = if !spec.cwd.is_empty() { + spec.cwd.clone() + } else if !cfg.working_dir.is_empty() { + cfg.working_dir.clone() + } else { + "/".to_string() + }; + let user = spec.user.as_deref().or(if cfg.user.is_empty() { + None + } else { + Some(cfg.user.as_str()) + }); + let (uid, gid) = parse_user(user); + + let spec_json = oci_spec_json(id, &argv, &env, &cwd, uid, gid); + + // Create the container record, referencing the snapshot. + client + .containers() + .create(with_namespace!( + CreateContainerRequest { + container: Some(Container { + id: id.to_string(), + image: spec.image.clone(), + runtime: Some(Runtime { + name: "io.containerd.runc.v2".to_string(), + // no_pivot_root=true: orchd-osx boots the VM as an + // initramfs (ramfs root) where runc's pivot_root + // fails (EINVAL); this makes runc use MS_MOVE+chroot + // instead. Harmless on a normal disk-rooted host. + // Any of containerd.runc.v1.Options{ no_pivot_root: + // true } = field 1, varint true = bytes 08 01. + options: Some(prost_types::Any { + type_url: "containerd.runc.v1.Options".to_string(), + value: vec![0x08, 0x01], + }), + }), + spec: Some(prost_types::Any { + type_url: + "types.containerd.io/opencontainers/runtime-spec/1/Spec" + .to_string(), + value: spec_json.into_bytes(), + }), + snapshotter: SNAPSHOTTER.to_string(), + snapshot_key: id.to_string(), + ..Default::default() + }) + }, + ns + )) + .await + .context("containers.create")?; + + // Create + start the task with the snapshot mounts as its rootfs. + let mut tasks = client.tasks(); + tasks + .create(with_namespace!( + CreateTaskRequest { + container_id: id.to_string(), + rootfs: mounts, + ..Default::default() + }, + ns + )) + .await + .context("tasks.create")?; + tasks + .start(with_namespace!( + StartRequest { + container_id: id.to_string(), + ..Default::default() + }, + ns + )) + .await + .context("tasks.start")?; + eprintln!("containerd-run: started {id} ({})", spec.image); + + // Wait for the task to exit, OR for the supervisor to SIGTERM us. + let mut sigterm = signal(SignalKind::terminate()).context("install SIGTERM handler")?; + let mut sigint = signal(SignalKind::interrupt()).context("install SIGINT handler")?; + let mut waiter = client.tasks(); + let code = tokio::select! { + w = waiter.wait(with_namespace!( + WaitRequest { container_id: id.to_string(), ..Default::default() }, ns)) => { + match w { + Ok(r) => r.into_inner().exit_status as i32, + Err(e) => { eprintln!("containerd-run: wait: {e}"); 1 } + } + } + _ = sigterm.recv() => { eprintln!("containerd-run: SIGTERM, stopping {id}"); 143 } + _ = sigint.recv() => { eprintln!("containerd-run: SIGINT, stopping {id}"); 130 } + }; + + // Always tear the container down on the way out. + teardown(&client, ns, id).await; + Ok(code) + } + + /// Pull `image` via the Transfer service, unpacking into the snapshotter. + async fn pull(client: &Client, ns: &str, image: &str) -> Result<()> { + let platform = Platform { + os: "linux".to_string(), + architecture: goarch().to_string(), + variant: String::new(), + os_version: String::new(), + }; + let source = OciRegistry { + reference: image.to_string(), + resolver: Default::default(), + }; + let destination = ImageStore { + name: image.to_string(), + platforms: vec![platform.clone()], + unpacks: vec![UnpackConfiguration { + platform: Some(platform), + snapshotter: SNAPSHOTTER.to_string(), + }], + ..Default::default() + }; + client + .transfer() + .transfer(with_namespace!( + TransferRequest { + source: Some(to_any(&source)), + destination: Some(to_any(&destination)), + options: Some(TransferOptions::default()), + }, + ns + )) + .await + .context("transfer (pull) image")?; + Ok(()) + } + + /// Read a content blob (full) by digest. + async fn read_content(client: &Client, ns: &str, digest: &str) -> Result> { + let mut stream = client + .content() + .read(with_namespace!( + ReadContentRequest { + digest: digest.to_string(), + offset: 0, + size: 0, + }, + ns + )) + .await + .with_context(|| format!("content.read {digest}"))? + .into_inner(); + let mut buf = Vec::new(); + while let Some(chunk) = stream.message().await.context("read content chunk")? { + buf.extend_from_slice(&chunk.data); + } + Ok(buf) + } + + /// Resolve the image's rootfs diff_ids and process config (descending an + /// index by platform if present). + async fn read_image( + client: &Client, + ns: &str, + image: &str, + ) -> Result<(Vec, ImageConfig)> { + let target = client + .images() + .get(with_namespace!( + GetImageRequest { name: image.to_string() }, + ns + )) + .await + .context("images.get")? + .into_inner() + .image + .and_then(|i| i.target) + .ok_or_else(|| anyhow!("image has no target descriptor"))?; + + let blob = read_content(client, ns, &target.digest).await?; + let json: serde_json::Value = + serde_json::from_slice(&blob).context("parse manifest/index json")?; + + let manifest = if json.get("manifests").is_some() { + let arch = goarch(); + let manifests = json["manifests"].as_array().cloned().unwrap_or_default(); + let chosen = manifests + .iter() + .find(|m| m["platform"]["os"] == "linux" && m["platform"]["architecture"] == arch) + .or_else(|| manifests.first()) + .ok_or_else(|| anyhow!("no manifest in index"))?; + let mdigest = chosen["digest"] + .as_str() + .ok_or_else(|| anyhow!("manifest entry missing digest"))?; + let mblob = read_content(client, ns, mdigest).await?; + serde_json::from_slice::(&mblob).context("parse manifest json")? + } else { + json + }; + + let config_digest = manifest["config"]["digest"] + .as_str() + .ok_or_else(|| anyhow!("manifest missing config.digest"))?; + let config: serde_json::Value = + serde_json::from_slice(&read_content(client, ns, config_digest).await?) + .context("parse image config json")?; + + let diff_ids = config["rootfs"]["diff_ids"] + .as_array() + .ok_or_else(|| anyhow!("config missing rootfs.diff_ids"))? + .iter() + .filter_map(|v| v.as_str().map(String::from)) + .collect::>(); + if diff_ids.is_empty() { + return Err(anyhow!("empty diff_ids")); + } + + let str_list = |v: &serde_json::Value| -> Vec { + v.as_array() + .map(|a| { + a.iter() + .filter_map(|x| x.as_str().map(String::from)) + .collect() + }) + .unwrap_or_default() + }; + let cfg = ImageConfig { + entrypoint: str_list(&config["config"]["Entrypoint"]), + cmd: str_list(&config["config"]["Cmd"]), + env: str_list(&config["config"]["Env"]), + working_dir: config["config"]["WorkingDir"] + .as_str() + .unwrap_or("") + .to_string(), + user: config["config"]["User"].as_str().unwrap_or("").to_string(), + }; + Ok((diff_ids, cfg)) + } + + /// Fold diff_ids into the rootfs chainID (containerd identity.ChainID). + fn chain_id(diff_ids: &[String]) -> String { + let mut chain = diff_ids[0].clone(); + for next in &diff_ids[1..] { + let mut h = Sha256::new(); + h.update(format!("{chain} {next}").as_bytes()); + chain = format!("sha256:{}", hex::encode(h.finalize())); + } + chain + } + + /// Parse a numeric uid[:gid] (names are not resolvable here -> root). + fn parse_user(user: Option<&str>) -> (u32, u32) { + let Some(u) = user.map(str::trim).filter(|s| !s.is_empty()) else { + return (0, 0); + }; + let (uid_s, gid_s) = match u.split_once(':') { + Some((a, b)) => (a, Some(b)), + None => (u, None), + }; + let uid = uid_s.parse::().unwrap_or(0); + let gid = gid_s.and_then(|g| g.parse::().ok()).unwrap_or(uid); + (uid, gid) + } + + /// OCI runtime spec JSON. Namespaces omit "network" => host netns, no CNI. + fn oci_spec_json( + id: &str, + argv: &[String], + env: &[String], + cwd: &str, + uid: u32, + gid: u32, + ) -> String { + serde_json::json!({ + "ociVersion": "1.1.0", + "process": { + "terminal": false, + "user": { "uid": uid, "gid": gid }, + "args": argv, + "env": env, + "cwd": if cwd.is_empty() { "/" } else { cwd }, + "capabilities": { + "bounding": ["CAP_NET_RAW","CAP_CHOWN","CAP_DAC_OVERRIDE","CAP_SETUID","CAP_SETGID","CAP_NET_BIND_SERVICE"], + "effective": ["CAP_NET_RAW","CAP_CHOWN","CAP_DAC_OVERRIDE","CAP_SETUID","CAP_SETGID","CAP_NET_BIND_SERVICE"], + "permitted": ["CAP_NET_RAW","CAP_CHOWN","CAP_DAC_OVERRIDE","CAP_SETUID","CAP_SETGID","CAP_NET_BIND_SERVICE"] + }, + "rlimits": [{ "type": "RLIMIT_NOFILE", "hard": 1024, "soft": 1024 }], + "noNewPrivileges": true + }, + "root": { "path": "rootfs", "readonly": false }, + "hostname": id, + "mounts": [ + { "destination": "/proc", "type": "proc", "source": "proc" }, + { "destination": "/dev", "type": "tmpfs", "source": "tmpfs", + "options": ["nosuid","strictatime","mode=755","size=65536k"] }, + { "destination": "/dev/pts", "type": "devpts", "source": "devpts", + "options": ["nosuid","noexec","newinstance","ptmxmode=0666","mode=0620","gid=5"] }, + { "destination": "/dev/shm", "type": "tmpfs", "source": "shm", + "options": ["nosuid","noexec","nodev","mode=1777","size=65536k"] }, + { "destination": "/dev/mqueue", "type": "mqueue", "source": "mqueue", + "options": ["nosuid","noexec","nodev"] }, + { "destination": "/sys", "type": "sysfs", "source": "sysfs", + "options": ["nosuid","noexec","nodev","ro"] }, + { "destination": "/etc/resolv.conf", "type": "bind", "source": "/etc/resolv.conf", + "options": ["rbind","ro"] } + ], + "linux": { + "namespaces": [ + { "type": "pid" }, { "type": "ipc" }, { "type": "uts" }, { "type": "mount" } + ], + "maskedPaths": [ + "/proc/kcore","/proc/latency_stats","/proc/timer_list", + "/proc/timer_stats","/proc/sched_debug","/sys/firmware" + ], + "readonlyPaths": [ + "/proc/asound","/proc/bus","/proc/fs","/proc/irq", + "/proc/sys","/proc/sysrq-trigger" + ] + } + }) + .to_string() + } + + /// Best-effort: SIGTERM + delete task, delete container, remove snapshot. + /// Every step tolerates "not found" so it is safe to call before and after. + async fn teardown(client: &Client, ns: &str, id: &str) { + let mut tasks = client.tasks(); + let _ = tasks + .kill(with_namespace!( + KillRequest { + container_id: id.to_string(), + exec_id: String::new(), + signal: SIGTERM, + all: true, + }, + ns + )) + .await; + let _ = tasks + .delete(with_namespace!( + DeleteTaskRequest { container_id: id.to_string() }, + ns + )) + .await; + let _ = client + .containers() + .delete(with_namespace!( + DeleteContainerRequest { id: id.to_string() }, + ns + )) + .await; + let _ = client + .snapshots() + .remove(with_namespace!( + RemoveSnapshotRequest { + snapshotter: SNAPSHOTTER.to_string(), + key: id.to_string(), + }, + ns + )) + .await; + } +} From 6ca65a3430d1ac5d54511927a2a75cbdaf0f3a38 Mon Sep 17 00:00:00 2001 From: Adil Date: Sun, 7 Jun 2026 00:26:25 +0500 Subject: [PATCH 08/13] docs/examples: present containerd as the runtime, not 'the nerdctl replacement' Drop the nerdctl framing from the living surfaces (module docs, comments, the inception example): describe the containerd runtime by what it is (drives containerd's gRPC API in process, container in host netns) rather than by what it is not. The inception toolchain now fetches containerd + runc directly (~106 MB) instead of the nerdctl-full bundle (~612 MB); leaner and nerdctl-free. History keeps the earlier commits intact. Re-validated end to end: orch-web RUNNING under containerd via gRPC. --- examples/inception/README.md | 8 ++++---- examples/inception/inner-Orchfile | 2 +- examples/inception/run-test.sh | 6 +++--- examples/inception/setup.sh | 25 +++++++++++++++---------- src/runtime/containerd/mod.rs | 10 +++++----- src/runtime/containerd/run.rs | 4 ++-- 6 files changed, 30 insertions(+), 25 deletions(-) diff --git a/examples/inception/README.md b/examples/inception/README.md index 81a7cda..f2a9295 100644 --- a/examples/inception/README.md +++ b/examples/inception/README.md @@ -27,14 +27,14 @@ service spec (memory / cpus / volumes). | `inner-Orchfile` | the **inner** workload the containerd runtime runs (an alpine container) | | `setup.sh` | stages `tools/` (builds the Linux orchd, fetches the containerd toolchain) and writes a runnable `Orchfile.run` | -`tools/` (the ~600 MB containerd toolchain + the Linux orchd) is fetched by +`tools/` (containerd + runc + the Linux orchd) is fetched/built by `setup.sh`, not committed. ## Run it ```sh cd examples/inception -./setup.sh # builds the linux orchd, fetches nerdctl-full, stages tools/ +./setup.sh # builds the linux orchd, fetches containerd + runc, stages tools/ ORCHD_APPLE_MODE=osx \ orchd --orchfile Orchfile.run --runtime apple --platform orchdi \ --state-dir ./state grow @@ -43,8 +43,8 @@ tail -f ./state/logs/orch.ctd.log ``` You should see, from inside the VM: containerd come up, then -`orchd --runtime containerd grow` pull and run the inner alpine container, -and `nerdctl ps` list it. +`orchd --runtime containerd grow` pull and run the inner alpine container, and +containerd's own `ctr tasks ls` report it RUNNING. ## Requirements diff --git a/examples/inception/inner-Orchfile b/examples/inception/inner-Orchfile index e935f3c..9f14918 100644 --- a/examples/inception/inner-Orchfile +++ b/examples/inception/inner-Orchfile @@ -1,5 +1,5 @@ # Inner workload: what orchd-inside-the-VM drives through containerd. -# orchd's containerd runtime turns this into nerdctl pull + nerdctl run. +# orchd's containerd runtime pulls and runs it over containerd's gRPC API. SERVICE web FROM docker.io/library/alpine:latest CMD sleep 300 diff --git a/examples/inception/run-test.sh b/examples/inception/run-test.sh index edd7816..aa8cf68 100755 --- a/examples/inception/run-test.sh +++ b/examples/inception/run-test.sh @@ -1,8 +1,8 @@ #!/bin/sh # Runs INSIDE the orchd-osx VM (Debian). Starts containerd from the mounted # toolchain, then has our Linux orchd drive it via the in-process containerd -# backend (mode-2 gRPC) — no nerdctl, no CNI, no iptables. Verbose so the -# detached supervisor's logfile tells the whole story. +# backend (the container runs in the host netns, so no CNI/iptables). Verbose so +# the detached supervisor's logfile tells the whole story. set -u log(){ echo "[inception] $*"; } export PATH=/opt/tools/bin:$PATH @@ -24,7 +24,7 @@ if ! ctr version >/dev/null 2>&1; then fi log "containerd up: $(ctr --version 2>/dev/null)" -log "STAGE 2: orchd drives containerd via its gRPC API (mode-2, no nerdctl)" +log "STAGE 2: orchd drives containerd via its gRPC API" mkdir -p /run/orchd orchd --orchfile /opt/tools/inner-Orchfile --runtime containerd --platform orchdi --state-dir /run/orchd grow log "orchd grow rc=$?" diff --git a/examples/inception/setup.sh b/examples/inception/setup.sh index 094a977..7b41866 100755 --- a/examples/inception/setup.sh +++ b/examples/inception/setup.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash -# Stage the inception example: build the static Linux orchd, fetch the -# containerd toolchain (nerdctl-full: containerd + runc + cni + nerdctl), and -# lay out tools/ exactly as the Orchfile mounts it. Idempotent. +# Stage the inception example: build the static Linux orchd, fetch the container +# runtime (containerd + runc), and lay out tools/ exactly as the Orchfile mounts +# it. Idempotent. set -euo pipefail here="$(cd "$(dirname "$0")" && pwd)" @@ -14,14 +14,19 @@ echo "==> building static aarch64-linux orchd" ( cd "$repo" && just build-linux >/dev/null ) cp "$repo/target/aarch64-unknown-linux-musl/release/orchd" "$tools/bin/orchd" -echo "==> fetching nerdctl-full (containerd + runc + cni + nerdctl)" +echo "==> fetching containerd + runc (the runtime)" if [ ! -e "$tools/bin/containerd" ]; then - url="$(gh api repos/containerd/nerdctl/releases/latest \ - --jq '.assets[] | select(.name | test("nerdctl-full-.*-linux-arm64.tar.gz$")) | .browser_download_url' 2>/dev/null \ - || curl -fsSL https://api.github.com/repos/containerd/nerdctl/releases/latest \ - | grep -o 'https://[^"]*nerdctl-full-[^"]*-linux-arm64.tar.gz' | head -1)" - echo " $url" - curl -fsSL "$url" | tar -xz -C "$tools" + cver="$(gh api repos/containerd/containerd/releases/latest --jq '.tag_name' | sed 's/^v//')" + echo " containerd ${cver}" + curl -fsSL "https://github.com/containerd/containerd/releases/download/v${cver}/containerd-${cver}-linux-arm64.tar.gz" \ + | tar -xz -C "$tools" # -> bin/containerd, bin/ctr, bin/containerd-shim-runc-v2 +fi +if [ ! -e "$tools/bin/runc" ]; then + rurl="$(gh api repos/opencontainers/runc/releases/latest \ + --jq '.assets[] | select(.name=="runc.arm64") | .browser_download_url')" + echo " $rurl" + curl -fsSL "$rurl" -o "$tools/bin/runc" + chmod +x "$tools/bin/runc" fi echo "==> copying the in-VM driver + inner workload into tools/" diff --git a/src/runtime/containerd/mod.rs b/src/runtime/containerd/mod.rs index 5239722..525ef4f 100644 --- a/src/runtime/containerd/mod.rs +++ b/src/runtime/containerd/mod.rs @@ -1,5 +1,5 @@ -//! containerd runtime (mode-2): orchd drives containerd's gRPC API directly, -//! in process. No nerdctl, no ctr, no Docker. Linux. +//! containerd runtime: orchd drives containerd's gRPC API directly, in process. +//! Linux. //! //! The exec_set for a container is a single stateless foreground command, //! `orchd containerd-run --spec ` (see `run`), which the supervisor @@ -142,8 +142,8 @@ impl Runtime for ContainerdRuntime { }; // start is a single foreground process the supervisor tracks: it pulls - // (if needed), runs the container task over containerd's gRPC socket, - // and on SIGTERM kills + deletes it. No nerdctl, no CNI, no iptables. + // (if needed), runs the container task over containerd's gRPC socket in + // the host network namespace, and on SIGTERM kills + deletes it. let start = format!( "{} containerd-run --spec {}", Self::orchd_exe(), @@ -190,7 +190,7 @@ mod tests { svc.cmd = Some("sleep 300".into()); let exec = rt.exec_set(&svc).expect("exec_set"); - // start is ` containerd-run --spec `; no nerdctl, no separate + // start is ` containerd-run --spec `; no separate // pull/stop/post_stop (containerd-run owns the whole lifecycle). assert!(exec.start.contains(" containerd-run --spec ")); assert!(exec.pre_start.is_none()); diff --git a/src/runtime/containerd/run.rs b/src/runtime/containerd/run.rs index bacf489..baf2e60 100644 --- a/src/runtime/containerd/run.rs +++ b/src/runtime/containerd/run.rs @@ -1,10 +1,10 @@ -//! containerd_run: orchd's in-process containerd client (mode-2). +//! containerd_run: orchd's in-process containerd client. //! //! `orchd containerd-run --spec ` is the foreground process the //! supervisor tracks for a containerd-backed service. It pulls the image (via //! containerd's Transfer service), prepares a writable snapshot, creates and //! starts the container task over containerd's gRPC socket, waits for it to -//! exit, and on SIGTERM kills + deletes it. No nerdctl, no ctr, no Docker. +//! exit, and on SIGTERM kills + deletes it. //! //! The container runs in the HOST network namespace (the OCI spec omits a new //! network namespace), so there is no CNI/iptables dependency. From ccd165b2d278510c59be0923af3c379d9e2c072c Mon Sep 17 00:00:00 2001 From: Adil Date: Sun, 7 Jun 2026 01:31:08 +0500 Subject: [PATCH 09/13] fix(containerd): teardown waits for the task to die before deleting Stress testing (repeated grow/fell over multiple containers) surfaced a leak: teardown killed the task then immediately deleted the task/container/snapshot, but the container's PID 1 ignores SIGTERM (kernel shields namespace init), so the task survived the delete and leaked. Teardown now kills (SIGTERM), waits for the task to actually exit, SIGKILLs if it overruns the grace, and only then deletes. Verified: repeated cycles leave containerd with zero leaked tasks/containers/snapshots. --- src/runtime/containerd/run.rs | 47 ++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/src/runtime/containerd/run.rs b/src/runtime/containerd/run.rs index baf2e60..fab2f53 100644 --- a/src/runtime/containerd/run.rs +++ b/src/runtime/containerd/run.rs @@ -112,6 +112,7 @@ mod backend { use tonic::Request; const SIGTERM: u32 = 15; + const SIGKILL: u32 = 9; const SNAPSHOTTER: &str = "overlayfs"; /// containerd's GOARCH string for this host. @@ -509,11 +510,15 @@ mod backend { .to_string() } - /// Best-effort: SIGTERM + delete task, delete container, remove snapshot. - /// Every step tolerates "not found" so it is safe to call before and after. + /// Best-effort, idempotent cleanup, safe to call before and after a run. + /// The container task runs under containerd's shim (not our process group), + /// so we must stop it via the API AND wait for it to actually exit before + /// deleting the task/container/snapshot — otherwise the live task leaks. async fn teardown(client: &Client, ns: &str, id: &str) { let mut tasks = client.tasks(); - let _ = tasks + + // Ask the task to stop. kill succeeds only if a running task exists. + let had_task = tasks .kill(with_namespace!( KillRequest { container_id: id.to_string(), @@ -523,7 +528,43 @@ mod backend { }, ns )) + .await + .is_ok(); + + if had_task { + // Wait for it to actually exit; SIGKILL if it overruns the grace. + let graceful = tokio::time::timeout( + std::time::Duration::from_secs(8), + tasks.wait(with_namespace!( + WaitRequest { container_id: id.to_string(), ..Default::default() }, + ns + )), + ) .await; + if graceful.is_err() { + let _ = tasks + .kill(with_namespace!( + KillRequest { + container_id: id.to_string(), + exec_id: String::new(), + signal: SIGKILL, + all: true, + }, + ns + )) + .await; + let _ = tokio::time::timeout( + std::time::Duration::from_secs(3), + tasks.wait(with_namespace!( + WaitRequest { container_id: id.to_string(), ..Default::default() }, + ns + )), + ) + .await; + } + } + + // Task is dead (or never existed): now safe to delete records. let _ = tasks .delete(with_namespace!( DeleteTaskRequest { container_id: id.to_string() }, From 07130506a4ddf2d192ee68280ca0ce765176849f Mon Sep 17 00:00:00 2001 From: Adil Date: Sun, 7 Jun 2026 01:31:08 +0500 Subject: [PATCH 10/13] examples/inception: add a stress harness (repeated cycles, leak-checked) stress.sh runs N grow/fell cycles over several containers and asserts containerd is left with zero leaked tasks/containers/snapshots after each teardown (waiting out the SIGTERM grace before measuring). setup.sh stages it and writes Orchfile.stress. Result on the dev box: PASS. --- examples/inception/.gitignore | 1 + examples/inception/README.md | 20 ++++++++++- examples/inception/inner-stress-Orchfile | 12 +++++++ examples/inception/setup.sh | 9 +++-- examples/inception/stress.sh | 45 ++++++++++++++++++++++++ 5 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 examples/inception/inner-stress-Orchfile create mode 100644 examples/inception/stress.sh diff --git a/examples/inception/.gitignore b/examples/inception/.gitignore index 272eecd..9afb326 100644 --- a/examples/inception/.gitignore +++ b/examples/inception/.gitignore @@ -1,4 +1,5 @@ # Fetched/generated by setup.sh, not committed. tools/ Orchfile.run +Orchfile.stress state/ diff --git a/examples/inception/README.md b/examples/inception/README.md index f2a9295..7ee28fe 100644 --- a/examples/inception/README.md +++ b/examples/inception/README.md @@ -25,7 +25,8 @@ service spec (memory / cpus / volumes). | `Orchfile` | the **outer** unit: boot a Debian VM, sized, with the toolchain mounted, running the driver | | `run-test.sh` | runs **inside** the VM: starts containerd, then has the inner orchd drive it | | `inner-Orchfile` | the **inner** workload the containerd runtime runs (an alpine container) | -| `setup.sh` | stages `tools/` (builds the Linux orchd, fetches the containerd toolchain) and writes a runnable `Orchfile.run` | +| `stress.sh` / `inner-stress-Orchfile` | the stress variant: repeated grow/fell cycles over several containers, leak-checked against containerd's own state | +| `setup.sh` | stages `tools/` (builds the Linux orchd, fetches containerd + runc) and writes runnable `Orchfile.run` / `Orchfile.stress` | `tools/` (containerd + runc + the Linux orchd) is fetched/built by `setup.sh`, not committed. @@ -46,6 +47,23 @@ You should see, from inside the VM: containerd come up, then `orchd --runtime containerd grow` pull and run the inner alpine container, and containerd's own `ctr tasks ls` report it RUNNING. +## Stress it + +Same VM, but the driver runs repeated grow/fell cycles over several containers +and asserts containerd is left with zero leaked tasks/containers/snapshots +after each teardown: + +```sh +ORCHD_APPLE_MODE=osx \ + orchd --orchfile Orchfile.stress --runtime apple --platform orchdi \ + --state-dir ./state grow +tail -f ./state/logs/orch.ctd.log # ends with RESULT: PASS +``` + +Note: graceful stop allows a grace period before SIGKILL, so each `fell` takes +several seconds to settle if the container's PID 1 ignores SIGTERM (same as +`docker stop`'s default). The cycle test waits that out before leak-checking. + ## Requirements - macOS on Apple silicon, the orchd-osx runtime built + signed (`just build-osx`) diff --git a/examples/inception/inner-stress-Orchfile b/examples/inception/inner-stress-Orchfile new file mode 100644 index 0000000..4f4a782 --- /dev/null +++ b/examples/inception/inner-stress-Orchfile @@ -0,0 +1,12 @@ +# Stress workload: several containers (mixed images) for the cycle test. +SERVICE a +FROM docker.io/library/alpine:latest +CMD sleep 600 + +SERVICE b +FROM docker.io/library/alpine:latest +CMD sleep 600 + +SERVICE c +FROM docker.io/library/busybox:latest +CMD sleep 600 diff --git a/examples/inception/setup.sh b/examples/inception/setup.sh index 7b41866..558ad76 100755 --- a/examples/inception/setup.sh +++ b/examples/inception/setup.sh @@ -29,9 +29,11 @@ if [ ! -e "$tools/bin/runc" ]; then chmod +x "$tools/bin/runc" fi -echo "==> copying the in-VM driver + inner workload into tools/" +echo "==> copying the in-VM driver + inner workloads into tools/" cp "$here/run-test.sh" "$tools/run-test.sh" cp "$here/inner-Orchfile" "$tools/inner-Orchfile" +cp "$here/stress.sh" "$tools/stress.sh" +cp "$here/inner-stress-Orchfile" "$tools/inner-stress-Orchfile" echo "==> staging a CA bundle (debian-slim has none; containerd needs it for registry TLS)" if [ -f /etc/ssl/cert.pem ]; then @@ -40,8 +42,11 @@ else curl -fsSL https://curl.se/ca/cacert.pem -o "$tools/ca-bundle.crt" fi -echo "==> writing runnable Orchfile.run (absolute volume path)" +echo "==> writing runnable Orchfiles (absolute volume path)" sed "s|__TOOLS__|$tools|" "$here/Orchfile" > "$here/Orchfile.run" +# stress variant: same VM, CMD runs the grow/fell cycle test. +sed "s|__TOOLS__|$tools|; s|/opt/tools/run-test.sh|/opt/tools/stress.sh|" \ + "$here/Orchfile" > "$here/Orchfile.stress" cat </var/log/containerd.log 2>&1 & +for i in $(seq 1 20); do ctr version >/dev/null 2>&1 && break; sleep 1; done +ctr version >/dev/null 2>&1 || { log "containerd FAILED"; exit 1; } +log "containerd up" + +running(){ ctr -n $NS tasks ls 2>/dev/null | grep -c RUNNING; } +containers(){ ctr -n $NS containers ls -q 2>/dev/null | grep -c . ; } +leaked_snaps(){ ctr -n $NS snapshots ls 2>/dev/null | grep -cE "^orch-[abc] "; } +crun_alive(){ ps -eo args 2>/dev/null | grep -q "[c]ontainerd-run"; } + +FAIL=0 +for cycle in 1 2; do + log "===== CYCLE $cycle =====" + rm -rf /run/orchd; mkdir -p /run/orchd + orchd --orchfile /opt/tools/inner-stress-Orchfile --runtime containerd --platform orchdi --state-dir /run/orchd grow >/dev/null 2>&1 + sleep 8 + r=$(running); log "grow -> RUNNING=$r (expect 3)" + [ "$r" = "3" ] || FAIL=1 + + orchd --platform orchdi --state-dir /run/orchd fell >/dev/null 2>&1 + log "fell issued; waiting 16s for teardown grace to settle..." + sleep 16 + rt=$(running); ct=$(containers); sn=$(leaked_snaps) + log "settled -> running=$rt containers=$ct snaps=$sn (expect 0/0/0)" + [ "$rt" = "0" ] && [ "$ct" = "0" ] && [ "$sn" = "0" ] || FAIL=1 +done + +log "===== FINAL =====" +log "tasks:"; ctr -n $NS tasks ls 2>&1 +log "RESULT: $([ $FAIL = 0 ] && echo PASS || echo FAIL)" +sleep 3 From 84186f985762f6e98e693f712035b5b01a14bf93 Mon Sep 17 00:00:00 2001 From: Adil Date: Sun, 7 Jun 2026 16:16:06 +0500 Subject: [PATCH 11/13] fix(orchdi): honor restart policy, oneshot, start-limit, and logging The orchdi supervisor parsed services and then ran each exactly once: RESTART, RESTART_DELAY, START_LIMIT_*, ONESHOT, and STDOUT/STDERR were all dropped. Now the supervise loop honors them: restart on policy (no/on-failure/always) after a delay, give up past the start-limit burst/interval, never restart a oneshot, and redirect the service's stdout/stderr to the configured paths. Verified: 5 restarts observed for a crash-looping service, frozen by fell. --- src/orchdi.rs | 130 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 124 insertions(+), 6 deletions(-) diff --git a/src/orchdi.rs b/src/orchdi.rs index cae3b0b..1f0aa60 100644 --- a/src/orchdi.rs +++ b/src/orchdi.rs @@ -41,7 +41,7 @@ pub struct DepSpec { /// Everything the supervisor needs, built from a `Service` + its `ExecSet`. /// Runtime-agnostic: only command strings, never runtime identity. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct SuperviseSpec { pub label: String, pub pre_start: Option, @@ -52,6 +52,27 @@ pub struct SuperviseSpec { pub deps: Vec, /// Seconds to wait for graceful stop before SIGKILLing the process group. pub stop_timeout_secs: u32, + /// Restart policy honored by the orchdi supervisor: "no" | "on-failure" | + /// "always". (launchd/systemd honor their own native equivalents instead.) + #[serde(default)] + pub restart_policy: String, + /// Delay before a restart (clamped to >= 1s to avoid tight crash loops). + #[serde(default)] + pub restart_delay_secs: u32, + /// Oneshot services run once and are never restarted. + #[serde(default)] + pub oneshot: bool, + /// Restart rate limit: give up after this many restarts within + /// restart_interval_secs (0 = no limit). Mirrors systemd StartLimitBurst. + #[serde(default)] + pub restart_burst: u32, + #[serde(default)] + pub restart_interval_secs: u32, + /// Where the service's stdout/stderr go (None -> the supervisor's logfile). + #[serde(default)] + pub stdout: Option, + #[serde(default)] + pub stderr: Option, } static TERM: AtomicBool = AtomicBool::new(false); @@ -105,16 +126,18 @@ pub fn run(spec_path: &Path) -> i32 { // 3. Signal handler + spawn start in its own process group. install_signal_handlers(); - let mut child = match spawn_in_group(&spec.start) { + let mut child = match spawn_in_group(&spec.start, spec.stdout.as_deref(), spec.stderr.as_deref()) { Ok(c) => c, Err(e) => { eprintln!("supervise[{}]: failed to start: {e}", spec.label); return 1; } }; - let pgid = child.id() as i32; // == pid, since the child is its own group leader + let mut pgid = child.id() as i32; // == pid, since the child is its own group leader - // 4. Supervise loop. + // 4. Supervise loop (honors the restart policy + rate limit). + let mut restarts: u32 = 0; + let mut restart_times: Vec = Vec::new(); loop { if TERM.load(Ordering::SeqCst) { teardown(&spec, &mut child, pgid); @@ -123,7 +146,51 @@ pub fn run(spec_path: &Path) -> i32 { match child.try_wait() { Ok(Some(status)) => { run_optional(&spec.post_stop); - return status.code().unwrap_or(0); + let code = status.code().unwrap_or(0); + if !TERM.load(Ordering::SeqCst) && should_restart(&spec, code) { + // Rate limit: give up if we're restarting too fast. + let now = Instant::now(); + restart_times.push(now); + if spec.restart_burst > 0 { + let window = Duration::from_secs(spec.restart_interval_secs.max(1) as u64); + restart_times.retain(|t| now.duration_since(*t) <= window); + if restart_times.len() as u32 > spec.restart_burst { + eprintln!( + "supervise[{}]: restart rate exceeded ({} within {}s); giving up", + spec.label, + spec.restart_burst, + spec.restart_interval_secs.max(1) + ); + return code; + } + } + restarts += 1; + let delay = spec.restart_delay_secs.max(1); + eprintln!( + "supervise[{}]: exited {code}; restart #{restarts} (policy={}) in {delay}s", + spec.label, spec.restart_policy + ); + // Wait out the delay, but wake promptly if asked to stop. + let until = Instant::now() + Duration::from_secs(delay as u64); + while Instant::now() < until { + if TERM.load(Ordering::SeqCst) { + return 0; + } + std::thread::sleep(Duration::from_millis(100)); + } + match spawn_in_group(&spec.start, spec.stdout.as_deref(), spec.stderr.as_deref()) { + Ok(c) => { + child = c; + pgid = child.id() as i32; + continue; + } + Err(e) => { + eprintln!("supervise[{}]: restart spawn failed: {e}", spec.label); + return 1; + } + } + } + return code; } Ok(None) => std::thread::sleep(Duration::from_millis(100)), Err(e) => { @@ -134,6 +201,19 @@ pub fn run(spec_path: &Path) -> i32 { } } +/// Whether to restart after the start process exited with `exit_code`. +/// Oneshot services never restart; otherwise the policy decides. +fn should_restart(spec: &SuperviseSpec, exit_code: i32) -> bool { + if spec.oneshot { + return false; + } + match spec.restart_policy.as_str() { + "always" => true, + "on-failure" => exit_code != 0, + _ => false, + } +} + /// Graceful teardown: stop (or signal the group), bounded wait, SIGKILL the /// group if it overruns, then post_stop. fn teardown(spec: &SuperviseSpec, child: &mut Child, pgid: i32) { @@ -178,10 +258,17 @@ fn run_optional(cmd: &Option) { /// Spawn `cmd` via `/bin/sh -c` in a fresh process group so the whole tree can /// be signalled together. macOS has no PR_SET_PDEATHSIG, so the group is how we /// guarantee no orphans on teardown. -fn spawn_in_group(cmd: &str) -> std::io::Result { +fn spawn_in_group(cmd: &str, stdout: Option<&str>, stderr: Option<&str>) -> std::io::Result { use std::os::unix::process::CommandExt; let mut c = Command::new("/bin/sh"); c.arg("-c").arg(cmd); + // Honor logging.stdout/stderr; otherwise inherit the supervisor's logfile. + if let Some(p) = stdout { + c.stdout(std::process::Stdio::from(open_append(p)?)); + } + if let Some(p) = stderr { + c.stderr(std::process::Stdio::from(open_append(p)?)); + } unsafe { c.pre_exec(|| { // Become group leader: new pgid == pid. @@ -194,6 +281,14 @@ fn spawn_in_group(cmd: &str) -> std::io::Result { c.spawn() } +/// Open a log path for appending, creating it (and parents) if needed. +fn open_append(path: &str) -> std::io::Result { + if let Some(parent) = std::path::Path::new(path).parent() { + let _ = std::fs::create_dir_all(parent); + } + std::fs::OpenOptions::new().create(true).append(true).open(path) +} + fn install_signal_handlers() { unsafe { libc::signal(libc::SIGTERM, on_term as *const () as libc::sighandler_t); @@ -304,6 +399,28 @@ pub fn build_supervise_spec( }) .collect(), stop_timeout_secs: stop_timeout, + restart_policy: match service.restart.policy { + crate::types::RestartPolicy::Always => "always", + crate::types::RestartPolicy::OnFailure => "on-failure", + crate::types::RestartPolicy::No => "no", + } + .to_string(), + restart_delay_secs: service + .restart + .delay + .as_deref() + .and_then(parse_duration_secs) + .unwrap_or(1), + oneshot: service.oneshot, + restart_burst: service.restart.start_limit_burst.unwrap_or(0), + restart_interval_secs: service + .restart + .start_limit_interval + .as_deref() + .and_then(parse_duration_secs) + .unwrap_or(10), + stdout: service.logging.stdout.clone(), + stderr: service.logging.stderr.clone(), } } @@ -348,6 +465,7 @@ mod tests { post_stop: Some("echo delete".into()), deps: vec![DepSpec { poll_cmd: "true".into(), timeout_secs: 5, required: true }], stop_timeout_secs: 30, + ..Default::default() }; let json = serde_json::to_string(&spec).unwrap(); let back: SuperviseSpec = serde_json::from_str(&json).unwrap(); From 7112adcb3a51679bdcadfbe3de7b56548693d2f0 Mon Sep 17 00:00:00 2001 From: Adil Date: Sun, 7 Jun 2026 16:16:06 +0500 Subject: [PATCH 12/13] fix(containerd): honor env_files, volumes, and resources The containerd runtime dropped env_files and volumes and ignored all of resources (the OCI spec even hardcoded RLIMIT_NOFILE 1024). Now the spec carries them: env_files are merged into the env, volumes become rw bind mounts, and resources map to the OCI spec's cgroup block (memory.limit, cpu quota/period, pids.limit, blockIO.weight) and rlimits (nofile/nproc from the spec). A username (vs numeric uid) is warned about rather than silently run as root. Verified in-container: VOLUME bind mount, ENV, and cgroup memory.max=64MiB. --- src/runtime/containerd/mod.rs | 60 ++++++++++++- src/runtime/containerd/run.rs | 154 +++++++++++++++++++++++++++------- 2 files changed, 182 insertions(+), 32 deletions(-) diff --git a/src/runtime/containerd/mod.rs b/src/runtime/containerd/mod.rs index 525ef4f..c3518cf 100644 --- a/src/runtime/containerd/mod.rs +++ b/src/runtime/containerd/mod.rs @@ -17,7 +17,7 @@ use crate::runtime::{Runtime, RuntimeError}; use crate::types::Service; pub mod run; -use run::{encode_spec, ContainerdRunSpec}; +use run::{encode_spec, ContainerdRunSpec, Resources, VolumeMount}; const DEFAULT_SOCKET: &str = "/run/containerd/containerd.sock"; @@ -130,6 +130,26 @@ impl Runtime for ContainerdRuntime { .collect(); env.sort(); + // Resource caps: cpus -> cgroup cpu.max (quota = cpus * period); else a + // CPU_QUOTA percentage. memory/pids/io/rlimits map straight through. + let r = &service.resources; + let (cpu_quota_us, cpu_period_us) = if let Some(c) = r.cpus.filter(|c| *c > 0.0) { + (Some((c * 100_000.0) as u64), Some(100_000u64)) + } else if let Some(q) = r.cpu_quota.as_deref().and_then(parse_cpu_quota_pct) { + (Some(q), Some(100_000u64)) + } else { + (None, None) + }; + let resources = Resources { + memory_bytes: r.memory.as_deref().and_then(parse_memory_bytes), + cpu_quota_us, + cpu_period_us, + pids_max: r.tasks_max.or(r.limit_nproc), + nofile: r.limit_nofile, + nproc: r.limit_nproc, + io_weight: r.io_weight, + }; + let spec = ContainerdRunSpec { socket: self.socket.clone(), namespace: self.namespace.clone(), @@ -139,6 +159,16 @@ impl Runtime for ContainerdRuntime { env, cwd: service.workdir.clone().unwrap_or_default(), user: service.user.clone(), + env_files: service.env_files.clone(), + volumes: service + .volumes + .iter() + .map(|v| VolumeMount { + source: v.source.clone(), + destination: v.destination.clone(), + }) + .collect(), + resources, }; // start is a single foreground process the supervisor tracks: it pulls @@ -159,6 +189,34 @@ impl Runtime for ContainerdRuntime { } } +/// Parse a memory size ("512M", "2G", "1Gi", "1073741824") into bytes. k/m/g +/// (case-insensitive) are 1024-based; a bare number is bytes. None if invalid. +fn parse_memory_bytes(s: &str) -> Option { + let t = s.trim(); + let end = t.find(|c: char| !c.is_ascii_digit()).unwrap_or(t.len()); + if end == 0 { + return None; + } + let num: u64 = t[..end].parse().ok()?; + let mult = match t[end..].trim().chars().next().map(|c| c.to_ascii_lowercase()) { + Some('k') => 1024, + Some('m') => 1024 * 1024, + Some('g') => 1024 * 1024 * 1024, + _ => 1, + }; + Some(num.saturating_mul(mult)) +} + +/// Parse a CPU quota percentage ("50%" or "50") into a cgroup cpu.max quota in +/// microseconds (period 100000). None if invalid/zero. +fn parse_cpu_quota_pct(s: &str) -> Option { + let pct: u64 = s.trim().trim_end_matches('%').trim().parse().ok()?; + if pct == 0 { + return None; + } + Some(pct * 100_000 / 100) +} + #[cfg(test)] #[allow(non_snake_case)] mod tests { diff --git a/src/runtime/containerd/run.rs b/src/runtime/containerd/run.rs index fab2f53..edb0d35 100644 --- a/src/runtime/containerd/run.rs +++ b/src/runtime/containerd/run.rs @@ -39,6 +39,35 @@ pub struct ContainerdRunSpec { /// uid[:gid] (numeric). None -> the image config's User (or root). #[serde(default)] pub user: Option, + /// Env files (paths) read and merged after the image env, before `env`. + #[serde(default)] + pub env_files: Vec, + /// Host directories bind-mounted into the container. + #[serde(default)] + pub volumes: Vec, + /// cgroup / rlimit caps applied to the container (from resources.*). + #[serde(default)] + pub resources: Resources, +} + +/// A host path bind-mounted into the container. +#[derive(Serialize, Deserialize, Debug, Clone, Default)] +pub struct VolumeMount { + pub source: String, + pub destination: String, +} + +/// Resolved resource caps. 0/None means "unset". Memory in bytes, cpu as a +/// cgroup v2 cpu.max (quota,period) in microseconds. +#[derive(Serialize, Deserialize, Debug, Clone, Default)] +pub struct Resources { + pub memory_bytes: Option, + pub cpu_quota_us: Option, + pub cpu_period_us: Option, + pub pids_max: Option, + pub nofile: Option, + pub nproc: Option, + pub io_weight: Option, } /// Encode a spec as a shell-safe base64 arg for the ExecSet start command. @@ -87,7 +116,7 @@ pub fn run(spec_b64: &str) -> i32 { #[cfg(feature = "containerd")] mod backend { - use super::ContainerdRunSpec; + use super::{ContainerdRunSpec, Resources, VolumeMount}; use std::collections::HashMap; use std::env::consts; @@ -188,6 +217,17 @@ mod backend { )); } let mut env = cfg.env.clone(); + for f in &spec.env_files { + if let Ok(data) = std::fs::read_to_string(f) { + for line in data.lines() { + let t = line.trim(); + if t.is_empty() || t.starts_with('#') || !t.contains('=') { + continue; + } + env.push(t.to_string()); + } + } + } env.extend(spec.env.clone()); let cwd = if !spec.cwd.is_empty() { spec.cwd.clone() @@ -201,9 +241,19 @@ mod backend { } else { Some(cfg.user.as_str()) }); + // Only numeric uid[:gid] is resolved here; a username would need the + // image's /etc/passwd. Surface that rather than silently running as root. + if let Some(u) = user { + let uid_part = u.split(':').next().unwrap_or(u); + if !uid_part.is_empty() && uid_part.parse::().is_err() { + eprintln!( + "containerd-run: USER '{u}' is a name; running as root (numeric uid[:gid] only)" + ); + } + } let (uid, gid) = parse_user(user); - let spec_json = oci_spec_json(id, &argv, &env, &cwd, uid, gid); + let spec_json = oci_spec_json(id, &argv, &env, &cwd, uid, gid, &spec.resources, &spec.volumes); // Create the container record, referencing the snapshot. client @@ -452,6 +502,7 @@ mod backend { } /// OCI runtime spec JSON. Namespaces omit "network" => host netns, no CNI. + /// rlimits, cgroup resources, and bind mounts come from the spec. fn oci_spec_json( id: &str, argv: &[String], @@ -459,7 +510,74 @@ mod backend { cwd: &str, uid: u32, gid: u32, + res: &Resources, + vols: &[VolumeMount], ) -> String { + // rlimits: nofile (from spec or default 1024) + optional nproc. + let nofile = res.nofile.unwrap_or(1024); + let mut rlimits = vec![serde_json::json!( + { "type": "RLIMIT_NOFILE", "hard": nofile, "soft": nofile } + )]; + if let Some(n) = res.nproc { + rlimits.push(serde_json::json!({ "type": "RLIMIT_NPROC", "hard": n, "soft": n })); + } + + // mounts: the standard set, plus a rw bind mount per volume. + let mut mounts = vec![ + serde_json::json!({ "destination": "/proc", "type": "proc", "source": "proc" }), + serde_json::json!({ "destination": "/dev", "type": "tmpfs", "source": "tmpfs", + "options": ["nosuid","strictatime","mode=755","size=65536k"] }), + serde_json::json!({ "destination": "/dev/pts", "type": "devpts", "source": "devpts", + "options": ["nosuid","noexec","newinstance","ptmxmode=0666","mode=0620","gid=5"] }), + serde_json::json!({ "destination": "/dev/shm", "type": "tmpfs", "source": "shm", + "options": ["nosuid","noexec","nodev","mode=1777","size=65536k"] }), + serde_json::json!({ "destination": "/dev/mqueue", "type": "mqueue", "source": "mqueue", + "options": ["nosuid","noexec","nodev"] }), + serde_json::json!({ "destination": "/sys", "type": "sysfs", "source": "sysfs", + "options": ["nosuid","noexec","nodev","ro"] }), + serde_json::json!({ "destination": "/etc/resolv.conf", "type": "bind", "source": "/etc/resolv.conf", + "options": ["rbind","ro"] }), + ]; + for v in vols { + mounts.push(serde_json::json!({ + "destination": v.destination, "type": "bind", "source": v.source, + "options": ["rbind","rw"] + })); + } + + // cgroup v2 resource caps. + let mut resources = serde_json::Map::new(); + if let Some(m) = res.memory_bytes { + resources.insert("memory".into(), serde_json::json!({ "limit": m })); + } + if let Some(q) = res.cpu_quota_us { + let period = res.cpu_period_us.unwrap_or(100000); + resources.insert("cpu".into(), serde_json::json!({ "quota": q, "period": period })); + } + if let Some(p) = res.pids_max { + resources.insert("pids".into(), serde_json::json!({ "limit": p })); + } + if let Some(w) = res.io_weight { + resources.insert("blockIO".into(), serde_json::json!({ "weight": w })); + } + + let mut linux = serde_json::json!({ + "namespaces": [ + { "type": "pid" }, { "type": "ipc" }, { "type": "uts" }, { "type": "mount" } + ], + "maskedPaths": [ + "/proc/kcore","/proc/latency_stats","/proc/timer_list", + "/proc/timer_stats","/proc/sched_debug","/sys/firmware" + ], + "readonlyPaths": [ + "/proc/asound","/proc/bus","/proc/fs","/proc/irq", + "/proc/sys","/proc/sysrq-trigger" + ] + }); + if !resources.is_empty() { + linux["resources"] = serde_json::Value::Object(resources); + } + serde_json::json!({ "ociVersion": "1.1.0", "process": { @@ -473,39 +591,13 @@ mod backend { "effective": ["CAP_NET_RAW","CAP_CHOWN","CAP_DAC_OVERRIDE","CAP_SETUID","CAP_SETGID","CAP_NET_BIND_SERVICE"], "permitted": ["CAP_NET_RAW","CAP_CHOWN","CAP_DAC_OVERRIDE","CAP_SETUID","CAP_SETGID","CAP_NET_BIND_SERVICE"] }, - "rlimits": [{ "type": "RLIMIT_NOFILE", "hard": 1024, "soft": 1024 }], + "rlimits": rlimits, "noNewPrivileges": true }, "root": { "path": "rootfs", "readonly": false }, "hostname": id, - "mounts": [ - { "destination": "/proc", "type": "proc", "source": "proc" }, - { "destination": "/dev", "type": "tmpfs", "source": "tmpfs", - "options": ["nosuid","strictatime","mode=755","size=65536k"] }, - { "destination": "/dev/pts", "type": "devpts", "source": "devpts", - "options": ["nosuid","noexec","newinstance","ptmxmode=0666","mode=0620","gid=5"] }, - { "destination": "/dev/shm", "type": "tmpfs", "source": "shm", - "options": ["nosuid","noexec","nodev","mode=1777","size=65536k"] }, - { "destination": "/dev/mqueue", "type": "mqueue", "source": "mqueue", - "options": ["nosuid","noexec","nodev"] }, - { "destination": "/sys", "type": "sysfs", "source": "sysfs", - "options": ["nosuid","noexec","nodev","ro"] }, - { "destination": "/etc/resolv.conf", "type": "bind", "source": "/etc/resolv.conf", - "options": ["rbind","ro"] } - ], - "linux": { - "namespaces": [ - { "type": "pid" }, { "type": "ipc" }, { "type": "uts" }, { "type": "mount" } - ], - "maskedPaths": [ - "/proc/kcore","/proc/latency_stats","/proc/timer_list", - "/proc/timer_stats","/proc/sched_debug","/sys/firmware" - ], - "readonlyPaths": [ - "/proc/asound","/proc/bus","/proc/fs","/proc/irq", - "/proc/sys","/proc/sysrq-trigger" - ] - } + "mounts": mounts, + "linux": linux }) .to_string() } From 06af65a7982ef3be49b41c8dc84a13cf07f6553b Mon Sep 17 00:00:00 2001 From: Adil Date: Sun, 7 Jun 2026 16:16:06 +0500 Subject: [PATCH 13/13] examples/inception: extended stress (fan-out, oneshot, crash/restart, spec) stress2.sh exercises 6-container fan-out, a oneshot (must not restart), crash/restart under RESTART on-failure (and fell freezing the loop), and a spec-alignment check that reads the volume mount, env, and cgroup memory.max from inside a real container. Result: PASS. --- examples/inception/stress2.sh | 77 +++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 examples/inception/stress2.sh diff --git a/examples/inception/stress2.sh b/examples/inception/stress2.sh new file mode 100644 index 0000000..60e2b34 --- /dev/null +++ b/examples/inception/stress2.sh @@ -0,0 +1,77 @@ +#!/bin/sh +# Extended stress for the containerd runtime + orchdi supervisor: +# TEST 1 fan-out: 6 containers up, clean teardown +# TEST 2 oneshot: a container that exits is not restarted and is cleaned up +# TEST 3 crash/restart: RESTART on-failure actually restarts, and fell stops it +# One VM boot; leak-checked against containerd's own state. +set -u +log(){ echo "[stress2] $*"; } +export PATH=/opt/tools/bin:$PATH +mkdir -p /etc/ssl/certs && cp /opt/tools/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt +export SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt +NS=orch +running(){ ctr -n $NS tasks ls 2>/dev/null | grep -c RUNNING; } +ctrs(){ ctr -n $NS containers ls -q 2>/dev/null | grep -c . ; } +sd(){ orchd --platform orchdi --state-dir /run/orchd "$@"; } +reset(){ rm -rf /run/orchd; mkdir -p /run/orchd; } + +mkdir -p /run/containerd /var/lib/containerd +containerd >/var/log/containerd.log 2>&1 & +for i in $(seq 1 20); do ctr version >/dev/null 2>&1 && break; sleep 1; done +ctr version >/dev/null 2>&1 || { log "containerd FAILED"; exit 1; } +log "containerd up" +FAIL=0 + +log "===== TEST 1: fan-out (6 containers) =====" +reset +i=1; : > /run/fan.Orchfile +for n in a b c d e f; do + printf 'SERVICE %s\nFROM docker.io/library/alpine:latest\nCMD sleep 600\n' "$n" >> /run/fan.Orchfile +done +sd --orchfile /run/fan.Orchfile --runtime containerd grow >/dev/null 2>&1 +sleep 16 +r=$(running); log "running=$r (expect 6)"; [ "$r" = "6" ] || FAIL=1 +sd fell >/dev/null 2>&1; log "fell; settling 16s..."; sleep 16 +r=$(running); c=$(ctrs); log "after fell running=$r containers=$c (expect 0/0)"; { [ "$r" = "0" ] && [ "$c" = "0" ]; } || FAIL=1 + +log "===== TEST 2: oneshot (exits, must NOT restart) =====" +reset +printf 'SERVICE once\nFROM docker.io/library/alpine:latest\nCMD true\nONESHOT true\n' > /run/once.Orchfile +sd --orchfile /run/once.Orchfile --runtime containerd grow >/dev/null 2>&1 +sleep 10 +r=$(running); c=$(ctrs); log "running=$r containers=$c (expect 0/0 — ran once and cleaned up)"; { [ "$r" = "0" ] && [ "$c" = "0" ]; } || FAIL=1 +log "survey (oneshot should not be running):"; sd survey +sd fell >/dev/null 2>&1; sleep 3 + +log "===== TEST 3: crash/restart (RESTART on-failure) =====" +reset +printf 'SERVICE crash\nFROM docker.io/library/alpine:latest\nCMD false\nRESTART on-failure\nRESTART_DELAY 2s\n' > /run/crash.Orchfile +sd --orchfile /run/crash.Orchfile --runtime containerd grow >/dev/null 2>&1 +sleep 20 +restarts=$(grep -ch "restart #" /run/orchd/logs/*.log 2>/dev/null | head -1) +log "restarts observed in ~20s: ${restarts:-0} (expect >= 2)"; [ "${restarts:-0}" -ge 2 ] || FAIL=1 +before=$(grep -ch "restart #" /run/orchd/logs/*.log 2>/dev/null | head -1) +sd fell >/dev/null 2>&1; log "fell; checking the restart loop stops..."; sleep 10 +after=$(grep -ch "restart #" /run/orchd/logs/*.log 2>/dev/null | head -1) +c=$(ctrs); log "after fell: restarts froze (${before:-0} -> ${after:-0}), containers=$c (expect frozen, 0)" +{ [ "${before:-0}" = "${after:-0}" ] && [ "$c" = "0" ]; } || FAIL=1 + +log "===== TEST 4: spec alignment (volume + env + memory cgroup honored) =====" +reset +mkdir -p /run/vol; echo "VOLUME-OK" > /run/vol/marker +printf 'SERVICE sa\nFROM docker.io/library/alpine:latest\nCMD sleep 600\nMEMORY 64M\nENV FOO=bar\nVOLUME /run/vol:/mnt\n' > /run/sa.Orchfile +sd --orchfile /run/sa.Orchfile --runtime containerd grow >/dev/null 2>&1 +sleep 8 +r=$(running); log "running=$r (container with MEMORY+VOLUME+ENV; expect 1)"; [ "$r" = "1" ] || FAIL=1 +vol=$(ctr -n $NS tasks exec --exec-id v orch-sa cat /mnt/marker 2>/dev/null | tr -d '\r') +log "volume /mnt/marker = '$vol' (expect VOLUME-OK)"; [ "$vol" = "VOLUME-OK" ] || FAIL=1 +e=$(ctr -n $NS tasks exec --exec-id e orch-sa printenv FOO 2>/dev/null | tr -d '\r') +log "env FOO = '$e' (expect bar)"; [ "$e" = "bar" ] || FAIL=1 +pid=$(ctr -n $NS tasks ls 2>/dev/null | grep orch-sa | awk '{print $2}') +cg=$(awk -F: '{print $3}' /proc/"$pid"/cgroup 2>/dev/null) +mem=$(cat /sys/fs/cgroup"$cg"/memory.max 2>/dev/null) +log "cgroup memory.max = '$mem' (expect 67108864 = 64M)"; [ "$mem" = "67108864" ] || FAIL=1 +sd fell >/dev/null 2>&1; sleep 14 + +log "RESULT: $([ $FAIL = 0 ] && echo PASS || echo FAIL)" +sleep 3