diff --git a/init.go b/init.go index 335f80a7..ffd20f78 100644 --- a/init.go +++ b/init.go @@ -39,12 +39,18 @@ var initCommand = cli.Command{ Name: "init", Usage: `initialize the namespaces and launch the process (do not call it outside of sysbox-runc)`, Action: func(context *cli.Context) error { + initType := os.Getenv("_LIBCONTAINER_INITTYPE") + factory, _ := libcontainer.New("") if err := factory.StartInitialization(); err != nil { - // as the error is sent back to the parent there is no need to log - // or write it to stderr because the parent process will handle this os.Exit(1) } + + // initMount helpers return here without exec; that's normal. + if initType == "mount" { + os.Exit(0) + } + panic("libcontainer: container init failed to exec") }, } diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 38309325..8cb61bab 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -141,6 +141,10 @@ func (v *ConfigValidator) timenamespace(config *configs.Config) error { if _, err := os.Stat("/proc/self/timens_offsets"); os.IsNotExist(err) { return errors.New("time namespaces aren't enabled in the kernel") } + } else { + if config.TimeOffsets != nil { + return errors.New("time namespace offsets specified, but time namespace isn't enabled in the config") + } } return nil } diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index bc759fa0..5474f51c 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -2374,7 +2374,7 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na } // Write boottime and monotonic time namespace offsets. - if c.config.Namespaces.Contains(configs.NEWTIME) && c.config.TimeOffsets != nil { + if c.config.TimeOffsets != nil { var offsetSpec bytes.Buffer for clock, offset := range c.config.TimeOffsets { fmt.Fprintf(&offsetSpec, "%s %d %d\n", clock, offset.Secs, offset.Nanosecs) @@ -2444,7 +2444,7 @@ func (c *linuxContainer) handleReqOp(childPid int, reqs []opReq) error { op := reqs[0].Op switch op { - case bind, chown, mkdir, overlay, rootfsIDMap, switchDockerDns: + case bind, chown, mkdir, overlay, rootfsIDMap, switchDockerDns, sysfs: return c.handleOp(op, childPid, reqs) default: return newSystemError(fmt.Errorf("invalid opReq type %d", int(op))) @@ -2490,7 +2490,7 @@ func (c *linuxContainer) handleOp(op opReqType, childPid int, reqs []opReq) erro namespaces := []string{} switch op { - case bind, chown, mkdir, overlay, rootfsIDMap: + case bind, chown, mkdir, overlay, rootfsIDMap, sysfs: namespaces = append(namespaces, fmt.Sprintf("mnt:/proc/%d/ns/mnt", childPid), fmt.Sprintf("pid:/proc/%d/ns/pid", childPid), diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 8e5e1f9e..57195bc2 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -402,6 +402,12 @@ func (l *LinuxFactory) StartInitialization() (err error) { defer func() { // We have an error during the initialization of the container's init, // send it back to the parent process in the form of an initError. + // For initMount helpers, Init() returns nil without exec'ing; + // sending procError with a nil payload would cause the parent's + // parseSync to panic ("No error following JSON procError payload"). + if err == nil { + return + } if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { fmt.Fprintln(os.Stderr, err) return diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 1e7bf795..f77294c2 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -371,7 +371,7 @@ static void update_oom_score_adj(char *data, size_t len) * our user-ns) don't match for the owner-permission check. Set the * dumpable bit across the write and restore the previous value. */ -static void update_timens(char *map, size_t map_len) +static void update_timens_offsets(char *map, size_t map_len) { if (map == NULL || map_len == 0) return; @@ -642,11 +642,20 @@ void join_namespaces(char *nslist) } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); /* - * The ordering in which we join namespaces is important. We should - * always join the user namespace *first*. This is all guaranteed - * from the container_linux.go side of this, so we're just going to - * follow the order given to us. + * The ordering in which we join namespaces is important. Upstream runc + * normally joins the user namespace first. Sysbox containers have a uid + * map that does not include host uid 0 though, so doing that first turns + * this process into overflow uid 65534 and drops the privilege needed to + * join the sandbox's network namespace on newer kernels. */ + for (i = 0; i < num; i++) { + if (namespaces[i].ns == CLONE_NEWNET && i > 0) { + struct namespace_t netns = namespaces[i]; + memmove(&namespaces[1], &namespaces[0], i * sizeof(struct namespace_t)); + namespaces[0] = netns; + break; + } + } for (i = 0; i < num; i++) { struct namespace_t ns = namespaces[i]; @@ -1128,7 +1137,7 @@ void nsexec(void) * thread exists yet in the new timens. */ if (unshared_timens) - update_timens(config.timensoffset, config.timensoffset_len); + update_timens_offsets(config.timensoffset, config.timensoffset_len); /* * TODO: What about non-namespace clone flags that we're dropping here? diff --git a/libcontainer/rootfs_init_linux.go b/libcontainer/rootfs_init_linux.go index cf55bc0d..be889af0 100644 --- a/libcontainer/rootfs_init_linux.go +++ b/libcontainer/rootfs_init_linux.go @@ -545,6 +545,31 @@ func (l *linuxRootfsInit) Init() error { } } + case sysfs: + rootfs := l.reqs[0].Rootfs + m := &l.reqs[0].Mount + + if err := unix.Chdir(rootfs); err != nil { + return newSystemErrorWithCausef(err, "chdir to rootfs %s", rootfs) + } + + if err := libcontainerUtils.WithProcfd(".", m.Destination, func(procfd string) error { + return unix.Mount(m.Source, procfd, m.Device, uintptr(m.Flags), m.Data) + }); err != nil { + return newSystemErrorWithCausef(err, "sysfs mount through procfd to %s", m.Destination) + } + + if err := libcontainerUtils.WithProcfd(".", m.Destination, func(procfd string) error { + for _, pflag := range m.PropagationFlags { + if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil { + return err + } + } + return nil + }); err != nil { + return newSystemErrorWithCausef(err, "change sysfs mount propagation through procfd to %s", m.Destination) + } + case switchDockerDns: oldDns := l.reqs[0].OldDns newDns := l.reqs[0].NewDns @@ -582,6 +607,10 @@ func (l *linuxRootfsInit) Init() error { rootfs := l.reqs[0].Rootfs for _, req := range l.reqs { + if strings.Contains(req.Path, "/proc/sys/fs/binfmt_misc") { + continue + } + path, err := securejoin.SecureJoin(rootfs, req.Path) if err != nil { return newSystemErrorWithCausef(err, "secure join of %s and %s failed: %s", rootfs, req.Path, err) @@ -603,6 +632,9 @@ func (l *linuxRootfsInit) Init() error { if err != nil { return newSystemErrorWithCausef(err, "secure join of %s and %s failed: %s", rootfs, req.Path, err) } + if strings.Contains(path, "/proc/sys/fs/binfmt_misc") { + continue + } mode := req.Mode uid := req.Uid diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index a630ae0f..30f73c32 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -396,6 +396,9 @@ func mkdirall(path string, mode os.FileMode, config *configs.Config, pipe io.Rea } if err := os.MkdirAll(path, mode); err != nil { + if errors.Is(err, unix.EOVERFLOW) && strings.Contains(path, "/proc/sys/fs/binfmt_misc") { + return nil + } // In some cases the container's init process process won't have // permission to perform the mkdir (e.g., if the parent directory in the @@ -454,6 +457,17 @@ func mountToRootfs(m *configs.Mount, config *configs.Config, enableCgroupns bool if err := mkdirall(dest, 0755, config, pipe); err != nil { return fmt.Errorf("failed to created dir for %s mount: %v", m.Device, err) } + if m.Device == "sysfs" { + req := opReq{ + Op: sysfs, + Rootfs: config.Rootfs, + Mount: *m, + } + if err := syncParentDoOp([]opReq{req}, pipe); err != nil { + return newSystemErrorWithCause(err, "syncing with parent runc to perform sysfs mount") + } + return nil + } // Selinux kernels do not support labeling of /proc or /sys return mountPropagate(m, ".", "") case "mqueue": @@ -1197,7 +1211,11 @@ func maskPaths(paths []string, mountLabel string) error { // For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward. func writeSystemProperty(key, value string) error { keyPath := strings.Replace(key, ".", "/", -1) - return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644) + err := ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644) + if os.IsNotExist(err) && (key == "net.ipv4.ip_unprivileged_port_start" || key == "net.ipv4.ping_group_range") { + return nil + } + return err } func remount(m *configs.Mount) error { @@ -1238,6 +1256,9 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { return unix.Mount(m.Source, procfd, m.Device, uintptr(flags), data) }); err != nil { + if m.Device == "binfmt_misc" && os.IsNotExist(err) { + return nil + } return fmt.Errorf("mount through procfd: %w", err) } // We have to apply mount propagation flags in a separate WithProcfd() call @@ -1290,6 +1311,16 @@ func doMounts(config *configs.Config, pipe io.ReadWriter, doSysboxfsOvermountsOn continue } + // Skip binfmt_misc overmounts on top of sysbox-fs: on containerd 2.x + // with kernel 6.8+, the binfmt_misc mount target (/proc/sys/fs/binfmt_misc) + // does not exist under the sysbox-fs emulated /proc/sys tree and cannot + // be created there. binfmt_misc registration is a host-level kernel + // feature that is not functional inside sysbox containers, so skipping + // it is safe. + if doSysboxfsOvermountsOnly && m.Device == "binfmt_misc" { + continue + } + if m.Device != "bind" { if err := mountToRootfs(m, config, true, pipe); err != nil { return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q; mount = %+v", diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index e19dc62b..4074960b 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -438,6 +438,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { config.ReadonlyPaths = spec.Linux.ReadonlyPaths config.MountLabel = spec.Linux.MountLabel config.Sysctl = spec.Linux.Sysctl + config.TimeOffsets = spec.Linux.TimeOffsets if spec.Linux.Seccomp != nil { seccomp, err := SetupSeccomp(spec.Linux.Seccomp) if err != nil { @@ -454,9 +455,6 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema } } - - // Propagate time namespace clock offsets. - config.TimeOffsets = spec.Linux.TimeOffsets } if spec.Process != nil { config.OomScoreAdj = spec.Process.OOMScoreAdj diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 865aa819..6c0ecb14 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -38,6 +38,7 @@ const ( mkdir rootfsIDMap overlay + sysfs ) type opReq struct { diff --git a/libsysbox/syscont/spec.go b/libsysbox/syscont/spec.go index 9b4dc133..f64ca319 100644 --- a/libsysbox/syscont/spec.go +++ b/libsysbox/syscont/spec.go @@ -287,7 +287,14 @@ func cfgNamespaces(sysMgr *sysbox.Mgr, spec *specs.Spec) error { // user-ns and cgroup-ns are not required per the OCI spec, but we will add // them to the system container spec. - var allNs = []string{"pid", "ipc", "uts", "mount", "network", "user", "cgroup"} + // + // time-ns is also added by default (with no offsets relative to the host) + // so that the container gets its own time namespace owned by the + // container's user-ns. Without this, tools inside the container that + // setns() into the host time-ns (e.g. `nsenter -a` from newer util-linux) + // fail with EPERM, since the host time-ns is owned by init_user_ns. If + // the OCI spec already specifies a time namespace, it is honored as-is. + var allNs = []string{"pid", "ipc", "uts", "mount", "network", "user", "cgroup", "time"} var reqNs = []string{"pid", "ipc", "uts", "mount", "network"} allNsSet := mapset.NewSet[string]()