Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions init.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,18 @@ var initCommand = cli.Command{
Name: "init",
Usage: `initialize the namespaces and launch the process (do not call it outside of sysbox-runc)`,
Action: func(context *cli.Context) error {
initType := os.Getenv("_LIBCONTAINER_INITTYPE")

factory, _ := libcontainer.New("")
if err := factory.StartInitialization(); err != nil {
// as the error is sent back to the parent there is no need to log
// or write it to stderr because the parent process will handle this
os.Exit(1)
}

// initMount helpers return here without exec; that's normal.
if initType == "mount" {
os.Exit(0)
}

panic("libcontainer: container init failed to exec")
},
}
4 changes: 4 additions & 0 deletions libcontainer/configs/validate/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ func (v *ConfigValidator) timenamespace(config *configs.Config) error {
if _, err := os.Stat("/proc/self/timens_offsets"); os.IsNotExist(err) {
return errors.New("time namespaces aren't enabled in the kernel")
}
} else {
if config.TimeOffsets != nil {
return errors.New("time namespace offsets specified, but time namespace isn't enabled in the config")
}
}
return nil
}
Expand Down
6 changes: 3 additions & 3 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -2374,7 +2374,7 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
}

// Write boottime and monotonic time namespace offsets.
if c.config.Namespaces.Contains(configs.NEWTIME) && c.config.TimeOffsets != nil {
if c.config.TimeOffsets != nil {
var offsetSpec bytes.Buffer
for clock, offset := range c.config.TimeOffsets {
fmt.Fprintf(&offsetSpec, "%s %d %d\n", clock, offset.Secs, offset.Nanosecs)
Expand Down Expand Up @@ -2444,7 +2444,7 @@ func (c *linuxContainer) handleReqOp(childPid int, reqs []opReq) error {
op := reqs[0].Op

switch op {
case bind, chown, mkdir, overlay, rootfsIDMap, switchDockerDns:
case bind, chown, mkdir, overlay, rootfsIDMap, switchDockerDns, sysfs:
return c.handleOp(op, childPid, reqs)
default:
return newSystemError(fmt.Errorf("invalid opReq type %d", int(op)))
Expand Down Expand Up @@ -2490,7 +2490,7 @@ func (c *linuxContainer) handleOp(op opReqType, childPid int, reqs []opReq) erro
namespaces := []string{}

switch op {
case bind, chown, mkdir, overlay, rootfsIDMap:
case bind, chown, mkdir, overlay, rootfsIDMap, sysfs:
namespaces = append(namespaces,
fmt.Sprintf("mnt:/proc/%d/ns/mnt", childPid),
fmt.Sprintf("pid:/proc/%d/ns/pid", childPid),
Expand Down
6 changes: 6 additions & 0 deletions libcontainer/factory_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,12 @@ func (l *LinuxFactory) StartInitialization() (err error) {
defer func() {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
// For initMount helpers, Init() returns nil without exec'ing;
// sending procError with a nil payload would cause the parent's
// parseSync to panic ("No error following JSON procError payload").
if err == nil {
return
}
if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
fmt.Fprintln(os.Stderr, err)
return
Expand Down
21 changes: 15 additions & 6 deletions libcontainer/nsenter/nsexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ static void update_oom_score_adj(char *data, size_t len)
* our user-ns) don't match for the owner-permission check. Set the
* dumpable bit across the write and restore the previous value.
*/
static void update_timens(char *map, size_t map_len)
static void update_timens_offsets(char *map, size_t map_len)
{
if (map == NULL || map_len == 0)
return;
Expand Down Expand Up @@ -642,11 +642,20 @@ void join_namespaces(char *nslist)
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);

/*
* The ordering in which we join namespaces is important. We should
* always join the user namespace *first*. This is all guaranteed
* from the container_linux.go side of this, so we're just going to
* follow the order given to us.
* The ordering in which we join namespaces is important. Upstream runc
* normally joins the user namespace first. Sysbox containers have a uid
* map that does not include host uid 0 though, so doing that first turns
* this process into overflow uid 65534 and drops the privilege needed to
* join the sandbox's network namespace on newer kernels.
*/
for (i = 0; i < num; i++) {
if (namespaces[i].ns == CLONE_NEWNET && i > 0) {
struct namespace_t netns = namespaces[i];
memmove(&namespaces[1], &namespaces[0], i * sizeof(struct namespace_t));
namespaces[0] = netns;
break;
}
}

for (i = 0; i < num; i++) {
struct namespace_t ns = namespaces[i];
Expand Down Expand Up @@ -1128,7 +1137,7 @@ void nsexec(void)
* thread exists yet in the new timens.
*/
if (unshared_timens)
update_timens(config.timensoffset, config.timensoffset_len);
update_timens_offsets(config.timensoffset, config.timensoffset_len);

/*
* TODO: What about non-namespace clone flags that we're dropping here?
Expand Down
32 changes: 32 additions & 0 deletions libcontainer/rootfs_init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,31 @@ func (l *linuxRootfsInit) Init() error {
}
}

case sysfs:
rootfs := l.reqs[0].Rootfs
m := &l.reqs[0].Mount

if err := unix.Chdir(rootfs); err != nil {
return newSystemErrorWithCausef(err, "chdir to rootfs %s", rootfs)
}

if err := libcontainerUtils.WithProcfd(".", m.Destination, func(procfd string) error {
return unix.Mount(m.Source, procfd, m.Device, uintptr(m.Flags), m.Data)
}); err != nil {
return newSystemErrorWithCausef(err, "sysfs mount through procfd to %s", m.Destination)
}

if err := libcontainerUtils.WithProcfd(".", m.Destination, func(procfd string) error {
for _, pflag := range m.PropagationFlags {
if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil {
return err
}
}
return nil
}); err != nil {
return newSystemErrorWithCausef(err, "change sysfs mount propagation through procfd to %s", m.Destination)
}

case switchDockerDns:
oldDns := l.reqs[0].OldDns
newDns := l.reqs[0].NewDns
Expand Down Expand Up @@ -582,6 +607,10 @@ func (l *linuxRootfsInit) Init() error {
rootfs := l.reqs[0].Rootfs

for _, req := range l.reqs {
if strings.Contains(req.Path, "/proc/sys/fs/binfmt_misc") {
continue
}

path, err := securejoin.SecureJoin(rootfs, req.Path)
if err != nil {
return newSystemErrorWithCausef(err, "secure join of %s and %s failed: %s", rootfs, req.Path, err)
Expand All @@ -603,6 +632,9 @@ func (l *linuxRootfsInit) Init() error {
if err != nil {
return newSystemErrorWithCausef(err, "secure join of %s and %s failed: %s", rootfs, req.Path, err)
}
if strings.Contains(path, "/proc/sys/fs/binfmt_misc") {
continue
}

mode := req.Mode
uid := req.Uid
Expand Down
33 changes: 32 additions & 1 deletion libcontainer/rootfs_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,9 @@ func mkdirall(path string, mode os.FileMode, config *configs.Config, pipe io.Rea
}

if err := os.MkdirAll(path, mode); err != nil {
if errors.Is(err, unix.EOVERFLOW) && strings.Contains(path, "/proc/sys/fs/binfmt_misc") {
return nil
}

// In some cases the container's init process process won't have
// permission to perform the mkdir (e.g., if the parent directory in the
Expand Down Expand Up @@ -454,6 +457,17 @@ func mountToRootfs(m *configs.Mount, config *configs.Config, enableCgroupns bool
if err := mkdirall(dest, 0755, config, pipe); err != nil {
return fmt.Errorf("failed to created dir for %s mount: %v", m.Device, err)
}
if m.Device == "sysfs" {
req := opReq{
Op: sysfs,
Rootfs: config.Rootfs,
Mount: *m,
}
if err := syncParentDoOp([]opReq{req}, pipe); err != nil {
return newSystemErrorWithCause(err, "syncing with parent runc to perform sysfs mount")
}
return nil
}
// Selinux kernels do not support labeling of /proc or /sys
return mountPropagate(m, ".", "")
case "mqueue":
Expand Down Expand Up @@ -1197,7 +1211,11 @@ func maskPaths(paths []string, mountLabel string) error {
// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
func writeSystemProperty(key, value string) error {
keyPath := strings.Replace(key, ".", "/", -1)
return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644)
err := ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644)
if os.IsNotExist(err) && (key == "net.ipv4.ip_unprivileged_port_start" || key == "net.ipv4.ping_group_range") {
return nil
}
return err
}

func remount(m *configs.Mount) error {
Expand Down Expand Up @@ -1238,6 +1256,9 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
return unix.Mount(m.Source, procfd, m.Device, uintptr(flags), data)
}); err != nil {
if m.Device == "binfmt_misc" && os.IsNotExist(err) {
return nil
}
return fmt.Errorf("mount through procfd: %w", err)
}
// We have to apply mount propagation flags in a separate WithProcfd() call
Expand Down Expand Up @@ -1290,6 +1311,16 @@ func doMounts(config *configs.Config, pipe io.ReadWriter, doSysboxfsOvermountsOn
continue
}

// Skip binfmt_misc overmounts on top of sysbox-fs: on containerd 2.x
// with kernel 6.8+, the binfmt_misc mount target (/proc/sys/fs/binfmt_misc)
// does not exist under the sysbox-fs emulated /proc/sys tree and cannot
// be created there. binfmt_misc registration is a host-level kernel
// feature that is not functional inside sysbox containers, so skipping
// it is safe.
if doSysboxfsOvermountsOnly && m.Device == "binfmt_misc" {
continue
}

if m.Device != "bind" {
if err := mountToRootfs(m, config, true, pipe); err != nil {
return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q; mount = %+v",
Expand Down
4 changes: 1 addition & 3 deletions libcontainer/specconv/spec_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
config.ReadonlyPaths = spec.Linux.ReadonlyPaths
config.MountLabel = spec.Linux.MountLabel
config.Sysctl = spec.Linux.Sysctl
config.TimeOffsets = spec.Linux.TimeOffsets
if spec.Linux.Seccomp != nil {
seccomp, err := SetupSeccomp(spec.Linux.Seccomp)
if err != nil {
Expand All @@ -454,9 +455,6 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema
}
}

// Propagate time namespace clock offsets.
config.TimeOffsets = spec.Linux.TimeOffsets
}
if spec.Process != nil {
config.OomScoreAdj = spec.Process.OOMScoreAdj
Expand Down
1 change: 1 addition & 0 deletions libcontainer/standard_init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ const (
mkdir
rootfsIDMap
overlay
sysfs
)

type opReq struct {
Expand Down
9 changes: 8 additions & 1 deletion libsysbox/syscont/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,14 @@ func cfgNamespaces(sysMgr *sysbox.Mgr, spec *specs.Spec) error {

// user-ns and cgroup-ns are not required per the OCI spec, but we will add
// them to the system container spec.
var allNs = []string{"pid", "ipc", "uts", "mount", "network", "user", "cgroup"}
//
// time-ns is also added by default (with no offsets relative to the host)
// so that the container gets its own time namespace owned by the
// container's user-ns. Without this, tools inside the container that
// setns() into the host time-ns (e.g. `nsenter -a` from newer util-linux)
// fail with EPERM, since the host time-ns is owned by init_user_ns. If
// the OCI spec already specifies a time namespace, it is honored as-is.
var allNs = []string{"pid", "ipc", "uts", "mount", "network", "user", "cgroup", "time"}
var reqNs = []string{"pid", "ipc", "uts", "mount", "network"}

allNsSet := mapset.NewSet[string]()
Expand Down