From e39e0afac76d47521dd6ad665ea367736e78e84b Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 2 Jun 2026 18:33:42 -0400 Subject: [PATCH 1/2] =?UTF-8?q?fuse:=20Update=20fuser=20dependency=200.15.?= =?UTF-8?q?1=20=E2=86=92=200.17.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fuser 0.17 is needed to support multithreaded FUSE sessions: the new API requires `Filesystem: Send + Sync + 'static`, which forces proper Arc-based ownership of the filesystem state and makes it possible to safely hand the implementation to multiple worker threads. The breaking API changes and how they are addressed: - `&self` instead of `&mut self` on all trait methods: the only mutable state (open file handles) is now protected by a Mutex. - New newtypes (INodeNo, FileHandle, LockOwner, Generation) and bitflags (OpenFlags, FopenFlags) — updated at call sites. - readdir/read offsets changed from i64 to u64. - Session::from_fd now takes SessionACL + Config separately. - Session::run() is no longer public; replaced by spawn().join(). - reply.error() takes fuser::Errno instead of raw i32. To satisfy the `'static` bound, serve_tree_fuse() now takes `Arc` and `Arc`. A pre-built flat Vec (indexed by ino-1) replaces the old HashMap>, removing the lifetime that was incompatible with `'static`. An InodeLookup index (path→ino for dirs, LeafId→ino for leaves) handles child ino resolution without raw pointers. Assisted-by: OpenCode (claude-sonnet-4-6) Signed-off-by: Colin Walters --- crates/composefs-fuse/Cargo.toml | 2 +- crates/composefs-fuse/src/lib.rs | 813 +++++++++++++++++++------------ 2 files changed, 506 insertions(+), 309 deletions(-) diff --git a/crates/composefs-fuse/Cargo.toml b/crates/composefs-fuse/Cargo.toml index 05f6fcfe..c62929e6 100644 --- a/crates/composefs-fuse/Cargo.toml +++ b/crates/composefs-fuse/Cargo.toml @@ -13,6 +13,6 @@ version.workspace = true [dependencies] anyhow = { version = "1.0.98", default-features = false } composefs = { workspace = true } -fuser = { version = "0.15.1", default-features = false, features = ["abi-7-31"] } +fuser = { version = "0.17.0", default-features = false } log = { version = "0.4.8", default-features = false } rustix = { version = "1.0.0", default-features = false, features = ["fs", "mount"] } diff --git a/crates/composefs-fuse/src/lib.rs b/crates/composefs-fuse/src/lib.rs index 20d108f3..773119c1 100644 --- a/crates/composefs-fuse/src/lib.rs +++ b/crates/composefs-fuse/src/lib.rs @@ -13,18 +13,19 @@ use std::{ fd::{AsFd, AsRawFd, OwnedFd}, unix::ffi::OsStrExt, }, + sync::{Arc, Mutex}, time::{Duration, SystemTime}, }; use anyhow::Context; use fuser::{ - FileAttr, FileType, Filesystem, ReplyAttr, ReplyData, ReplyDirectory, ReplyEntry, ReplyOpen, - Request, Session, SessionACL, + Config, FileAttr, FileHandle, FileType, Filesystem, FopenFlags, Generation, INodeNo, OpenFlags, + ReplyAttr, ReplyData, ReplyDirectory, ReplyEntry, ReplyOpen, Request, Session, SessionACL, }; use rustix::{ buffer::spare_capacity, fs::{Mode, OFlags, open}, - io::{Errno, pread}, + io::pread, mount::{ FsMountFlags, MountAttrFlags, fsconfig_create, fsconfig_set_flag, fsconfig_set_string, fsmount, @@ -48,279 +49,456 @@ const TTL: Duration = Duration::from_secs(1_000_000); /// concern and not exposed in the public API. type Ino = u64; -/// Precomputed inode number assignments for the entire filesystem tree. +/// Pre-built static data for one inode, computed at mount time. /// -/// Directories are identified by pointer (stable because the tree is -/// borrowed immutably for the lifetime of the FUSE session). Leaves -/// are identified by `LeafId`. -#[derive(Debug)] -struct InodeMap { - /// Directory pointer → inode number. - dir_inos: HashMap<*const Directory, Ino>, - /// LeafId → inode number. Indexed by `LeafId.0`. - /// Hardlinked leaves (same `LeafId`) naturally get the same ino. - leaf_inos: Vec, +/// Indexed by `(ino - 1)` for O(1) attribute lookup. Directory inodes +/// store their path from the root so we can resolve them via +/// [`Directory::get_directory`] without raw pointers. +#[derive(Debug, Clone)] +enum InodeData { + /// A directory inode. + Dir { + /// Path from filesystem root (empty bytes for the root itself). + path: Box, + /// Inode number of the parent directory. + parent_ino: Ino, + /// Pre-computed file attributes. + attrs: FileAttr, + }, + /// A leaf (regular file, symlink, device, etc.) inode. + Leaf { + /// Index into the filesystem's leaf table. + leaf_id: LeafId, + /// Pre-computed file attributes. + attrs: FileAttr, + }, } -impl InodeMap { - /// Walk the tree and assign sequential inode numbers. - fn build(fs: &FileSystem) -> Self { - let mut next_ino: Ino = 1; // root = 1 - let mut dir_inos = HashMap::new(); - let mut leaf_inos = vec![0u64; fs.leaves.len()]; - - fn walk( - dir: &Directory, - next_ino: &mut Ino, - dir_inos: &mut HashMap<*const Directory, Ino>, - leaf_inos: &mut [Ino], - ) { - let ino = *next_ino; - *next_ino += 1; - dir_inos.insert(dir as *const _, ino); - - for (_, inode) in dir.entries() { - match inode { - Inode::Directory(subdir) => walk(subdir, next_ino, dir_inos, leaf_inos), - Inode::Leaf(id, _) => { - if leaf_inos[id.0] == 0 { - leaf_inos[id.0] = *next_ino; - *next_ino += 1; - } - // Hardlinks: same LeafId keeps the same ino. - } - } - } - } - - walk(&fs.root, &mut next_ino, &mut dir_inos, &mut leaf_inos); - InodeMap { - dir_inos, - leaf_inos, +impl InodeData { + /// Return the pre-computed [`FileAttr`] for this inode. + fn attrs(&self) -> &FileAttr { + match self { + InodeData::Dir { attrs, .. } | InodeData::Leaf { attrs, .. } => attrs, } } +} + +/// A lookup table mapping directory children to their inode numbers. +/// +/// Built once at mount time from the full DFS walk. Directories are keyed by +/// their path (as a `Box`) and leaves by their `LeafId`. +/// This is used during `lookup` and `readdir` to map a child inode found by +/// tree traversal back to its assigned inode number. +#[derive(Debug)] +struct InodeLookup { + /// Maps a directory's root-relative path to its inode number. + dir_inos: HashMap, Ino>, + /// Maps `LeafId.0` to its inode number. Hardlinks share the same ino. + leaf_inos: Vec, +} - fn dir_ino(&self, dir: &Directory) -> Ino { - self.dir_inos[&(dir as *const _)] +impl InodeLookup { + fn dir_ino(&self, path: &OsStr) -> Option { + self.dir_inos.get(path).copied() } fn leaf_ino(&self, id: LeafId) -> Ino { self.leaf_inos[id.0] } +} - fn inode_ino(&self, inode: &Inode) -> Ino { - match inode { - Inode::Directory(dir) => self.dir_ino(dir), - Inode::Leaf(id, _) => self.leaf_ino(*id), - } +/// Helpers to compute attributes from the composefs tree types. +fn leaf_kind(leaf: &Leaf) -> FileType { + match leaf.content { + LeafContent::BlockDevice(..) => FileType::BlockDevice, + LeafContent::CharacterDevice(..) => FileType::CharDevice, + LeafContent::Fifo => FileType::NamedPipe, + LeafContent::Regular(..) => FileType::RegularFile, + LeafContent::Socket => FileType::Socket, + LeafContent::Symlink(..) => FileType::Symlink, } } -/// A reference to a filesystem node, used for FUSE inode lookup. -#[derive(Debug, Clone)] -enum InodeRef<'a, ObjectID: FsVerityHashValue> { - Directory(&'a Directory, Ino), - Leaf(LeafId, &'a Leaf), +fn leaf_rdev(leaf: &Leaf) -> u32 { + match &leaf.content { + LeafContent::BlockDevice(rdev) | LeafContent::CharacterDevice(rdev) => *rdev as u32, + _ => 0, + } } -impl<'a, ObjectID: FsVerityHashValue> InodeRef<'a, ObjectID> { - fn nlink(&self, nlink_map: &[u32]) -> u32 { - (match self { - InodeRef::Directory(dir, ..) => { - 2 + dir - .inodes() - .filter(|i| matches!(i, Inode::Directory(..))) - .count() - } - InodeRef::Leaf(leaf_id, _) => nlink_map[leaf_id.0] as usize, - }) as u32 +fn leaf_size(leaf: &Leaf) -> u64 { + match &leaf.content { + LeafContent::Regular(RegularFile::Inline(data)) => data.len() as u64, + LeafContent::Regular(RegularFile::External(.., size)) => *size, + _ => 0, } +} - fn rdev(&self) -> u32 { - (match self { - InodeRef::Directory(..) => 0, - InodeRef::Leaf(_, leaf) => match &leaf.content { - LeafContent::BlockDevice(rdev) | LeafContent::CharacterDevice(rdev) => *rdev, - _ => 0, - }, - }) as u32 - } +fn stat_mtime(stat: &Stat) -> SystemTime { + SystemTime::UNIX_EPOCH + Duration::from_secs(stat.st_mtim_sec as u64) +} - fn kind(&self) -> FileType { - match self { - InodeRef::Directory(..) => FileType::Directory, - InodeRef::Leaf(_, leaf) => match leaf.content { - LeafContent::BlockDevice(..) => FileType::BlockDevice, - LeafContent::CharacterDevice(..) => FileType::CharDevice, - LeafContent::Fifo => FileType::NamedPipe, - LeafContent::Regular(..) => FileType::RegularFile, - LeafContent::Socket => FileType::Socket, - LeafContent::Symlink(..) => FileType::Symlink, - }, - } +fn dir_fileattr(dir: &Directory, ino: Ino, nlinks: u32) -> FileAttr { + let mtime = stat_mtime(&dir.stat); + FileAttr { + ino: INodeNo(ino), + size: 0, + blocks: 1, + atime: mtime, + mtime, + ctime: mtime, + crtime: mtime, + kind: FileType::Directory, + perm: dir.stat.st_mode as u16, + nlink: nlinks, + uid: dir.stat.st_uid, + gid: dir.stat.st_gid, + rdev: 0, + blksize: 4096, + flags: 0, } +} - fn stat(&self) -> &'a Stat { - match self { - InodeRef::Directory(dir, ..) => &dir.stat, - InodeRef::Leaf(_, leaf) => &leaf.stat, - } +fn leaf_fileattr(leaf: &Leaf, ino: Ino, nlink: u32) -> FileAttr { + let mtime = stat_mtime(&leaf.stat); + FileAttr { + ino: INodeNo(ino), + size: leaf_size(leaf), + blocks: 1, + atime: mtime, + mtime, + ctime: mtime, + crtime: mtime, + kind: leaf_kind(leaf), + perm: leaf.stat.st_mode as u16, + nlink, + uid: leaf.stat.st_uid, + gid: leaf.stat.st_gid, + rdev: leaf_rdev(leaf), + blksize: 4096, + flags: 0, } +} - fn size(&self) -> u64 { - match self { - InodeRef::Directory(..) => 0, - InodeRef::Leaf(_, leaf) => match &leaf.content { - LeafContent::Regular(RegularFile::Inline(data)) => data.len() as u64, - LeafContent::Regular(RegularFile::External(.., size)) => *size, - _ => 0, - }, - } - } +/// Result of [`build_inode_table`]: the pre-built table plus the lookup index. +struct InodeTable { + /// Flat vector indexed by `(ino - 1)`. + data: Vec, + /// Lookup index: path → ino for dirs, leaf_id → ino for leaves. + lookup: InodeLookup, +} - fn fileattr(&self, ino: Ino, nlink_map: &[u32]) -> FileAttr { - let stat = self.stat(); - let mtime = SystemTime::UNIX_EPOCH + Duration::from_secs(stat.st_mtim_sec as u64); +/// Mutable accumulator used during the DFS walk in [`build_inode_table`]. +struct InodeWalker<'a, O: FsVerityHashValue> { + next_ino: Ino, + dir_inos: HashMap, Ino>, + leaf_inos: Vec, + entries: Vec<(Ino, InodeData)>, + nlink_map: &'a [u32], + leaves: &'a [Leaf], +} - FileAttr { +impl InodeWalker<'_, O> { + fn walk(&mut self, dir: &Directory, path: &OsStr, parent_ino: Ino, ino: Ino) { + self.dir_inos.insert(path.into(), ino); + let nlinks = 2 + dir + .inodes() + .filter(|i| matches!(i, Inode::Directory(..))) + .count() as u32; + let attrs = dir_fileattr(dir, ino, nlinks); + self.entries.push(( ino, - size: self.size(), - blocks: 1, - atime: mtime, - mtime, - ctime: mtime, - crtime: mtime, - kind: self.kind(), - perm: stat.st_mode as u16, - nlink: self.nlink(nlink_map), - uid: stat.st_uid, - gid: stat.st_gid, - rdev: self.rdev(), - blksize: 4096, - flags: 0, + InodeData::Dir { + path: path.into(), + parent_ino, + attrs, + }, + )); + + for (name, inode) in dir.entries() { + match inode { + Inode::Directory(subdir) => { + self.next_ino += 1; + let child_ino = self.next_ino; + let child_path = child_path_from(path, name); + self.walk(subdir, &child_path, ino, child_ino); + } + Inode::Leaf(leaf_id, _) => { + if self.leaf_inos[leaf_id.0] == 0 { + self.next_ino += 1; + let leaf_ino = self.next_ino; + self.leaf_inos[leaf_id.0] = leaf_ino; + let leaf = &self.leaves[leaf_id.0]; + let nlink = self.nlink_map[leaf_id.0]; + let attrs = leaf_fileattr(leaf, leaf_ino, nlink); + self.entries.push(( + leaf_ino, + InodeData::Leaf { + leaf_id: *leaf_id, + attrs, + }, + )); + } + // Hardlinks: same LeafId → same ino, no new entry needed. + } + } } } } +/// Build the flat inode table and lookup index from the filesystem tree. +/// +/// The table is indexed by `(ino - 1)` so index 0 = inode 1 (root). +/// Sequential inode numbers are assigned via DFS, matching what the kernel +/// expects for a stable, mountable filesystem. +fn build_inode_table(fs: &FileSystem) -> InodeTable { + let nlink_map = fs.nlinks(); + let root_ino: Ino = 1; + let mut walker = InodeWalker { + next_ino: root_ino, + dir_inos: HashMap::new(), + leaf_inos: vec![0; fs.leaves.len()], + entries: Vec::new(), + nlink_map: &nlink_map, + leaves: &fs.leaves, + }; + walker.walk(&fs.root, OsStr::new(""), root_ino, root_ino); + + let InodeWalker { + dir_inos, + leaf_inos, + mut entries, + .. + } = walker; + + // Sort by ino (ascending) and build the flat Vec indexed by (ino - 1). + entries.sort_unstable_by_key(|(ino, _)| *ino); + let max_ino = entries.last().map(|(ino, _)| *ino).unwrap_or(1); + let mut data: Vec> = vec![None; max_ino as usize]; + for (ino, entry) in entries { + data[(ino as usize) - 1] = Some(entry); + } + let data: Vec = data + .into_iter() + .enumerate() + .map(|(i, opt)| opt.unwrap_or_else(|| panic!("inode table slot {i} was never filled"))) + .collect(); + + InodeTable { + data, + lookup: InodeLookup { + dir_inos, + leaf_inos, + }, + } +} + +/// An open file handle: either a real fd (for external objects) or inline data. #[derive(Debug)] enum OpenHandle { Fd(OwnedFd), Data(Box<[u8]>), } -#[derive(Debug)] -struct TreeFuse<'a, ObjectID: FsVerityHashValue> { - repo: &'a Repository, - fs: &'a FileSystem, - inode_map: InodeMap, - nlink_map: Vec, - inodes: HashMap>, - attrs: HashMap, +/// Mutable runtime state: only tracks open file handles. +#[derive(Debug, Default)] +struct FuseHandles { handles: HashMap, next_fh: u64, } -impl<'a, ObjectID: FsVerityHashValue> TreeFuse<'a, ObjectID> { - fn register_inode(&mut self, inode: &'a Inode, parent: Ino) -> (Ino, FileType) { - let ino = self.inode_map.inode_ino(inode); - let iref = match inode { - Inode::Directory(dir) => InodeRef::Directory(dir, parent), - Inode::Leaf(leaf_id, _) => InodeRef::Leaf(*leaf_id, self.fs.leaf(*leaf_id)), +/// The main FUSE filesystem implementation. +/// +/// Holds the composefs repository and tree by `Arc`, plus a pre-built inode +/// table (built at mount time). The only mutable state is the open-file-handle +/// map, protected by a `Mutex` to satisfy `Filesystem: Send + Sync + 'static`. +#[derive(Debug)] +struct TreeFuse { + repo: Arc>, + fs: Arc>, + /// Pre-built, static inode data indexed by `(ino - 1)`. + inode_data: Vec, + /// Lookup index for resolving child inode numbers. + lookup: InodeLookup, + /// Mutable handle state, protected for thread safety. + handles: Mutex, +} + +impl TreeFuse { + fn get_data(&self, ino: Ino) -> Option<&InodeData> { + let idx = (ino as usize).checked_sub(1)?; + self.inode_data.get(idx) + } + + /// Resolve a directory inode to a `&Directory` by walking the stored path. + fn resolve_dir(&self, ino: Ino) -> Option<&Directory> { + let InodeData::Dir { path, .. } = self.get_data(ino)? else { + return None; + }; + if path.is_empty() { + Some(&self.fs.root) + } else { + self.fs.root.get_directory(path).ok() + } + } + + /// Resolve a leaf inode to its `&Leaf`. + fn resolve_leaf(&self, ino: Ino) -> Option<&Leaf> { + let InodeData::Leaf { leaf_id, .. } = self.get_data(ino)? else { + return None; }; - let kind = iref.kind(); - self.attrs.insert(ino, iref.fileattr(ino, &self.nlink_map)); - self.inodes.insert(ino, iref); - (ino, kind) + Some(self.fs.leaf(*leaf_id)) + } + + /// Resolve the [`Stat`] for an inode. + /// + /// Returns `None` if the inode doesn't exist, `Some(Err(()))` on an internal + /// path resolution error, and `Some(Ok(&Stat))` on success. + fn resolve_stat(&self, ino: Ino) -> Option> { + match self.get_data(ino)? { + InodeData::Dir { path, .. } => { + let dir = if path.is_empty() { + &self.fs.root + } else { + self.fs.root.get_directory(path).ok()? + }; + Some(Ok(&dir.stat)) + } + InodeData::Leaf { leaf_id, .. } => Some(Ok(&self.fs.leaf(*leaf_id).stat)), + } + } + + /// Given a child `Inode` (from a directory lookup), return its ino number. + fn child_ino(&self, inode: &Inode, child_path: &OsStr) -> Option { + match inode { + Inode::Directory(_) => self.lookup.dir_ino(child_path), + Inode::Leaf(id, _) => Some(self.lookup.leaf_ino(*id)), + } } } -impl Filesystem for TreeFuse<'_, ObjectID> { - fn statfs(&mut self, _req: &Request<'_>, _ino: u64, reply: fuser::ReplyStatfs) { +impl Filesystem for TreeFuse { + fn statfs(&self, _req: &Request, _ino: INodeNo, reply: fuser::ReplyStatfs) { reply.statfs(0, 0, 0, 0, 0, 4096, 255, 4096); } - fn lookup(&mut self, _req: &Request, parent: u64, name: &OsStr, reply: ReplyEntry) { + fn lookup(&self, _req: &Request, parent: INodeNo, name: &OsStr, reply: ReplyEntry) { + let parent = parent.0; log::trace!("lookup {parent} {name:?}"); - let Some(InodeRef::Directory(dir, ..)) = self.inodes.get(&parent) else { - log::error!("lookup({parent}, {name:?}) parent does not exist"); - return reply.error(Errno::BADF.raw_os_error()); + + let Some(InodeData::Dir { + path: parent_path, .. + }) = self.get_data(parent) + else { + log::error!("lookup({parent}, {name:?}): parent is not a directory"); + return reply.error(fuser::Errno::EBADF); + }; + let parent_path = parent_path.clone(); + + let Some(dir) = self.resolve_dir(parent) else { + log::error!("lookup({parent}, {name:?}): failed to resolve parent directory"); + return reply.error(fuser::Errno::EIO); + }; + + let child_path: Box = if parent_path.is_empty() { + name.into() + } else { + let mut p = parent_path.as_bytes().to_vec(); + p.push(b'/'); + p.extend_from_slice(name.as_bytes()); + OsStr::from_bytes(&p).into() }; - let dir = *dir; match dir.lookup(name) { - Some(inode) => { - let (ino, _) = self.register_inode(inode, parent); - reply.entry(&TTL, self.attrs.get(&ino).unwrap(), 0); - } - None => reply.error(Errno::NOENT.raw_os_error()), + Some(inode) => match self.child_ino(inode, &child_path) { + Some(ino) => { + let attrs = self.inode_data[(ino as usize) - 1].attrs(); + reply.entry(&TTL, attrs, Generation(0)); + } + None => { + log::error!("lookup({parent}, {name:?}): child inode not in table"); + reply.error(fuser::Errno::EIO); + } + }, + None => reply.error(fuser::Errno::ENOENT), } } - fn getattr(&mut self, _req: &Request, ino: u64, _fh: Option, reply: ReplyAttr) { - if let Some(attrs) = self.attrs.get(&ino) { - return reply.attr(&TTL, attrs); + fn getattr(&self, _req: &Request, ino: INodeNo, _fh: Option, reply: ReplyAttr) { + match self.get_data(ino.0) { + Some(data) => reply.attr(&TTL, data.attrs()), + None => { + log::error!("getattr({ino}): inode does not exist"); + reply.error(fuser::Errno::EBADF); + } } - - let Some(iref) = self.inodes.get(&ino) else { - log::error!("getattr({ino}) inode does not exist"); - return reply.error(Errno::BADF.raw_os_error()); - }; - let iref = iref.clone(); - - let attr = iref.fileattr(ino, &self.nlink_map); - self.attrs.insert(ino, attr); - reply.attr(&TTL, self.attrs.get(&ino).unwrap()); } - fn readlink(&mut self, _req: &Request<'_>, ino: u64, reply: ReplyData) { - let Some(InodeRef::Leaf(_, leaf)) = self.inodes.get(&ino) else { - return reply.error(Errno::INVAL.raw_os_error()); + fn readlink(&self, _req: &Request, ino: INodeNo, reply: ReplyData) { + let Some(leaf) = self.resolve_leaf(ino.0) else { + return reply.error(fuser::Errno::EINVAL); }; - let LeafContent::Symlink(target) = &leaf.content else { - return reply.error(Errno::INVAL.raw_os_error()); + return reply.error(fuser::Errno::EINVAL); }; - reply.data(target.as_bytes()); } - fn opendir(&mut self, _req: &Request<'_>, _ino: u64, _flags: i32, reply: ReplyOpen) { - reply.opened(0, 0); + fn opendir(&self, _req: &Request, _ino: INodeNo, _flags: OpenFlags, reply: ReplyOpen) { + reply.opened(FileHandle(0), FopenFlags::empty()); } fn readdir( - &mut self, + &self, _req: &Request, - ino: u64, - _fh: u64, - mut offset: i64, + ino: INodeNo, + _fh: FileHandle, + offset: u64, mut reply: ReplyDirectory, ) { - let Some(InodeRef::Directory(dir, parent)) = self.inodes.get(&ino) else { - log::error!("readdir({ino}) inode is not a directory"); - return reply.error(Errno::BADF.raw_os_error()); + let ino = ino.0; + let Some(InodeData::Dir { + parent_ino, + path: dir_path, + .. + }) = self.get_data(ino) + else { + log::error!("readdir({ino}): inode is not a directory"); + return reply.error(fuser::Errno::EBADF); + }; + let parent_ino = *parent_ino; + let dir_path = dir_path.clone(); + + let Some(dir) = self.resolve_dir(ino) else { + log::error!("readdir({ino}): failed to resolve directory"); + return reply.error(fuser::Errno::EIO); }; - let (dir, parent) = (*dir, *parent); - if offset == 0 { - offset += 1; - if reply.add(ino, offset, FileType::Directory, ".") { + let mut cur_offset = offset; + + if cur_offset == 0 { + cur_offset += 1; + if reply.add(INodeNo(ino), cur_offset, FileType::Directory, ".") { return reply.ok(); } } - if offset == 1 { - offset += 1; - if reply.add(parent, offset, FileType::Directory, "..") { + if cur_offset == 1 { + cur_offset += 1; + if reply.add(INodeNo(parent_ino), cur_offset, FileType::Directory, "..") { return reply.ok(); } } - for (name, inode) in dir.sorted_entries().skip(offset as usize - 2) { - let (child_ino, kind) = self.register_inode(inode, ino); - - offset += 1; - if reply.add(child_ino, offset, kind, name) { + for (name, inode) in dir.sorted_entries().skip((cur_offset as usize) - 2) { + let child_path = child_path_from(&dir_path, name); + let Some(child_ino) = self.child_ino(inode, &child_path) else { + log::error!("readdir({ino}): child {name:?} not in inode table"); + continue; + }; + let kind = self.inode_data[(child_ino as usize) - 1].attrs().kind; + cur_offset += 1; + if reply.add(INodeNo(child_ino), cur_offset, kind, name) { break; } } @@ -329,159 +507,180 @@ impl Filesystem for TreeFuse<'_, ObjectID> { } fn releasedir( - &mut self, - _req: &Request<'_>, - _ino: u64, - _fh: u64, - _flags: i32, + &self, + _req: &Request, + _ino: INodeNo, + _fh: FileHandle, + _flags: OpenFlags, reply: fuser::ReplyEmpty, ) { reply.ok(); } fn getxattr( - &mut self, - _req: &Request<'_>, - ino: u64, + &self, + _req: &Request, + ino: INodeNo, name: &OsStr, size: u32, reply: fuser::ReplyXattr, ) { - let Some(iref) = self.inodes.get(&ino) else { - log::error!("getxattr({ino}, {name:?}, {size}) inode does not exist"); - return reply.error(Errno::BADF.raw_os_error()); - }; - - let xattrs = &iref.stat().xattrs; - let Some(value) = xattrs.get(name) else { - return reply.error(Errno::NODATA.raw_os_error()); - }; - - if size == 0 { - return reply.size(value.len() as u32); - } else if value.len() > size as usize { - return reply.error(Errno::RANGE.raw_os_error()); + let ino = ino.0; + match self.resolve_stat(ino) { + None => { + log::error!("getxattr({ino}, {name:?}, {size}): inode does not exist"); + reply.error(fuser::Errno::EBADF); + } + Some(Err(())) => reply.error(fuser::Errno::EIO), + Some(Ok(stat)) => match stat.xattrs.get(name) { + None => reply.error(fuser::Errno::ENODATA), + Some(value) => { + if size == 0 { + reply.size(value.len() as u32); + } else if value.len() > size as usize { + reply.error(fuser::Errno::ERANGE); + } else { + reply.data(value); + } + } + }, } - - reply.data(value); } - fn listxattr(&mut self, _req: &Request<'_>, ino: u64, size: u32, reply: fuser::ReplyXattr) { - let Some(iref) = self.inodes.get(&ino) else { - log::error!("listxattr({ino}, {size}) inode does not exist"); - return reply.error(Errno::BADF.raw_os_error()); - }; - - let mut list = vec![]; - for name in iref.stat().xattrs.keys() { - list.extend_from_slice(name.as_bytes()); - list.push(b'\0'); - } - - if size == 0 { - return reply.size(list.len() as u32); - } else if list.len() > size as usize { - return reply.error(Errno::RANGE.raw_os_error()); + fn listxattr(&self, _req: &Request, ino: INodeNo, size: u32, reply: fuser::ReplyXattr) { + let ino = ino.0; + match self.resolve_stat(ino) { + None => { + log::error!("listxattr({ino}, {size}): inode does not exist"); + reply.error(fuser::Errno::EBADF); + } + Some(Err(())) => reply.error(fuser::Errno::EIO), + Some(Ok(stat)) => { + let mut list = vec![]; + for k in stat.xattrs.keys() { + list.extend_from_slice(k.as_bytes()); + list.push(b'\0'); + } + if size == 0 { + reply.size(list.len() as u32); + } else if list.len() > size as usize { + reply.error(fuser::Errno::ERANGE); + } else { + reply.data(&list); + } + } } - - reply.data(&list); } - fn open(&mut self, _req: &Request<'_>, ino: u64, _flags: i32, reply: ReplyOpen) { + fn open(&self, _req: &Request, ino: INodeNo, _flags: OpenFlags, reply: ReplyOpen) { + let ino = ino.0; log::trace!("open({ino})"); - let Some(iref) = self.inodes.get(&ino) else { - log::error!("open({ino}) inode does not exist"); - return reply.error(Errno::BADF.raw_os_error()); - }; - let InodeRef::Leaf(_, leaf) = iref else { - log::error!("open({ino}) inode is a directory"); - return reply.error(Errno::BADF.raw_os_error()); + let Some(InodeData::Leaf { leaf_id, .. }) = self.get_data(ino) else { + log::error!("open({ino}): inode is not a regular file"); + return reply.error(fuser::Errno::EBADF); }; + let leaf = self.fs.leaf(*leaf_id); let handle = match &leaf.content { LeafContent::Regular(RegularFile::External(id, ..)) => { let Ok(fd) = self.repo.open_object(id) else { - log::error!("open({ino}) open object failed"); - return reply.error(Errno::INVAL.raw_os_error()); + log::error!("open({ino}): failed to open object"); + return reply.error(fuser::Errno::EIO); }; OpenHandle::Fd(fd) } LeafContent::Regular(RegularFile::Inline(data)) => OpenHandle::Data(data.clone()), _ => { - log::error!("open({ino}) non-regular file"); - return reply.error(Errno::BADF.raw_os_error()); + log::error!("open({ino}): not a regular file"); + return reply.error(fuser::Errno::EBADF); } }; - let fh = self.next_fh; - self.next_fh += 1; - log::debug!("self.handles.insert({fh}, {handle:?})"); - self.handles.insert(fh, handle); - reply.opened(fh, 0); + let mut state = self.handles.lock().expect("fuse handles mutex poisoned"); + let fh = state.next_fh; + state.next_fh += 1; + log::debug!("open({ino}): inserted handle {fh}"); + state.handles.insert(fh, handle); + reply.opened(FileHandle(fh), FopenFlags::empty()); } fn read( - &mut self, - _req: &Request<'_>, - _ino: u64, - fh: u64, - offset: i64, + &self, + _req: &Request, + _ino: INodeNo, + fh: FileHandle, + offset: u64, size: u32, - _flags: i32, - _lock_owner: Option, - reply: fuser::ReplyData, + _flags: OpenFlags, + _lock_owner: Option, + reply: ReplyData, ) { - match self.handles.get(&fh) { + let state = self.handles.lock().expect("fuse handles mutex poisoned"); + match state.handles.get(&fh.0) { Some(OpenHandle::Fd(fd)) => { let mut data = Vec::with_capacity(size as usize); - match pread(fd, spare_capacity(&mut data), offset as u64) { + match pread(fd, spare_capacity(&mut data), offset) { Ok(_) => reply.data(&data), - Err(errno) => reply.error(errno.raw_os_error()), + Err(errno) => { + reply.error(errno_to_fuser(errno)); + } } } Some(OpenHandle::Data(data)) => { - if offset as usize > data.len() { - reply.data(b""); - } else { - let mut data = &data[offset as usize..]; - if data.len() > size as usize { - data = &data[..size as usize]; - } - reply.data(data); - } + let start = (offset as usize).min(data.len()); + let end = (start + size as usize).min(data.len()); + reply.data(&data[start..end]); } None => { - log::error!("Handle doesn't exist: pread({fh}, {size}, {offset})"); - reply.error(Errno::BADF.raw_os_error()); + log::error!("read(fh={fh}): handle does not exist"); + reply.error(fuser::Errno::EBADF); } } } fn release( - &mut self, - _req: &Request<'_>, - _ino: u64, - fh: u64, - _flags: i32, - _lock_owner: Option, + &self, + _req: &Request, + _ino: INodeNo, + fh: FileHandle, + _flags: OpenFlags, + _lock_owner: Option, _flush: bool, reply: fuser::ReplyEmpty, ) { - match self.handles.remove(&fh) { + let mut state = self.handles.lock().expect("fuse handles mutex poisoned"); + match state.handles.remove(&fh.0) { Some(_) => reply.ok(), None => { - log::error!("Handle doesn't exist: close({fh})"); - reply.error(Errno::BADF.raw_os_error()) + log::error!("release(fh={fh}): handle does not exist"); + reply.error(fuser::Errno::EBADF); } } } } +/// Construct the child path given the parent's path and the entry name. +fn child_path_from(parent_path: &OsStr, name: &OsStr) -> Box { + if parent_path.is_empty() { + name.into() + } else { + let mut p = parent_path.as_bytes().to_vec(); + p.push(b'/'); + p.extend_from_slice(name.as_bytes()); + OsStr::from_bytes(&p).into() + } +} + +/// Convert a `rustix::io::Errno` to the corresponding `fuser::Errno`. +fn errno_to_fuser(errno: rustix::io::Errno) -> fuser::Errno { + fuser::Errno::from(std::io::Error::from_raw_os_error(errno.raw_os_error())) +} + /// Opens /dev/fuse. /// -/// After you do this, you can mount it using mount_fuse() and then start serving requests using -/// serve_tree_fuse(). You might want to do this in different threads, which is why these +/// After you do this, you can mount it using [`mount_fuse`] and then start serving requests using +/// [`serve_tree_fuse`]. You might want to do this in different threads, which is why these /// operations are defined separately. pub fn open_fuse() -> anyhow::Result { open("/dev/fuse", OFlags::RDWR | OFlags::CLOEXEC, Mode::empty()) @@ -509,9 +708,9 @@ impl FuseMountOptions { /// Mounts a FUSE filesystem with the given /dev/fuse fd. /// -/// This does the necessary dance of creating the mount object, given a /dev/fuse device node. In -/// order for this to be useful, you'll also need to call serve_tree_fuse() to actually satisfy the -/// requests for data. +/// This does the necessary dance of creating the mount object, given a /dev/fuse device node. In +/// order for this to be useful, you'll also need to call [`serve_tree_fuse`] to actually satisfy +/// the requests for data. pub fn mount_fuse(dev_fuse: impl AsFd, options: &FuseMountOptions) -> anyhow::Result { let fusefs = FsHandle::open("fuse")?; fsconfig_set_flag(fusefs.as_fd(), "ro")?; @@ -538,28 +737,26 @@ pub fn mount_fuse(dev_fuse: impl AsFd, options: &FuseMountOptions) -> anyhow::Re /// Serves a FUSE filesystem exposing the content of `filesystem`, backed by `repo`. /// -/// You should have called mount_fuse() on the dev_fuse fd to establish a mount point. -pub fn serve_tree_fuse<'a, ObjectID: FsVerityHashValue>( +/// You should have called [`mount_fuse`] on the `dev_fuse` fd to establish a mount point. +/// The function blocks until the FUSE session ends. +pub fn serve_tree_fuse( dev_fuse: OwnedFd, - filesystem: &'a FileSystem, - repo: &'a Repository, + filesystem: Arc>, + repo: Arc>, ) -> std::io::Result<()> { - let inode_map = InodeMap::build(filesystem); - let nlink_map = filesystem.nlinks(); - - let root_ino = inode_map.dir_ino(&filesystem.root); - let root_ref = InodeRef::Directory(&filesystem.root, root_ino); - let root_attr = root_ref.fileattr(root_ino, &nlink_map); + let InodeTable { + data: inode_data, + lookup, + } = build_inode_table(&filesystem); let tf = TreeFuse:: { repo, fs: filesystem, - inode_map, - nlink_map, - inodes: HashMap::from([(root_ino, root_ref)]), - attrs: HashMap::from([(root_ino, root_attr)]), - handles: Default::default(), - next_fh: 1, + inode_data, + lookup, + handles: Mutex::new(FuseHandles::default()), }; - Session::from_fd(tf, dev_fuse, SessionACL::All).run() + Session::from_fd(tf, dev_fuse, SessionACL::All, Config::default())? + .spawn()? + .join() } From 02ccc6d81ae8cb98f88674aa756ed88a8f68ce2b Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 2 Jun 2026 19:01:09 -0400 Subject: [PATCH 2/2] fuse: Add FUSE serving via CLI and varlink RPC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the composefs-fuse crate into cfsctl behind a new `fuse` cargo feature (on by default). The FUSE implementation supports readdirplus, multithreaded serving via FUSE_DEV_IOC_CLONE, FOPEN_KEEP_CACHE, and optional passthrough (Linux 6.9+, root only) for kernel-bypass reads on external object files. CLI surface: - `cfsctl mount --fuse[=passthrough] [--raw-image]` serves an image over FUSE instead of doing a kernel composefs mount. `--raw-image` reads a bare EROFS file from disk rather than looking up a repo image. This replaces the old `fuse-serve` subcommand; all mount paths now live under `mount`. - `cfsctl oci mount --fuse[=passthrough]` likewise for OCI images. Varlink surface — the single `Mount`/`OciMount` methods now always return a detached mount fd via SCM_RIGHTS, for both kernel and FUSE backends. A new `MountPath`/`OciMountPath` pair handles the attach-to- a-path case server-side (blocking until the FUSE session ends unless `wait=false`). This makes FUSE mounts a proper first-class path: the caller gets the fd and can move_mount it wherever it likes, with the FUSE server running in the background for as long as the fd is held. Also adds `erofs_fd_to_filesystem` in composefs-fuse to replace the open-coded open_image → read_to_end → erofs_to_filesystem pattern that appeared at every call site. The privileged_fuse_dumpfile_roundtrip integration test spawns `cfsctl mount --raw-image --fuse`, polls for mount readiness via st_dev change, and compares the dumpfile produced over the FUSE mount against the expected output from write_dumpfile. Assisted-by: OpenCode (claude-sonnet-4-6) Signed-off-by: Colin Walters --- crates/composefs-ctl/Cargo.toml | 4 +- crates/composefs-ctl/src/lib.rs | 160 ++- crates/composefs-ctl/src/varlink.rs | 1254 ++++++++++++++--- crates/composefs-fuse/src/lib.rs | 377 ++++- crates/composefs-integration-tests/Cargo.toml | 2 +- .../src/tests/privileged.rs | 388 +++++ 6 files changed, 1954 insertions(+), 231 deletions(-) diff --git a/crates/composefs-ctl/Cargo.toml b/crates/composefs-ctl/Cargo.toml index 361b5f0e..3529f0d5 100644 --- a/crates/composefs-ctl/Cargo.toml +++ b/crates/composefs-ctl/Cargo.toml @@ -17,7 +17,8 @@ name = "cfsctl" path = "src/main.rs" [features] -default = ['pre-6.15', 'oci', 'containers-storage'] +default = ['pre-6.15', 'oci', 'containers-storage', 'fuse'] +fuse = ['dep:composefs-fuse'] http = ['composefs-http'] oci = ['composefs-oci', 'composefs-oci/varlink'] containers-storage = ['composefs-oci/containers-storage', 'cstorage'] @@ -31,6 +32,7 @@ clap = { version = "4.5.0", default-features = false, features = ["std", "help", comfy-table = { version = "7.1", default-features = false } composefs = { workspace = true, features = ["varlink"] } composefs-boot = { workspace = true } +composefs-fuse = { path = "../composefs-fuse", version = "0.4.0", optional = true } composefs-oci = { workspace = true, optional = true, features = ["boot"] } composefs-http = { workspace = true, optional = true } cstorage = { package = "composefs-storage", path = "../composefs-storage", version = "0.4.0", features = ["userns-helper"], optional = true } diff --git a/crates/composefs-ctl/src/lib.rs b/crates/composefs-ctl/src/lib.rs index 548ae23c..0322f89a 100644 --- a/crates/composefs-ctl/src/lib.rs +++ b/crates/composefs-ctl/src/lib.rs @@ -318,6 +318,35 @@ impl From for composefs_oci::LocalFetchOpt { } } +/// Options accepted by `--fuse[=]` on `mount` and `oci mount`. +/// +/// Pass bare `--fuse` to FUSE-mount with defaults, or `--fuse=passthrough` +/// to also enable kernel-bypass reads for external files. +/// +/// Multiple options are comma-separated: `--fuse=passthrough,option2` +/// (only `passthrough` is defined today). +#[cfg(feature = "fuse")] +#[derive(Debug, Default, Clone)] +struct FuseOptions { + passthrough: bool, +} + +#[cfg(feature = "fuse")] +impl std::str::FromStr for FuseOptions { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let mut opts = FuseOptions::default(); + for token in s.split(',').map(str::trim).filter(|t| !t.is_empty()) { + match token { + "passthrough" => opts.passthrough = true, + other => anyhow::bail!("unknown fuse option: {other:?} (known: passthrough)"), + } + } + Ok(opts) + } +} + /// Common options for operations using OCI config manifest streams that may transform the image rootfs #[cfg(feature = "oci")] #[derive(Debug, Parser)] @@ -447,6 +476,17 @@ enum OciCommand { /// Mount read-write (requires --upperdir) #[arg(long, requires = "upperdir")] read_write: bool, + /// Serve the EROFS image over FUSE instead of using a kernel composefs mount. + /// Requires /dev/fuse and blocks until the mount is detached or the process + /// is killed. Does not require fs-verity on the backing store. + /// + /// Accepts an optional comma-separated list of options: + /// --fuse basic FUSE mount + /// --fuse=passthrough also enable kernel-bypass reads (Linux 6.9+, root, non-tmpfs) + #[cfg(feature = "fuse")] + #[arg(long, num_args = 0..=1, require_equals = false, value_name = "OPTS", + default_missing_value = "")] + fuse: Option, }, /// Compute the composefs image ID of a stored OCI image's rootfs /// @@ -568,13 +608,23 @@ enum Command { #[clap(subcommand)] cmd: OciCommand, }, - /// Mounts a composefs image, possibly enforcing fsverity of the image + /// Mounts a composefs image, possibly enforcing fsverity of the image. + /// + /// By default the image is identified by its repo name (an fs-verity hash + /// or a `ref/` tag). Pass `--raw-image` to supply a path to a bare EROFS + /// file instead, which skips the repository image store lookup and is + /// required when using `--fuse` without a full repository setup. Mount { - /// the name of the image to mount, either an fs-verity hash or prefixed with 'ref/' + /// The image to mount: a repo name (fs-verity hash or `ref/`) by + /// default, or a filesystem path when `--raw-image` is given. name: String, /// the mountpoint mountpoint: String, - /// Writable upper layer directory for overlayfs + /// Treat as a path to a raw EROFS image file instead of a repo + /// image name. + #[arg(long)] + raw_image: bool, + /// Writable upper layer directory for overlayfs (kernel mount only) #[arg(long, requires = "workdir")] upperdir: Option, /// Work directory for overlayfs (required with --upperdir) @@ -583,6 +633,15 @@ enum Command { /// Mount read-write (requires --upperdir) #[arg(long, requires = "upperdir")] read_write: bool, + /// Serve the image over FUSE instead of a kernel composefs mount. + /// + /// Accepts an optional comma-separated list of options: + /// --fuse basic FUSE mount + /// --fuse=passthrough also enable kernel-bypass reads (Linux 6.9+, root, non-tmpfs) + #[cfg(feature = "fuse")] + #[arg(long, num_args = 0..=1, require_equals = false, value_name = "OPTS", + default_missing_value = "")] + fuse: Option, }, /// Read rootfs located at a path, add all files to the repo, then create the composefs image of the rootfs, /// commit it to the repo, and print its image object ID @@ -1264,9 +1323,9 @@ where ref upperdir, ref workdir, read_write, + #[cfg(feature = "fuse")] + fuse, } => { - let mount_options = - get_mount_options(upperdir.as_deref(), workdir.as_deref(), read_write)?; let img = if image.starts_with("sha256:") { let digest: composefs_oci::OciDigest = image.parse().context("Parsing manifest digest")?; @@ -1289,7 +1348,45 @@ where ), } }; - repo.mount_at(&erofs_id.to_hex(), mountpoint.as_str(), &mount_options)?; + #[cfg(feature = "fuse")] + if let Some(fuse_opts) = fuse { + use composefs_fuse::{ + FuseConfig, erofs_fd_to_filesystem, mount_fuse, open_fuse, + serve_tree_fuse_fd, + }; + + let (image_fd, _verified) = repo.open_image(&erofs_id.to_hex())?; + let filesystem = erofs_fd_to_filesystem::(image_fd)?; + + let dev_fuse = open_fuse()?; + let mnt_fd = mount_fuse(&dev_fuse, &Default::default())?; + composefs::mount::mount_at(&mnt_fd, CWD, mountpoint.as_str()) + .with_context(|| format!("attaching FUSE mount at {mountpoint}"))?; + + // Hold mnt_fd alive for the session duration — it pins the FUSE + // superblock so the connection stays alive while we serve. + let _mnt_fd = mnt_fd; + + serve_tree_fuse_fd( + dev_fuse, + Arc::new(filesystem), + Arc::clone(&repo), + FuseConfig { + passthrough: fuse_opts.passthrough, + }, + ) + .context("FUSE session error")?; + } else { + let mount_options = + get_mount_options(upperdir.as_deref(), workdir.as_deref(), read_write)?; + repo.mount_at(&erofs_id.to_hex(), mountpoint.as_str(), &mount_options)?; + } + #[cfg(not(feature = "fuse"))] + { + let mount_options = + get_mount_options(upperdir.as_deref(), workdir.as_deref(), read_write)?; + repo.mount_at(&erofs_id.to_hex(), mountpoint.as_str(), &mount_options)?; + } } OciCommand::ComputeId { config_opts } => { let mut fs = load_filesystem_from_oci_image(&repo, config_opts)?; @@ -1513,13 +1610,58 @@ where Command::Mount { name, mountpoint, + raw_image, ref upperdir, ref workdir, read_write, + #[cfg(feature = "fuse")] + fuse, } => { - let mount_options = - get_mount_options(upperdir.as_deref(), workdir.as_deref(), read_write)?; - repo.mount_at(&name, &mountpoint, &mount_options)?; + #[cfg(feature = "fuse")] + if let Some(fuse_opts) = fuse { + use composefs_fuse::{ + FuseConfig, erofs_fd_to_filesystem, mount_fuse, open_fuse, serve_tree_fuse_fd, + }; + + let filesystem = if raw_image { + let bytes = std::fs::read(&name).with_context(|| format!("reading {name}"))?; + erofs_to_filesystem::(&bytes).context("parsing EROFS image")? + } else { + let (image_fd, _verified) = repo.open_image(&name)?; + erofs_fd_to_filesystem::(image_fd)? + }; + + let dev_fuse = open_fuse()?; + let mnt_fd = mount_fuse(&dev_fuse, &Default::default())?; + composefs::mount::mount_at(&mnt_fd, CWD, mountpoint.as_str()) + .with_context(|| format!("attaching FUSE mount at {mountpoint}"))?; + + // Hold mnt_fd alive for the session duration — it pins the FUSE + // superblock so the connection stays alive while we serve. + let _mnt_fd = mnt_fd; + + serve_tree_fuse_fd( + dev_fuse, + Arc::new(filesystem), + Arc::clone(&repo), + FuseConfig { + passthrough: fuse_opts.passthrough, + }, + ) + .context("FUSE session error")?; + } else { + anyhow::ensure!(!raw_image, "--raw-image requires --fuse"); + let mount_options = + get_mount_options(upperdir.as_deref(), workdir.as_deref(), read_write)?; + repo.mount_at(&name, &mountpoint, &mount_options)?; + } + #[cfg(not(feature = "fuse"))] + { + anyhow::ensure!(!raw_image, "--raw-image requires --fuse"); + let mount_options = + get_mount_options(upperdir.as_deref(), workdir.as_deref(), read_write)?; + repo.mount_at(&name, &mountpoint, &mount_options)?; + } } Command::ImageObjects { name } => { let objects = repo.objects_for_image(&name)?; diff --git a/crates/composefs-ctl/src/varlink.rs b/crates/composefs-ctl/src/varlink.rs index dc003fcb..dcd2a47a 100644 --- a/crates/composefs-ctl/src/varlink.rs +++ b/crates/composefs-ctl/src/varlink.rs @@ -417,41 +417,37 @@ pub struct MountParams { pub overlay: Option, /// Whether to mount read-write (only meaningful with overlay). pub read_write: Option, + /// When true, serve via FUSE and return a detached FUSE mount fd. + /// A background task serves the FUSE session for as long as the returned + /// fd (or any mount derived from it) remains alive — no explicit wait is + /// needed or meaningful. + pub fuse: Option, + /// Enable FUSE passthrough (Linux 6.9+; requires root). Only meaningful with fuse=true. + pub passthrough: Option, } -impl MountParams { - /// Build [`MountOptions`] from these params, consuming the expected fds. - fn to_mount_options( - &self, - fds: Vec, - ) -> std::result::Result { - let overlay = self.overlay.unwrap_or(false); - - let mut expected_fds = 0; - if overlay { - expected_fds += 2; - } - - if fds.len() != expected_fds { - return Err(RepositoryError::InvalidSpec { - message: format!( - "Mount expects {expected_fds} fds for the requested options, got {}", - fds.len() - ), - }); - } - - let mut options = composefs::mount::MountOptions::default(); - let mut fd_iter = fds.into_iter(); - if overlay { - let upperdir = fd_iter.next().unwrap(); - let workdir = fd_iter.next().unwrap(); - options.set_overlay(upperdir, workdir); - } - options.set_read_write(self.read_write.unwrap_or(false)); - - Ok(options) - } +/// Parameters for a `MountPath` call (attaches the mount at a server-side path). +/// +/// Unlike [`MountParams`], this method performs the `move_mount` to `mountpoint` +/// on the server side and returns no fd. The `wait` field is only meaningful for +/// FUSE mounts: when true (the default) the call blocks until the FUSE session +/// ends; when false the session is detached to a background task and the call +/// returns immediately. +#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, zlink::introspect::Type)] +pub struct MountPathParams { + /// Whether to set up an overlayfs upper layer. + /// When true, the fd array must contain two fds: upperdir and workdir. + pub overlay: Option, + /// Whether to mount read-write (only meaningful with overlay). + pub read_write: Option, + /// When true, serve via FUSE instead of using a kernel composefs mount. + pub fuse: Option, + /// Enable FUSE passthrough (Linux 6.9+; requires root). Only meaningful with fuse=true. + pub passthrough: Option, + /// FUSE only: block until the session ends (default true). When false, detach + /// the FUSE session into a background task and return immediately. + /// Ignored for kernel mounts (they are always non-blocking). + pub wait: Option, } /// Reply for a `Mount` call — just an fd_index referencing the mount fd. @@ -461,32 +457,180 @@ pub struct MountReply { pub fd_index: u32, } -fn run_mount( - repo: &Repository, - name: &str, - params: &MountParams, +/// Reply for a `MountPath` call — the mount has been attached server-side. +#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, zlink::introspect::Type)] +pub struct MountPathReply {} + +/// Build a [`composefs::mount::MountOptions`] from overlay/read-write flags, +/// consuming the expected fds. Shared by both `Mount` and `MountPath`. +fn build_mount_options( + overlay: bool, + read_write: bool, + fds: Vec, +) -> std::result::Result { + let expected_fds = if overlay { 2 } else { 0 }; + if fds.len() != expected_fds { + return Err(RepositoryError::InvalidSpec { + message: format!( + "Mount expects {expected_fds} fds for the requested options, got {}", + fds.len() + ), + }); + } + let mut options = composefs::mount::MountOptions::default(); + let mut fd_iter = fds.into_iter(); + if overlay { + let upperdir = fd_iter.next().unwrap(); + let workdir = fd_iter.next().unwrap(); + options.set_overlay(upperdir, workdir); + } + options.set_read_write(read_write); + Ok(options) +} + +/// Prepare a FUSE session: read the EROFS image, open /dev/fuse, and call +/// `mount_fuse` to get a detached mount fd. Returns `(dev_fuse, mnt_fd, fs)`. +/// +/// `erofs_name` is the image name passed to `repo.open_image`. On error the +/// `anyhow::Error` should be wrapped into the caller's error type. +#[cfg(feature = "fuse")] +fn prepare_fuse( + repo: &Arc>, + erofs_name: &str, +) -> anyhow::Result<( + std::os::fd::OwnedFd, + std::os::fd::OwnedFd, + composefs::generic_tree::FileSystem>, +)> { + use composefs_fuse::{erofs_fd_to_filesystem, mount_fuse, open_fuse}; + + let (image_fd, _verified) = repo + .open_image(erofs_name) + .with_context(|| format!("opening repo image {erofs_name}"))?; + let filesystem = erofs_fd_to_filesystem::(image_fd) + .with_context(|| format!("parsing repo image {erofs_name}"))?; + + let dev_fuse = open_fuse().context("opening /dev/fuse")?; + let mnt_fd = mount_fuse(&dev_fuse, &Default::default()).context("mounting FUSE")?; + Ok((dev_fuse, mnt_fd, filesystem)) +} + +/// Create a detached mount fd for `name` and return it to the caller. +/// +/// For kernel mounts this is a composefs overlay/erofs fd. For FUSE mounts +/// (when `params.fuse` is true) this is the detached FUSE mount fd; a +/// background task serves the session for as long as the fd (or any mount +/// derived from it) is alive. +async fn run_mount( + repo: Arc>, + name: String, + params: MountParams, fds: Vec, ) -> std::result::Result<(MountReply, Vec), RepositoryError> { - let options = params.to_mount_options(fds)?; + #[cfg(feature = "fuse")] + if params.fuse.unwrap_or(false) { + use composefs_fuse::{FuseConfig, serve_tree_fuse_fd}; + + let passthrough = params.passthrough.unwrap_or(false); + let (dev_fuse, mnt_fd, filesystem) = + prepare_fuse(&repo, &name).map_err(|e| RepositoryError::InternalError { + message: format!("{e:#}"), + })?; + + // Serve in the background. The session stays alive because the caller + // holds the returned mnt_fd; it self-terminates when the fd (and any + // attached mounts derived from it) are dropped. + let fs = Arc::new(filesystem); + let _task = tokio::task::spawn_blocking(move || { + if let Err(e) = serve_tree_fuse_fd(dev_fuse, fs, repo, FuseConfig { passthrough }) { + log::warn!("FUSE session ended with error: {e:#}"); + } + }); + return Ok((MountReply { fd_index: 0 }, vec![mnt_fd])); + } + let overlay = params.overlay.unwrap_or(false); + let options = build_mount_options(overlay, params.read_write.unwrap_or(false), fds)?; let mount_fd = - repo.mount_with_options(name, &options) + repo.mount_with_options(&name, &options) .map_err(|e| RepositoryError::InternalError { message: format!("{e:#}"), })?; - Ok((MountReply { fd_index: 0 }, vec![mount_fd])) } +/// Attach a mount of `name` at `mountpoint` on the server side. +/// +/// For kernel mounts this completes immediately. For FUSE mounts (when +/// `params.fuse` is true) the call either blocks until the session ends +/// (`wait=true`, the default) or returns immediately after attaching +/// (`wait=false`). +async fn run_mount_path( + repo: Arc>, + name: String, + mountpoint: String, + params: MountPathParams, + fds: Vec, +) -> std::result::Result { + #[cfg(feature = "fuse")] + if params.fuse.unwrap_or(false) { + use composefs_fuse::{FuseConfig, serve_tree_fuse_fd}; + use rustix::fs::CWD; + + let passthrough = params.passthrough.unwrap_or(false); + let wait = params.wait.unwrap_or(true); + let (dev_fuse, mnt_fd, filesystem) = + prepare_fuse(&repo, &name).map_err(|e| RepositoryError::InternalError { + message: format!("{e:#}"), + })?; + composefs::mount::mount_at(&mnt_fd, CWD, &mountpoint).map_err(|e| { + RepositoryError::InternalError { + message: format!("attaching FUSE mount at {mountpoint}: {e:#}"), + } + })?; + + let fs = Arc::new(filesystem); + if wait { + let _mnt_fd = mnt_fd; + tokio::task::spawn_blocking(move || { + serve_tree_fuse_fd(dev_fuse, fs, repo, FuseConfig { passthrough }) + }) + .await + .map_err(|e| RepositoryError::InternalError { + message: format!("FUSE task panicked: {e}"), + })? + .map_err(|e| RepositoryError::InternalError { + message: format!("FUSE session error: {e:#}"), + })?; + } else { + // The attached mountpoint pins the superblock; we can drop mnt_fd. + let _task = tokio::task::spawn_blocking(move || { + let _mnt_fd = mnt_fd; + if let Err(e) = serve_tree_fuse_fd(dev_fuse, fs, repo, FuseConfig { passthrough }) { + log::warn!("FUSE session ended with error: {e:#}"); + } + }); + } + return Ok(MountPathReply {}); + } + + let overlay = params.overlay.unwrap_or(false); + let options = build_mount_options(overlay, params.read_write.unwrap_or(false), fds)?; + repo.mount_at(&name, &mountpoint, &options) + .map_err(|e| RepositoryError::InternalError { + message: format!("{e:#}"), + })?; + Ok(MountPathReply {}) +} + +/// Resolve an OCI image reference to its EROFS image name string. #[cfg(feature = "oci")] -fn run_oci_mount( - repo: &Repository, +fn resolve_oci_erofs_name( + repo: &Arc>, image: &str, bootable: bool, - params: &MountParams, - fds: Vec, -) -> std::result::Result<(MountReply, Vec), oci::OciError> { - let img = if image.starts_with("sha256:") { +) -> std::result::Result { + let img = if image.starts_with("sha256:") || image.starts_with("sha512:") { let digest: composefs_oci::OciDigest = image.parse().map_err(|e| oci::OciError::InternalError { message: format!("Invalid manifest digest: {e}"), @@ -511,21 +655,124 @@ fn run_oci_mount( "No composefs EROFS image linked".into() }, })?; + Ok(erofs_id.to_hex()) +} - let options = params - .to_mount_options(fds) - .map_err(|e| oci::OciError::InternalError { - message: format!("{e:?}"), +/// Mount an OCI image and return a detached mount fd. +/// +/// For FUSE mounts (when `params.fuse` is true) a background task serves the +/// session for as long as the returned fd (or any derived mount) is alive. +#[cfg(feature = "oci")] +async fn run_oci_mount( + repo: Arc>, + image: String, + bootable: bool, + params: MountParams, + fds: Vec, +) -> std::result::Result<(MountReply, Vec), oci::OciError> { + let erofs_name = resolve_oci_erofs_name(&repo, &image, bootable)?; + + #[cfg(feature = "fuse")] + if params.fuse.unwrap_or(false) { + use composefs_fuse::{FuseConfig, serve_tree_fuse_fd}; + + let passthrough = params.passthrough.unwrap_or(false); + let (dev_fuse, mnt_fd, filesystem) = + prepare_fuse(&repo, &erofs_name).map_err(|e| oci::OciError::InternalError { + message: format!("{e:#}"), + })?; + + let fs = Arc::new(filesystem); + let _task = tokio::task::spawn_blocking(move || { + if let Err(e) = serve_tree_fuse_fd(dev_fuse, fs, repo, FuseConfig { passthrough }) { + log::warn!("FUSE session ended with error: {e:#}"); + } + }); + return Ok((MountReply { fd_index: 0 }, vec![mnt_fd])); + } + + let overlay = params.overlay.unwrap_or(false); + let options = + build_mount_options(overlay, params.read_write.unwrap_or(false), fds).map_err(|e| { + oci::OciError::InternalError { + message: format!("{e:?}"), + } })?; let mount_fd = repo - .mount_with_options(&erofs_id.to_hex(), &options) + .mount_with_options(&erofs_name, &options) .map_err(|e| oci::OciError::InternalError { message: format!("{e:#}"), })?; - Ok((MountReply { fd_index: 0 }, vec![mount_fd])) } +/// Attach an OCI image mount at `mountpoint` on the server side. +#[cfg(feature = "oci")] +async fn run_oci_mount_path( + repo: Arc>, + image: String, + bootable: bool, + mountpoint: String, + params: MountPathParams, + fds: Vec, +) -> std::result::Result { + let erofs_name = resolve_oci_erofs_name(&repo, &image, bootable)?; + + #[cfg(feature = "fuse")] + if params.fuse.unwrap_or(false) { + use composefs_fuse::{FuseConfig, serve_tree_fuse_fd}; + use rustix::fs::CWD; + + let passthrough = params.passthrough.unwrap_or(false); + let wait = params.wait.unwrap_or(true); + let (dev_fuse, mnt_fd, filesystem) = + prepare_fuse(&repo, &erofs_name).map_err(|e| oci::OciError::InternalError { + message: format!("{e:#}"), + })?; + composefs::mount::mount_at(&mnt_fd, CWD, &mountpoint).map_err(|e| { + oci::OciError::InternalError { + message: format!("attaching FUSE mount at {mountpoint}: {e:#}"), + } + })?; + + let fs = Arc::new(filesystem); + if wait { + let _mnt_fd = mnt_fd; + tokio::task::spawn_blocking(move || { + serve_tree_fuse_fd(dev_fuse, fs, repo, FuseConfig { passthrough }) + }) + .await + .map_err(|e| oci::OciError::InternalError { + message: format!("FUSE task panicked: {e}"), + })? + .map_err(|e| oci::OciError::InternalError { + message: format!("FUSE session error: {e:#}"), + })?; + } else { + let _task = tokio::task::spawn_blocking(move || { + let _mnt_fd = mnt_fd; + if let Err(e) = serve_tree_fuse_fd(dev_fuse, fs, repo, FuseConfig { passthrough }) { + log::warn!("FUSE session ended with error: {e:#}"); + } + }); + } + return Ok(MountPathReply {}); + } + + let overlay = params.overlay.unwrap_or(false); + let options = + build_mount_options(overlay, params.read_write.unwrap_or(false), fds).map_err(|e| { + oci::OciError::InternalError { + message: format!("{e:?}"), + } + })?; + repo.mount_at(&erofs_name, &mountpoint, &options) + .map_err(|e| oci::OciError::InternalError { + message: format!("{e:#}"), + })?; + Ok(MountPathReply {}) +} + /// Initialize (or verify) a repository at `path` with the given algorithm. /// /// Creates parent directories if needed, then delegates to @@ -738,14 +985,21 @@ async fn run_compute_id( // `interface = "org.composefs.Oci"` the macro keeps using it for subsequent // methods until changed. The Repository methods come first and inherit the // seeded `org.composefs.Repository` interface. -#[cfg(not(feature = "oci"))] +// Repository-only variants (no OCI), split further by the `fuse` feature. +// The #[zlink::service] macro generates a dispatch enum keyed on the wire +// method name, so methods cannot be individually cfg-gated inside one impl +// block — the macro sees all methods at expansion time. We therefore keep +// separate impl blocks for each (oci, fuse) combination. + +#[cfg(all(not(feature = "oci"), feature = "fuse"))] mod service_impl { #![allow(missing_docs)] use super::{ CfsctlService, FsckReply, GcReply, ImageObjectsReply, InitRepositoryReply, MountParams, - MountReply, OpenRepo, OpenRepositoryReply, RepositoryError, run_fsck, run_gc, - run_image_objects, run_init_repository, run_mount, + MountPathParams, MountPathReply, MountReply, OpenRepo, OpenRepositoryReply, + RepositoryError, run_fsck, run_gc, run_image_objects, run_init_repository, run_mount, + run_mount_path, }; use composefs::fsverity::{Algorithm, Sha256HashValue, Sha512HashValue}; @@ -856,7 +1110,9 @@ mod service_impl { /// /// If overlay upper/work directories are needed, pass them as two fds /// (upperdir, workdir) via SCM_RIGHTS. The returned fd is a detached - /// mount that the caller can attach with `move_mount()`. + /// mount that the caller can attach with `move_mount()`. When + /// `options.fuse` is true, a detached FUSE mount fd is returned and a + /// background task serves the session for as long as it is held. #[zlink(return_fds)] async fn mount( &self, @@ -869,11 +1125,11 @@ mod service_impl { Vec, ) { let result = match self.lookup_repo(handle) { - Ok(OpenRepo::Sha256(ref r)) => { - run_mount::(r, &name, &options, fds) + Ok(OpenRepo::Sha256(r)) => { + run_mount::(r, name, options, fds).await } - Ok(OpenRepo::Sha512(ref r)) => { - run_mount::(r, &name, &options, fds) + Ok(OpenRepo::Sha512(r)) => { + run_mount::(r, name, options, fds).await } Err(e) => Err(e), }; @@ -882,26 +1138,42 @@ mod service_impl { Err(e) => (Err(e), vec![]), } } + + /// Mount an image and attach it at `mountpoint` on the server side. + /// + /// For kernel mounts this returns immediately. For FUSE mounts + /// (`options.fuse=true`) the call blocks until the session ends + /// unless `options.wait=false`. + async fn mount_path( + &self, + handle: u64, + name: String, + mountpoint: String, + options: MountPathParams, + #[zlink(fds)] fds: Vec, + ) -> std::result::Result { + match self.lookup_repo(handle) { + Ok(OpenRepo::Sha256(r)) => { + run_mount_path::(r, name, mountpoint, options, fds).await + } + Ok(OpenRepo::Sha512(r)) => { + run_mount_path::(r, name, mountpoint, options, fds).await + } + Err(e) => Err(e), + } + } } } -// Combined variant: hosts BOTH the `org.composefs.Repository` and -// `org.composefs.Oci` interfaces from a single impl block on `CfsctlService`, -// so one service answers both interfaces on one socket. See the comment above -// for why this can't be cfg-gated method-by-method. -#[cfg(feature = "oci")] +#[cfg(all(not(feature = "oci"), not(feature = "fuse")))] mod service_impl { #![allow(missing_docs)] - use super::oci::{ - ListImagesReply, OciComputeIdReply, OciError, OciFsckReply, OciInspectReply, PullProgress, - parse_local_fetch, pull_stream, - }; use super::{ CfsctlService, FsckReply, GcReply, ImageObjectsReply, InitRepositoryReply, MountParams, - MountReply, OpenRepo, OpenRepositoryReply, RepositoryError, run_compute_id, run_fsck, - run_gc, run_image_objects, run_init_repository, run_inspect, run_list_images, run_mount, - run_oci_fsck, run_oci_mount, run_tag, run_untag, + MountPathParams, MountPathReply, MountReply, OpenRepo, OpenRepositoryReply, + RepositoryError, run_fsck, run_gc, run_image_objects, run_init_repository, run_mount, + run_mount_path, }; use composefs::fsverity::{Algorithm, Sha256HashValue, Sha512HashValue}; @@ -913,8 +1185,6 @@ mod service_impl { url = "https://github.com/composefs/composefs-rs" )] impl CfsctlService { - // --- org.composefs.Repository (inherits the seeded interface) --- - /// Initialize a new repository at the given path, or verify that an /// existing one matches the requested algorithm (idempotent). /// @@ -1027,11 +1297,11 @@ mod service_impl { Vec, ) { let result = match self.lookup_repo(handle) { - Ok(OpenRepo::Sha256(ref r)) => { - run_mount::(r, &name, &options, fds) + Ok(OpenRepo::Sha256(r)) => { + run_mount::(r, name, options, fds).await } - Ok(OpenRepo::Sha512(ref r)) => { - run_mount::(r, &name, &options, fds) + Ok(OpenRepo::Sha512(r)) => { + run_mount::(r, name, options, fds).await } Err(e) => Err(e), }; @@ -1041,114 +1311,708 @@ mod service_impl { } } - // --- org.composefs.Oci --- - // - // The first OCI method sets `interface = "org.composefs.Oci"`; the - // macro then keeps that interface sticky for subsequent methods. Each - // OCI method is still annotated explicitly for clarity. - - /// List tagged OCI images in the repository. - /// - /// When `filter` is given, only images whose name contains that - /// substring are returned. - #[zlink(interface = "org.composefs.Oci")] - async fn list_images( - &self, - handle: u64, - filter: Option, - ) -> std::result::Result { - let images = match self.lookup_oci(handle)? { - OpenRepo::Sha256(ref r) => run_list_images::(r, filter).await, - OpenRepo::Sha512(ref r) => run_list_images::(r, filter).await, - }?; - Ok(ListImagesReply { images }) - } - - /// Run an OCI-aware consistency check on the repository. + /// Mount an image and attach it at `mountpoint` on the server side. /// - /// Renamed on the wire to `Check` so it does not collide with the - /// repository-level `Fsck` method (the dispatch enum keys on the wire - /// method name, which must be globally unique across both interfaces). - #[zlink(interface = "org.composefs.Oci", rename = "Check")] - async fn oci_fsck( + /// For kernel mounts this returns immediately. For FUSE mounts + /// (`options.fuse=true`) the call blocks until the session ends + /// unless `options.wait=false`. + async fn mount_path( &self, handle: u64, - image: Option, - ) -> std::result::Result { - match self.lookup_oci(handle)? { - OpenRepo::Sha256(ref r) => run_oci_fsck::(r, image).await, - OpenRepo::Sha512(ref r) => run_oci_fsck::(r, image).await, - } - } - - /// Inspect a single OCI image. - #[zlink(interface = "org.composefs.Oci")] - async fn inspect( - &self, - handle: u64, - image: String, - ) -> std::result::Result { - match self.lookup_oci(handle)? { - OpenRepo::Sha256(ref r) => run_inspect::(r, image).await, - OpenRepo::Sha512(ref r) => run_inspect::(r, image).await, - } - } - - /// Tag a manifest digest with a name. - #[zlink(interface = "org.composefs.Oci")] - async fn tag( - &self, - handle: u64, - manifest_digest: String, name: String, - ) -> std::result::Result<(), OciError> { - match self.lookup_oci(handle)? { - OpenRepo::Sha256(ref r) => { - run_tag::(r, manifest_digest, name).await + mountpoint: String, + options: MountPathParams, + #[zlink(fds)] fds: Vec, + ) -> std::result::Result { + match self.lookup_repo(handle) { + Ok(OpenRepo::Sha256(r)) => { + run_mount_path::(r, name, mountpoint, options, fds).await } - OpenRepo::Sha512(ref r) => { - run_tag::(r, manifest_digest, name).await + Ok(OpenRepo::Sha512(r)) => { + run_mount_path::(r, name, mountpoint, options, fds).await } + Err(e) => Err(e), } } + } +} - /// Remove a tag. - #[zlink(interface = "org.composefs.Oci")] - async fn untag(&self, handle: u64, name: String) -> std::result::Result<(), OciError> { - match self.lookup_oci(handle)? { - OpenRepo::Sha256(ref r) => run_untag::(r, name).await, - OpenRepo::Sha512(ref r) => run_untag::(r, name).await, - } - } +// Combined variant: hosts BOTH the `org.composefs.Repository` and +// `org.composefs.Oci` interfaces from a single impl block on `CfsctlService`, +// so one service answers both interfaces on one socket. The #[zlink::service] +// macro generates a dispatch enum keyed on the wire method name, so methods +// cannot be individually cfg-gated inside one impl block — the macro sees all +// methods at expansion time. We therefore keep separate impl blocks for each +// (oci, fuse) combination. - /// Compute the composefs image ID for an OCI image. - #[zlink(interface = "org.composefs.Oci")] - async fn compute_id( - &self, - handle: u64, - image: String, - verity: Option, - bootable: bool, - ) -> std::result::Result { - match self.lookup_oci(handle)? { - OpenRepo::Sha256(ref r) => { - run_compute_id::(r, image, verity, bootable).await - } - OpenRepo::Sha512(ref r) => { - run_compute_id::(r, image, verity, bootable).await - } - } - } +#[cfg(all(feature = "oci", feature = "fuse"))] +mod service_impl { + #![allow(missing_docs)] - /// Pull an OCI image into the repository, streaming progress. + use super::oci::{ + ListImagesReply, OciComputeIdReply, OciError, OciFsckReply, OciInspectReply, PullProgress, + parse_local_fetch, pull_stream, + }; + use super::{ + CfsctlService, FsckReply, GcReply, ImageObjectsReply, InitRepositoryReply, MountParams, + MountPathParams, MountPathReply, MountReply, OpenRepo, OpenRepositoryReply, + RepositoryError, run_compute_id, run_fsck, run_gc, run_image_objects, run_init_repository, + run_inspect, run_list_images, run_mount, run_mount_path, run_oci_fsck, run_oci_mount, + run_oci_mount_path, run_tag, run_untag, + }; + use composefs::fsverity::{Algorithm, Sha256HashValue, Sha512HashValue}; + + #[zlink::service( + interface = "org.composefs.Repository", + vendor = "org.composefs", + product = "cfsctl", + version = env!("CARGO_PKG_VERSION"), + url = "https://github.com/composefs/composefs-rs" + )] + impl CfsctlService { + // --- org.composefs.Repository (inherits the seeded interface) --- + + /// Initialize a new repository at the given path, or verify that an + /// existing one matches the requested algorithm (idempotent). /// - /// Emits zero or more intermediate [`PullProgress`] frames describing - /// fetch progress (only when `more` is true), followed by exactly one - /// terminal frame whose `completed` field is set, carrying the pull result. - #[zlink(interface = "org.composefs.Oci", more)] - #[allow(clippy::too_many_arguments)] - async fn pull( - &self, + /// Creates the directory (and any parents) if they do not exist. + /// `algorithm` must be a valid fs-verity algorithm string such as + /// `"fsverity-sha512-12"` (the default) or `"fsverity-sha256-12"`. + /// When omitted the service default (`fsverity-sha512-12`) is used. + /// The `insecure` flag mirrors `cfsctl init --insecure`: when `true`, + /// fs-verity is not required on `meta.json`. + async fn init_repository( + &mut self, + path: String, + algorithm: Option, + insecure: Option, + ) -> std::result::Result { + let algorithm: Algorithm = algorithm + .as_deref() + .unwrap_or("fsverity-sha512-12") + .parse() + .map_err(|e| RepositoryError::InvalidSpec { + message: format!("invalid algorithm: {e}"), + })?; + let insecure = insecure.unwrap_or(self.open_opts.insecure); + run_init_repository(std::path::Path::new(&path), algorithm, insecure) + } + + /// Open and validate a repository, returning an opaque handle. + /// + /// Exactly one of `path`, `user`, `system` must be set. + async fn open_repository( + &mut self, + path: Option, + user: Option, + system: Option, + #[zlink(connection)] conn: &mut zlink::Connection, + ) -> std::result::Result { + let selected = Self::resolve_selector(path, user, system)?; + let handle = self.do_open(&selected, Some(conn.id()))?; + Ok(OpenRepositoryReply { handle }) + } + + /// Close a previously opened repository handle. + async fn close_repository( + &mut self, + handle: u64, + ) -> std::result::Result<(), RepositoryError> { + self.repos + .remove(&handle) + .map(|_| ()) + .ok_or(RepositoryError::InvalidHandle { handle }) + } + + /// Check repository integrity and return the structured result. + /// + /// When `metadata_only` is true, the expensive per-object fs-verity + /// verification is skipped; only metadata and symlink structure are + /// checked. + async fn fsck( + &self, + handle: u64, + metadata_only: Option, + ) -> std::result::Result { + let metadata_only = metadata_only.unwrap_or(false); + let result = match self.lookup_repo(handle)? { + OpenRepo::Sha256(ref r) => run_fsck::(r, metadata_only).await, + OpenRepo::Sha512(ref r) => run_fsck::(r, metadata_only).await, + }?; + Ok(FsckReply::from(&result)) + } + + /// Run garbage collection (or a dry run) and return what was removed. + async fn gc( + &self, + handle: u64, + dry_run: bool, + roots: Vec, + ) -> std::result::Result { + match self.lookup_repo(handle)? { + OpenRepo::Sha256(ref r) => run_gc::(r, dry_run, roots).await, + OpenRepo::Sha512(ref r) => run_gc::(r, dry_run, roots).await, + } + } + + /// List the objects referenced by a single image. + async fn image_objects( + &self, + handle: u64, + name: String, + ) -> std::result::Result { + match self.lookup_repo(handle)? { + OpenRepo::Sha256(ref r) => run_image_objects::(r, name).await, + OpenRepo::Sha512(ref r) => run_image_objects::(r, name).await, + } + } + + /// Create a detached mount of an image and return the mount fd. + /// + /// If overlay upper/work directories are needed, pass them as two fds + /// (upperdir, workdir) via SCM_RIGHTS. The returned fd is a detached + /// mount that the caller can attach with `move_mount()`. When + /// `options.fuse` is true, a detached FUSE mount fd is returned and a + /// background task serves the session for as long as it is held. + #[zlink(return_fds)] + async fn mount( + &self, + handle: u64, + name: String, + options: MountParams, + #[zlink(fds)] fds: Vec, + ) -> ( + std::result::Result, + Vec, + ) { + let result = match self.lookup_repo(handle) { + Ok(OpenRepo::Sha256(r)) => { + run_mount::(r, name, options, fds).await + } + Ok(OpenRepo::Sha512(r)) => { + run_mount::(r, name, options, fds).await + } + Err(e) => Err(e), + }; + match result { + Ok((reply, fds)) => (Ok(reply), fds), + Err(e) => (Err(e), vec![]), + } + } + + /// Mount an image and attach it at `mountpoint` on the server side. + /// + /// For kernel mounts this returns immediately. For FUSE mounts + /// (`options.fuse=true`) the call blocks until the session ends + /// unless `options.wait=false`. + async fn mount_path( + &self, + handle: u64, + name: String, + mountpoint: String, + options: MountPathParams, + #[zlink(fds)] fds: Vec, + ) -> std::result::Result { + match self.lookup_repo(handle) { + Ok(OpenRepo::Sha256(r)) => { + run_mount_path::(r, name, mountpoint, options, fds).await + } + Ok(OpenRepo::Sha512(r)) => { + run_mount_path::(r, name, mountpoint, options, fds).await + } + Err(e) => Err(e), + } + } + + // --- org.composefs.Oci --- + // + // The first OCI method sets `interface = "org.composefs.Oci"`; the + // macro then keeps that interface sticky for subsequent methods. Each + // OCI method is still annotated explicitly for clarity. + + /// List tagged OCI images in the repository. + /// + /// When `filter` is given, only images whose name contains that + /// substring are returned. + #[zlink(interface = "org.composefs.Oci")] + async fn list_images( + &self, + handle: u64, + filter: Option, + ) -> std::result::Result { + let images = match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => run_list_images::(r, filter).await, + OpenRepo::Sha512(ref r) => run_list_images::(r, filter).await, + }?; + Ok(ListImagesReply { images }) + } + + /// Run an OCI-aware consistency check on the repository. + /// + /// Renamed on the wire to `Check` so it does not collide with the + /// repository-level `Fsck` method (the dispatch enum keys on the wire + /// method name, which must be globally unique across both interfaces). + #[zlink(interface = "org.composefs.Oci", rename = "Check")] + async fn oci_fsck( + &self, + handle: u64, + image: Option, + ) -> std::result::Result { + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => run_oci_fsck::(r, image).await, + OpenRepo::Sha512(ref r) => run_oci_fsck::(r, image).await, + } + } + + /// Inspect a single OCI image. + #[zlink(interface = "org.composefs.Oci")] + async fn inspect( + &self, + handle: u64, + image: String, + ) -> std::result::Result { + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => run_inspect::(r, image).await, + OpenRepo::Sha512(ref r) => run_inspect::(r, image).await, + } + } + + /// Tag a manifest digest with a name. + #[zlink(interface = "org.composefs.Oci")] + async fn tag( + &self, + handle: u64, + manifest_digest: String, + name: String, + ) -> std::result::Result<(), OciError> { + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => { + run_tag::(r, manifest_digest, name).await + } + OpenRepo::Sha512(ref r) => { + run_tag::(r, manifest_digest, name).await + } + } + } + + /// Remove a tag. + #[zlink(interface = "org.composefs.Oci")] + async fn untag(&self, handle: u64, name: String) -> std::result::Result<(), OciError> { + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => run_untag::(r, name).await, + OpenRepo::Sha512(ref r) => run_untag::(r, name).await, + } + } + + /// Compute the composefs image ID for an OCI image. + #[zlink(interface = "org.composefs.Oci")] + async fn compute_id( + &self, + handle: u64, + image: String, + verity: Option, + bootable: bool, + ) -> std::result::Result { + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => { + run_compute_id::(r, image, verity, bootable).await + } + OpenRepo::Sha512(ref r) => { + run_compute_id::(r, image, verity, bootable).await + } + } + } + + /// Mount an OCI image and return the detached mount fd. + /// + /// Resolves the image by ref name or `sha256:`/`sha512:` digest, finds + /// its EROFS image (or boot variant if `bootable` is true), and creates + /// a composefs mount. If `options.overlay` is true, the fd array must + /// contain upperdir and workdir fds. When `options.fuse` is true, a + /// detached FUSE mount fd is returned and a background task serves the + /// session for as long as it is held. + #[zlink(interface = "org.composefs.Oci", return_fds)] + async fn oci_mount( + &self, + handle: u64, + image: String, + bootable: bool, + options: MountParams, + #[zlink(fds)] fds: Vec, + ) -> ( + std::result::Result, + Vec, + ) { + let result = match self.lookup_oci(handle) { + Ok(OpenRepo::Sha256(r)) => { + run_oci_mount::(r, image, bootable, options, fds).await + } + Ok(OpenRepo::Sha512(r)) => { + run_oci_mount::(r, image, bootable, options, fds).await + } + Err(e) => Err(e), + }; + match result { + Ok((reply, fds)) => (Ok(reply), fds), + Err(e) => (Err(e), vec![]), + } + } + + /// Mount an OCI image and attach it at `mountpoint` on the server side. + /// + /// For kernel mounts this returns immediately. For FUSE mounts + /// (`options.fuse=true`) the call blocks until the session ends + /// unless `options.wait=false`. + #[zlink(interface = "org.composefs.Oci")] + async fn oci_mount_path( + &self, + handle: u64, + image: String, + bootable: bool, + mountpoint: String, + options: MountPathParams, + #[zlink(fds)] fds: Vec, + ) -> std::result::Result { + match self.lookup_oci(handle) { + Ok(OpenRepo::Sha256(r)) => { + run_oci_mount_path::( + r, image, bootable, mountpoint, options, fds, + ) + .await + } + Ok(OpenRepo::Sha512(r)) => { + run_oci_mount_path::( + r, image, bootable, mountpoint, options, fds, + ) + .await + } + Err(e) => Err(e), + } + } + + /// Pull an OCI image into the repository, streaming progress. + /// + /// Emits zero or more intermediate [`PullProgress`] frames describing + /// fetch progress (only when `more` is true), followed by exactly one + /// terminal frame whose `completed` field is set, carrying the pull result. + #[zlink(interface = "org.composefs.Oci", more)] + #[allow(clippy::too_many_arguments)] + async fn pull( + &self, + more: bool, + handle: u64, + image: String, + name: Option, + local_fetch: String, + storage_root: Option, + bootable: bool, + ) -> impl zlink::futures_util::Stream< + Item = std::result::Result, OciError>, + > + Send { + let lf = parse_local_fetch(&local_fetch); + let sr = storage_root.map(std::path::PathBuf::from); + // Resolve the handle synchronously and clone an owned Arc out so the + // returned stream owns everything it needs ('static). On a missing + // handle, yield a one-shot error stream (`pull_stream` and the + // error path share the same boxed-trait-object return type). + match self.repos.get(&handle).map(|entry| &entry.repo) { + Some(OpenRepo::Sha256(r)) => { + pull_stream::(r.clone(), image, name, lf, sr, bootable, more) + } + Some(OpenRepo::Sha512(r)) => { + pull_stream::(r.clone(), image, name, lf, sr, bootable, more) + } + None => { + use zlink::futures_util::stream; + Box::pin(stream::once(async move { + Err(OciError::InvalidHandle { handle }) + })) + } + } + } + } +} + +// OCI enabled, FUSE disabled. +#[cfg(all(feature = "oci", not(feature = "fuse")))] +mod service_impl { + #![allow(missing_docs)] + + use super::oci::{ + ListImagesReply, OciComputeIdReply, OciError, OciFsckReply, OciInspectReply, PullProgress, + parse_local_fetch, pull_stream, + }; + use super::{ + CfsctlService, FsckReply, GcReply, ImageObjectsReply, InitRepositoryReply, MountParams, + MountPathParams, MountPathReply, MountReply, OpenRepo, OpenRepositoryReply, + RepositoryError, run_compute_id, run_fsck, run_gc, run_image_objects, run_init_repository, + run_inspect, run_list_images, run_mount, run_mount_path, run_oci_fsck, run_oci_mount, + run_oci_mount_path, run_tag, run_untag, + }; + use composefs::fsverity::{Algorithm, Sha256HashValue, Sha512HashValue}; + + #[zlink::service( + interface = "org.composefs.Repository", + vendor = "org.composefs", + product = "cfsctl", + version = env!("CARGO_PKG_VERSION"), + url = "https://github.com/composefs/composefs-rs" + )] + impl CfsctlService { + // --- org.composefs.Repository --- + + /// Initialize a new repository at the given path, or verify that an + /// existing one matches the requested algorithm (idempotent). + /// + /// Creates the directory (and any parents) if they do not exist. + /// `algorithm` must be a valid fs-verity algorithm string such as + /// `"fsverity-sha512-12"` (the default) or `"fsverity-sha256-12"`. + /// When omitted the service default (`fsverity-sha512-12`) is used. + /// The `insecure` flag mirrors `cfsctl init --insecure`: when `true`, + /// fs-verity is not required on `meta.json`. + async fn init_repository( + &mut self, + path: String, + algorithm: Option, + insecure: Option, + ) -> std::result::Result { + let algorithm: Algorithm = algorithm + .as_deref() + .unwrap_or("fsverity-sha512-12") + .parse() + .map_err(|e| RepositoryError::InvalidSpec { + message: format!("invalid algorithm: {e}"), + })?; + let insecure = insecure.unwrap_or(self.open_opts.insecure); + run_init_repository(std::path::Path::new(&path), algorithm, insecure) + } + + /// Open and validate a repository, returning an opaque handle. + /// + /// Exactly one of `path`, `user`, `system` must be set. + async fn open_repository( + &mut self, + path: Option, + user: Option, + system: Option, + #[zlink(connection)] conn: &mut zlink::Connection, + ) -> std::result::Result { + let selected = Self::resolve_selector(path, user, system)?; + let handle = self.do_open(&selected, Some(conn.id()))?; + Ok(OpenRepositoryReply { handle }) + } + + /// Close a previously opened repository handle. + async fn close_repository( + &mut self, + handle: u64, + ) -> std::result::Result<(), RepositoryError> { + self.repos + .remove(&handle) + .map(|_| ()) + .ok_or(RepositoryError::InvalidHandle { handle }) + } + + /// Check repository integrity and return the structured result. + /// + /// When `metadata_only` is true, the expensive per-object fs-verity + /// verification is skipped; only metadata and symlink structure are + /// checked. + async fn fsck( + &self, + handle: u64, + metadata_only: Option, + ) -> std::result::Result { + let metadata_only = metadata_only.unwrap_or(false); + let result = match self.lookup_repo(handle)? { + OpenRepo::Sha256(ref r) => run_fsck::(r, metadata_only).await, + OpenRepo::Sha512(ref r) => run_fsck::(r, metadata_only).await, + }?; + Ok(FsckReply::from(&result)) + } + + /// Run garbage collection (or a dry run) and return what was removed. + async fn gc( + &self, + handle: u64, + dry_run: bool, + roots: Vec, + ) -> std::result::Result { + match self.lookup_repo(handle)? { + OpenRepo::Sha256(ref r) => run_gc::(r, dry_run, roots).await, + OpenRepo::Sha512(ref r) => run_gc::(r, dry_run, roots).await, + } + } + + /// List the objects referenced by a single image. + async fn image_objects( + &self, + handle: u64, + name: String, + ) -> std::result::Result { + match self.lookup_repo(handle)? { + OpenRepo::Sha256(ref r) => run_image_objects::(r, name).await, + OpenRepo::Sha512(ref r) => run_image_objects::(r, name).await, + } + } + + /// Create a detached mount of an image and return the mount fd. + /// + /// If overlay upper/work directories are needed, pass them as two fds + /// (upperdir, workdir) via SCM_RIGHTS. The returned fd is a detached + /// mount that the caller can attach with `move_mount()`. + #[zlink(return_fds)] + async fn mount( + &self, + handle: u64, + name: String, + options: MountParams, + #[zlink(fds)] fds: Vec, + ) -> ( + std::result::Result, + Vec, + ) { + let result = match self.lookup_repo(handle) { + Ok(OpenRepo::Sha256(r)) => { + run_mount::(r, name, options, fds).await + } + Ok(OpenRepo::Sha512(r)) => { + run_mount::(r, name, options, fds).await + } + Err(e) => Err(e), + }; + match result { + Ok((reply, fds)) => (Ok(reply), fds), + Err(e) => (Err(e), vec![]), + } + } + + /// Mount an image and attach it at `mountpoint` on the server side. + /// + /// For kernel mounts this returns immediately. + async fn mount_path( + &self, + handle: u64, + name: String, + mountpoint: String, + options: MountPathParams, + #[zlink(fds)] fds: Vec, + ) -> std::result::Result { + match self.lookup_repo(handle) { + Ok(OpenRepo::Sha256(r)) => { + run_mount_path::(r, name, mountpoint, options, fds).await + } + Ok(OpenRepo::Sha512(r)) => { + run_mount_path::(r, name, mountpoint, options, fds).await + } + Err(e) => Err(e), + } + } + + // --- org.composefs.Oci --- + + /// List tagged OCI images in the repository. + /// + /// When `filter` is given, only images whose name contains that + /// substring are returned. + #[zlink(interface = "org.composefs.Oci")] + async fn list_images( + &self, + handle: u64, + filter: Option, + ) -> std::result::Result { + let images = match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => run_list_images::(r, filter).await, + OpenRepo::Sha512(ref r) => run_list_images::(r, filter).await, + }?; + Ok(ListImagesReply { images }) + } + + /// Run an OCI-aware consistency check on the repository. + /// + /// Renamed on the wire to `Check` so it does not collide with the + /// repository-level `Fsck` method (the dispatch enum keys on the wire + /// method name, which must be globally unique across both interfaces). + #[zlink(interface = "org.composefs.Oci", rename = "Check")] + async fn oci_fsck( + &self, + handle: u64, + image: Option, + ) -> std::result::Result { + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => run_oci_fsck::(r, image).await, + OpenRepo::Sha512(ref r) => run_oci_fsck::(r, image).await, + } + } + + /// Inspect a single OCI image. + #[zlink(interface = "org.composefs.Oci")] + async fn inspect( + &self, + handle: u64, + image: String, + ) -> std::result::Result { + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => run_inspect::(r, image).await, + OpenRepo::Sha512(ref r) => run_inspect::(r, image).await, + } + } + + /// Tag a manifest digest with a name. + #[zlink(interface = "org.composefs.Oci")] + async fn tag( + &self, + handle: u64, + manifest_digest: String, + name: String, + ) -> std::result::Result<(), OciError> { + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => { + run_tag::(r, manifest_digest, name).await + } + OpenRepo::Sha512(ref r) => { + run_tag::(r, manifest_digest, name).await + } + } + } + + /// Remove a tag. + #[zlink(interface = "org.composefs.Oci")] + async fn untag(&self, handle: u64, name: String) -> std::result::Result<(), OciError> { + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => run_untag::(r, name).await, + OpenRepo::Sha512(ref r) => run_untag::(r, name).await, + } + } + + /// Compute the composefs image ID for an OCI image. + #[zlink(interface = "org.composefs.Oci")] + async fn compute_id( + &self, + handle: u64, + image: String, + verity: Option, + bootable: bool, + ) -> std::result::Result { + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => { + run_compute_id::(r, image, verity, bootable).await + } + OpenRepo::Sha512(ref r) => { + run_compute_id::(r, image, verity, bootable).await + } + } + } + + /// Pull an OCI image into the repository, streaming progress. + /// + /// Emits zero or more intermediate [`PullProgress`] frames describing + /// fetch progress (only when `more` is true), followed by exactly one + /// terminal frame whose `completed` field is set, carrying the pull result. + #[zlink(interface = "org.composefs.Oci", more)] + #[allow(clippy::too_many_arguments)] + async fn pull( + &self, more: bool, handle: u64, image: String, @@ -1161,10 +2025,6 @@ mod service_impl { > + Send { let lf = parse_local_fetch(&local_fetch); let sr = storage_root.map(std::path::PathBuf::from); - // Resolve the handle synchronously and clone an owned Arc out so the - // returned stream owns everything it needs ('static). On a missing - // handle, yield a one-shot error stream (`pull_stream` and the - // error path share the same boxed-trait-object return type). match self.repos.get(&handle).map(|entry| &entry.repo) { Some(OpenRepo::Sha256(r)) => { pull_stream::(r.clone(), image, name, lf, sr, bootable, more) @@ -1200,11 +2060,11 @@ mod service_impl { Vec, ) { let result = match self.lookup_oci(handle) { - Ok(OpenRepo::Sha256(ref r)) => { - run_oci_mount::(r, &image, bootable, &options, fds) + Ok(OpenRepo::Sha256(r)) => { + run_oci_mount::(r, image, bootable, options, fds).await } - Ok(OpenRepo::Sha512(ref r)) => { - run_oci_mount::(r, &image, bootable, &options, fds) + Ok(OpenRepo::Sha512(r)) => { + run_oci_mount::(r, image, bootable, options, fds).await } Err(e) => Err(e), }; @@ -1213,6 +2073,36 @@ mod service_impl { Err(e) => (Err(e), vec![]), } } + + /// Mount an OCI image and attach it at `mountpoint` on the server side. + /// + /// For kernel mounts this returns immediately. + #[zlink(interface = "org.composefs.Oci")] + async fn oci_mount_path( + &self, + handle: u64, + image: String, + bootable: bool, + mountpoint: String, + options: MountPathParams, + #[zlink(fds)] fds: Vec, + ) -> std::result::Result { + match self.lookup_oci(handle) { + Ok(OpenRepo::Sha256(r)) => { + run_oci_mount_path::( + r, image, bootable, mountpoint, options, fds, + ) + .await + } + Ok(OpenRepo::Sha512(r)) => { + run_oci_mount_path::( + r, image, bootable, mountpoint, options, fds, + ) + .await + } + Err(e) => Err(e), + } + } } } diff --git a/crates/composefs-fuse/src/lib.rs b/crates/composefs-fuse/src/lib.rs index 773119c1..a5e3bcf9 100644 --- a/crates/composefs-fuse/src/lib.rs +++ b/crates/composefs-fuse/src/lib.rs @@ -9,33 +9,26 @@ use std::{ collections::HashMap, ffi::OsStr, + num::NonZeroUsize, os::{ fd::{AsFd, AsRawFd, OwnedFd}, unix::ffi::OsStrExt, }, + path::Path, sync::{Arc, Mutex}, time::{Duration, SystemTime}, }; -use anyhow::Context; use fuser::{ - Config, FileAttr, FileHandle, FileType, Filesystem, FopenFlags, Generation, INodeNo, OpenFlags, - ReplyAttr, ReplyData, ReplyDirectory, ReplyEntry, ReplyOpen, Request, Session, SessionACL, -}; -use rustix::{ - buffer::spare_capacity, - fs::{Mode, OFlags, open}, - io::pread, - mount::{ - FsMountFlags, MountAttrFlags, fsconfig_create, fsconfig_set_flag, fsconfig_set_string, - fsmount, - }, + BackingId, Config, FileAttr, FileHandle, FileType, Filesystem, FopenFlags, Generation, INodeNo, + InitFlags, KernelConfig, MountOption, OpenFlags, ReplyAttr, ReplyData, ReplyDirectory, + ReplyDirectoryPlus, ReplyEntry, ReplyOpen, Request, Session, SessionACL, }; +use rustix::{buffer::spare_capacity, io::pread}; use composefs::{ fsverity::FsVerityHashValue, generic_tree::LeafId, - mount::FsHandle, repository::Repository, tree::{Directory, FileSystem, Inode, Leaf, LeafContent, RegularFile, Stat}, }; @@ -135,7 +128,10 @@ fn leaf_size(leaf: &Leaf) -> u64 { } fn stat_mtime(stat: &Stat) -> SystemTime { - SystemTime::UNIX_EPOCH + Duration::from_secs(stat.st_mtim_sec as u64) + // Container image timestamps are virtually always post-epoch (positive), + // so we treat negative values as epoch rather than wrapping to the far future. + let secs = stat.st_mtim_sec.max(0) as u64; + SystemTime::UNIX_EPOCH + Duration::from_secs(secs) } fn dir_fileattr(dir: &Directory, ino: Ino, nlinks: u32) -> FileAttr { @@ -294,10 +290,26 @@ fn build_inode_table(fs: &FileSystem) -> } /// An open file handle: either a real fd (for external objects) or inline data. -#[derive(Debug)] +/// +/// Both variants are wrapped in `Arc` so that `read()` can clone the handle +/// cheaply and drop the `FuseHandles` lock before issuing the actual I/O. +/// Without this, all `pread` calls would be serialised on the single mutex. +#[derive(Debug, Clone)] enum OpenHandle { - Fd(OwnedFd), - Data(Box<[u8]>), + /// An `OwnedFd` shared via `Arc` so threads can read concurrently. + Fd(Arc), + /// Immutable inline bytes, shared via `Arc` for cheap clone-and-read. + Data(Arc<[u8]>), + /// A FUSE passthrough backing id. The kernel reads directly from the + /// backing fd; userspace read() is never called for this handle. + /// Both fields must be kept alive until release(): the `OwnedFd` is the + /// file the kernel reads through, and dropping the `BackingId` sends + /// `FUSE_DEV_IOC_BACKING_CLOSE` to deregister it. + #[allow(dead_code)] + Passthrough { + backing_id: Arc, + fd: Arc, + }, } /// Mutable runtime state: only tracks open file handles. @@ -322,6 +334,11 @@ struct TreeFuse { lookup: InodeLookup, /// Mutable handle state, protected for thread safety. handles: Mutex, + /// Whether the caller requested FUSE passthrough. Only negotiated with the + /// kernel when this is true; always false until the caller opts in. + passthrough_requested: bool, + /// Whether FUSE passthrough was successfully negotiated with the kernel. + passthrough_enabled: std::sync::atomic::AtomicBool, } impl TreeFuse { @@ -331,6 +348,12 @@ impl TreeFuse { } /// Resolve a directory inode to a `&Directory` by walking the stored path. + /// + /// The path walk is O(depth × log(entries_per_dir)) which is acceptable for + /// typical container image trees (depth < 20). A future optimisation could + /// store a pre-built `Vec<*const Directory>` indexed by `(ino-1)` for + /// O(1) resolution, but that would require either `unsafe` raw pointers or + /// a significant redesign of ownership. fn resolve_dir(&self, ino: Ino) -> Option<&Directory> { let InodeData::Dir { path, .. } = self.get_data(ino)? else { return None; @@ -378,10 +401,38 @@ impl TreeFuse { } impl Filesystem for TreeFuse { + fn init(&mut self, _req: &Request, config: &mut KernelConfig) -> std::io::Result<()> { + if self.passthrough_requested + && config.capabilities().contains(InitFlags::FUSE_PASSTHROUGH) + && config.add_capabilities(InitFlags::FUSE_PASSTHROUGH).is_ok() + { + match config.set_max_stack_depth(2) { + Ok(_) => { + self.passthrough_enabled + .store(true, std::sync::atomic::Ordering::Relaxed); + log::debug!("FUSE passthrough enabled"); + } + Err(current) => { + log::warn!( + "FUSE passthrough: set_max_stack_depth(2) failed \ + (current={current}), disabling passthrough" + ); + } + } + } + Ok(()) + } + fn statfs(&self, _req: &Request, _ino: INodeNo, reply: fuser::ReplyStatfs) { reply.statfs(0, 0, 0, 0, 0, 4096, 255, 4096); } + /// Forget is a no-op: the inode table is fully pre-built at mount time + /// and lives for the entire session, so there is nothing to free per-inode. + fn forget(&self, _req: &Request, _ino: INodeNo, _nlookup: u64) { + // nothing to do + } + fn lookup(&self, _req: &Request, parent: INodeNo, name: &OsStr, reply: ReplyEntry) { let parent = parent.0; log::trace!("lookup {parent} {name:?}"); @@ -506,6 +557,92 @@ impl Filesystem for TreeFuse { reply.ok(); } + fn readdirplus( + &self, + _req: &Request, + ino: INodeNo, + _fh: FileHandle, + offset: u64, + mut reply: ReplyDirectoryPlus, + ) { + let ino = ino.0; + let Some(InodeData::Dir { + parent_ino, + path: dir_path, + attrs: dir_attrs, + .. + }) = self.get_data(ino) + else { + log::error!("readdirplus({ino}): inode is not a directory"); + return reply.error(fuser::Errno::EBADF); + }; + let parent_ino = *parent_ino; + let dir_path = dir_path.clone(); + let dir_attrs = *dir_attrs; + + let parent_attrs = self + .get_data(parent_ino) + .map(|d| *d.attrs()) + .unwrap_or(dir_attrs); + + let Some(dir) = self.resolve_dir(ino) else { + log::error!("readdirplus({ino}): failed to resolve directory"); + return reply.error(fuser::Errno::EIO); + }; + + let mut cur_offset = offset; + + if cur_offset == 0 { + cur_offset += 1; + if reply.add( + INodeNo(ino), + cur_offset, + ".", + &TTL, + &dir_attrs, + Generation(0), + ) { + return reply.ok(); + } + } + + if cur_offset == 1 { + cur_offset += 1; + if reply.add( + INodeNo(parent_ino), + cur_offset, + "..", + &TTL, + &parent_attrs, + Generation(0), + ) { + return reply.ok(); + } + } + + for (name, inode) in dir.sorted_entries().skip((cur_offset as usize) - 2) { + let child_path = child_path_from(&dir_path, name); + let Some(child_ino) = self.child_ino(inode, &child_path) else { + log::error!("readdirplus({ino}): child {name:?} not in inode table"); + continue; + }; + let child_attrs = self.inode_data[(child_ino as usize) - 1].attrs(); + cur_offset += 1; + if reply.add( + INodeNo(child_ino), + cur_offset, + name, + &TTL, + child_attrs, + Generation(0), + ) { + break; + } + } + + reply.ok(); + } + fn releasedir( &self, _req: &Request, @@ -588,9 +725,53 @@ impl Filesystem for TreeFuse { log::error!("open({ino}): failed to open object"); return reply.error(fuser::Errno::EIO); }; - OpenHandle::Fd(fd) + // If passthrough is enabled, try to register the fd with the + // kernel so reads bypass the userspace path entirely. + if self + .passthrough_enabled + .load(std::sync::atomic::Ordering::Relaxed) + { + let fd = Arc::new(fd); + match reply.open_backing(fd.as_fd()) { + Ok(backing_id) => { + let mut state = + self.handles.lock().expect("fuse handles mutex poisoned"); + let fh = state.next_fh; + state.next_fh += 1; + let backing_id = Arc::new(backing_id); + log::debug!("open({ino}): inserted passthrough handle {fh}"); + state.handles.insert( + fh, + OpenHandle::Passthrough { + backing_id: Arc::clone(&backing_id), + fd: Arc::clone(&fd), + }, + ); + return reply.opened_passthrough( + FileHandle(fh), + FopenFlags::FOPEN_KEEP_CACHE, + &backing_id, + ); + } + Err(err) => { + log::warn!( + "open({ino}): open_backing failed ({err}), disabling passthrough" + ); + self.passthrough_enabled + .store(false, std::sync::atomic::Ordering::Relaxed); + // fall through to userspace-read path below + } + } + // Fallback: unwrap Arc (only one reference at this point) + let fd = Arc::try_unwrap(fd).expect("no other Arc refs"); + OpenHandle::Fd(Arc::new(fd)) + } else { + OpenHandle::Fd(Arc::new(fd)) + } + } + LeafContent::Regular(RegularFile::Inline(data)) => { + OpenHandle::Data(Arc::from(data.as_ref())) } - LeafContent::Regular(RegularFile::Inline(data)) => OpenHandle::Data(data.clone()), _ => { log::error!("open({ino}): not a regular file"); return reply.error(fuser::Errno::EBADF); @@ -602,7 +783,9 @@ impl Filesystem for TreeFuse { state.next_fh += 1; log::debug!("open({ino}): inserted handle {fh}"); state.handles.insert(fh, handle); - reply.opened(FileHandle(fh), FopenFlags::empty()); + // FOPEN_KEEP_CACHE tells the kernel it may reuse cached pages across + // open/close cycles. This is always safe for our read-only filesystem. + reply.opened(FileHandle(fh), FopenFlags::FOPEN_KEEP_CACHE); } fn read( @@ -616,11 +799,17 @@ impl Filesystem for TreeFuse { _lock_owner: Option, reply: ReplyData, ) { - let state = self.handles.lock().expect("fuse handles mutex poisoned"); - match state.handles.get(&fh.0) { + // Clone the Arc handle so we can release the lock before doing I/O. + // Holding the mutex across pread() would serialise all concurrent reads + // onto a single lock, negating the benefit of multithreaded sessions. + let handle = { + let state = self.handles.lock().expect("fuse handles mutex poisoned"); + state.handles.get(&fh.0).cloned() + }; + match handle { Some(OpenHandle::Fd(fd)) => { let mut data = Vec::with_capacity(size as usize); - match pread(fd, spare_capacity(&mut data), offset) { + match pread(&*fd, spare_capacity(&mut data), offset) { Ok(_) => reply.data(&data), Err(errno) => { reply.error(errno_to_fuser(errno)); @@ -632,6 +821,12 @@ impl Filesystem for TreeFuse { let end = (start + size as usize).min(data.len()); reply.data(&data[start..end]); } + Some(OpenHandle::Passthrough { .. }) => { + // The kernel should never call read() on a passthrough handle; + // it reads directly from the backing fd. Handle defensively. + log::error!("read(fh={fh}): unexpected read on passthrough handle"); + reply.error(fuser::Errno::EBADF); + } None => { log::error!("read(fh={fh}): handle does not exist"); reply.error(fuser::Errno::EBADF); @@ -677,16 +872,52 @@ fn errno_to_fuser(errno: rustix::io::Errno) -> fuser::Errno { fuser::Errno::from(std::io::Error::from_raw_os_error(errno.raw_os_error())) } -/// Opens /dev/fuse. +/// Configuration for [`serve_tree_fuse`]. +#[derive(Debug, Default)] +pub struct FuseConfig { + /// Enable FUSE passthrough for external files (Linux 6.9+, requires root + /// and a backing filesystem that supports passthrough reads). + /// + /// When true and the kernel supports `FUSE_PASSTHROUGH`, external file + /// reads are routed directly in-kernel to the repository object fds, + /// eliminating userspace context-switch overhead. + /// + /// Defaults to `false`. Set to `true` only when you know the backing + /// filesystem supports passthrough (e.g. ext4, xfs — not tmpfs). + pub passthrough: bool, +} + +/// Opens `/dev/fuse`, returning the device fd. /// -/// After you do this, you can mount it using [`mount_fuse`] and then start serving requests using -/// [`serve_tree_fuse`]. You might want to do this in different threads, which is why these -/// operations are defined separately. +/// The returned fd should be passed to [`mount_fuse`] (to create a detached mount object +/// via `fsopen`/`fsmount`) and then to [`serve_tree_fuse_fd`] to start serving requests. +/// Splitting open/mount/serve into three steps lets callers attach the mount fd to an +/// arbitrary path — or move it into a container namespace — between `mount_fuse` and +/// `serve_tree_fuse_fd`. pub fn open_fuse() -> anyhow::Result { + use anyhow::Context as _; + use rustix::fs::{Mode, OFlags, open}; open("/dev/fuse", OFlags::RDWR | OFlags::CLOEXEC, Mode::empty()) .context("Unable to open fuse device /dev/fuse") } +/// Read and parse an EROFS image fd into a [`composefs::tree::FileSystem`]. +/// +/// The file is read into a temporary buffer; the returned `FileSystem` is +/// fully owned and does not borrow from the fd or the buffer. +pub fn erofs_fd_to_filesystem( + fd: OwnedFd, +) -> anyhow::Result>> { + use anyhow::Context as _; + use composefs::erofs::reader::erofs_to_filesystem; + use std::io::Read as _; + let mut buf = Vec::new(); + std::fs::File::from(fd) + .read_to_end(&mut buf) + .context("reading EROFS image")?; + erofs_to_filesystem::(&buf).context("parsing EROFS image") +} + /// Options controlling how a FUSE filesystem is mounted. #[derive(Debug, Default)] #[non_exhaustive] @@ -706,12 +937,20 @@ impl FuseMountOptions { } } -/// Mounts a FUSE filesystem with the given /dev/fuse fd. +/// Creates a detached FUSE mount object for `dev_fuse` via the `fsopen`/`fsmount` API. +/// +/// Returns an unattached mount fd. The caller must use [`composefs::mount::mount_at`] (or +/// `move_mount`) to attach it to a path before calling [`serve_tree_fuse_fd`]. /// -/// This does the necessary dance of creating the mount object, given a /dev/fuse device node. In -/// order for this to be useful, you'll also need to call [`serve_tree_fuse`] to actually satisfy -/// the requests for data. +/// This path requires `CAP_SYS_ADMIN` (Linux kernel ≥ 5.2). For unprivileged mounts, +/// use the high-level [`serve_tree_fuse`] instead, which falls back to the `fusermount3` +/// setuid helper automatically. pub fn mount_fuse(dev_fuse: impl AsFd, options: &FuseMountOptions) -> anyhow::Result { + use composefs::mount::FsHandle; + use rustix::mount::{ + FsMountFlags, MountAttrFlags, fsconfig_create, fsconfig_set_flag, fsconfig_set_string, + fsmount, + }; let fusefs = FsHandle::open("fuse")?; fsconfig_set_flag(fusefs.as_fd(), "ro")?; fsconfig_set_flag(fusefs.as_fd(), "default_permissions")?; @@ -735,15 +974,16 @@ pub fn mount_fuse(dev_fuse: impl AsFd, options: &FuseMountOptions) -> anyhow::Re )?) } -/// Serves a FUSE filesystem exposing the content of `filesystem`, backed by `repo`. +/// Build the `TreeFuse` filesystem object and the fuser `Config` from the given inputs. /// -/// You should have called [`mount_fuse`] on the `dev_fuse` fd to establish a mount point. -/// The function blocks until the FUSE session ends. -pub fn serve_tree_fuse( - dev_fuse: OwnedFd, +/// This shared helper factors out the construction work that is common to both +/// [`serve_tree_fuse`] (high-level, path-based) and [`serve_tree_fuse_fd`] +/// (low-level, pre-mounted fd). +fn build_fuse_session_parts( filesystem: Arc>, repo: Arc>, -) -> std::io::Result<()> { + config: FuseConfig, +) -> (TreeFuse, Config) { let InodeTable { data: inode_data, lookup, @@ -755,8 +995,69 @@ pub fn serve_tree_fuse( inode_data, lookup, handles: Mutex::new(FuseHandles::default()), + passthrough_requested: config.passthrough, + passthrough_enabled: std::sync::atomic::AtomicBool::new(false), }; - Session::from_fd(tf, dev_fuse, SessionACL::All, Config::default())? + + let n_threads: usize = std::thread::available_parallelism() + .unwrap_or(NonZeroUsize::new(1).unwrap()) + .get(); + let mut session_config = Config::default(); + session_config.n_threads = Some(n_threads); + // clone_fd gives each worker thread its own /dev/fuse fd via FUSE_DEV_IOC_CLONE, + // avoiding per-request lock contention on the shared channel (Linux 4.5+). + session_config.clone_fd = true; + + (tf, session_config) +} + +/// Mounts and serves a FUSE filesystem exposing the content of `filesystem`, backed by `repo`. +/// +/// Mounts at `mountpoint` and blocks until the session ends (i.e. until the mountpoint is +/// unmounted). Uses `Session::new` which tries the new `fsopen`/`fsmount` kernel API first and +/// automatically falls back to the `fusermount3` setuid helper, so unprivileged callers work +/// without any extra setup. +/// +/// FUSE passthrough I/O is opt-in via [`FuseConfig::passthrough`]. When enabled, the +/// kernel reads object data directly from the backing fd, bypassing userspace entirely +/// for external files. This requires root (`CAP_SYS_ADMIN`) **and** a backing filesystem +/// that supports passthrough reads (e.g. ext4, xfs — not tmpfs). +/// +/// Uses one worker thread per logical CPU with per-thread fd cloning +/// (`FUSE_DEV_IOC_CLONE`) to avoid kernel channel-lock contention under load. +/// This is safe because [`TreeFuse`] is `Send + Sync` and the filesystem is +/// read-only. +pub fn serve_tree_fuse( + mountpoint: impl AsRef, + filesystem: Arc>, + repo: Arc>, + config: FuseConfig, +) -> std::io::Result<()> { + let (tf, mut session_config) = build_fuse_session_parts(filesystem, repo, config); + session_config.mount_options = vec![MountOption::RO, MountOption::DefaultPermissions]; + Session::new(tf, mountpoint.as_ref(), &session_config)? + .spawn()? + .join() +} + +/// Serves a FUSE filesystem over a caller-supplied, pre-mounted `/dev/fuse` fd. +/// +/// Use this together with [`open_fuse`] and [`mount_fuse`] when you need control over +/// the mount lifecycle — for example, to attach the mount fd to a path inside a container +/// namespace with `move_mount` before handing it to the FUSE server. The caller is +/// responsible for attaching the mount fd (via [`composefs::mount::mount_at`]) before +/// calling this function, and for keeping the mount fd alive for the duration of the +/// session. +/// +/// This function blocks until the FUSE session ends. +pub fn serve_tree_fuse_fd( + dev_fuse: OwnedFd, + filesystem: Arc>, + repo: Arc>, + config: FuseConfig, +) -> std::io::Result<()> { + let (tf, session_config) = build_fuse_session_parts(filesystem, repo, config); + Session::from_fd(tf, dev_fuse, SessionACL::All, session_config)? .spawn()? .join() } diff --git a/crates/composefs-integration-tests/Cargo.toml b/crates/composefs-integration-tests/Cargo.toml index b7bfaa75..cceda52d 100644 --- a/crates/composefs-integration-tests/Cargo.toml +++ b/crates/composefs-integration-tests/Cargo.toml @@ -42,13 +42,13 @@ composefs-oci = { workspace = true, features = ["test", "boot", "containers-stor composefs-ctl = { workspace = true, features = ["oci"] } hex = "0.4" libtest-mimic = "0.8" +similar-asserts = "1" linkme = "0.3" ocidir = { workspace = true } paste = "1" rustix = { version = "1", features = ["fs", "process"] } serde = { version = "1", features = ["derive"] } serde_json = "1" -similar-asserts = "1" tar = "0.4" tempfile = "3" tokio = { version = "1", features = ["rt-multi-thread", "macros"] } diff --git a/crates/composefs-integration-tests/src/tests/privileged.rs b/crates/composefs-integration-tests/src/tests/privileged.rs index a5c5f922..6aade53b 100644 --- a/crates/composefs-integration-tests/src/tests/privileged.rs +++ b/crates/composefs-integration-tests/src/tests/privileged.rs @@ -968,3 +968,391 @@ fn privileged_cstor_import_xfs_reflink() -> Result<()> { Ok(()) } integration_test!(privileged_cstor_import_xfs_reflink); + +// ============================================================================ +// FUSE integration test +// ============================================================================ + +/// RAII guard that tears down a `cfsctl mount --fuse` subprocess and its FUSE +/// mount, even if the test panics. +/// +/// The subprocess owns the `/dev/fuse` fd and the `fsmount()` fd that pin the +/// FUSE superblock. Killing it closes those fds, the kernel aborts the +/// connection, then a lazy (`DETACH`) unmount removes the dead mount from +/// the directory tree. +struct MountGuard { + mountpoint: PathBuf, + child: Option, +} + +impl Drop for MountGuard { + fn drop(&mut self) { + if let Some(mut child) = self.child.take() { + let _ = child.kill(); + let _ = child.wait(); + } + let _ = rustix::mount::unmount(&self.mountpoint, rustix::mount::UnmountFlags::DETACH); + } +} + +/// Content for the external files used by [`build_test_filesystem`]. +/// +/// These are defined at module level so the FUSE content-read verification +/// in [`privileged_fuse_dumpfile_roundtrip`] can reconstruct the same bytes +/// without having to pass them out of `build_test_filesystem`. +fn bigfile_content() -> Vec { + // 600 bytes of 'A' — well above MAX_INLINE_CONTENT (512) + vec![b'A'; 600] +} + +fn biglib_content() -> Vec { + // 800 bytes cycling 0..=255 — different pattern, different hash + (0u8..=255).cycle().take(800).collect() +} + +/// Build a synthetic [`FileSystem`] with diverse content: +/// directories, inline regular files (≤64 bytes), external regular files +/// (>512 bytes), symlinks, xattrs, hardlinks, a FIFO, and a character device. +/// +/// The `repo` argument is used to store the external file objects and obtain +/// their fsverity hashes; it must already be initialised and set insecure. +fn build_test_filesystem( + repo: &Repository, +) -> Result> { + use std::collections::BTreeMap; + use std::ffi::OsStr; + + use composefs_oci::composefs::generic_tree::{LeafId, Stat}; + use composefs_oci::composefs::tree::{ + Directory, FileSystem, Inode, Leaf, LeafContent, RegularFile, + }; + + fn dir_stat(mode: u32, uid: u32, gid: u32, mtime: i64) -> Stat { + Stat { + st_mode: mode, + st_uid: uid, + st_gid: gid, + st_mtim_sec: mtime, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + } + } + + fn leaf_stat(mode: u32, uid: u32, gid: u32, mtime: i64) -> Stat { + Stat { + st_mode: mode, + st_uid: uid, + st_gid: gid, + st_mtim_sec: mtime, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + } + } + + fn leaf_stat_xattr( + mode: u32, + uid: u32, + gid: u32, + mtime: i64, + xattrs: &[(&str, &[u8])], + ) -> Stat { + let mut map = BTreeMap::new(); + for (k, v) in xattrs { + map.insert(OsStr::new(k).into(), Box::from(*v)); + } + Stat { + st_mode: mode, + st_uid: uid, + st_gid: gid, + st_mtim_sec: mtime, + st_mtim_nsec: 0, + xattrs: map, + } + } + + // Root directory stat (with an xattr) + let mut root_xattrs = BTreeMap::new(); + root_xattrs.insert( + OsStr::new("security.selinux").into(), + Box::from(b"system_u:object_r:root_t:s0".as_ref()), + ); + let root_stat = Stat { + st_mode: 0o755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1_700_000_000, + st_mtim_nsec: 0, + xattrs: root_xattrs, + }; + + let mut fs = FileSystem::::new(root_stat); + + // Insert leaves in a deterministic order so indices are predictable. + + // leaf 0: /usr/bin/hello (inline file with xattr) + let hello_id = LeafId(fs.leaves.len()); + fs.leaves.push(Leaf { + stat: leaf_stat_xattr(0o755, 0, 0, 1_700_000_001, &[("user.test", b"hello-value")]), + content: LeafContent::Regular(RegularFile::Inline( + b"hello world binary stub".as_ref().into(), + )), + }); + + // leaf 1: /usr/lib/readme.txt (inline file) + let readme_id = LeafId(fs.leaves.len()); + fs.leaves.push(Leaf { + stat: leaf_stat(0o644, 0, 0, 1_700_000_002), + content: LeafContent::Regular(RegularFile::Inline( + b"readme text content\n".as_ref().into(), + )), + }); + + // leaf 2: /etc/hostname (inline file) + let hostname_id = LeafId(fs.leaves.len()); + fs.leaves.push(Leaf { + stat: leaf_stat(0o644, 0, 0, 1_700_000_003), + content: LeafContent::Regular(RegularFile::Inline(b"integration-test\n".as_ref().into())), + }); + + // leaf 3: /usr/lib/os-release (inline file, also target of symlink) + let os_release_id = LeafId(fs.leaves.len()); + fs.leaves.push(Leaf { + stat: leaf_stat(0o644, 0, 0, 1_700_000_004), + content: LeafContent::Regular(RegularFile::Inline(b"ID=test\nNAME=Test\n".as_ref().into())), + }); + + // leaf 4: /etc/os-release (symlink → ../usr/lib/os-release) + let symlink_id = LeafId(fs.leaves.len()); + fs.leaves.push(Leaf { + stat: leaf_stat(0o777, 0, 0, 1_700_000_005), + content: LeafContent::Symlink(OsStr::new("../usr/lib/os-release").into()), + }); + + // leaf 5: /dev/null (char device, major=1 minor=3 → rdev = makedev(1,3) = 259) + let devnull_id = LeafId(fs.leaves.len()); + fs.leaves.push(Leaf { + stat: leaf_stat(0o666, 0, 0, 0), + // rdev = major * 256 + minor for the erofs encoding used here + // Linux makedev(1,3) = (1 << 8) | 3 = 259 + content: LeafContent::CharacterDevice(rustix::fs::makedev(1, 3)), + }); + + // leaf 6: /tmp/fifo (named pipe) + let fifo_id = LeafId(fs.leaves.len()); + fs.leaves.push(Leaf { + stat: leaf_stat(0o644, 0, 0, 1_700_000_006), + content: LeafContent::Fifo, + }); + + // leaf 7: /usr/bin/bigfile (external file, 600 bytes — above MAX_INLINE_CONTENT) + // Exercises the FUSE open()+read() path through Repository::open_object(). + let bigfile_data = bigfile_content(); + let bigfile_hash = repo.ensure_object(&bigfile_data)?; + let bigfile_id = LeafId(fs.leaves.len()); + fs.leaves.push(Leaf { + stat: leaf_stat(0o755, 0, 0, 1_700_000_007), + content: LeafContent::Regular(RegularFile::External( + bigfile_hash, + bigfile_data.len() as u64, + )), + }); + + // leaf 8: /usr/lib/biglib.so (external file, 800 bytes — different pattern) + let biglib_data = biglib_content(); + let biglib_hash = repo.ensure_object(&biglib_data)?; + let biglib_id = LeafId(fs.leaves.len()); + fs.leaves.push(Leaf { + stat: leaf_stat(0o755, 0, 0, 1_700_000_008), + content: LeafContent::Regular(RegularFile::External(biglib_hash, biglib_data.len() as u64)), + }); + + // Now build the directory tree. + + // /usr/bin/ + let mut usr_bin = Directory::::new(dir_stat(0o755, 0, 0, 1_700_000_010)); + usr_bin.insert(OsStr::new("hello"), Inode::leaf(hello_id)); + // hardlink: /usr/bin/hello2 → same leaf as /usr/bin/hello + usr_bin.insert(OsStr::new("hello2"), Inode::leaf(hello_id)); + usr_bin.insert(OsStr::new("bigfile"), Inode::leaf(bigfile_id)); + + // /usr/lib/ + let mut usr_lib = Directory::::new(dir_stat(0o755, 0, 0, 1_700_000_011)); + usr_lib.insert(OsStr::new("readme.txt"), Inode::leaf(readme_id)); + usr_lib.insert(OsStr::new("os-release"), Inode::leaf(os_release_id)); + usr_lib.insert(OsStr::new("biglib.so"), Inode::leaf(biglib_id)); + + // /usr/ + let mut usr = Directory::::new(dir_stat(0o755, 0, 0, 1_700_000_012)); + usr.insert(OsStr::new("bin"), Inode::Directory(Box::new(usr_bin))); + usr.insert(OsStr::new("lib"), Inode::Directory(Box::new(usr_lib))); + + // /etc/ + let mut etc = Directory::::new(dir_stat(0o755, 0, 0, 1_700_000_013)); + etc.insert(OsStr::new("hostname"), Inode::leaf(hostname_id)); + etc.insert(OsStr::new("os-release"), Inode::leaf(symlink_id)); + + // /dev/ + let mut dev = Directory::::new(dir_stat(0o755, 0, 0, 1_700_000_014)); + dev.insert(OsStr::new("null"), Inode::leaf(devnull_id)); + + // /tmp/ + let mut tmp_dir = Directory::::new(dir_stat(0o1777, 0, 0, 1_700_000_015)); + tmp_dir.insert(OsStr::new("fifo"), Inode::leaf(fifo_id)); + + // Root + fs.root + .insert(OsStr::new("usr"), Inode::Directory(Box::new(usr))); + fs.root + .insert(OsStr::new("etc"), Inode::Directory(Box::new(etc))); + fs.root + .insert(OsStr::new("dev"), Inode::Directory(Box::new(dev))); + fs.root + .insert(OsStr::new("tmp"), Inode::Directory(Box::new(tmp_dir))); + + Ok(fs) +} + +/// Mount a composefs [`FileSystem`] via FUSE, generate a dumpfile from the +/// FUSE mount using `cfsctl create-dumpfile`, and assert that it is +/// byte-for-byte identical to the dumpfile produced directly by +/// [`write_dumpfile`] on the same in-memory tree. +/// +/// This validates that the FUSE implementation correctly reports every piece +/// of metadata that the dumpfile format captures: modes, uid/gid, mtimes, +/// xattrs, symlink targets, hardlink structure, and device numbers. +fn privileged_fuse_dumpfile_roundtrip() -> Result<()> { + use std::os::unix::fs::MetadataExt as _; + use std::time::{Duration, Instant}; + + use composefs_oci::composefs::{ + dumpfile::write_dumpfile, + erofs::{ + reader::erofs_to_filesystem, + writer::{ValidatedFileSystem, mkfs_erofs}, + }, + repository::{Repository, RepositoryConfig}, + }; + + if require_privileged("privileged_fuse_dumpfile_roundtrip")?.is_some() { + return Ok(()); + } + + // 1. Temp dir: mountpoint, insecure SHA-256 repo, and EROFS image file. + let work_dir = tempfile::tempdir()?; + let mountpoint = work_dir.path().join("mnt"); + let repo_path = work_dir.path().join("repo"); + let image_path = work_dir.path().join("image.erofs"); + std::fs::create_dir(&mountpoint)?; + std::fs::create_dir(&repo_path)?; + + let repo_fd = rustix::fs::open( + &repo_path, + rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY, + rustix::fs::Mode::empty(), + )?; + let (mut repo, _created) = Repository::::init_path( + &repo_fd, + ".", + RepositoryConfig::default().set_insecure(), + )?; + repo.set_insecure(); + + // 2. Build the synthetic tree, write external objects to the repo, and + // round-trip through EROFS for canonical form. + let synthetic = build_test_filesystem(&repo)?; + let erofs_bytes = mkfs_erofs(&mut ValidatedFileSystem::new(synthetic)?); + std::fs::write(&image_path, &*erofs_bytes)?; + let canonical_fs = erofs_to_filesystem::(&erofs_bytes)?; + + // 3. Expected dumpfile from the in-memory canonical tree. + let mut expected_buf = Vec::new(); + write_dumpfile(&mut expected_buf, &canonical_fs)?; + let expected_dump = String::from_utf8(expected_buf)?; + + // 4. Record the mountpoint's device number so we can detect when the + // FUSE mount becomes visible (st_dev changes). + let pre_mount_dev = std::fs::metadata(&mountpoint)?.dev(); + + // 5. Spawn `cfsctl mount --raw-image --fuse` in the background. It opens + // /dev/fuse, mounts, attaches at , and serves until killed. + let cfsctl_bin = cfsctl()?; + let child = std::process::Command::new(&cfsctl_bin) + .arg("--repo") + .arg(&repo_path) + .arg("mount") + .arg("--raw-image") + .arg("--fuse") + .arg(&image_path) + .arg(&mountpoint) + .spawn() + .context("spawning cfsctl mount --fuse")?; + + let mut guard = MountGuard { + mountpoint: mountpoint.clone(), + child: Some(child), + }; + + // 6. Poll until the mount is ready: st_dev of mountpoint changes once + // the FUSE filesystem is attached. Bail if the child exits early. + let deadline = Instant::now() + Duration::from_secs(30); + loop { + if let Some(child) = guard.child.as_mut() + && let Some(status) = child.try_wait()? + { + bail!("cfsctl mount --fuse exited before mount was ready: {status}"); + } + if std::fs::metadata(&mountpoint) + .map(|m| m.dev()) + .unwrap_or(pre_mount_dev) + != pre_mount_dev + { + break; + } + if Instant::now() >= deadline { + bail!("timed out waiting for FUSE mount"); + } + std::thread::sleep(Duration::from_millis(20)); + } + + // 7. Verify external file content is served correctly. + let bigfile_actual = std::fs::read(mountpoint.join("usr/bin/bigfile")) + .context("reading bigfile from FUSE mount")?; + ensure!( + bigfile_actual == bigfile_content(), + "bigfile content mismatch: got {} bytes, expected {}", + bigfile_actual.len(), + bigfile_content().len(), + ); + let biglib_actual = std::fs::read(mountpoint.join("usr/lib/biglib.so")) + .context("reading biglib.so from FUSE mount")?; + ensure!( + biglib_actual == biglib_content(), + "biglib.so content mismatch: got {} bytes, expected {}", + biglib_actual.len(), + biglib_content().len(), + ); + + // 8. Generate the actual dumpfile by walking the FUSE mount via cfsctl. + // --no-propagate-usr-to-root preserves raw metadata; --repo points at + // the SHA-256 repo so external-file digests match expected_dump. + let sh = Shell::new()?; + let mp = mountpoint.to_str().context("non-UTF-8 mountpoint")?; + let repo_arg = repo_path.to_str().context("non-UTF-8 repo path")?; + let actual_dump = cmd!( + sh, + "{cfsctl_bin} --repo {repo_arg} create-dumpfile --no-propagate-usr-to-root {mp}" + ) + .read()?; + + // 9. Tear down before asserting so a mismatch doesn't leak the mount. + drop(guard); + + // 10. Compare with a readable diff on mismatch. + similar_asserts::assert_eq!( + expected_dump.trim_end_matches('\n'), + actual_dump.trim_end_matches('\n') + ); + + Ok(()) +} +integration_test!(privileged_fuse_dumpfile_roundtrip);