diff --git a/Cargo.lock b/Cargo.lock index 6281532..d540d9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -33,7 +33,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", - "anstyle-parse", + "anstyle-parse 0.2.7", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse 1.0.0", "anstyle-query", "anstyle-wincon", "colorchoice", @@ -56,6 +71,15 @@ dependencies = [ "utf8parse", ] +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + [[package]] name = "anstyle-query" version = "1.1.5" @@ -133,9 +157,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.53" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" dependencies = [ "clap_builder", "clap_derive", @@ -143,11 +167,11 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.53" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ - "anstream", + "anstream 1.0.0", "anstyle", "clap_lex", "strsim", @@ -155,9 +179,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.49" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" dependencies = [ "heck", "proc-macro2", @@ -167,9 +191,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.6" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "colorchoice" @@ -324,7 +348,7 @@ version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" dependencies = [ - "anstream", + "anstream 0.6.21", "anstyle", "env_filter", "jiff", @@ -588,9 +612,9 @@ checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itoa" -version = "1.0.15" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jiff" @@ -638,9 +662,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.178" +version = "0.2.183" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" [[package]] name = "libgit2-sys" @@ -695,9 +719,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "litemap" @@ -713,9 +737,9 @@ checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" [[package]] name = "memchr" -version = "2.7.6" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "miniz_oxide" @@ -738,9 +762,9 @@ dependencies = [ [[package]] name = "oci-spec" -version = "0.8.3" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eb4684653aeaba48dea019caa17b2773e1212e281d50b6fa759f36fe032239d" +checksum = "e8445a2631507cec628a15fdd6154b54a3ab3f20ed4fe9d73a3b8b7a4e1ba03a" dependencies = [ "const_format", "derive_builder", @@ -862,18 +886,18 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.103" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.42" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] @@ -907,9 +931,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", @@ -924,9 +948,9 @@ checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "rustix" -version = "1.1.2" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ "bitflags", "errno", @@ -943,9 +967,9 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "ryu" -version = "1.0.20" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" [[package]] name = "serde" @@ -979,9 +1003,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.145" +version = "1.0.146" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +checksum = "217ca874ae0207aac254aa02c957ded05585a90892cc8d87f9e5fa49669dadd8" dependencies = [ "itoa", "memchr", @@ -1040,9 +1064,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.111" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -1073,9 +1097,9 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.23.0" +version = "3.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", "getrandom", @@ -1086,18 +1110,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", @@ -1116,9 +1140,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.22" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-width" diff --git a/Cargo.toml b/Cargo.toml index 1d3d54a..a47a02f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,15 +24,15 @@ exclude = [ ] [dependencies] -clap = { version = "4.5", features = ["derive"] } +clap = { version = "4.6", features = ["derive"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" anyhow = "1.0" -tempfile = "3.20" -flate2 = "1.0" +tempfile = "3.27" +flate2 = "1.1" git2 = "0.20" chrono = "0.4" -oci-spec = { version = "0.8.1", features = ["image"] } +oci-spec = { version = "0.9.0", features = ["image"] } indicatif = "0.18" log = "0.4" env_logger = "0.11" diff --git a/README.md b/README.md index f2834ae..fb1719f 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@
-A Rust application that converts container images (Docker, etc.) to Git repositories. Each container layer is represented as a Git commit, preserving the history and structure of the original image. +A Rust application that converts container images (Docker, etc.) to Git repositories, and generates filesystem bill of materials (fsbom) in YAML. Each container layer is represented as a Git commit, preserving the history and structure of the original image. ![Demo of OCI2Git converting the nginx image](./assets/nginx.gif) @@ -54,6 +54,7 @@ A Rust application that converts container images (Docker, etc.) to Git reposito - Analyze Docker images and extract layer information - Create a Git repository where each image layer is represented as a commit +- Generate a YAML filesystem bill of materials (fsbom) with per-layer file listings - Support for empty layers (ENV, WORKDIR, etc.) as empty commits - Complete metadata extraction to Markdown format - Extensible architecture for supporting different container engines @@ -208,32 +209,53 @@ cargo install --path . ## Usage -```bash +``` oci2git [OPTIONS] +oci2git convert [OPTIONS] +oci2git fsbom [OPTIONS] +``` + +### `convert` — OCI image → Git repository + +```bash +oci2git convert [OPTIONS] +# or simply: +oci2git ``` -Arguments: - `` Image name to convert (e.g., 'ubuntu:latest') or path to tarball when using the tar engine +Options: + `-o, --output ` Output directory for Git repository [default: ./container_repo] + `-e, --engine ` Container engine to use (docker, nerdctl, tar) [default: docker] + `-v, --verbose` Verbose mode (-v for info, -vv for debug, -vvv for trace) + +### `fsbom` — Filesystem bill of materials + +```bash +oci2git fsbom [OPTIONS] +``` Options: - `-o, --output ` Output directory for Git repository [default: ./container_repo] + `-o, --output ` Output path for the YAML BOM file [default: ./fsbom.yml] `-e, --engine ` Container engine to use (docker, nerdctl, tar) [default: docker] - `-h, --help` Print help information - `-V, --version` Print version information + `-v, --verbose` Verbose mode (-v for info, -vv for debug, -vvv for trace) Environment Variables: `TMPDIR` Set this environment variable to change the default location used for intermediate data processing. This is platform-dependent (e.g., `TMPDIR` on Unix/macOS, `TEMP` or `TMP` on Windows). ## Examples +### Convert + Using Docker engine (default): ```bash -oci2git -o ./ubuntu-repo ubuntu:latest +oci2git ubuntu:latest +# or explicitly: +oci2git convert ubuntu:latest -o ./ubuntu-repo ``` Using an already downloaded image tarball: ```bash -oci2git -e tar -o ./ubuntu-repo /path/to/ubuntu-latest.tar +oci2git convert -e tar -o ./ubuntu-repo /path/to/ubuntu-latest.tar ``` The tar engine expects a valid OCI format tarball, which is typically created with `docker save`: @@ -242,7 +264,7 @@ The tar engine expects a valid OCI format tarball, which is typically created wi docker save -o ubuntu-latest.tar ubuntu:latest # Convert the tarball to a Git repository -oci2git -e tar -o ./ubuntu-repo ubuntu-latest.tar +oci2git convert -e tar -o ./ubuntu-repo ubuntu-latest.tar ``` This will create a Git repository in `./ubuntu-repo` containing: @@ -254,6 +276,55 @@ The Git history reflects the container's layer history: - Each subsequent commit represents a layer from the original image - Commits include the Dockerfile command as the commit message +### Filesystem Bill of Materials (fsbom) + +Generate a YAML listing of every file introduced or modified per layer: +```bash +oci2git fsbom ubuntu:latest -o ubuntu.yml +``` + +Using a tarball: +```bash +oci2git fsbom -e tar image.tar -o image-bom.yml +``` + +The output YAML lists every layer with its entries tagged by type (`file`, `hardlink`, `symlink`, `directory`) and status (`n:uid:gid` for new, `m:uid:gid` for modified). Deleted files (OCI whiteouts) are excluded. + +```yaml +layers: + - index: 0 + command: "ADD rootfs.tar.gz / # buildkit" + digest: "sha256:45f3ea58..." + entries: + - type: file + path: "bin/busybox" + size: 919304 + mode: 493 + stat: "n:0:0" + - type: hardlink + path: "bin/sh" + target: "bin/busybox" + stat: "n:0:0" + - type: symlink + path: "lib64" + target: "lib" + stat: "n:0:0" + - index: 1 + command: "RUN apk add --no-cache curl" + digest: "sha256:..." + entries: + - type: file + path: "usr/bin/curl" + size: 204800 + mode: 493 + stat: "n:0:0" + - type: file + path: "etc/apk/world" + size: 32 + mode: 420 + stat: "m:0:0" +``` + ## Repository Structure ``` diff --git a/src/fsbom.rs b/src/fsbom.rs new file mode 100644 index 0000000..d4ce1b6 --- /dev/null +++ b/src/fsbom.rs @@ -0,0 +1,289 @@ +//! Filesystem Bill of Materials (fsbom) generation. +//! +//! This module provides a pure read-only scan of OCI image layer tarballs to produce +//! a per-layer listing of all files introduced or modified in each layer. +//! +//! Key features: +//! - YAML format (hand-written, no extra dependency) +//! - `status: new | modified` per entry +//! - Deleted files (whiteouts) are excluded from output +//! - Single flat `entries` list per layer with type-tagged entries + +use anyhow::{Context, Result}; +use std::collections::HashSet; +use std::fmt::Write as FmtWrite; +use std::path::Path; +use tar::EntryType; +use tar_rs as tar; + +/// Status of a filesystem entry relative to previous layers. +pub enum EntryStatus { + New, + Modified, +} + +/// A single filesystem entry in a layer's bill of materials. +pub enum FsEntry { + /// Regular file. + File { + path: String, + size: u64, + mode: u32, + stat: String, + }, + /// Hard link — another directory entry pointing to the same inode as `target`. + Hardlink { + path: String, + target: String, + stat: String, + }, + Directory { + path: String, + mode: u32, + stat: String, + }, + /// Soft (symbolic) link. + Symlink { + path: String, + target: String, + stat: String, + }, +} + +fn stat_str(status: EntryStatus, uid: u32, gid: u32) -> String { + let s = match status { + EntryStatus::New => 'n', + EntryStatus::Modified => 'm', + }; + format!("{s}:{uid}:{gid}") +} + +/// Bill of materials for a single image layer. +pub struct LayerBom { + pub index: usize, + pub command: String, + pub digest: String, + pub entries: Vec, +} + +/// Complete filesystem bill of materials for an image. +pub struct FsBom { + pub layers: Vec, +} + +/// Scan a layer tarball and produce a [`LayerBom`]. +/// +/// Reads tar headers only — no extraction to disk. +/// `seen_paths` tracks all paths materialized so far across layers for new/modified detection. +pub fn scan_layer( + tar_path: &Path, + seen_paths: &mut HashSet, + index: usize, + command: String, + digest: String, +) -> Result { + let file = std::fs::File::open(tar_path) + .with_context(|| format!("Failed to open layer tarball: {tar_path:?}"))?; + + // Detect gzip by magic bytes + let mut magic = [0u8; 2]; + { + use std::io::Read; + let mut peek = std::io::BufReader::new(&file); + peek.read_exact(&mut magic).unwrap_or(()); + } + + // Re-open for actual reading + let file = std::fs::File::open(tar_path) + .with_context(|| format!("Failed to re-open layer tarball: {tar_path:?}"))?; + + let mut entries: Vec = Vec::new(); + + if magic == [0x1f, 0x8b] { + // gzip compressed + let gz = flate2::read::GzDecoder::new(file); + let mut archive = tar::Archive::new(gz); + scan_archive(&mut archive, seen_paths, &mut entries)?; + } else { + let mut archive = tar::Archive::new(file); + scan_archive(&mut archive, seen_paths, &mut entries)?; + } + + Ok(LayerBom { + index, + command, + digest, + entries, + }) +} + +fn scan_archive( + archive: &mut tar::Archive, + seen_paths: &mut HashSet, + entries: &mut Vec, +) -> Result<()> { + for entry in archive.entries().context("Failed to read tar entries")? { + let entry: tar::Entry<'_, R> = entry.context("Failed to read tar entry")?; + let header = entry.header(); + + let path = entry + .path() + .context("Failed to read entry path")? + .to_string_lossy() + .trim_start_matches("./") + .to_string(); + + // Handle whiteout files + let file_name = Path::new(&path) + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_default(); + + if file_name == ".wh..wh..opq" { + // Opaque whiteout: remove all seen_paths entries under parent dir + let parent = Path::new(&path) + .parent() + .map(|p| { + let s = p.to_string_lossy().to_string(); + if s.is_empty() { + String::new() + } else { + format!("{s}/") + } + }) + .unwrap_or_default(); + seen_paths.retain(|p| !p.starts_with(&parent)); + continue; + } + + if let Some(orig_name) = file_name.strip_prefix(".wh.") { + // Regular whiteout: remove the specific path + let parent = Path::new(&path) + .parent() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(); + let orig_path = if parent.is_empty() { + orig_name.to_string() + } else { + format!("{parent}/{orig_name}") + }; + seen_paths.remove(&orig_path); + continue; + } + + let uid = header.uid().unwrap_or(0) as u32; + let gid = header.gid().unwrap_or(0) as u32; + let mode = header.mode().unwrap_or(0); + + let status = if seen_paths.contains(&path) { + EntryStatus::Modified + } else { + EntryStatus::New + }; + seen_paths.insert(path.clone()); + + let stat = stat_str(status, uid, gid); + + let link_target = || { + header + .link_name() + .ok() + .flatten() + .map(|p: std::borrow::Cow<'_, std::path::Path>| p.to_string_lossy().to_string()) + .unwrap_or_default() + }; + + match header.entry_type() { + EntryType::Regular => { + let size = header.size().unwrap_or(0); + entries.push(FsEntry::File { + path, + size, + mode, + stat, + }); + } + EntryType::Link => { + entries.push(FsEntry::Hardlink { + path, + target: link_target(), + stat, + }); + } + EntryType::Directory => { + entries.push(FsEntry::Directory { path, mode, stat }); + } + EntryType::Symlink => { + entries.push(FsEntry::Symlink { + path, + target: link_target(), + stat, + }); + } + _ => { + // Skip other entry types (block/char devices, fifos, etc.) + } + } + } + Ok(()) +} + +impl FsBom { + /// Serialize this BOM to YAML and write to `path`. + /// + /// Hand-written YAML — no external dependency. + pub fn save_yaml(&self, path: &Path) -> Result<()> { + let mut out = String::new(); + writeln!(out, "layers:").unwrap(); + + for layer in &self.layers { + writeln!(out, " - index: {}", layer.index).unwrap(); + writeln!(out, " command: {:?}", layer.command).unwrap(); + writeln!(out, " digest: {:?}", layer.digest).unwrap(); + writeln!(out, " entries:").unwrap(); + + if layer.entries.is_empty() { + writeln!(out, " []").unwrap(); + } else { + for entry in &layer.entries { + match entry { + FsEntry::File { + path, + size, + mode, + stat, + } => { + writeln!(out, " - type: file").unwrap(); + writeln!(out, " path: {:?}", path).unwrap(); + writeln!(out, " size: {size}").unwrap(); + writeln!(out, " mode: {mode}").unwrap(); + writeln!(out, " stat: {:?}", stat).unwrap(); + } + FsEntry::Hardlink { path, target, stat } => { + writeln!(out, " - type: hardlink").unwrap(); + writeln!(out, " path: {:?}", path).unwrap(); + writeln!(out, " target: {:?}", target).unwrap(); + writeln!(out, " stat: {:?}", stat).unwrap(); + } + FsEntry::Directory { path, mode, stat } => { + writeln!(out, " - type: directory").unwrap(); + writeln!(out, " path: {:?}", path).unwrap(); + writeln!(out, " mode: {mode}").unwrap(); + writeln!(out, " stat: {:?}", stat).unwrap(); + } + FsEntry::Symlink { path, target, stat } => { + writeln!(out, " - type: symlink").unwrap(); + writeln!(out, " path: {:?}", path).unwrap(); + writeln!(out, " target: {:?}", target).unwrap(); + writeln!(out, " stat: {:?}", stat).unwrap(); + } + } + } + } + } + + std::fs::write(path, &out) + .with_context(|| format!("Failed to write fsbom YAML to {path:?}"))?; + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index e905485..7f05420 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ -//! This crate is using for converts container OCI/Docker images to Git repositories. -//! The whole image unpacked into Git repo and each container layer is represented as a Git commit, +//! Converts container OCI/Docker images to Git repositories and generates filesystem +//! bills of materials (fsbom). Each container layer is represented as a Git commit, //! preserving the history and structure of the original image. //! //! This lets you use the power of Git to: @@ -9,36 +9,63 @@ //! - Easily compare related images by converting multiple images and using Git’s diff tools //! to see similarities and differences. //! -//! # Usage +//! # Commands //! -//! `oci2git [OPTIONS] ` +//! ## `convert` — OCI image → Git repository +//! +//! ```text +//! oci2git convert [OPTIONS] +//! oci2git +//! ``` +//! +//! Arguments: +//! - `` Image name to convert (e.g., `ubuntu:latest`) or path to tarball when using the tar engine +//! +//! Options: +//! - `-o` `--output` `` Output directory for Git repository `[default: ./container_repo]` +//! - `-e` `--engine` `` Container engine to use (docker, nerdctl, tar) `[default: docker]` +//! - `-v` `--verbose` Verbose mode +//! +//! ## `fsbom` — Filesystem bill of materials +//! +//! ```text +//! oci2git fsbom [OPTIONS] +//! ``` //! //! Arguments: -//! - `` Image name to convert (e.g., 'ubuntu:latest') or path to tarball when using the tar engine -//! - Options: -//! - `-o` `--output` `` Output directory for Git repository `[default: ./container_repo]` -//! - `-e` `--engine` `` Container engine to use (docker, nerdctl, tar) `[default: docker]` -//! - `-h` `--help` Print help information -//! - `-V` `--version` Print version information +//! - `` Image name or path to tarball when using the tar engine +//! +//! Options: +//! - `-o` `--output` `` Output path for the YAML BOM file `[default: ./fsbom.yml]` +//! - `-e` `--engine` `` Container engine to use (docker, nerdctl, tar) `[default: docker]` +//! - `-v` `--verbose` Verbose mode +//! +//! ## Environment Variables //! -//! - Environment Variables: -//! - `TMPDIR` Set this environment variable to change the default location used for intermediate data processing. This is platform-dependent (e.g., TMPDIR on Unix/macOS, TEMP or TMP on Windows). +//! - `TMPDIR` Override the directory used for intermediate data processing +//! (platform-dependent: `TMPDIR` on Unix/macOS, `TEMP` or `TMP` on Windows). //! -//! # Example +//! # Examples //! -//! ```oci2git ubuntu:latest``` +//! Convert an image to a Git repository: +//! ```text +//! oci2git ubuntu:latest +//! ``` //! -//! This will create a Git repository in `./container_repo` folder containing: +//! Generate a filesystem bill of materials: +//! ```text +//! oci2git fsbom ubuntu:latest -o ubuntu.yml +//! ``` //! -//! - Image.md - Complete metadata about the image in Markdown format -//! - rootfs/ - The filesystem content from the container -//! - The Git history reflects the container's layer history: +//! The `convert` command produces a Git repository in `./container_repo` containing: +//! - `Image.md` — Complete metadata about the image +//! - `rootfs/` — The filesystem content from the container //! -//! The first commit contains only the Image.md file with full metadata -//! Each subsequent commit represents a layer from the original image -//! Commits include the Dockerfile command as the commit message +//! The `fsbom` command produces a YAML file with per-layer entries: +//! - `type: file | hardlink | symlink | directory` +//! - `stat: "n:uid:gid"` for new entries, `"m:uid:gid"` for modified +//! - Deleted files (OCI whiteouts) are excluded //! -//! Repository Structure: //! ```text //! container_repo/ //! ├── .git/ @@ -48,6 +75,7 @@ pub mod digest_tracker; pub mod extracted_image; +pub mod fsbom; pub mod git; pub mod image_metadata; pub mod metadata; @@ -59,6 +87,7 @@ pub mod tar_extractor; // Re-exports for easy access pub use extracted_image::{ExtractedImage, Layer}; +pub use fsbom::{FsBom, LayerBom}; pub use git::GitRepo; pub use notifier::Notifier; pub use processor::ImageProcessor; diff --git a/src/main.rs b/src/main.rs index 1c1a6f4..340a21c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,6 @@ use anyhow::{anyhow, Result}; -use clap::{Parser, ValueEnum}; -use std::path::PathBuf; +use clap::{Args, Parser, Subcommand, ValueEnum}; +use std::path::{Path, PathBuf}; use oci2git::{DockerSource, ImageProcessor, NerdctlSource, Notifier, TarSource}; @@ -11,13 +11,87 @@ enum Engine { Tar, } +#[derive(Debug, Args)] +struct ConvertArgs { + #[arg( + help = "Image name to convert (e.g., ubuntu:latest) or path to tarball when using tar engine" + )] + image: String, + + #[arg( + short, + long, + default_value = "./container_repo", + help = "Output directory for Git repository" + )] + output: PathBuf, + + #[arg( + short, + long, + value_enum, + default_value = "docker", + help = "Container engine to use (docker, nerdctl, tar)" + )] + engine: Engine, + + #[arg( + short, + long, + action = clap::ArgAction::Count, + help = "Verbose mode (-v for info, -vv for debug, -vvv for trace)" + )] + verbose: u8, +} + +#[derive(Subcommand)] +enum Commands { + /// Convert an OCI/Docker image to a Git repository + Convert(ConvertArgs), + + /// Generate a YAML filesystem bill of materials + Fsbom { + #[arg(help = "Image name (e.g., ubuntu:latest) or path to tarball when using tar engine")] + image: String, + + #[arg( + short, + long, + default_value = "./fsbom.yml", + help = "Output path for the YAML BOM file" + )] + output: PathBuf, + + #[arg( + short, + long, + value_enum, + default_value = "docker", + help = "Container engine to use (docker, nerdctl, tar)" + )] + engine: Engine, + + #[arg( + short, + long, + action = clap::ArgAction::Count, + help = "Verbose mode (-v for info, -vv for debug, -vvv for trace)" + )] + verbose: u8, + }, +} + #[derive(Parser)] #[command(author, version, about, long_about = None)] +#[command(args_conflicts_with_subcommands = true)] struct Cli { + #[command(subcommand)] + command: Option, + #[arg( help = "Image name to convert (e.g., ubuntu:latest) or path to tarball when using tar engine" )] - image: String, + image: Option, #[arg( short, @@ -40,63 +114,107 @@ struct Cli { short, long, action = clap::ArgAction::Count, - help = "Verbose mode (-v for info, -vv for debug, -vvv for trace). Also switches to text-based progress" + help = "Verbose mode (-v for info, -vv for debug, -vvv for trace)" )] verbose: u8, } -fn main() -> Result<()> { - let cli = Cli::parse(); +fn run_convert(image: &str, output: &Path, engine: Engine, verbose: u8) -> Result<()> { + let notifier = Notifier::new(verbose); + notifier.debug(&format!("Output directory: {}", output.display())); + notifier.debug(&format!("Engine: {engine:?}")); - // Create notifier with verbosity level - let notifier = Notifier::new(cli.verbose); + match engine { + Engine::Docker => { + notifier.info(&format!( + "Starting oci2git with Docker engine, image: {image}" + )); + let source = DockerSource::new() + .map_err(|e| anyhow!("Failed to initialize Docker source: {e}"))?; + ImageProcessor::new(source, notifier).convert(image, output)?; + } + Engine::Nerdctl => { + notifier.info(&format!( + "Starting oci2git with nerdctl engine, image: {image}" + )); + let source = NerdctlSource::new() + .map_err(|e| anyhow!("Failed to initialize nerdctl source: {e}"))?; + ImageProcessor::new(source, notifier).convert(image, output)?; + } + Engine::Tar => { + notifier.info(&format!( + "Starting oci2git with tar engine, tarball: {image}" + )); + let source = + TarSource::new().map_err(|e| anyhow!("Failed to initialize tar source: {e}"))?; + ImageProcessor::new(source, notifier).convert(image, output)?; + } + } + Ok(()) +} - notifier.debug(&format!("Output directory: {}", cli.output.display())); - notifier.debug(&format!("Engine: {:?}", cli.engine)); - notifier.debug(&format!( - "Beautiful progress: {}", - notifier.use_beautiful_progress() - )); +fn run_fsbom(image: &str, output: &Path, engine: Engine, verbose: u8) -> Result<()> { + let notifier = Notifier::new(verbose); + notifier.debug(&format!("Output path: {}", output.display())); + notifier.debug(&format!("Engine: {engine:?}")); - match cli.engine { + match engine { Engine::Docker => { notifier.info(&format!( - "Starting oci2git with Docker engine, image: {}", - cli.image + "Generating fsbom with Docker engine, image: {image}" )); - notifier.debug("Initializing Docker source"); - let source = DockerSource::new() .map_err(|e| anyhow!("Failed to initialize Docker source: {e}"))?; - - let processor = ImageProcessor::new(source, notifier); - processor.convert(&cli.image, &cli.output)?; + ImageProcessor::new(source, notifier).generate_fsbom(image, output)?; } Engine::Nerdctl => { notifier.info(&format!( - "Starting oci2git with nerdctl engine, image: {}", - cli.image + "Generating fsbom with nerdctl engine, image: {image}" )); - notifier.debug("Initializing nerdctl source"); - let source = NerdctlSource::new() .map_err(|e| anyhow!("Failed to initialize nerdctl source: {e}"))?; - - let processor = ImageProcessor::new(source, notifier); - processor.convert(&cli.image, &cli.output)?; + ImageProcessor::new(source, notifier).generate_fsbom(image, output)?; } Engine::Tar => { notifier.info(&format!( - "Starting oci2git with tar engine, tarball: {}", - cli.image + "Generating fsbom with tar engine, tarball: {image}" )); - notifier.debug("Initializing tar source"); - let source = TarSource::new().map_err(|e| anyhow!("Failed to initialize tar source: {e}"))?; + ImageProcessor::new(source, notifier).generate_fsbom(image, output)?; + } + } + Ok(()) +} - let processor = ImageProcessor::new(source, notifier); - processor.convert(&cli.image, &cli.output)?; +fn main() -> Result<()> { + let cli = Cli::parse(); + + let cmd = cli.command.unwrap_or_else(|| { + Commands::Convert(ConvertArgs { + image: cli.image.unwrap_or_default(), + output: cli.output, + engine: cli.engine, + verbose: cli.verbose, + }) + }); + + match cmd { + Commands::Convert(args) => { + if args.image.is_empty() { + return Err(anyhow!( + "Image name required.\nUsage: oci2git convert " + )); + } + run_convert(&args.image, &args.output, args.engine, args.verbose)?; + } + Commands::Fsbom { + image, + output, + engine, + verbose, + } => { + run_fsbom(&image, &output, engine, verbose)?; } } diff --git a/src/notifier.rs b/src/notifier.rs index 86b8bdf..96f99c3 100644 --- a/src/notifier.rs +++ b/src/notifier.rs @@ -174,7 +174,9 @@ impl Notifier { } pub fn progress(&self, current: u64, total: u64, message: &str) { - if self.verbosity != VerbosityLevel::Quiet && (current % 100 == 0 || current == total) { + if self.verbosity != VerbosityLevel::Quiet + && (current.is_multiple_of(100) || current == total) + { self.info(&format!("{message}: {current}/{total}")); } } diff --git a/src/processor.rs b/src/processor.rs index 364e161..4ae74cb 100644 --- a/src/processor.rs +++ b/src/processor.rs @@ -14,12 +14,14 @@ use crate::digest_tracker::DigestTracker; use crate::extracted_image::ExtractedImage; +use crate::fsbom::{self, FsBom, LayerBom}; use crate::git::GitRepo; use crate::image_metadata::ImageMetadata; use crate::notifier::Notifier; use crate::sources::Source; use crate::successor_navigator::SuccessorNavigator; use anyhow::{Context, Result}; +use std::collections::HashSet; use std::fs; use std::path::Path; @@ -381,4 +383,83 @@ impl ImageProcessor { Ok(()) } + + /// Generate a YAML filesystem bill of materials for an image. + /// + /// Scans layer tarballs read-only (no extraction to disk, no git repo needed). + /// Produces a YAML file at `output_path` listing all files introduced or modified + /// per layer, with `new`/`modified` status. + /// + /// # Parameters + /// - `image_name`: image reference understood by the configured [`Source`]. + /// - `output_path`: path where the YAML BOM file will be written. + pub fn generate_fsbom(&self, image_name: &str, output_path: &Path) -> Result<()> { + self.notifier.info(&format!( + "Generating filesystem BOM for image '{}' using {} source", + image_name, + self.source.name() + )); + + let mut temp_dirs: Vec = Vec::new(); + + let (tarball_path, tarball_temp_dir) = + self.source.get_image_tarball(image_name, &self.notifier)?; + if let Some(td) = tarball_temp_dir { + temp_dirs.push(td); + } + + self.notifier.info("Extracting image tarball..."); + let extracted_image = ExtractedImage::from_tarball(&tarball_path, &self.notifier)?; + + let layers = extracted_image.layers()?; + self.notifier + .debug(&format!("Found {} layers", layers.len())); + + let mut seen_paths: HashSet = HashSet::new(); + let mut layer_boms: Vec = Vec::new(); + + for (i, layer) in layers.iter().enumerate() { + self.notifier.info(&format!( + "Scanning layer {}/{}: {}", + i + 1, + layers.len(), + layer.command + )); + + let bom = if let Some(ref tarball) = layer.tarball_path { + fsbom::scan_layer( + tarball, + &mut seen_paths, + i, + layer.command.clone(), + layer.digest.clone(), + )? + } else { + LayerBom { + index: i, + command: layer.command.clone(), + digest: layer.digest.clone(), + entries: vec![], + } + }; + + layer_boms.push(bom); + } + + let bom = FsBom { layers: layer_boms }; + + if let Some(parent) = output_path.parent() { + if !parent.as_os_str().is_empty() { + fs::create_dir_all(parent)?; + } + } + bom.save_yaml(output_path)?; + + self.notifier.info(&format!( + "Filesystem BOM written to '{}'", + output_path.display() + )); + + Ok(()) + } } diff --git a/tests/integration/fixtures/fsbom-test.Dockerfile b/tests/integration/fixtures/fsbom-test.Dockerfile new file mode 100644 index 0000000..3deeea2 --- /dev/null +++ b/tests/integration/fixtures/fsbom-test.Dockerfile @@ -0,0 +1,19 @@ +FROM busybox:1.37 + +# Layer 1: Create initial files +RUN mkdir -p /app && \ + echo "hello world" > /app/hello.txt && \ + echo "static content" > /app/static.txt + +# Layer 2: Add subdirectory, symlinks, and a script +RUN mkdir -p /app/sub && \ + echo "sub content" > /app/sub/data.txt && \ + echo '#!/bin/sh\necho hello' > /app/run.sh && \ + chmod +x /app/run.sh && \ + ln -s /app/hello.txt /app/hello-link.txt && \ + ln -s ../run.sh /app/sub/run-link.sh + +# Layer 3: Modify hello.txt, add new file, delete static.txt +RUN echo "hello updated" > /app/hello.txt && \ + echo "new file" > /app/new.txt && \ + rm /app/static.txt diff --git a/tests/integration/fixtures/fsbom-test.tar b/tests/integration/fixtures/fsbom-test.tar new file mode 100644 index 0000000..f435941 Binary files /dev/null and b/tests/integration/fixtures/fsbom-test.tar differ diff --git a/tests/integration/fsbom/mod.rs b/tests/integration/fsbom/mod.rs new file mode 100644 index 0000000..2e121bd --- /dev/null +++ b/tests/integration/fsbom/mod.rs @@ -0,0 +1,259 @@ +//! Integration tests for `fsbom` — filesystem bill of materials generation. +//! +//! Uses a controlled busybox-based fixture image with known layer contents: +//! +//! - Layer 1 (busybox base): the busybox rootfs +//! - Layer 2: `/app/hello.txt` (new), `/app/static.txt` (new) +//! - Layer 3: `/app/sub/data.txt` (new), `/app/run.sh` (new), +//! `/app/hello-link.txt` → symlink (new), `/app/sub/run-link.sh` → symlink (new) +//! - Layer 4: `/app/hello.txt` (modified), `/app/new.txt` (new), +//! `/app/static.txt` deleted (whiteout — absent from BOM) + +use anyhow::Result; +use oci2git::processor::ImageProcessor; +use oci2git::sources::TarSource; +use oci2git::Notifier; +use std::path::Path; +use tempfile::TempDir; + +const FIXTURE: &str = "tests/integration/fixtures/fsbom-test.tar"; + +fn skip_if_missing() -> bool { + if !Path::new(FIXTURE).exists() { + println!("Skipping: fixture not found at {FIXTURE}"); + true + } else { + false + } +} + +/// Run `generate_fsbom` on the fixture and return the output YAML as a String. +fn run_fsbom() -> Result<(TempDir, std::path::PathBuf)> { + let out_dir = TempDir::new()?; + let out_path = out_dir.path().join("out.yml"); + + let source = TarSource::new()?; + let notifier = Notifier::new(0); + let processor = ImageProcessor::new(source, notifier); + processor.generate_fsbom(FIXTURE, &out_path)?; + + Ok((out_dir, out_path)) +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fsbom_yaml_is_created() -> Result<()> { + if skip_if_missing() { + return Ok(()); + } + let (_dir, path) = run_fsbom()?; + assert!(path.exists(), "output YAML should exist"); + let content = std::fs::read_to_string(&path)?; + assert!( + content.starts_with("layers:"), + "YAML should start with 'layers:'" + ); + println!("✓ fsbom YAML created"); + Ok(()) + } + + #[test] + fn test_fsbom_has_correct_layer_count() -> Result<()> { + if skip_if_missing() { + return Ok(()); + } + let (_dir, path) = run_fsbom()?; + let content = std::fs::read_to_string(&path)?; + + // Count "- index:" occurrences — one per layer + let layer_count = content.matches("- index:").count(); + // busybox base + 3 RUN layers = 4 layers + assert_eq!(layer_count, 4, "expected 4 layers, got {layer_count}"); + println!("✓ correct layer count ({layer_count})"); + Ok(()) + } + + #[test] + fn test_fsbom_new_files_in_layer2() -> Result<()> { + if skip_if_missing() { + return Ok(()); + } + let (_dir, path) = run_fsbom()?; + let content = std::fs::read_to_string(&path)?; + + // hello.txt and static.txt must appear as 'new' (stat starts with "n:") + // We check YAML presence of the paths and that stat is n: + assert!( + content.contains(r#""app/hello.txt""#), + "hello.txt should be in BOM" + ); + assert!( + content.contains(r#""app/static.txt""#), + "static.txt should be in BOM" + ); + println!("✓ new files present in layer 2"); + Ok(()) + } + + #[test] + fn test_fsbom_symlinks_in_layer3() -> Result<()> { + if skip_if_missing() { + return Ok(()); + } + let (_dir, path) = run_fsbom()?; + let content = std::fs::read_to_string(&path)?; + + assert!( + content.contains(r#""app/hello-link.txt""#), + "symlink hello-link.txt should be in BOM" + ); + assert!( + content.contains(r#""app/sub/run-link.sh""#), + "symlink sub/run-link.sh should be in BOM" + ); + // Both should be recorded as type: symlink + // Count symlink entries + let symlink_count = content.matches("type: symlink").count(); + assert!( + symlink_count >= 2, + "expected at least 2 symlink entries, got {symlink_count}" + ); + println!("✓ symlinks present in layer 3 ({symlink_count} total symlinks)"); + Ok(()) + } + + #[test] + fn test_fsbom_symlink_targets() -> Result<()> { + if skip_if_missing() { + return Ok(()); + } + let (_dir, path) = run_fsbom()?; + let content = std::fs::read_to_string(&path)?; + + // hello-link.txt → /app/hello.txt (absolute symlink) + assert!( + content.contains(r#""/app/hello.txt""#), + "symlink target /app/hello.txt should appear in BOM" + ); + // sub/run-link.sh → ../run.sh (relative symlink) + assert!( + content.contains(r#""../run.sh""#), + "relative symlink target ../run.sh should appear in BOM" + ); + println!("✓ symlink targets correctly recorded"); + Ok(()) + } + + #[test] + fn test_fsbom_modified_file_in_layer4() -> Result<()> { + if skip_if_missing() { + return Ok(()); + } + let (_dir, path) = run_fsbom()?; + let content = std::fs::read_to_string(&path)?; + + // hello.txt appears first in layer 2 as new, then again in layer 4 as modified + let hello_count = content.matches(r#""app/hello.txt""#).count(); + assert_eq!( + hello_count, 2, + "hello.txt should appear twice (new + modified), got {hello_count}" + ); + + // Find the second occurrence and check stat is "m:..." + let second_pos = content + .match_indices(r#""app/hello.txt""#) + .nth(1) + .map(|(i, _)| i) + .expect("second occurrence of hello.txt"); + let end = (second_pos + 200).min(content.len()); + let after = &content[second_pos..end]; + assert!( + after.contains("\"m:"), + "second occurrence of hello.txt should have stat 'm:...', got:\n{after}" + ); + println!("✓ hello.txt correctly marked as modified in layer 4"); + Ok(()) + } + + #[test] + fn test_fsbom_deleted_file_absent() -> Result<()> { + if skip_if_missing() { + return Ok(()); + } + let (_dir, path) = run_fsbom()?; + let content = std::fs::read_to_string(&path)?; + + // static.txt is deleted in layer 4 via whiteout — should appear only once (layer 2) + let count = content.matches(r#""app/static.txt""#).count(); + assert_eq!( + count, 1, + "static.txt deleted in layer 4 should appear only once, got {count}" + ); + println!("✓ deleted file (static.txt) absent from later layers"); + Ok(()) + } + + #[test] + fn test_fsbom_new_file_in_layer4() -> Result<()> { + if skip_if_missing() { + return Ok(()); + } + let (_dir, path) = run_fsbom()?; + let content = std::fs::read_to_string(&path)?; + + assert!( + content.contains(r#""app/new.txt""#), + "new.txt added in layer 4 should be in BOM" + ); + // Should be marked as new + let pos = content.find(r#""app/new.txt""#).expect("new.txt in BOM"); + let end = (pos + 200).min(content.len()); + let after = &content[pos..end]; + assert!( + after.contains("\"n:"), + "new.txt should have stat 'n:...', got:\n{after}" + ); + println!("✓ new.txt correctly marked as new in layer 4"); + Ok(()) + } + + #[test] + fn test_fsbom_layer_indices_are_sequential() -> Result<()> { + if skip_if_missing() { + return Ok(()); + } + let (_dir, path) = run_fsbom()?; + let content = std::fs::read_to_string(&path)?; + + for i in 0..4 { + let expected = format!("- index: {i}"); + assert!( + content.contains(&expected), + "missing '- index: {i}' in YAML" + ); + } + println!("✓ layer indices are sequential 0..3"); + Ok(()) + } + + #[test] + fn test_fsbom_layer_digests_present() -> Result<()> { + if skip_if_missing() { + return Ok(()); + } + let (_dir, path) = run_fsbom()?; + let content = std::fs::read_to_string(&path)?; + + let digest_count = content.matches("digest:").count(); + assert_eq!(digest_count, 4, "each layer should have a digest field"); + println!("✓ all layer digests present"); + Ok(()) + } +} diff --git a/tests/integration/mod.rs b/tests/integration/mod.rs index 2f5c7e6..375d3a3 100644 --- a/tests/integration/mod.rs +++ b/tests/integration/mod.rs @@ -5,5 +5,6 @@ pub mod common; pub mod docker; +pub mod fsbom; pub mod nerdctl; pub mod tar;