diff --git a/.gitignore b/.gitignore index e93be391..391b5b6c 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,9 @@ Cargo.lock out test_data/**/*.json +# Local end-to-end linker scratchpad (kept locally, never committed) +scratch/ + # Coverage reports lcov.info *.profraw diff --git a/CHANGELOG.md b/CHANGELOG.md index bfffd494..8c36d376 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Language +- `external fn` + `use { … } from ` — declare and call functions from external + `.wasm` libraries using logical (platform-independent) module references. The compiler + emits a WASM import section with one entry per bound extern; a separate link step + (`inference-wasm-linker`) produces a single self-contained `.wasm` and `.v` with no + dangling imports. Tier-A (pure) and Tier-B (caller-pointer memory) closures merge + automatically; Tier-C (own static data/globals/tables) produces a clear error with a + relocatable-build recommendation ([#9]) - Add struct definition and parsing support ([#14]) - Add division operator (`/`) support ([#86]) - Add unary negation (`-`) and bitwise NOT (`~`) operators ([#86]) @@ -45,6 +52,63 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Compiler +- wasm-linker: New `core/wasm-linker` crate (`inference-wasm-linker`) implementing the + static-merge link pass. `link(main_wasm, &[external_wasm])` folds satisfied imports' + transitive closures into the main module, rewrites all index-bearing operators into a + unified index space, deduplicates function types, preserves the `name` custom section for + Rocq translation, and emits the unified WASM binary ([#9]) +- wasm-linker: External modules using **floating-point** (any `f32`/`f64` value type in a + signature, local, or global, or any float instruction) are now rejected by the linker. The + Inference language has no `f32`/`f64` types and the Rocq translator models none; floats were + previously admitted at the feature gate via `WASM1` but are now excluded. The feature gate + (`SUPPORTED_WASM_FEATURES`) is `GC_TYPES | MUTABLE_GLOBAL | BULK_MEMORY`; the safety + allow-list provides a second, independent backstop that rejects every float opcode with a + diagnostic naming the exact mnemonic (e.g. `floating-point instruction 'f32.add' is not + supported by the static merge`). **Sign-extension** and **saturating float-to-int** are + also removed from the supported set: the Rocq translator has no lowering for either, and + Inference codegen emits neither ([#9]) +- wasm-linker: **Tail calls** (`return_call`/`return_call_indirect`) and **segment-indexed + table ops** (`table.init`/`elem.drop`/`table.copy`) are rejected by the safety allow-list + (`UnsupportedConstruct`). The Rocq translator has no lowering for either; Inference codegen + never emits them, so the rejection applies only to third-party externals ([#9]) +- wasm-linker: The main-module rebuild is now fail-closed on constructs the merge cannot + preserve: a main module that declares a **start function**, imports **non-function + entities** (globals/memories/tables) from its environment, or declares a **table section** + is rejected up front with `UnsupportedConstruct`. Previously the start section and + non-function imports were silently dropped — the latter shifting the global index space so + `global.get` could read the wrong global — and table-using mains failed after the merge + with a misleading `InvalidMergedModule`. **v128** value types are likewise rejected in + merged signatures, locals, and block types: the Inference language has no SIMD types and + every SIMD operator is already rejected ([#9]) +- wasm-linker: Fixed an unsound Tier-B provenance rule. Pointer subtraction classified + `Param - NotParam` as still parameter-derived; because `NotParam` only means *not provably + parameter-derived*, the subtrahend could itself be `p - C`, so `p - (p - C) == C` fabricated + a fixed absolute address that the analysis accepted as caller-relative — letting a Tier-B + external read or write host memory outside the caller's buffer. Subtraction now preserves + parameter-derivation only when subtracting a provable constant (`Param - Const`), mirroring + the existing `add` cancellation guard. The main-module rebuild also now enforces the same + 256-level control-flow nesting cap as the external scan and the Rocq translator, rejects a + duplicate `inference.spec_funcs` section instead of silently keeping only the last, rejects + a multi-memory main, and rejects trailing bytes in a `spec_funcs` payload ([#9]) +- wasm-linker: Merged external function names in the output name section are now + **module-prefixed** using a `module.field` dot convention. A closure root satisfying import + `sum` from logical module `mathlib` is recorded as `mathlib.sum`; an inner callee the + source named `helper` becomes `mathlib.helper`; a nameless callee receives a deterministic + fallback `mathlib.func_`. The prefix is collision-free by construction (two externals + bound under different logical modules can export the same field without colliding in the + name section). The Rocq translator sanitizes `.` to `_`, so `mathlib.sum` translates to + `Definition mathlib_sum` in the `.v` ([#9]) +- wasm-codegen: Emit WASM import section for `external fn` declarations. The three-stage + index pre-scan now runs `register_imports` before local functions, so every + `Def::ExternFunction` bound via `use … from` is assigned a function import index (lowest + indices, `0..N`), the local-function base is shifted to `N`, and extern calls lower to + `call ` identically to local calls. The import section is emitted between the + Type and Function sections per the WASM binary format; it is omitted when there are no + externs. Function type deduplication (`intern_type`) ensures imports with identical + signatures share one type entry ([#9]) +- type-checker: `ExternOrigin { logical_module, export_field }` binds each `external fn` + declaration to its source module; `extern_origins()` on `SymbolTable` collects all bound + externs for use by codegen ([#9]) - ast: Remove dead `OperatorKind::BitNot` variant — `~x` is always parsed as `UnaryOperatorKind::BitNot` in a `PrefixUnaryExpression`; the binary enum variant was never produced by the AST builder ([#142]) - parser: Replace the `tree-sitter` + `tree-sitter-inference` front end with a resilient recursive-descent parser in the new `inference-parser` crate (`core/parser`). The parser lexes, parses, and lowers directly into the same `inference_ast::arena::AstArena`, producing byte-identical ASTs for all previously valid inputs, so the type-checker, analysis, codegen, and wasm-to-v phases are unchanged. The `tree-sitter`/`tree-sitter-inference` dependencies are removed from the default build, eliminating the C toolchain requirement. Parsing is now resilient (collects every syntax error instead of aborting on the first) and never panics on malformed input. `parse_external_module` moves from `inference_ast::extern_prelude` to `inference::extern_prelude` so that `inference-ast` no longer depends on the parser ([#62]) - ast: Introduce `SimpleTypeKind` enum for primitive types, replacing string-based type matching ([#50]) diff --git a/Cargo.toml b/Cargo.toml index 039c8e53..fffa3925 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ inference-type-checker = { path = "./core/type-checker", version = "0.0.1" } inference-cli = { path = "./core/cli", version = "0.0.1" } inference-wasm-to-v-translator = { path = "./core/wasm-to-v", version = "0.0.1" } inference-wasm-codegen = { path = "./core/wasm-codegen", version = "0.0.1" } +inference-wasm-linker = { path = "./core/wasm-linker", version = "0.0.1" } inference-analysis = { path = "./core/analysis", version = "0.0.1" } inference-compiler-interface = { path = "./core/compiler-interface", version = "0.0.1" } diff --git a/apps/infs/src/commands/build.rs b/apps/infs/src/commands/build.rs index 2673b442..e6d07a79 100644 --- a/apps/infs/src/commands/build.rs +++ b/apps/infs/src/commands/build.rs @@ -66,6 +66,7 @@ use std::process::Command; use crate::commands::project_build::{check_compiler_compatibility, mode_flag, run_project_build}; use crate::errors::InfsError; +use crate::project::manifest::{find_manifest_dir, InferenceToml, MANIFEST_FILE_NAME}; use crate::project::{self, ProjectContext}; use crate::toolchain::find_infc; @@ -111,6 +112,11 @@ pub struct BuildArgs { /// and implies `-v`. #[clap(long = "mode", value_enum)] pub mode: Option, + + /// Directory to search for external `.wasm` modules referenced by + /// `use { … } from ;`. Repeatable; forwarded verbatim to `infc`. + #[clap(short = 'L', long = "wasm-lib-dir", value_name = "DIR")] + pub wasm_lib_dirs: Vec, } /// Executes the build command with the given arguments. @@ -175,6 +181,14 @@ fn execute_single_file(path: &Path, args: &BuildArgs) -> Result<()> { cmd.arg("--mode").arg(mode_flag(mode)); } + for dir in &args.wasm_lib_dirs { + cmd.arg("--wasm-lib-dir").arg(dir); + } + + for (name, path) in manifest_wasm_dependencies(path)? { + cmd.arg("--wasm-dep").arg(format_wasm_dep_arg(&name, &path)?); + } + let status = cmd .stdin(std::process::Stdio::inherit()) .stdout(std::process::Stdio::inherit()) @@ -190,6 +204,59 @@ fn execute_single_file(path: &Path, args: &BuildArgs) -> Result<()> { } } +/// Formats one resolved manifest dependency as the `=` argument +/// forwarded to `infc --wasm-dep`. +/// +/// `name` is already validated against the logical-name grammar in +/// [`crate::project::manifest::validate_wasm_dependency_key`], so it never +/// contains `=`. The receiver splits on the FIRST `=`, which is therefore always +/// the name/path boundary — a path that itself contains `=` is preserved intact. +/// +/// The argument is a single UTF-8 `String`, so the path must round-trip through +/// UTF-8. Using `Path::display()` would lossily substitute U+FFFD for any +/// non-UTF-8 component and silently forward a corrupted path that resolves to the +/// wrong file (or none). The manifest declares its paths as UTF-8 strings, so a +/// non-UTF-8 *resolved* path can only come from a non-UTF-8 manifest directory. +/// Reject it with an actionable error instead of corrupting it. (An +/// OsString-preserving argument channel would lift this restriction, but is out +/// of scope for this pass.) +/// +/// ## Errors +/// +/// Returns an error when `path` is not valid UTF-8. +fn format_wasm_dep_arg(name: &str, path: &Path) -> Result { + let Some(path) = path.to_str() else { + bail!( + "wasm dependency `{name}` resolves to a path that is not valid UTF-8 ({}); \ + rename the containing directory to a UTF-8 path so it can be forwarded to \ + the compiler", + path.display() + ); + }; + Ok(format!("{name}={path}")) +} + +/// Resolves the `[wasm-dependencies]` of the project enclosing `source_path`. +/// +/// Walks up from the source file to the nearest `Inference.toml`, loads it, and +/// returns each declared dependency's logical name paired with its absolute +/// `.wasm` path (relative entries resolved against the manifest directory). +/// A source compiled outside any project (no manifest found) yields an empty +/// list — a manifest-free build is valid and simply has no manifest deps. +/// +/// ## Errors +/// +/// Returns an error only if a manifest exists but cannot be read or parsed; a +/// missing manifest is not an error. +fn manifest_wasm_dependencies(source_path: &Path) -> Result> { + let Some(manifest_dir) = find_manifest_dir(source_path) else { + return Ok(Vec::new()); + }; + let manifest_path = manifest_dir.join(MANIFEST_FILE_NAME); + let manifest = InferenceToml::from_file(&manifest_path)?; + manifest.resolved_wasm_dependencies(&manifest_dir) +} + /// Compiles the entry point of a discovered project (project mode). /// /// Resolves the *effective* build configuration from the CLI flags and the @@ -271,6 +338,95 @@ fn resolve_out_dir( } } +#[cfg(test)] +mod manifest_dep_tests { + use super::*; + use assert_fs::prelude::*; + + #[test] + fn forwards_declared_wasm_dependencies_as_absolute_paths() { + let temp = assert_fs::TempDir::new().unwrap(); + let manifest = temp.child("Inference.toml"); + manifest + .write_str( + "[package]\n\ + name = \"demo\"\n\ + version = \"0.1.0\"\n\ + infc_version = \"0.1.0\"\n\n\ + [wasm-dependencies]\n\ + arith = { path = \"libs/arith.wasm\" }\n", + ) + .unwrap(); + let source = temp.child("src").child("main.inf"); + source.write_str("").unwrap(); + + let deps = manifest_wasm_dependencies(source.path()).expect("should resolve"); + + assert_eq!(deps.len(), 1); + assert_eq!(deps[0].0, "arith"); + assert_eq!(deps[0].1, temp.path().join("libs/arith.wasm")); + } + + #[test] + fn no_manifest_yields_no_dependencies() { + let temp = assert_fs::TempDir::new().unwrap(); + let source = temp.child("main.inf"); + source.write_str("").unwrap(); + + let deps = manifest_wasm_dependencies(source.path()).expect("should succeed"); + assert!(deps.is_empty()); + } + + #[test] + fn manifest_without_wasm_dependencies_yields_none() { + let temp = assert_fs::TempDir::new().unwrap(); + let manifest = temp.child("Inference.toml"); + manifest + .write_str("[package]\nname = \"demo\"\nversion = \"0.1.0\"\ninfc_version = \"0.1.0\"\n") + .unwrap(); + let source = temp.child("main.inf"); + source.write_str("").unwrap(); + + let deps = manifest_wasm_dependencies(source.path()).expect("should succeed"); + assert!(deps.is_empty()); + } + + #[test] + fn formats_utf8_dependency_path() { + let arg = format_wasm_dep_arg("arith", Path::new("/libs/arith.wasm")) + .expect("a UTF-8 path must format"); + assert_eq!(arg, "arith=/libs/arith.wasm"); + } + + #[test] + fn preserves_equals_sign_in_path() { + // The receiver splits on the first `=`, so a `=` inside the path is + // preserved intact (the name is `=`-free by grammar validation). + let arg = format_wasm_dep_arg("arith", Path::new("/a=b/arith.wasm")) + .expect("a path containing `=` must format"); + assert_eq!(arg, "arith=/a=b/arith.wasm"); + assert_eq!(arg.split_once('=').map(|(n, _)| n), Some("arith")); + } + + #[cfg(unix)] + #[test] + fn rejects_non_utf8_dependency_path() { + use std::os::unix::ffi::OsStrExt; + + // A path component with an invalid UTF-8 byte (0xFF) cannot round-trip + // through the single-`String` `=` argument. + let bytes = b"/libs/\xFF/arith.wasm"; + let path = PathBuf::from(std::ffi::OsStr::from_bytes(bytes)); + let err = format_wasm_dep_arg("arith", &path) + .expect_err("a non-UTF-8 path must be rejected, not lossily forwarded"); + let msg = err.to_string(); + assert!( + msg.contains("arith") && msg.contains("not valid UTF-8"), + "diagnostic should name the dependency and the UTF-8 cause; got: {msg}" + ); + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/apps/infs/src/project/manifest.rs b/apps/infs/src/project/manifest.rs index dfaad940..fed50a4b 100644 --- a/apps/infs/src/project/manifest.rs +++ b/apps/infs/src/project/manifest.rs @@ -16,6 +16,11 @@ //! [dependencies] //! # Future: package dependencies //! +//! [wasm-dependencies] +//! # Logical module name -> location of a compiled `.wasm` module. +//! # The logical name is what source refers to via `use { f } from ;`. +//! arith = { path = "libs/arith.wasm" } +//! //! [build] //! target = "wasm32" //! optimize = "release" @@ -95,6 +100,14 @@ pub struct InferenceToml { #[serde(default, skip_serializing_if = "Dependencies::is_empty")] pub dependencies: Dependencies, + /// External `.wasm` module dependencies, keyed by logical module name. + #[serde( + rename = "wasm-dependencies", + default, + skip_serializing_if = "WasmDependencies::is_empty" + )] + pub wasm_dependencies: WasmDependencies, + /// Build configuration. #[serde(default, skip_serializing_if = "BuildConfig::is_default")] pub build: BuildConfig, @@ -148,6 +161,86 @@ impl Dependencies { } } +/// External `.wasm` module dependencies, keyed by logical module name. +/// +/// Each entry maps a logical name — the identifier source refers to in +/// `use { f } from ;` — to the location of a compiled `.wasm` module. +/// These declarations are the highest-priority source feeding the compiler's +/// module resolver; `-L` search directories and `INFERENCE_*` environment +/// directories act as overrides only when a logical name is *not* declared here. +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +pub struct WasmDependencies { + /// Map of logical module name to its location entry. + #[serde(flatten)] + pub modules: HashMap, +} + +impl WasmDependencies { + /// Returns true if no `.wasm` dependencies are declared. + #[must_use] + pub fn is_empty(&self) -> bool { + self.modules.is_empty() + } +} + +/// Validates a `[wasm-dependencies]` key against the logical-module-name grammar. +/// +/// A logical name is one or more `::`-joined segments, each a non-empty ASCII +/// identifier (`[A-Za-z_][A-Za-z0-9_]*`). This is the same name source refers to +/// in `use { f } from ;`. Rejecting any other shape — in particular a key +/// containing `=` — keeps the `infs build` → `infc --wasm-dep =` +/// forwarding unambiguous, since the receiver splits on the first `=`. +/// +/// # Errors +/// +/// Returns an error naming the offending key when it is not a well-formed +/// logical name. +pub fn validate_wasm_dependency_key(key: &str) -> Result<()> { + if key.is_empty() { + bail!("invalid [wasm-dependencies] key: the module name is empty"); + } + if key.contains('=') { + bail!( + "invalid [wasm-dependencies] key `{key}`: a module name cannot contain `=`" + ); + } + + let segments: Vec<&str> = key.split("::").collect(); + for segment in &segments { + if !is_logical_name_segment(segment) { + bail!( + "invalid [wasm-dependencies] key `{key}`: `{segment}` is not a valid \ + module-name segment (expected `::`-joined ASCII identifiers)" + ); + } + } + Ok(()) +} + +/// Returns true when `segment` is a non-empty ASCII identifier: +/// the first character is a letter or `_`, the rest are alphanumeric or `_`. +fn is_logical_name_segment(segment: &str) -> bool { + let mut chars = segment.chars(); + let Some(first) = chars.next() else { + return false; + }; + if !first.is_ascii_alphabetic() && first != '_' { + return false; + } + chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_') +} + +/// The location of a single external `.wasm` module dependency. +/// +/// Only a filesystem `path` is supported today. The entry is a table — not a +/// bare string — so future producers (version pins, registries) can add fields +/// without a breaking change to the manifest format. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct WasmDependency { + /// Filesystem path to the compiled `.wasm` module, relative to the manifest. + pub path: String, +} + /// Build configuration section. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct BuildConfig { @@ -394,11 +487,56 @@ impl InferenceToml { license: None, }, dependencies: Dependencies::default(), + wasm_dependencies: WasmDependencies::default(), build: BuildConfig::default(), verification: VerificationConfig::default(), } } + /// Loads and parses a manifest from a file. + /// + /// # Errors + /// + /// Returns an error if the file cannot be read or is not valid + /// `Inference.toml`. + pub fn from_file(path: &Path) -> Result { + let content = std::fs::read_to_string(path) + .with_context(|| format!("Failed to read manifest: {}", path.display()))?; + Self::from_toml(&content) + } + + /// Resolves every `[wasm-dependencies]` entry to an absolute path. + /// + /// Each entry's `path` is interpreted relative to `base_dir` (the directory + /// containing the manifest), then made absolute via [`Path::join`]. Entries + /// already absolute are returned unchanged. The result preserves the logical + /// name so the resolver can key on it. + /// + /// Each key is validated against the logical-module-name grammar + /// ([`validate_wasm_dependency_key`]) so a malformed name — in particular one + /// containing `=` — never silently corrupts the `--wasm-dep =` + /// forwarding to `infc`. + /// + /// The returned order is sorted by logical name for determinism. + /// + /// # Errors + /// + /// Returns an error if any `[wasm-dependencies]` key is not a well-formed + /// logical module name. + pub fn resolved_wasm_dependencies( + &self, + base_dir: &Path, + ) -> Result> { + let mut resolved: Vec<(String, std::path::PathBuf)> = + Vec::with_capacity(self.wasm_dependencies.modules.len()); + for (name, dep) in &self.wasm_dependencies.modules { + validate_wasm_dependency_key(name)?; + resolved.push((name.clone(), base_dir.join(&dep.path))); + } + resolved.sort_by(|a, b| a.0.cmp(&b.0)); + Ok(resolved) + } + /// Serializes the manifest to TOML format. /// /// # Errors @@ -487,6 +625,29 @@ pub fn discover_manifest(start: &Path) -> Result { ) } +/// Locates the nearest `Inference.toml` by walking up from `start`. +/// +/// `start` may be a file (e.g. the source being compiled) or a directory; the +/// search begins at `start`'s directory and ascends to the filesystem root, +/// returning the first directory that contains an `Inference.toml`. Returns +/// `None` when no manifest is found — a bare file compiled outside any project +/// is a valid, manifest-free build. +#[must_use] +pub fn find_manifest_dir(start: &Path) -> Option { + let mut dir = if start.is_dir() { + Some(start) + } else { + start.parent() + }; + while let Some(current) = dir { + if current.join(MANIFEST_FILE_NAME).is_file() { + return Some(current.to_path_buf()); + } + dir = current.parent(); + } + None +} + /// Validates a project name for use in Inference projects. /// /// # Rules @@ -531,6 +692,7 @@ pub fn validate_project_name(name: &str) -> Result<()> { #[cfg(test)] mod tests { use super::*; + use assert_fs::prelude::*; use semver::Version; #[test] @@ -722,6 +884,24 @@ mod tests { ); } + #[test] + fn test_new_manifest_has_no_wasm_dependencies() { + let manifest = InferenceToml::new("myproject"); + assert!(manifest.wasm_dependencies.is_empty()); + } + + #[test] + fn test_wasm_dependencies_default_is_omitted_from_toml() { + // An empty `[wasm-dependencies]` table must not be serialized — a fresh + // manifest stays minimal. + let manifest = InferenceToml::new("myproject"); + let output = manifest.to_toml().unwrap(); + assert!( + !output.contains("wasm-dependencies"), + "empty wasm-dependencies should be skipped, got:\n{output}" + ); + } + #[test] fn from_toml_parses_explicit_compile_mode() { let src = r#" @@ -781,6 +961,31 @@ infc_version = "0.1.0" ); } + #[test] + fn test_parse_wasm_dependencies_table() { + let content = r#" + [package] + name = "demo" + version = "0.1.0" + infc_version = "0.1.0" + + [wasm-dependencies] + arith = { path = "libs/arith.wasm" } + crypto = { path = "vendor/sha256.wasm" } + "#; + let manifest = InferenceToml::from_toml(content).expect("should parse"); + + assert_eq!(manifest.wasm_dependencies.modules.len(), 2); + assert_eq!( + manifest.wasm_dependencies.modules["arith"].path, + "libs/arith.wasm" + ); + assert_eq!( + manifest.wasm_dependencies.modules["crypto"].path, + "vendor/sha256.wasm" + ); + } + #[test] fn from_toml_rejects_invalid_mode() { let src = r#" @@ -1185,4 +1390,152 @@ target = "wasm32" "a non-existent start directory cannot be canonicalized" ); } + + #[test] + fn test_parse_manifest_without_wasm_dependencies() { + // A manifest that predates the feature must still parse, with an empty + // dependency set. + let content = r#" + [package] + name = "demo" + version = "0.1.0" + infc_version = "0.1.0" + "#; + let manifest = InferenceToml::from_toml(content).expect("should parse"); + assert!(manifest.wasm_dependencies.is_empty()); + } + + #[test] + fn test_wasm_dependencies_round_trip() { + let content = r#" + [package] + name = "demo" + version = "0.1.0" + infc_version = "0.1.0" + + [wasm-dependencies] + arith = { path = "libs/arith.wasm" } + "#; + let manifest = InferenceToml::from_toml(content).expect("should parse"); + let serialized = manifest.to_toml().expect("should serialize"); + let reparsed = InferenceToml::from_toml(&serialized).expect("should reparse"); + assert_eq!(manifest, reparsed); + assert!(serialized.contains("wasm-dependencies")); + } + + #[test] + fn test_resolved_wasm_dependencies_joins_against_base_dir() { + let content = r#" + [package] + name = "demo" + version = "0.1.0" + infc_version = "0.1.0" + + [wasm-dependencies] + arith = { path = "libs/arith.wasm" } + beta = { path = "vendor/beta.wasm" } + "#; + let manifest = InferenceToml::from_toml(content).expect("should parse"); + let base = Path::new("/projects/demo"); + + let resolved = manifest + .resolved_wasm_dependencies(base) + .expect("valid keys resolve"); + + // Sorted by logical name for determinism. + assert_eq!(resolved.len(), 2); + assert_eq!(resolved[0].0, "arith"); + assert_eq!(resolved[0].1, base.join("libs/arith.wasm")); + assert_eq!(resolved[1].0, "beta"); + assert_eq!(resolved[1].1, base.join("vendor/beta.wasm")); + } + + #[test] + fn validate_wasm_dependency_key_accepts_logical_names() { + for key in ["arith", "crypto", "_priv", "a1", "crypto::sha256", "a::b::c"] { + assert!( + validate_wasm_dependency_key(key).is_ok(), + "`{key}` should be a valid logical name" + ); + } + } + + #[test] + fn validate_wasm_dependency_key_rejects_equals_bearing_keys() { + // L1: a `=` in a key would corrupt the `--wasm-dep =` + // forwarding, which splits on the first `=`. Reject it outright. + let err = validate_wasm_dependency_key("arith=evil").unwrap_err(); + assert!(err.to_string().contains("cannot contain `=`"), "{err}"); + } + + #[test] + fn validate_wasm_dependency_key_rejects_malformed_segments() { + for bad in ["", "1arith", "a-b", "a/b", "a::", "::a", "a..b", "a b"] { + assert!( + validate_wasm_dependency_key(bad).is_err(), + "`{bad}` should be rejected as an invalid logical name" + ); + } + } + + #[test] + fn resolved_wasm_dependencies_rejects_an_invalid_key() { + let content = r#" + [package] + name = "demo" + version = "0.1.0" + infc_version = "0.1.0" + + [wasm-dependencies] + "bad=key" = { path = "libs/x.wasm" } + "#; + let manifest = InferenceToml::from_toml(content).expect("manifest parses"); + let err = manifest + .resolved_wasm_dependencies(Path::new("/projects/demo")) + .expect_err("an `=`-bearing key must be rejected"); + assert!(err.to_string().contains("bad=key"), "{err}"); + } + + #[test] + fn test_resolved_wasm_dependencies_empty_when_none_declared() { + let manifest = InferenceToml::new("demo"); + let resolved = manifest + .resolved_wasm_dependencies(Path::new("/projects/demo")) + .expect("no keys to validate"); + assert!(resolved.is_empty()); + } + + #[test] + fn test_find_manifest_dir_in_same_directory() { + let temp = assert_fs::TempDir::new().unwrap(); + let manifest = temp.child(MANIFEST_FILE_NAME); + manifest.write_str("[package]\nname = \"x\"\nversion = \"0.1.0\"\n").unwrap(); + let source = temp.child("main.inf"); + source.write_str("").unwrap(); + + let found = find_manifest_dir(source.path()).expect("manifest should be found"); + assert_eq!(found, temp.path()); + } + + #[test] + fn test_find_manifest_dir_walks_up_from_nested_source() { + let temp = assert_fs::TempDir::new().unwrap(); + let manifest = temp.child(MANIFEST_FILE_NAME); + manifest.write_str("[package]\nname = \"x\"\nversion = \"0.1.0\"\n").unwrap(); + let nested = temp.child("src").child("deep"); + nested.create_dir_all().unwrap(); + let source = nested.child("main.inf"); + source.write_str("").unwrap(); + + let found = find_manifest_dir(source.path()).expect("manifest should be found"); + assert_eq!(found, temp.path()); + } + + #[test] + fn test_find_manifest_dir_returns_none_without_manifest() { + let temp = assert_fs::TempDir::new().unwrap(); + let source = temp.child("main.inf"); + source.write_str("").unwrap(); + assert!(find_manifest_dir(source.path()).is_none()); + } } diff --git a/apps/infs/tests/fixtures/example.inf b/apps/infs/tests/fixtures/example.inf index 4fd75285..a080ba70 100644 --- a/apps/infs/tests/fixtures/example.inf +++ b/apps/infs/tests/fixtures/example.inf @@ -90,8 +90,8 @@ struct identity { //Use use inference::std; use inference::std::algorithms::sort; -use { sort } from "./sort.rs"; -use { sort, hash } from "./sort.rs"; +use { sort } from sorting; +use { sort, hash } from sorting; use inference::std::algorithms::{sort,hash}; //Binary Expression fn spec_assign() -> () { @@ -281,7 +281,7 @@ struct Account { } } use inference::std::algorithms::sort; -use { sort, hash } from "./sort.0.wasm"; +use { sort, hash } from sorting; use inference::std::algorithms::{sort, hash}; fn example() -> u32 { let a: u32 = 42; @@ -374,7 +374,7 @@ fn bubble_sort(arr: [i32;10], compare_function: fn(left: i32, right: i32) -> i32 } } } -use { hash } from "./cryptography.0.wasm"; +use { hash } from cryptography; spec HashContext { type HashFunction = fn([u8; 100]) -> [u8; 32]; fn verify_hash_transitivity(hash_f: HashFunction) -> () { diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index e22f258b..e6460d9d 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -10,6 +10,7 @@ - [Arithmetic Overflow in WASM Codegen](arithmetic-overflow-in-wasm-codegen.md) - [Unreachable Emission in Codegen](unreachable-emission-in-codegen.md) - [Compilation Targets](compilation_targets.md) +- [External Functions and WASM Linking](external-functions-and-wasm-linking.md) # Appendix diff --git a/book/src/external-functions-and-wasm-linking.md b/book/src/external-functions-and-wasm-linking.md new file mode 100644 index 00000000..1015718c --- /dev/null +++ b/book/src/external-functions-and-wasm-linking.md @@ -0,0 +1,182 @@ +# External Functions and WASM Linking + +Inference programs can call functions from pre-compiled `.wasm` libraries using +two cooperating language constructs: `external fn` and `use … from`. The +compiler emits the calls as WebAssembly imports, and a separate link step +(provided by `inference-wasm-linker`) folds the external function bodies into the +output so the final `.wasm` and `.v` files are self-contained. + +## Declaring an External Function + +Use `external fn` to declare a function whose body lives in another `.wasm` +module. The declaration looks like an ordinary function signature without a body: + +```inference +external fn sum(a: i32, b: i32) -> i32; +``` + +Parameter names are optional in the declaration. The following is equivalent: + +```inference +external fn sum(i32, i32) -> i32; +``` + +The type signature must match the exported function in the external module exactly. +If the types disagree, the front-end validation step (`validate_extern`) reports a +`SignatureMismatch` error before any code is generated. + +## Binding an External Function to a Module + +An `external fn` declaration is not tied to a particular module until a `use` +directive names the source: + +```inference +use { sum } from arith; +``` + +The name after `from` is a **logical module reference**, not a file path. The +compiler resolves it at build time by searching: + +1. The `[wasm-dependencies]` table in `Inference.toml` (highest priority). +2. Directories passed via `-L` / `--wasm-lib-dir` on the command line. +3. Directories listed in the `INFERENCE_WASM_LIB_PATH` environment variable + (a `PATH`-style list, separated by `:` on Unix and `;` on Windows). + +A `::` separator is used for namespaced logical names: + +```inference +use { sha256 } from crypto::digest; +``` + +This resolves to `crypto/digest.wasm` in one of the search directories (using the +platform's path separator at resolution time, so the source stays portable across +operating systems). + +Multiple names from the same module are grouped in one `use` directive: + +```inference +external fn sum(a: i32, b: i32) -> i32; +external fn neg(a: i32) -> i32; +use { sum, neg } from arith; +``` + +## Calling an External Function + +Once declared and bound, an external function is called exactly like a local one: + +```inference +external fn sum(a: i32, b: i32) -> i32; +use { sum } from arith; + +pub fn add_three(x: i32) -> i32 { + return sum(x, 3); +} +``` + +The type-checker validates the call site (argument types, return type) using the +declared signature. If the call passes type checking, codegen emits `call 0` — the +import index — identically to how it would emit a call to a local function. + +## What the Compiler Emits (Intermediate Form) + +Before linking, the compiled module contains a WASM import section. The single-import +example above produces: + +```wat +(module + (type (;0;) (func (param i32 i32) (result i32))) + (type (;1;) (func (param i32) (result i32))) + (import "arith" "sum" (func (;0;) (type 0))) + (func $add_three (;1;) (type 1) (param $x i32) (result i32) + local.get $x + i32.const 3 + call 0 + return + unreachable) + (export "add_three" (func 1))) +``` + +Imported functions occupy the lowest WASM function indices. The local `add_three` +is shifted to index 1 (after the one import at index 0). The call target `call 0` +is the import index, resolved statically from the `extern_name_to_idx` table built +during the pre-scan phase. + +## The Link Step + +`inference-wasm-linker` consumes the intermediate module and the resolved external +`.wasm` binaries, and produces a single self-contained module with the imports +satisfied and removed. The external function bodies are merged in and every index +reference is rewritten into the unified index space. + +```text +main.wasm (with imports) ──┐ +arith.wasm ────────────────┼──▶ inference-wasm-linker ──▶ unified.wasm + │ │ + wasm-to-v + ↓ + unified.v +``` + +After linking: +- No `(import …)` referencing `arith` remains in the output. +- The bodies of `sum` (and any functions it calls transitively) are appended after + `add_three` and called by index. +- The unified module passes validation and flows into `wasm-to-v` as an ordinary + module whose merged functions translate to Rocq `Definition`s. + +## Memory-Merge Feasibility + +Not all external functions can be merged. The linker classifies each closure: + +| Tier | What the function touches | Merged? | +|------|--------------------------|---------| +| A | No memory, globals, data, or tables — pure arithmetic | Yes | +| B | Memory only through caller-supplied pointers (e.g., `sort(ptr, len)`) | Yes | +| C | Own static data, globals, or indirect-call tables | No — requires a relocatable build | + +A Tier-C function produces a clear error at link time: + +```text +error: external function `lookup` requires a relocatable build: + defines or initializes its own static data segments +``` + +Build the library with a relocatable/position-independent toolchain to enable +Tier-C support in a future release. + +## Current Restrictions + +- External functions that themselves import their host environment (memory, globals) + are rejected with a clear error: a static merge cannot reconstruct that environment. +- Analysis rule A024 (`ExternFunctionCall`) is scope-aware: a call to a *bound* + external (one named by a `use { … } from ;` in scope) is allowed and + flows through the codegen + link path. Only a call to an *unbound* bare + `external fn` — one with no `use` binding — is rejected, since codegen emits no + import for it and so cannot compile the call. +- Only one version of each logical module is resolved per build. Multi-version + dependency resolution is deferred to a future manifest update. + +## Example: Two Libraries, One Module + +```inference +external fn sort(ptr: i32, len: i32); +external fn checksum(ptr: i32, len: i32) -> i32; +use { sort } from collections; +use { checksum } from crypto; + +pub fn process(ptr: i32, len: i32) -> i32 { + sort(ptr, len); + return checksum(ptr, len); +} +``` + +The compiler emits two imports (indices 0 and 1), the local `process` at index 2. +The linker searches both `collections.wasm` and `crypto.wasm`, computes the closure +of each export, and merges the bodies into a single output module. + +## Related Resources + +- `core/wasm-linker/README.md` — the merge algorithm, tier classification, and entry point API +- `core/wasm-codegen/docs/function-calls-lowering.md` — three-stage index pre-scan and import section emission +- `core/type-checker` — `ExternOrigin`, `extern_origins()`, and the `A024 ExternFunctionCall` analysis rule +- [WebAssembly import section](https://webassembly.github.io/spec/core/binary/modules.html#import-section) — binary format reference diff --git a/core/analysis/src/rules/extern_function_call.rs b/core/analysis/src/rules/extern_function_call.rs index 181dcf1e..71bb0a9d 100644 --- a/core/analysis/src/rules/extern_function_call.rs +++ b/core/analysis/src/rules/extern_function_call.rs @@ -1,83 +1,134 @@ -//! A024: Calls to external functions are not yet supported in codegen. +//! A024: Calls to *unbound* external functions are not supported in codegen. //! -//! External functions are declared with `external fn` but the WebAssembly code -//! generator does not yet emit WASM imports for them. Calling an external -//! function would panic during code generation, so the analysis pass rejects -//! such calls with a clear error message. +//! An `external fn` bound to a source module via `use { f } from ;` +//! lowers to a WASM import that the static-merge linker later satisfies (issue +//! #9), so calling it is fully supported. An *unbound* bare extern — declared +//! `external fn` with no binding `use` — has no source module to merge, would +//! emit no import, and so cannot be compiled. This rule rejects calls to those +//! unbound externs only. +//! +//! Resolution is *scope-aware*, not name-keyed. Two distinct `external fn f` +//! declarations — a bound top-level `f` and an unbound spec-inner `f` — share +//! a name but bind differently: a call inside the spec resolves to the +//! spec-inner declaration, while a call at the top level resolves to the +//! top-level one. The rule resolves each call to the specific `external fn` +//! declaration visible in its enclosing scope and flags it only when *that* +//! declaration is unbound. A purely name-keyed check would let an unbound +//! same-named declaration poison every call to a bound extern (round-2 H-1). //! //! NOTE: This rule only matches direct calls by name (`foo()`). External //! functions cannot currently be struct members or passed as values, so -//! name-based matching is sufficient. If the language later allows extern -//! functions in structs or as first-class values, this rule will need to -//! be extended. +//! name-based matching within a scope is sufficient. If the language later +//! allows extern functions in structs or as first-class values, this rule +//! will need to be extended. -use std::collections::HashSet; +use std::collections::HashMap; use inference_ast::arena::AstArena; +use inference_ast::ids::DefId; use inference_ast::nodes::{Def, Expr}; use inference_type_checker::typed_context::TypedContext; use crate::{errors::AnalysisDiagnostic, walker}; crate::rule! { - /// Calls to external functions are not yet supported in codegen. + /// Calls to unbound external functions are not supported in codegen. #[id = "A024"] #[name = "External function call"] #[severity = error] pub struct ExternFunctionCall; fn check(ctx: &TypedContext) -> Vec { let arena = ctx.arena(); - let extern_names = collect_extern_function_names(arena, ctx); - if extern_names.is_empty() { - return Vec::new(); - } let mut errors = Vec::new(); - walker::walk_function_bodies(ctx, &mut |stmt_id, _walk_ctx| { - walker::for_each_stmt_expr(&arena[stmt_id].kind, arena, &mut |expr_id| { - walker::walk_expr(arena, expr_id, &mut |sub_id| { - if let Expr::FunctionCall { function, .. } = &arena[sub_id].kind - && let Expr::Identifier(ident_id) = &arena[*function].kind - { - let callee_name = &arena[*ident_id].name; - if extern_names.contains(callee_name) { - errors.push(AnalysisDiagnostic::ExternFunctionCall { - name: callee_name.clone(), - location: arena[sub_id].location, - }); - } - } - }); - }); - }); + let mut scopes: Vec> = Vec::new(); + for source_file in ctx.source_files() { + check_defs(arena, ctx, &source_file.defs, &mut scopes, &mut errors); + } errors } } -fn collect_extern_function_names( - arena: &AstArena, +/// Walks the definition tree, maintaining a stack of extern declarations in +/// scope, and flags every call that resolves to an *unbound* extern. +/// +/// Each `Spec`/`Module` pushes its own extern declarations as a new scope +/// layer before its function bodies are checked, so a spec-inner `external fn` +/// shadows a same-named top-level one for calls inside that spec. The layer is +/// popped on exit, keeping sibling specs isolated from one another. +fn check_defs<'a>( + arena: &'a AstArena, ctx: &TypedContext, -) -> HashSet { - let mut names = HashSet::default(); - for source_file in ctx.source_files() { - collect_extern_names_from_defs(arena, &source_file.defs, &mut names); - } - names -} - -fn collect_extern_names_from_defs( - arena: &AstArena, - defs: &[inference_ast::ids::DefId], - names: &mut HashSet, + defs: &[DefId], + scopes: &mut Vec>, + errors: &mut Vec, ) { + scopes.push(collect_extern_decls(arena, defs)); for &def_id in defs { match &arena[def_id].kind { - Def::ExternFunction { name, .. } => { - names.insert(arena[*name].name.clone()); + Def::Function { body, .. } => { + check_function_body(arena, ctx, *body, scopes, errors); + } + Def::Struct { methods, .. } => { + for &method_id in methods { + if let Def::Function { body, .. } = &arena[method_id].kind { + check_function_body(arena, ctx, *body, scopes, errors); + } + } } Def::Spec { defs, .. } | Def::Module { defs: Some(defs), .. } => { - collect_extern_names_from_defs(arena, defs, names); + check_defs(arena, ctx, defs, scopes, errors); } _ => {} } } + scopes.pop(); +} + +/// Records the `external fn` declarations introduced directly by `defs`, +/// mapping each extern name to its declaring [`DefId`]. Keeps the first +/// declaration for a name; a same-name redeclaration in one scope is a type +/// error caught earlier, so the choice is immaterial to a valid program. +fn collect_extern_decls<'a>(arena: &'a AstArena, defs: &[DefId]) -> HashMap<&'a str, DefId> { + let mut decls = HashMap::default(); + for &def_id in defs { + if let Def::ExternFunction { name, .. } = &arena[def_id].kind { + decls.entry(arena[*name].name.as_str()).or_insert(def_id); + } + } + decls +} + +/// Resolves a callee name against the scope stack, innermost first, returning +/// the declaring [`DefId`] of the nearest `external fn` of that name, or `None` +/// if the name does not resolve to any extern in scope (a regular function). +fn resolve_extern_decl(scopes: &[HashMap<&str, DefId>], name: &str) -> Option { + scopes.iter().rev().find_map(|scope| scope.get(name).copied()) +} + +fn check_function_body( + arena: &AstArena, + ctx: &TypedContext, + body: inference_ast::ids::BlockId, + scopes: &[HashMap<&str, DefId>], + errors: &mut Vec, +) { + walker::walk_block_stmts(arena, body, &mut |stmt_id| { + walker::for_each_stmt_expr(&arena[stmt_id].kind, arena, &mut |expr_id| { + walker::walk_expr(arena, expr_id, &mut |sub_id| { + if let Expr::FunctionCall { function, .. } = &arena[sub_id].kind + && let Expr::Identifier(ident_id) = &arena[*function].kind + { + let callee_name = &arena[*ident_id].name; + if let Some(decl) = resolve_extern_decl(scopes, callee_name) + && ctx.extern_origin_by_decl(decl).is_none() + { + errors.push(AnalysisDiagnostic::ExternFunctionCall { + name: callee_name.clone(), + location: arena[sub_id].location, + }); + } + } + }); + }); + }); } diff --git a/core/analysis/src/walker.rs b/core/analysis/src/walker.rs index 4d622ea1..d69af52f 100644 --- a/core/analysis/src/walker.rs +++ b/core/analysis/src/walker.rs @@ -257,6 +257,51 @@ fn contains_break_in_stmt(arena: &AstArena, stmt_id: StmtId) -> bool { } } +/// Recursively visits every statement in a single block, calling `visitor` +/// for each one. Unlike [`walk_function_bodies`], this walks one body in +/// isolation and tracks no loop/non-det depth — for rules that maintain their +/// own per-body context (e.g. an enclosing-scope stack) across the traversal. +pub(crate) fn walk_block_stmts( + arena: &AstArena, + block_id: BlockId, + visitor: &mut dyn FnMut(StmtId), +) { + for &stmt_id in &arena[block_id].stmts { + walk_stmt_recursive(arena, stmt_id, visitor); + } +} + +fn walk_stmt_recursive( + arena: &AstArena, + stmt_id: StmtId, + visitor: &mut dyn FnMut(StmtId), +) { + visitor(stmt_id); + match &arena[stmt_id].kind { + Stmt::Loop { body, .. } | Stmt::Block(body) => { + walk_block_stmts(arena, *body, visitor); + } + Stmt::If { + then_block, + else_block, + .. + } => { + walk_block_stmts(arena, *then_block, visitor); + if let Some(else_id) = else_block { + walk_block_stmts(arena, *else_id, visitor); + } + } + Stmt::Assign { .. } + | Stmt::Return { .. } + | Stmt::Break + | Stmt::Expr(_) + | Stmt::VarDef { .. } + | Stmt::TypeDef { .. } + | Stmt::Assert { .. } + | Stmt::ConstDef(_) => {} + } +} + /// Recursively walks all `Def` variants and calls `callback` for each /// function body found. Handles struct methods, spec definitions (recursive), /// and module definitions (recursive). diff --git a/core/ast/docs/nodes.md b/core/ast/docs/nodes.md index 55b248a1..3a646334 100644 --- a/core/ast/docs/nodes.md +++ b/core/ast/docs/nodes.md @@ -160,11 +160,10 @@ Import statement for bringing external symbols into scope. ```rust pub struct UseDirective { - pub id: u32, pub location: Location, - pub imported_types: Option>>, - pub segments: Option>>, - pub from: Option, + pub imported_types: Vec, + pub segments: Vec, + pub from: Option, } ``` @@ -172,12 +171,16 @@ pub struct UseDirective { ```inference use std::{io, fs}; use core::option::Option; +use { sort } from sorting; +use { hash } from crypto::sha256; ``` **Fields:** - `imported_types`: Specific types to import (e.g., `{io, fs}`) -- `segments`: Module path segments (e.g., `std`, `core`) -- `from`: Optional source path +- `segments`: Path-form module segments (e.g., `std`, `core`) +- `from`: Optional logical module reference of a `from` clause. A `ModuleRef` + carries identifier-path segments (e.g. `crypto::sha256`), not a filesystem + path — the driver resolves it to a `.wasm` file, keeping source portable. ## Definitions diff --git a/core/ast/src/nodes.rs b/core/ast/src/nodes.rs index 4d76a9b8..0dbc97ab 100644 --- a/core/ast/src/nodes.rs +++ b/core/ast/src/nodes.rs @@ -226,12 +226,25 @@ pub enum Directive { Use(UseDirective), } +/// A logical, platform-independent module reference. +/// +/// Carries the identifier-path `segments` of a `from` clause (e.g. `crypto::sha256` +/// lowers to `["crypto", "sha256"]`). It is deliberately *not* a filesystem path: +/// the driver maps it to a concrete `.wasm` file at resolve time, so source stays +/// portable across operating systems (no `./`, no OS separators). +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct ModuleRef { + pub segments: Vec, +} + #[derive(Clone, PartialEq, Eq, Debug)] pub struct UseDirective { pub location: Location, pub imported_types: Vec, pub segments: Vec, - pub from: Option, + /// Logical module reference of a `from` clause, if present. The string-literal + /// path form (`from "./sort.wasm"`) was removed in favour of this portable form. + pub from: Option, } // --------------------------------------------------------------------------- diff --git a/core/cli/src/main.rs b/core/cli/src/main.rs index d6c132ac..f0672158 100644 --- a/core/cli/src/main.rs +++ b/core/cli/src/main.rs @@ -150,7 +150,10 @@ mod parser; pub(crate) mod toolchain; use clap::Parser; -use inference::{analyze, parse, type_check, wasm_to_v}; +use inference::wasm_link::{ + resolve_external_modules, ManifestDeps, ResolvedExternalModule, SearchPath, +}; +use inference::{analyze, link, parse, type_check, wasm_to_v}; use parser::{Cli, CliMode}; use std::{ fs, @@ -159,6 +162,74 @@ use std::{ }; use toolchain::BuildProfile; +/// Environment variable holding a `PATH`-style list of directories to search +/// for external `.wasm` modules, after any `-L` directories. +const WASM_LIB_PATH_ENV: &str = "INFERENCE_WASM_LIB_PATH"; + +/// Builds the manifest-declared dependency map from `--wasm-dep =` +/// entries. +/// +/// `infs build` forwards one entry per `Inference.toml [wasm-dependencies]` +/// declaration; these bind a logical module name directly to a `.wasm` file and +/// take precedence over every search directory. A malformed entry (no `=`, or an +/// empty name) is a hard error so a typo never silently falls through to the +/// search path. +fn parse_manifest_deps(entries: &[String]) -> anyhow::Result { + let mut deps = ManifestDeps::new(); + for entry in entries { + let (name, path) = entry.split_once('=').ok_or_else(|| { + anyhow::anyhow!("invalid --wasm-dep `{entry}`: expected `=`") + })?; + if name.is_empty() { + anyhow::bail!("invalid --wasm-dep `{entry}`: module name is empty"); + } + deps.insert(name, PathBuf::from(path)); + } + Ok(deps) +} + +/// Resolves and validates every external `.wasm` module the program binds. +/// +/// Resolution priority, highest first: +/// 1. manifest dependencies (`--wasm-dep`, forwarded from +/// `Inference.toml [wasm-dependencies]`), +/// 2. `-L` / `--wasm-lib-dir` directories, +/// 3. `INFERENCE_WASM_LIB_PATH` environment directories. +fn resolve_externals( + typed_context: &inference::TypedContext, + lib_dirs: &[PathBuf], + manifest_deps: &ManifestDeps, +) -> anyhow::Result> { + let mut search_path = SearchPath::new(); + for dir in lib_dirs { + search_path.push_lib_dir(dir.clone()); + } + if let Some(env_path) = std::env::var_os(WASM_LIB_PATH_ENV) { + for dir in env_search_dirs(&env_path) { + search_path.push_env_dir(dir); + } + } + Ok(resolve_external_modules( + typed_context, + &search_path, + Some(manifest_deps), + )?) +} + +/// Splits an `INFERENCE_WASM_LIB_PATH`-style value into search directories, +/// dropping empty entries. +/// +/// An empty entry (a leading/trailing/interior separator, or a wholly-empty +/// value) would otherwise yield an empty `PathBuf` whose `join(relative)` +/// resolves against the process CWD — silently turning the build directory into +/// a `.wasm` search root. Dropping it makes `""` and `":"` behave exactly like +/// the variable being unset. +fn env_search_dirs(env_path: &std::ffi::OsStr) -> Vec { + std::env::split_paths(env_path) + .filter(|dir| !dir.as_os_str().is_empty()) + .collect() +} + /// Applies default phase normalization to parsed CLI arguments. /// /// When no phase flag (`--parse`, `--analyze`, `--codegen`) is given, defaults @@ -431,6 +502,28 @@ fn main() { } } } + + // Resolve every external `.wasm` the program binds, ahead of codegen, so a + // resolution or validation failure aborts before any output is produced. + let manifest_deps = match parse_manifest_deps(&args.wasm_deps) { + Ok(deps) => deps, + Err(e) => { + eprintln!("External module resolution failed: {e}"); + process::exit(1); + } + }; + let external_modules = match &typed_context { + Some(tctx) if need_codegen => { + match resolve_externals(tctx, &args.wasm_lib_dirs, &manifest_deps) { + Ok(modules) => modules, + Err(e) => { + eprintln!("External module resolution failed: {e}"); + process::exit(1); + } + } + } + _ => Vec::new(), + }; if need_codegen { let Some(tctx) = typed_context else { eprintln!("Internal error: type check phase did not produce typed context"); @@ -455,7 +548,26 @@ fn main() { }; println!("Codegen complete"); - let wasm_bytes = codegen_output.wasm(); + // Fold the resolved external modules into the codegen output: a single + // self-contained module with no cross-module imports. Each external is + // paired with the logical module it was bound under, so the merge + // matches each import's recorded `(module, field)` against the right + // external. With no externs this is a byte-identical pass-through. + let external_bytes: Vec<(&str, &[u8])> = external_modules + .iter() + .map(|m| (m.logical_module.as_str(), m.bytes.as_slice())) + .collect(); + let wasm_owned = match link(codegen_output.wasm(), &external_bytes) { + Ok(bytes) => bytes, + Err(e) => { + eprintln!("Linking external modules failed: {e}"); + process::exit(1); + } + }; + if !external_modules.is_empty() { + println!("Linked {} external module(s)", external_modules.len()); + } + let wasm_bytes = wasm_owned.as_slice(); if args.generate_wasm_output { let wasm_file_path = output_path.join(format!("{source_fname}.wasm")); @@ -470,11 +582,20 @@ fn main() { println!("WASM generated at: {}", wasm_file_path.to_string_lossy()); } if args.generate_v_output { - match wasm_to_v( - source_fname, - wasm_bytes, - codegen_output.spec_func_indices_by_spec(), - ) { + // The spec-function indices codegen records are in the *pre-link* + // space; the linker rewrote the embedded `inference.spec_funcs` + // section into the post-link space. When externals were merged the + // pre-link map is stale, so defer entirely to the embedded post-link + // section (an empty explicit map makes the translator adopt it). + // With no externals the merge is a byte-identical pass-through and + // the explicit map still cross-checks against the embedded one. + let empty_spec_funcs = inference::FxHashMap::default(); + let explicit_spec_funcs = if external_modules.is_empty() { + codegen_output.spec_func_indices_by_spec() + } else { + &empty_spec_funcs + }; + match wasm_to_v(source_fname, wasm_bytes, explicit_spec_funcs) { Ok(v_output) => { let v_file_path = output_path.join(format!("{source_fname}.v")); if let Err(e) = fs::create_dir_all(&output_path) { @@ -500,7 +621,7 @@ fn main() { #[cfg(test)] mod tests { use super::*; - use std::path::PathBuf; + use std::path::{Path, PathBuf}; fn make_args(parse: bool, analyze: bool, codegen: bool) -> Cli { Cli { @@ -512,6 +633,8 @@ mod tests { generate_wasm_output: false, generate_v_output: false, mode: None, + wasm_lib_dirs: Vec::new(), + wasm_deps: Vec::new(), commit_hash: false, abi_version: false, } @@ -630,4 +753,78 @@ mod tests { fn get_out_path() -> std::path::PathBuf { get_test_data_path().parent().unwrap().join("out") } + + #[test] + fn parse_manifest_deps_binds_name_to_path() { + let deps = + parse_manifest_deps(&["arith=/libs/arith.wasm".to_string()]).expect("should parse"); + assert_eq!(deps.get("arith"), Some(Path::new("/libs/arith.wasm"))); + } + + #[test] + fn parse_manifest_deps_accepts_multiple_entries() { + let deps = parse_manifest_deps(&[ + "arith=/libs/arith.wasm".to_string(), + "crypto=/vendor/sha256.wasm".to_string(), + ]) + .expect("should parse"); + assert_eq!(deps.get("arith"), Some(Path::new("/libs/arith.wasm"))); + assert_eq!(deps.get("crypto"), Some(Path::new("/vendor/sha256.wasm"))); + } + + #[test] + fn parse_manifest_deps_preserves_path_with_equals() { + // Only the first `=` separates name from path; later ones belong to the + // path so values like `a=b=c` survive intact. + let deps = parse_manifest_deps(&["arith=/odd=dir/arith.wasm".to_string()]) + .expect("should parse"); + assert_eq!(deps.get("arith"), Some(Path::new("/odd=dir/arith.wasm"))); + } + + #[test] + fn parse_manifest_deps_rejects_missing_separator() { + let err = parse_manifest_deps(&["arith".to_string()]).unwrap_err(); + assert!(err.to_string().contains("expected `=`")); + } + + #[test] + fn parse_manifest_deps_rejects_empty_name() { + let err = parse_manifest_deps(&["=/libs/arith.wasm".to_string()]).unwrap_err(); + assert!(err.to_string().contains("module name is empty")); + } + + #[test] + fn parse_manifest_deps_empty_input_yields_empty_map() { + let deps = parse_manifest_deps(&[]).expect("should parse"); + assert!(deps.get("anything").is_none()); + } + + #[test] + fn empty_wasm_lib_path_resolves_like_unset() { + // H5: a wholly-empty value, and a lone separator, must each yield zero + // search directories — identical to the variable being unset — rather + // than injecting the process CWD as a silent `.wasm` search root. + use std::ffi::OsString; + + let empty = env_search_dirs(&OsString::from("")); + assert!(empty.is_empty(), "empty value yields no dirs: {empty:?}"); + + // A lone PATH list separator (`:` on Unix, `;` on Windows) splits into + // two empty entries; both must be dropped. + let list_sep = if cfg!(windows) { ";" } else { ":" }; + let bare = env_search_dirs(&OsString::from(list_sep)); + assert!(bare.is_empty(), "a lone list separator yields no dirs: {bare:?}"); + } + + #[test] + fn wasm_lib_path_keeps_real_dirs_and_drops_empties() { + // `"/real/dir"` (a trailing separator) must keep the real directory + // and drop only the empty trailing entry. + use std::ffi::OsString; + + let list_sep = if cfg!(windows) { ";" } else { ":" }; + let value = OsString::from(format!("real{list_sep}")); + let dirs = env_search_dirs(&value); + assert_eq!(dirs, [PathBuf::from("real")]); + } } diff --git a/core/cli/src/parser.rs b/core/cli/src/parser.rs index 54943a20..9a0b7f59 100644 --- a/core/cli/src/parser.rs +++ b/core/cli/src/parser.rs @@ -207,6 +207,25 @@ pub(crate) struct Cli { #[clap(long = "mode", value_enum)] pub(crate) mode: Option, + /// Directory to search for external `.wasm` modules referenced by + /// `use { … } from ;`. + /// + /// Repeatable; directories are searched in the order given, ahead of any + /// `INFERENCE_WASM_LIB_PATH` environment directories. A logical module + /// `a::b` resolves to `/a/b.wasm` under each directory. + #[clap(short = 'L', long = "wasm-lib-dir", value_name = "DIR")] + pub(crate) wasm_lib_dirs: Vec, + + /// A manifest-declared external `.wasm` module, as `=`. + /// + /// Repeatable; binds the logical module `` directly to the `.wasm` + /// file at ``, taking precedence over every `-L` / `INFERENCE_*` + /// search directory. `infs build` forwards one entry per + /// `Inference.toml [wasm-dependencies]` declaration; direct `infc` callers + /// may pass them by hand. + #[clap(long = "wasm-dep", value_name = "NAME=PATH")] + pub(crate) wasm_deps: Vec, + /// Print the git commit hash embedded at build time and exit 0. /// /// Used by `infs build` to detect version drift between paired `infs` and diff --git a/core/inference/Cargo.toml b/core/inference/Cargo.toml index 9e4d1f24..f970f613 100644 --- a/core/inference/Cargo.toml +++ b/core/inference/Cargo.toml @@ -12,9 +12,14 @@ categories = ["compilers", "wasm"] [dependencies] anyhow.workspace = true rustc-hash.workspace = true +inf-wasmparser.workspace = true inference-ast.workspace = true inference-parser.workspace = true inference-wasm-codegen.workspace = true inference-wasm-to-v-translator.workspace = true +inference-wasm-linker.workspace = true inference-type-checker.workspace = true inference-analysis.workspace = true + +[dev-dependencies] +wasm-encoder = "0.249.0" diff --git a/core/inference/src/lib.rs b/core/inference/src/lib.rs index 578d133f..37046411 100644 --- a/core/inference/src/lib.rs +++ b/core/inference/src/lib.rs @@ -265,9 +265,10 @@ pub use inference_analysis::errors::{AnalysisErrors, AnalysisResult}; use inference_ast::arena::AstArena; -use inference_type_checker::typed_context::TypedContext; +pub use inference_type_checker::typed_context::TypedContext; pub mod extern_prelude; +pub mod wasm_link; /// Re-export of `rustc_hash::FxHashMap` so library consumers of `inference` /// can construct the spec-funcs map passed to [`wasm_to_v`] without taking a @@ -279,6 +280,11 @@ pub use rustc_hash::FxHashMap; /// dependency on `inference-wasm-to-v-translator`. pub use inference_wasm_to_v_translator::errors::{InvalidIdentifierReason, WasmToVError}; +/// Re-export of the static-merge linker's error type so downstream consumers +/// can match on link failures (e.g. an unsatisfied import or a Tier-C module) +/// without taking a direct dependency on `inference-wasm-linker`. +pub use inference_wasm_linker::LinkError; + /// Re-export of the `inference.spec_funcs` custom-section identifiers so /// downstream consumers (CLI tools, integration tests) share a single source /// of truth with the codegen and translator crates. @@ -585,6 +591,66 @@ pub fn codegen( ) } +/// Folds external `.wasm` modules into the codegen output, producing a single +/// self-contained module with no cross-module imports. +/// +/// This is the post-codegen link step (Phase 4 of Issue #9). When a program +/// `use`s functions from an external module, [`codegen`] emits those calls as +/// WASM `(import …)` entries. This function consumes that intermediate module +/// plus the resolved external module bytes and merges the imported functions' +/// bodies in, re-indexing so the result imports nothing — the single artifact +/// the user asked for, ready for [`wasm_to_v`]. +/// +/// `externals` is the set of resolved, validated external module binaries, each +/// paired with the logical `::`-joined module name it was bound under so the +/// merge can match an import's recorded `(module, field)` against the right +/// external. When it is empty the call is a no-op pass-through: a program +/// without externs links to byte-identical output, so callers can route every +/// program through this step unconditionally. +/// +/// # Errors +/// +/// Returns an error if any module fails to parse, an import is left unsatisfied +/// by the supplied externals, or a merged function falls into the unsupported +/// Tier C (own static data / mutable globals). The underlying error downcasts +/// to [`LinkError`]. +pub fn link(main_wasm: &[u8], externals: &[(&str, &[u8])]) -> anyhow::Result> { + // Byte-identical fast path *only* for a module that is provably import-free — + // it is already the self-contained artifact this step would produce. A module + // that still carries imports (e.g. a caller that passed no resolved externals + // for a program that actually uses them), or one that does not parse, must go + // through the linker so the unsatisfied-import / parse failure surfaces as an + // error instead of being silently passed through. Keying the fast path on the + // *module's own imports* rather than merely on `externals.is_empty()` keeps it + // fail-closed and honours the documented error contract above. + if externals.is_empty() && module_is_import_free(main_wasm) { + return Ok(main_wasm.to_vec()); + } + Ok(inference_wasm_linker::link(main_wasm, externals)?) +} + +/// Whether `wasm` parses and declares no imports. Returns `false` on any parse +/// failure or on the first surviving import, so [`link`] routes such a module +/// through the linker — which validates it and reports the precise error — +/// rather than taking the byte-identical no-op path. +fn module_is_import_free(wasm: &[u8]) -> bool { + use inf_wasmparser::{Parser, Payload}; + for payload in Parser::new(0).parse_all(wasm) { + match payload { + Ok(Payload::ImportSection(reader)) => { + // Any entry (well-formed or not) means the module is not yet + // self-contained, so it must not take the no-op path. + if reader.into_iter().next().is_some() { + return false; + } + } + Ok(_) => {} + Err(_) => return false, + } + } + true +} + /// Translates WebAssembly binary to Rocq (Coq) verification code. /// /// This function parses a WebAssembly binary and generates equivalent Rocq diff --git a/core/inference/src/wasm_link/driver.rs b/core/inference/src/wasm_link/driver.rs new file mode 100644 index 00000000..d00dbdf4 --- /dev/null +++ b/core/inference/src/wasm_link/driver.rs @@ -0,0 +1,434 @@ +//! Driver-side orchestration of external `.wasm` resolution and validation. +//! +//! Between type checking and linking, the build driver must turn each bound +//! `external fn` into actual `.wasm` bytes the static-merge linker can consume. +//! This module performs that, end to end, for every extern in a program: +//! +//! 1. enumerate the program's bound externs ([`TypedContext::extern_origins`]), +//! 2. [resolve](super::resolve) each logical module to a concrete `.wasm` path, +//! 3. [validate](super::validate) that the resolved module exports the named +//! function with the declared signature, and +//! 4. read the deduplicated module bytes for the linker. +//! +//! Signature validation needs the `external fn`'s declared parameter and return +//! types, which live in the AST. The arena is reachable from the +//! [`TypedContext`], so this stays a pure post-type-check step with no extra +//! plumbing through the front end. + +use std::collections::BTreeMap; +use std::io::Read; +use std::path::PathBuf; + +use inference_ast::nodes::Def; +use inference_type_checker::typed_context::TypedContext; + +use super::resolve::{resolve_wasm_module, ManifestDeps, ModulePath, SearchPath}; +use super::validate::{lower_extern_signature, validate_extern}; + +/// Maximum size, in bytes, of a resolved external `.wasm` module. +/// +/// External modules are read fully into memory before validation, so an +/// unbounded read of an attacker-influenced (sparse, multi-GB) file in a search +/// location would drive the compiler toward OOM. A real `.wasm` library is well +/// under this bound; the cap exists solely to defeat that resource cliff. +pub const MAX_EXTERNAL_MODULE_BYTES: u64 = 64 * 1024 * 1024; + +/// A resolved external module: its logical name, the file it resolved to, and +/// the bytes read from disk. +#[derive(Debug, Clone)] +pub struct ResolvedExternalModule { + /// Logical `::`-joined module reference, for diagnostics. + pub logical_module: String, + /// The `.wasm` file the logical module resolved to. + pub path: PathBuf, + /// The module's bytes, ready for the linker. + pub bytes: Vec, +} + +/// Why the driver could not assemble the external-module set. +#[derive(Debug)] +pub enum ExternalResolutionError { + /// A logical module could not be resolved to a `.wasm` file. + Resolve(super::resolve::ResolveError), + /// A resolved module failed export/signature validation. The error is boxed + /// because [`super::validate::ValidateError`] carries a signature mismatch + /// payload large enough to dominate the enum's size. + Validate { + logical_module: String, + error: Box, + }, + /// A resolved module failed full WASM validation (its bytes do not decode to + /// a structurally and semantically valid module). A malformed-but-decodable + /// external must be rejected here, before it can reach the linker. + Invalid { + logical_module: String, + path: PathBuf, + reason: String, + }, + /// A resolved module is well-formed WebAssembly but uses a feature outside the + /// linker's supported WASM 1.0 subset (see + /// [`inference_wasm_linker::SUPPORTED_WASM_FEATURES`]). Rejecting it here — the + /// same gate the linker applies — surfaces the feature-named diagnostic at the + /// earliest point in the build, keeping the supported-version contract a single + /// source of truth. + UnsupportedFeature { + logical_module: String, + path: PathBuf, + reason: String, + }, + /// A resolved `.wasm` file exceeded [`MAX_EXTERNAL_MODULE_BYTES`]. + TooLarge { + path: PathBuf, + size: u64, + limit: u64, + }, + /// An `external fn`'s declared signature could not be lowered to WASM value + /// types (e.g. a `unit` parameter or an unsupported type form). + Signature { + export_field: String, + error: super::validate::LowerSignatureError, + }, + /// A logical name was not a valid module path (empty or separator-bearing + /// segment). + ModulePath(super::resolve::ModulePathError), + /// The resolved `.wasm` file could not be read. + Read { + path: PathBuf, + error: std::io::Error, + }, + /// A bound extern named a function the AST has no `external fn` declaration + /// for — an internal inconsistency between provenance and the parsed tree. + MissingDeclaration { export_field: String }, +} + +impl std::fmt::Display for ExternalResolutionError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ExternalResolutionError::Resolve(e) => write!(f, "{e}"), + ExternalResolutionError::Validate { + logical_module, + error, + } => write!(f, "module `{logical_module}`: {error}"), + ExternalResolutionError::Invalid { + logical_module, + path, + reason, + } => write!( + f, + "module `{logical_module}` at `{}` is not a valid WASM module: {reason}", + path.display() + ), + ExternalResolutionError::UnsupportedFeature { + logical_module, + path, + reason, + } => write!( + f, + "module `{logical_module}` at `{}` uses an unsupported WebAssembly feature: {reason}", + path.display() + ), + ExternalResolutionError::TooLarge { path, size, limit } => write!( + f, + "external `.wasm` `{}` is {size} bytes, exceeding the {limit}-byte limit", + path.display() + ), + ExternalResolutionError::Signature { + export_field, + error, + } => write!(f, "external fn `{export_field}`: {error}"), + ExternalResolutionError::ModulePath(e) => write!(f, "{e}"), + ExternalResolutionError::Read { path, error } => { + write!(f, "failed to read `{}`: {error}", path.display()) + } + ExternalResolutionError::MissingDeclaration { export_field } => write!( + f, + "internal error: extern `{export_field}` is bound but has no declaration" + ), + } + } +} + +impl std::error::Error for ExternalResolutionError {} + +/// Resolves, validates, and reads every external `.wasm` module a program binds. +/// +/// Returns one resolved module per distinct **logical module** the program +/// binds. Two externs from the same logical module yield a single entry, and a +/// physical `.wasm` file is read and validated once even if two logical modules +/// resolve to it — but each logical module still gets its own entry, because the +/// linker matches an import's recorded `(module, field)` on the logical module. +/// The order is deterministic (sorted by logical module name). +/// +/// A program with no externs yields an empty vector, and the build proceeds +/// without invoking the linker. +/// +/// # Errors +/// +/// Returns an [`ExternalResolutionError`] if any extern fails to resolve, +/// validate, lower its signature, or read its bytes. +pub fn resolve_external_modules( + typed_context: &TypedContext, + search_path: &SearchPath, + manifest_deps: Option<&ManifestDeps>, +) -> Result, ExternalResolutionError> { + let origins = typed_context.extern_origins(); + if origins.is_empty() { + return Ok(Vec::new()); + } + + let arena = typed_context.arena(); + + // Cache reads/validations by resolved path so a physical file is read once, + // even when two logical modules resolve to it. + let mut read_cache: BTreeMap> = BTreeMap::new(); + // Output keyed by logical module: the linker matches each import's recorded + // `(module, field)` on the logical module, so every bound logical module + // needs its own entry even if it shares bytes with another. `BTreeMap` keeps + // the output deterministic. + let mut by_module: BTreeMap = BTreeMap::new(); + + for origin in &origins { + let module_path = parse_module_path(&origin.logical_module)?; + let resolved = resolve_wasm_module(&module_path, search_path, manifest_deps) + .map_err(ExternalResolutionError::Resolve)?; + + // For a path seen before, the bytes are already read AND validated; reuse + // them. A fresh path is size-checked, read with a bounded streaming read, + // and validated as a real WASM module before any byte reaches the linker. + let bytes = if let Some(existing) = read_cache.get(&resolved) { + existing.clone() + } else { + let bytes = read_external_module(&resolved)?; + validate_module_bytes(&bytes, &origin.logical_module, &resolved)?; + read_cache.insert(resolved.clone(), bytes.clone()); + bytes + }; + + // Recover the declared signature from the *exact* declaration this + // binding attaches to, by `DefId`. Two same-named externs (e.g. a + // top-level and a spec-inner `sort`) must not collide into one slot: + // validating the resolved library against a same-named sibling's + // signature would either reject a matching library or accept a + // mismatching one. Only the bound declaration is the source of truth. + let (args, returns) = extern_declaration(arena, origin.decl).ok_or_else(|| { + ExternalResolutionError::MissingDeclaration { + export_field: origin.export_field.clone(), + } + })?; + let declared_sig = lower_extern_signature(arena, &args, returns).map_err(|error| { + ExternalResolutionError::Signature { + export_field: origin.export_field.clone(), + error, + } + })?; + + validate_extern(&bytes, &origin.export_field, &declared_sig).map_err(|error| { + ExternalResolutionError::Validate { + logical_module: origin.logical_module.clone(), + error: Box::new(error), + } + })?; + + by_module + .entry(origin.logical_module.clone()) + .or_insert(ResolvedExternalModule { + logical_module: origin.logical_module.clone(), + path: resolved, + bytes, + }); + } + + Ok(by_module.into_values().collect()) +} + +/// Reads a resolved external `.wasm` module's bytes, enforcing +/// [`MAX_EXTERNAL_MODULE_BYTES`]. +/// +/// The size is checked twice to defeat a TOCTOU race: once against the file +/// metadata before opening, and once against the actual bytes read. The read is +/// bounded to `limit + 1` bytes via [`Read::take`], so a file that grows past +/// the cap between the `stat` and the read still cannot force an unbounded +/// allocation — the streaming read stops one byte past the limit and the module +/// is rejected. +fn read_external_module(path: &std::path::Path) -> Result, ExternalResolutionError> { + let limit = MAX_EXTERNAL_MODULE_BYTES; + + let metadata = std::fs::metadata(path).map_err(|error| ExternalResolutionError::Read { + path: path.to_path_buf(), + error, + })?; + if metadata.len() > limit { + return Err(ExternalResolutionError::TooLarge { + path: path.to_path_buf(), + size: metadata.len(), + limit, + }); + } + + let file = std::fs::File::open(path).map_err(|error| ExternalResolutionError::Read { + path: path.to_path_buf(), + error, + })?; + let mut bytes = Vec::new(); + file.take(limit + 1) + .read_to_end(&mut bytes) + .map_err(|error| ExternalResolutionError::Read { + path: path.to_path_buf(), + error, + })?; + + if bytes.len() as u64 > limit { + return Err(ExternalResolutionError::TooLarge { + path: path.to_path_buf(), + size: bytes.len() as u64, + limit, + }); + } + + Ok(bytes) +} + +/// Runs the linker's supported-version gate over a resolved external module, +/// rejecting any module that is not structurally valid WASM or that uses a +/// feature outside the supported WASM 1.0 subset. +/// +/// `validate_extern` only inspects the exported function's signature; it never +/// decodes bodies, locals, or non-root sections. This gate closes that gap so a +/// malformed-but-decodable external cannot reach the linker, where it would +/// otherwise drive a recoverable error into a panic. +/// +/// The check delegates to [`inference_wasm_linker::validate_external`], the same +/// two-pass gate the linker applies at `link()`, so the CLI rejects a non-1.0 +/// external at the earliest point with the *same* feature-named diagnostic — a +/// single source of truth for the supported-version contract rather than two +/// divergent validations. A structural failure surfaces as +/// [`ExternalResolutionError::Invalid`]; a well-formed-but-unsupported module +/// surfaces as [`ExternalResolutionError::UnsupportedFeature`]. +fn validate_module_bytes( + bytes: &[u8], + logical_module: &str, + path: &std::path::Path, +) -> Result<(), ExternalResolutionError> { + inference_wasm_linker::validate_external(logical_module, bytes).map_err(|error| match error { + inference_wasm_linker::LinkError::UnsupportedWasmFeature { details, .. } => { + ExternalResolutionError::UnsupportedFeature { + logical_module: logical_module.to_string(), + path: path.to_path_buf(), + reason: details, + } + } + other => ExternalResolutionError::Invalid { + logical_module: logical_module.to_string(), + path: path.to_path_buf(), + reason: other.to_string(), + }, + }) +} + +/// Splits a `::`-joined logical module string into a validated [`ModulePath`]. +fn parse_module_path(logical_module: &str) -> Result { + ModulePath::from_segments(logical_module.split("::")) + .map_err(ExternalResolutionError::ModulePath) +} + +/// The declared argument list and return type of the `external fn` at `decl`. +/// +/// Resolving by [`DefId`] (rather than by bare name) is what lets the driver +/// validate a bound extern against its *own* declaration when two same-named +/// externs exist — the top-level and a spec-inner `sort` no longer collide into +/// one signature slot. +fn extern_declaration( + arena: &inference_ast::arena::AstArena, + decl: inference_ast::ids::DefId, +) -> Option<( + Vec, + Option, +)> { + match &arena[decl].kind { + Def::ExternFunction { args, returns, .. } => Some((args.clone(), *returns)), + _ => None, + } +} + +#[cfg(test)] +mod tests { + //! Unit tests for the driver error diagnostics. Each `ExternalResolutionError` + //! variant renders a distinct, actionable message; these assert the rendered + //! text so a future refactor cannot silently drop the context a build needs. + + use super::*; + use crate::wasm_link::resolve::{ModulePathError, ResolveError}; + use crate::wasm_link::validate::{ + DeclaredSignature, LowerSignatureError, SignatureMismatch, ValidateError, WasmValType, + }; + + #[test] + fn resolve_error_display_forwards_inner_message() { + let inner = ResolveError::NotFound { + logical_name: "sorting".into(), + searched: vec![PathBuf::from("lib").join("sorting.wasm")], + }; + let rendered = ExternalResolutionError::Resolve(inner).to_string(); + assert!(rendered.contains("sorting"), "{rendered}"); + } + + #[test] + fn validate_error_display_names_the_module() { + let rendered = ExternalResolutionError::Validate { + logical_module: "crypto::sha256".into(), + error: Box::new(ValidateError::SignatureMismatch { + export_field: "hash".into(), + mismatch: SignatureMismatch { + declared: DeclaredSignature { + params: vec![WasmValType::I32], + results: vec![WasmValType::I32], + }, + found_params: vec![WasmValType::I64], + found_results: vec![WasmValType::I32], + }, + }), + } + .to_string(); + assert!(rendered.contains("crypto::sha256"), "names the module: {rendered}"); + assert!(rendered.contains("hash"), "names the export: {rendered}"); + } + + #[test] + fn signature_error_display_names_the_export() { + let rendered = ExternalResolutionError::Signature { + export_field: "f".into(), + error: LowerSignatureError::UnitParameter, + } + .to_string(); + assert!(rendered.contains("external fn `f`"), "{rendered}"); + assert!(rendered.contains("unit"), "{rendered}"); + } + + #[test] + fn module_path_error_display_forwards_inner_message() { + let rendered = + ExternalResolutionError::ModulePath(ModulePathError::Empty).to_string(); + assert!(!rendered.is_empty(), "empty module path renders a message"); + } + + #[test] + fn read_error_display_shows_path() { + let rendered = ExternalResolutionError::Read { + path: PathBuf::from("vendor").join("arith.wasm"), + error: std::io::Error::new(std::io::ErrorKind::NotFound, "missing"), + } + .to_string(); + assert!(rendered.contains("arith.wasm"), "names the path: {rendered}"); + assert!(rendered.contains("missing"), "carries the io error: {rendered}"); + } + + #[test] + fn missing_declaration_display_is_an_internal_error() { + let rendered = ExternalResolutionError::MissingDeclaration { + export_field: "ghost".into(), + } + .to_string(); + assert!(rendered.contains("ghost"), "{rendered}"); + assert!(rendered.contains("internal error"), "{rendered}"); + } +} diff --git a/core/inference/src/wasm_link/mod.rs b/core/inference/src/wasm_link/mod.rs new file mode 100644 index 00000000..096fcbcb --- /dev/null +++ b/core/inference/src/wasm_link/mod.rs @@ -0,0 +1,29 @@ +//! Front-end support for linking external `.wasm` modules. +//! +//! This module hosts the **driver-side** half of Issue #9's `.wasm` static-merge +//! feature — everything that runs before any bytes are merged: +//! +//! - [`resolve`] turns a logical module reference (`use { f } from a::b;`) into a +//! concrete `.wasm` [`std::path::PathBuf`], portably and with a precise miss +//! diagnostic. +//! - [`validate`] confirms that a resolved `.wasm` actually exports the named +//! function and that its signature matches the `external fn` declaration. +//! +//! The later codegen and merge phases (a dedicated `core/wasm-linker/` crate) +//! consume the validated bindings these utilities produce. + +pub mod driver; +pub mod resolve; +pub mod validate; + +pub use driver::{ + resolve_external_modules, ExternalResolutionError, ResolvedExternalModule, + MAX_EXTERNAL_MODULE_BYTES, +}; +pub use resolve::{ + resolve_wasm_module, ManifestDeps, ModulePath, ModulePathError, ResolveError, SearchPath, +}; +pub use validate::{ + lower_extern_signature, validate_extern, DeclaredSignature, LowerSignatureError, + SignatureMismatch, ValidateError, WasmValType, +}; diff --git a/core/inference/src/wasm_link/resolve.rs b/core/inference/src/wasm_link/resolve.rs new file mode 100644 index 00000000..aaf2c889 --- /dev/null +++ b/core/inference/src/wasm_link/resolve.rs @@ -0,0 +1,438 @@ +//! Platform-independent resolution of a logical module reference to a `.wasm` +//! file on disk. +//! +//! Source never names a filesystem path (no `./`, no OS separators); it names a +//! *logical* module — a `::`-separated identifier path mirrored by +//! [`inference_ast::nodes::ModuleRef`]. This module turns that logical name into +//! a concrete [`PathBuf`] by searching, in priority order: +//! +//! 1. **manifest** dependency entries (`Inference.toml [wasm-dependencies]`, +//! delivered fully in a later phase — accepted here as a stub map so the +//! precedence is wired from the start), +//! 2. **`-L` / `--wasm-lib-dir`** directories, +//! 3. **`INFERENCE_*`** environment directories. +//! +//! The `-L` and environment directories arrive already concatenated in +//! [`SearchPath::dirs`] in exactly that order, so the resolver walks them +//! front-to-back. A logical name `a::b` maps to the relative path `a/b.wasm` +//! using [`std::path::MAIN_SEPARATOR`] (via [`Path::join`]) at resolve time, so +//! the same source resolves identically on every operating system. + +use std::path::{Path, PathBuf}; + +use inference_ast::arena::AstArena; +use inference_ast::nodes::ModuleRef; +use rustc_hash::FxHashMap; + +/// File extension of a compiled WebAssembly module. +const WASM_EXTENSION: &str = "wasm"; + +/// A logical, platform-independent module name as a sequence of identifier +/// segments (e.g. `crypto::sha256` → `["crypto", "sha256"]`). +/// +/// The segments are validated to be non-empty and free of path separators at +/// construction, so mapping them onto a [`Path`] can never escape the search +/// directory or smuggle in an OS separator. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct ModulePath { + segments: Vec, +} + +/// Reason a logical name could not be turned into a [`ModulePath`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ModulePathError { + /// The reference had no segments at all. + Empty, + /// A segment was empty or contained a path separator / `.` component. + InvalidSegment(String), +} + +impl std::fmt::Display for ModulePathError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ModulePathError::Empty => write!(f, "module reference has no path segments"), + ModulePathError::InvalidSegment(seg) => { + write!(f, "invalid module path segment `{seg}`") + } + } + } +} + +impl std::error::Error for ModulePathError {} + +impl ModulePath { + /// Builds a [`ModulePath`] from already-owned segment strings, validating each. + /// + /// # Errors + /// + /// Returns [`ModulePathError::Empty`] if there are no segments, or + /// [`ModulePathError::InvalidSegment`] if a segment is empty, a `.`/`..` + /// component, or contains a path separator. + pub fn from_segments(segments: I) -> Result + where + I: IntoIterator, + S: Into, + { + let segments: Vec = segments.into_iter().map(Into::into).collect(); + if segments.is_empty() { + return Err(ModulePathError::Empty); + } + for segment in &segments { + if segment.is_empty() + || segment == "." + || segment == ".." + || segment.contains('/') + || segment.contains('\\') + { + return Err(ModulePathError::InvalidSegment(segment.clone())); + } + } + Ok(ModulePath { segments }) + } + + /// Builds a [`ModulePath`] from a parsed [`ModuleRef`], resolving each + /// identifier index against `arena`. + /// + /// # Errors + /// + /// Propagates the validation errors of [`ModulePath::from_segments`]. + pub fn from_module_ref( + module_ref: &ModuleRef, + arena: &AstArena, + ) -> Result { + Self::from_segments(module_ref.segments.iter().map(|&id| arena.ident_name(id))) + } + + /// The logical name in `a::b` form, for diagnostics. + #[must_use] + pub fn display_name(&self) -> String { + self.segments.join("::") + } + + /// The relative filesystem path this logical name maps to, e.g. + /// `crypto::sha256` → `crypto/sha256.wasm` (with the host separator). + /// + /// Built exclusively through [`Path::join`] / [`Path::with_extension`], so no + /// literal separator ever appears in source or here. + #[must_use] + pub fn to_relative_path(&self) -> PathBuf { + let mut path = PathBuf::new(); + for segment in &self.segments { + path.push(segment); + } + path.with_extension(WASM_EXTENSION) + } +} + +/// Ordered search directories for the resolver. `-L` directories precede +/// `INFERENCE_*` environment directories; callers assemble them in that order. +#[derive(Debug, Default, Clone)] +pub struct SearchPath { + dirs: Vec, +} + +impl SearchPath { + /// Creates an empty search path. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Appends a `-L` / `--wasm-lib-dir` directory (highest of the directory tiers). + /// + /// An empty path is dropped: a bare `dir.join(relative)` against an empty + /// directory resolves against the process CWD, silently turning the build + /// directory into a `.wasm` search root. + pub fn push_lib_dir(&mut self, dir: impl Into) { + let dir = dir.into(); + if dir.as_os_str().is_empty() { + return; + } + self.dirs.push(dir); + } + + /// Appends an `INFERENCE_*` environment directory (lowest tier). + /// + /// An empty path is dropped, for the same reason as [`Self::push_lib_dir`]. + pub fn push_env_dir(&mut self, dir: impl Into) { + let dir = dir.into(); + if dir.as_os_str().is_empty() { + return; + } + self.dirs.push(dir); + } + + /// The directories in resolution order. + #[must_use] + pub fn dirs(&self) -> &[PathBuf] { + &self.dirs + } +} + +/// Manifest-declared `.wasm` dependencies (`Inference.toml [wasm-dependencies]`). +/// +/// Phase 0 accepts this as a plain logical-name → file map so the resolver's +/// precedence is exercised; the manifest *producer* lands in a later phase. +#[derive(Debug, Default, Clone)] +pub struct ManifestDeps { + entries: FxHashMap, +} + +impl ManifestDeps { + /// Creates an empty manifest dependency set. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Declares that logical `name` resolves to `path`. + pub fn insert(&mut self, name: impl Into, path: impl Into) { + self.entries.insert(name.into(), path.into()); + } + + /// The manifest entry for `name`, if any. + #[must_use] + pub fn get(&self, name: &str) -> Option<&Path> { + self.entries.get(name).map(PathBuf::as_path) + } +} + +/// Failure to resolve a logical module reference to a `.wasm` file. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ResolveError { + /// No candidate path existed under any searched location. + NotFound { + /// The logical name in `a::b` form. + logical_name: String, + /// Every absolute/relative candidate that was probed, in order. + searched: Vec, + }, + /// The manifest named a path that does not exist on disk. + ManifestPathMissing { + /// The logical name in `a::b` form. + logical_name: String, + /// The path the manifest pointed at. + path: PathBuf, + }, +} + +impl std::fmt::Display for ResolveError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ResolveError::NotFound { + logical_name, + searched, + } => { + writeln!( + f, + "could not resolve module `{logical_name}` to a `.wasm` file" + )?; + if searched.is_empty() { + write!(f, " (no search directories were configured)") + } else { + writeln!(f, " searched the following locations:")?; + for (i, path) in searched.iter().enumerate() { + let last = i + 1 == searched.len(); + if last { + write!(f, " - {}", path.display())?; + } else { + writeln!(f, " - {}", path.display())?; + } + } + Ok(()) + } + } + ResolveError::ManifestPathMissing { logical_name, path } => { + write!( + f, + "manifest declares module `{logical_name}` at `{}`, but no file exists there", + path.display() + ) + } + } + } +} + +impl std::error::Error for ResolveError {} + +/// Resolves a logical module reference to a concrete `.wasm` file. +/// +/// Order: `manifest_deps` (if given) → `search_path` directories (`-L` then env). +/// The logical name `a::b` maps to the relative path `a/b.wasm` under each +/// directory, using the host separator via [`Path::join`]. +/// +/// # Errors +/// +/// Returns [`ResolveError::ManifestPathMissing`] when the manifest names a file +/// that does not exist, and [`ResolveError::NotFound`] when no candidate exists +/// under any searched location (the error lists every probed path). +pub fn resolve_wasm_module( + logical_name: &ModulePath, + search_path: &SearchPath, + manifest_deps: Option<&ManifestDeps>, +) -> Result { + if let Some(path) = manifest_deps.and_then(|m| m.get(&logical_name.display_name())) { + if path.is_file() { + return Ok(path.to_path_buf()); + } + return Err(ResolveError::ManifestPathMissing { + logical_name: logical_name.display_name(), + path: path.to_path_buf(), + }); + } + + let relative = logical_name.to_relative_path(); + let mut searched = Vec::with_capacity(search_path.dirs().len()); + for dir in search_path.dirs() { + let candidate = dir.join(&relative); + if candidate.is_file() { + return Ok(candidate); + } + searched.push(candidate); + } + + Err(ResolveError::NotFound { + logical_name: logical_name.display_name(), + searched, + }) +} + +#[cfg(test)] +mod tests { + //! Unit tests for module-path construction from the AST and the error + //! diagnostics' rendered form. The resolution-precedence behaviour itself is + //! covered by the integration suite in `tests/wasm_resolve.rs`; these focus + //! on the AST bridge and the `Display` rendering the integration tests reach + //! only partially. + + use super::*; + use inference_ast::nodes::Directive; + + /// Parses `source` and returns the `ModuleRef` of its first `use … from …;`. + fn first_module_ref(source: &str) -> (inference_ast::arena::AstArena, ModuleRef) { + let arena = crate::parse(source).expect("source parses"); + let module_ref = arena + .source_files() + .flat_map(|file| file.directives.iter()) + .find_map(|directive| { + let Directive::Use(use_dir) = directive; + use_dir.from.clone() + }) + .expect("a `use … from …;` directive"); + (arena, module_ref) + } + + #[test] + fn module_path_from_a_parsed_use_directive() { + let (arena, module_ref) = first_module_ref( + "external fn hash(a: i32) -> i32;\n\ + use { hash } from crypto::sha256;", + ); + let path = ModulePath::from_module_ref(&module_ref, &arena).expect("valid module ref"); + assert_eq!(path.display_name(), "crypto::sha256"); + + let components: Vec<_> = path + .to_relative_path() + .components() + .map(|c| c.as_os_str().to_string_lossy().into_owned()) + .collect(); + assert_eq!( + components, + ["crypto", "sha256.wasm"], + "the relative path uses host separators, never a literal slash" + ); + } + + #[test] + fn single_segment_use_directive_maps_to_a_flat_file() { + let (arena, module_ref) = first_module_ref( + "external fn sum(a: i32) -> i32;\n\ + use { sum } from arith;", + ); + let path = ModulePath::from_module_ref(&module_ref, &arena).expect("valid module ref"); + assert_eq!(path.display_name(), "arith"); + assert_eq!(path.to_relative_path(), PathBuf::from("arith.wasm")); + } + + #[test] + fn module_path_error_display_renders_both_variants() { + assert!(ModulePathError::Empty + .to_string() + .contains("no path segments")); + assert!(ModulePathError::InvalidSegment("a/b".into()) + .to_string() + .contains("a/b")); + } + + #[test] + fn not_found_display_lists_every_searched_location() { + let rendered = ResolveError::NotFound { + logical_name: "crypto::sha256".into(), + searched: vec![ + PathBuf::from("lib").join("crypto").join("sha256.wasm"), + PathBuf::from("env").join("crypto").join("sha256.wasm"), + ], + } + .to_string(); + assert!(rendered.contains("crypto::sha256"), "names the module"); + assert!(rendered.contains("searched the following locations"), "{rendered}"); + // Both probed paths appear, each on its own line. + let lib_line = format!("{}", Path::new("lib").join("crypto").join("sha256.wasm").display()); + let env_line = format!("{}", Path::new("env").join("crypto").join("sha256.wasm").display()); + assert!(rendered.contains(&lib_line), "lists first path: {rendered}"); + assert!(rendered.contains(&env_line), "lists last path: {rendered}"); + } + + #[test] + fn empty_search_dirs_are_dropped() { + // An empty `PathBuf` from a stray separator in `INFERENCE_WASM_LIB_PATH` + // (or an empty `-L`) must never become a search root: `dir.join(rel)` + // against an empty dir resolves relative to the process CWD. Both push + // entry points drop it, so a path built only from empty entries searches + // nothing — identical to no directories being configured at all. + let mut search = SearchPath::new(); + search.push_env_dir(PathBuf::new()); + search.push_lib_dir(PathBuf::from("")); + assert!( + search.dirs().is_empty(), + "empty directory entries must be dropped, got {:?}", + search.dirs() + ); + + let module = ModulePath::from_segments(["arith"]).unwrap(); + let err = resolve_wasm_module(&module, &search, None).unwrap_err(); + let ResolveError::NotFound { searched, .. } = err else { + panic!("expected NotFound, got {err:?}"); + }; + assert!( + searched.is_empty(), + "an all-empty search path probes nothing, like an unset path" + ); + } + + #[test] + fn non_empty_dirs_are_kept_alongside_dropped_empties() { + // A real directory survives even when interleaved with empty entries, + // mirroring `"/real/dir:"` splitting into `["/real/dir", ""]`. + let mut search = SearchPath::new(); + search.push_env_dir(PathBuf::new()); + search.push_env_dir(PathBuf::from("real")); + search.push_env_dir(PathBuf::new()); + assert_eq!(search.dirs(), [PathBuf::from("real")]); + } + + #[test] + fn manifest_path_missing_display_names_the_module_and_path() { + let path = PathBuf::from("vendor").join("missing.wasm"); + let rendered = ResolveError::ManifestPathMissing { + logical_name: "sorting".into(), + path: path.clone(), + } + .to_string(); + assert!(rendered.contains("sorting"), "names the module: {rendered}"); + assert!( + rendered.contains(&path.display().to_string()), + "names the declared path: {rendered}" + ); + } +} diff --git a/core/inference/src/wasm_link/validate.rs b/core/inference/src/wasm_link/validate.rs new file mode 100644 index 00000000..f649832b --- /dev/null +++ b/core/inference/src/wasm_link/validate.rs @@ -0,0 +1,623 @@ +//! Compile-time validation of an `external fn` declaration against the real +//! `.wasm` module that is expected to provide it. +//! +//! After a logical module reference is [resolved](super::resolve), the compiler +//! must confirm two things about the resolved binary before trusting the binding: +//! +//! 1. the named `export_field` is actually an **exported function**, and +//! 2. that function's WASM signature **matches** the lowering of the +//! `external fn` declaration (parameter and result value types, in order). +//! +//! The two failure modes carry **distinct** error variants +//! ([`ValidateError::ExportNotFound`] vs [`ValidateError::SignatureMismatch`]) so +//! callers can report precisely what went wrong. +//! +//! ## Signature lowering +//! +//! Inference primitive types lower to WASM value types exactly as `wasm-codegen` +//! does: `bool`, `i8`/`u8`, `i16`/`u16`, `i32`/`u32`, arrays, and struct/enum +//! pointers become `i32`; `i64`/`u64` become `i64`; `unit` produces no value. +//! Keeping this in lock-step with codegen is what makes validation meaningful — +//! a mismatch here is a real mismatch at link time. + +use inf_wasmparser::{ + CompositeInnerType, Export, ExternalKind, FuncType, Parser, Payload, RecGroup, ValType, +}; + +use inference_ast::arena::AstArena; +use inference_ast::ids::TypeId; +use inference_ast::nodes::{ArgKind, SimpleTypeKind, TypeNode}; + +/// Maximum number of exported function names listed in an +/// [`ValidateError::ExportNotFound`] hint before the rest are summarized as a +/// count. Bounds the diagnostic against a module exporting thousands of names. +const MAX_LISTED_EXPORTS: usize = 20; + +/// A WASM value type, restricted to the kinds Inference codegen emits. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum WasmValType { + I32, + I64, +} + +impl WasmValType { + fn from_parser(val: ValType) -> Option { + match val { + ValType::I32 => Some(WasmValType::I32), + ValType::I64 => Some(WasmValType::I64), + _ => None, + } + } +} + +impl std::fmt::Display for WasmValType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WasmValType::I32 => write!(f, "i32"), + WasmValType::I64 => write!(f, "i64"), + } + } +} + +/// The lowered WASM signature of an `external fn` declaration. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DeclaredSignature { + pub params: Vec, + pub results: Vec, +} + +/// Reason an `external fn` type could not be lowered to a WASM value type. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LowerSignatureError { + /// A parameter was declared `unit`, which has no WASM value representation. + UnitParameter, + /// A type form that codegen does not lower to a scalar value type + /// (e.g. a generic or function type) appeared in the signature. + UnsupportedType { rendered: String }, +} + +impl std::fmt::Display for LowerSignatureError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LowerSignatureError::UnitParameter => { + write!(f, "`unit` cannot appear as an external function parameter") + } + LowerSignatureError::UnsupportedType { rendered } => { + write!(f, "unsupported type in external function signature: {rendered}") + } + } + } +} + +impl std::error::Error for LowerSignatureError {} + +/// Lowers an Inference type to its WASM value type, mirroring +/// `wasm-codegen`'s `val_type_from_type_id`. `unit` lowers to `None` +/// (no value); struct/enum and qualified names lower to an `i32` pointer. +fn lower_value_type(arena: &AstArena, ty: TypeId) -> Result, LowerSignatureError> { + match &arena[ty].kind { + TypeNode::Simple(SimpleTypeKind::Unit) => Ok(None), + TypeNode::Simple( + SimpleTypeKind::Bool + | SimpleTypeKind::I8 + | SimpleTypeKind::U8 + | SimpleTypeKind::I16 + | SimpleTypeKind::U16 + | SimpleTypeKind::I32 + | SimpleTypeKind::U32, + ) + | TypeNode::Array { .. } + // Struct / enum values are i32 pointers into linear memory, matching codegen. + | TypeNode::Custom(_) => Ok(Some(WasmValType::I32)), + TypeNode::Simple(SimpleTypeKind::I64 | SimpleTypeKind::U64) => Ok(Some(WasmValType::I64)), + // `Generic`, `Function`, and qualified-name forms are not lowered to a + // scalar value type by codegen (it `todo!()`s on them); reject them here + // with a clear error rather than guessing a representation. + other => Err(LowerSignatureError::UnsupportedType { + rendered: format!("{other:?}"), + }), + } +} + +/// Lowers an `external fn`'s declared argument types and return type into a +/// [`DeclaredSignature`] of WASM value types. +/// +/// # Errors +/// +/// Returns [`LowerSignatureError`] if a parameter is `unit` or a type form is +/// not lowerable to a scalar value type. A `unit` return is valid and yields an +/// empty `results` list. +pub fn lower_extern_signature( + arena: &AstArena, + args: &[inference_ast::nodes::ArgData], + returns: Option, +) -> Result { + let mut params = Vec::with_capacity(args.len()); + for arg in args { + let ty = match arg.kind { + ArgKind::Named { ty, .. } | ArgKind::Ignored { ty } | ArgKind::TypeOnly(ty) => ty, + // `external fn` declarations have no receiver; the type-checker now + // rejects a `self` here (H7). Drop it with no param so this validator + // genuinely agrees with codegen — which also emits no receiver — and + // a mismatching export is reported as a `SignatureMismatch` rather + // than silently validating against an extra i32 the call never pushes. + ArgKind::SelfRef { .. } => continue, + }; + match lower_value_type(arena, ty)? { + Some(val) => params.push(val), + None => return Err(LowerSignatureError::UnitParameter), + } + } + + let results = match returns { + Some(ty) => lower_value_type(arena, ty)?.into_iter().collect(), + None => Vec::new(), + }; + + Ok(DeclaredSignature { params, results }) +} + +/// A WASM signature mismatch, rendered for diagnostics. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SignatureMismatch { + pub declared: DeclaredSignature, + pub found_params: Vec, + pub found_results: Vec, +} + +/// Failure of [`validate_extern`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ValidateError { + /// The `.wasm` bytes could not be parsed. + Parse(String), + /// No exported **function** named `export_field` exists in the module. + ExportNotFound { + export_field: String, + /// Names of the functions the module *does* export, for a helpful hint. + available_functions: Vec, + }, + /// The export exists and is a function, but its signature differs from the + /// lowered `external fn` declaration. + SignatureMismatch { + export_field: String, + mismatch: SignatureMismatch, + }, + /// The exported function's signature uses a WASM value type Inference does + /// not model (e.g. `f64`), so it cannot back an `external fn`. + UnsupportedExportType { export_field: String }, +} + +impl std::fmt::Display for ValidateError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ValidateError::Parse(msg) => write!(f, "failed to parse external `.wasm`: {msg}"), + ValidateError::ExportNotFound { + export_field, + available_functions, + } => { + write!( + f, + "external module has no exported function `{export_field}`" + )?; + if !available_functions.is_empty() { + // Cap the hint so an adversarial module exporting thousands of + // functions cannot flood stderr; the count covers the rest. + let shown = available_functions + .iter() + .take(MAX_LISTED_EXPORTS) + .map(String::as_str) + .collect::>() + .join(", "); + write!(f, " (exported functions: {shown}")?; + let hidden = available_functions.len().saturating_sub(MAX_LISTED_EXPORTS); + if hidden > 0 { + write!(f, ", ... and {hidden} more")?; + } + write!(f, ")")?; + } + Ok(()) + } + ValidateError::SignatureMismatch { + export_field, + mismatch, + } => { + write!( + f, + "signature mismatch for external function `{export_field}`: declared {}, found {}", + render_signature(&mismatch.declared.params, &mismatch.declared.results), + render_signature(&mismatch.found_params, &mismatch.found_results), + ) + } + ValidateError::UnsupportedExportType { export_field } => write!( + f, + "exported function `{export_field}` uses a WASM value type Inference does not model" + ), + } + } +} + +impl std::error::Error for ValidateError {} + +fn render_signature(params: &[WasmValType], results: &[WasmValType]) -> String { + let p = params + .iter() + .map(ToString::to_string) + .collect::>() + .join(", "); + let r = results + .iter() + .map(ToString::to_string) + .collect::>() + .join(", "); + format!("({p}) -> ({r})") +} + +/// Validates that `wasm_bytes` exports a function named `export_field` whose +/// signature equals `declared_sig`. +/// +/// # Errors +/// +/// - [`ValidateError::Parse`] if the bytes are not a valid WASM module. +/// - [`ValidateError::ExportNotFound`] if no exported *function* of that name +/// exists (a non-function export of the same name is treated as "not found"). +/// - [`ValidateError::SignatureMismatch`] if the function exists but its +/// parameters or results differ from `declared_sig`. +/// - [`ValidateError::UnsupportedExportType`] if the export uses a value type +/// Inference does not model. +pub fn validate_extern( + wasm_bytes: &[u8], + export_field: &str, + declared_sig: &DeclaredSignature, +) -> Result<(), ValidateError> { + let module = ParsedModule::parse(wasm_bytes)?; + + let Some(func_index) = module.exported_function_index(export_field) else { + return Err(ValidateError::ExportNotFound { + export_field: export_field.to_string(), + available_functions: module.exported_function_names(), + }); + }; + + let func_type = module + .function_type(func_index) + .ok_or_else(|| ValidateError::Parse(format!( + "export `{export_field}` references function index {func_index} with no type" + )))?; + + let found_params = to_val_types(func_type.params(), export_field)?; + let found_results = to_val_types(func_type.results(), export_field)?; + + if found_params == declared_sig.params && found_results == declared_sig.results { + Ok(()) + } else { + Err(ValidateError::SignatureMismatch { + export_field: export_field.to_string(), + mismatch: SignatureMismatch { + declared: declared_sig.clone(), + found_params, + found_results, + }, + }) + } +} + +fn to_val_types( + types: &[ValType], + export_field: &str, +) -> Result, ValidateError> { + types + .iter() + .map(|&v| { + WasmValType::from_parser(v).ok_or_else(|| ValidateError::UnsupportedExportType { + export_field: export_field.to_string(), + }) + }) + .collect() +} + +/// The subset of a parsed WASM module needed for export-signature validation: +/// the function-type table, the per-function type indices (imports first, then +/// locally-defined functions), and the function exports. +struct ParsedModule { + /// Types indexed by their position in the module's type section. Non-function + /// composite types occupy a `None` slot so that every function-section type + /// index stays aligned with the section it came from. + types: Vec>, + /// Type index for each function, ordered by function index. Imported + /// functions occupy the lowest indices, then locally-defined functions. + func_type_indices: Vec, + /// `export name → function index` for every function export. + function_exports: Vec<(String, u32)>, +} + +impl ParsedModule { + fn parse(wasm_bytes: &[u8]) -> Result { + let mut types = Vec::new(); + let mut func_type_indices = Vec::new(); + let mut function_exports = Vec::new(); + + for payload in Parser::new(0).parse_all(wasm_bytes) { + let payload = payload.map_err(|e| ValidateError::Parse(e.to_string()))?; + match payload { + Payload::TypeSection(reader) => { + for group in reader { + let group = group.map_err(|e| ValidateError::Parse(e.to_string()))?; + collect_types(&group, &mut types); + } + } + Payload::ImportSection(reader) => { + for import in reader { + let import = import.map_err(|e| ValidateError::Parse(e.to_string()))?; + if let inf_wasmparser::TypeRef::Func(type_idx) = import.ty { + func_type_indices.push(type_idx); + } + } + } + Payload::FunctionSection(reader) => { + for type_idx in reader { + let type_idx = type_idx.map_err(|e| ValidateError::Parse(e.to_string()))?; + func_type_indices.push(type_idx); + } + } + Payload::ExportSection(reader) => { + for export in reader { + let export = export.map_err(|e| ValidateError::Parse(e.to_string()))?; + let Export { name, kind, index } = export; + if kind == ExternalKind::Func { + function_exports.push((name.to_string(), index)); + } + } + } + _ => {} + } + } + + Ok(ParsedModule { + types, + func_type_indices, + function_exports, + }) + } + + fn exported_function_index(&self, name: &str) -> Option { + self.function_exports + .iter() + .find(|(export_name, _)| export_name == name) + .map(|(_, index)| *index) + } + + fn exported_function_names(&self) -> Vec { + self.function_exports + .iter() + .map(|(name, _)| name.clone()) + .collect() + } + + fn function_type(&self, func_index: u32) -> Option<&FuncType> { + let type_index = *self.func_type_indices.get(func_index as usize)?; + self.types.get(type_index as usize)?.as_ref() + } +} + +/// Appends each type in a `RecGroup` to `out` in type-section order, keeping a +/// `None` slot for non-function composite types so that function-section type +/// indices remain aligned with the type section they reference. +fn collect_types(group: &RecGroup, out: &mut Vec>) { + for sub_type in group.types() { + match &sub_type.composite_type.inner { + CompositeInnerType::Func(func_type) => out.push(Some(func_type.clone())), + _ => out.push(None), + } + } +} + +#[cfg(test)] +mod tests { + //! Unit tests for signature lowering, the diagnostic `Display` impls, and the + //! `f64`-export rejection — the parts the integration suite drives only for + //! the happy path. + + use super::*; + use crate::parse; + use inference_ast::nodes::Def; + + /// Lowers the first `external fn` found in `source` (descending into specs). + fn lower_first_extern(source: &str) -> Result { + let arena = parse(source).expect("source parses"); + let extern_def = arena + .source_files() + .flat_map(|file| file.defs.iter().copied()) + .find_map(|def_id| find_extern(&arena, def_id)) + .expect("an external fn"); + let Def::ExternFunction { args, returns, .. } = &arena[extern_def].kind else { + unreachable!("find_extern only yields externs"); + }; + lower_extern_signature(&arena, args, *returns) + } + + fn find_extern( + arena: &inference_ast::arena::AstArena, + def_id: inference_ast::ids::DefId, + ) -> Option { + match &arena[def_id].kind { + Def::ExternFunction { .. } => Some(def_id), + Def::Spec { defs, .. } => defs.iter().find_map(|&inner| find_extern(arena, inner)), + _ => None, + } + } + + #[test] + fn lowers_scalar_and_pointer_types_to_value_types() { + // bool/i16/u32/array/struct-name all lower to i32; i64/u64 to i64. + let sig = lower_first_extern( + "struct P { x: i32; }\n\ + external fn f(a: bool, b: u32, c: i64, d: [i32; 4], e: P) -> u64;", + ) + .expect("lowers"); + assert_eq!( + sig.params, + vec![ + WasmValType::I32, + WasmValType::I32, + WasmValType::I64, + WasmValType::I32, + WasmValType::I32, + ], + "bool/u32/array/struct lower to i32; i64 stays i64" + ); + assert_eq!(sig.results, vec![WasmValType::I64]); + } + + #[test] + fn unit_return_lowers_to_no_results() { + // The unit type is written `()`; a unit return produces no WASM result. + let sig = lower_first_extern("external fn f(a: i32) -> ();").expect("lowers"); + assert_eq!(sig.params, vec![WasmValType::I32]); + assert!(sig.results.is_empty(), "a unit return yields no result value"); + } + + #[test] + fn unit_parameter_is_rejected() { + let err = lower_first_extern("external fn f(a: ()) -> i32;") + .expect_err("unit parameter must be rejected"); + assert_eq!(err, LowerSignatureError::UnitParameter); + } + + #[test] + fn f64_export_is_an_unsupported_export_type() { + // The module exports a function taking `f64` — a value type Inference does + // not model — so validation must reject it as unsupported, not as a + // signature mismatch. + let mut module = wasm_encoder::Module::new(); + let mut types = wasm_encoder::TypeSection::new(); + types + .ty() + .function([wasm_encoder::ValType::F64], [wasm_encoder::ValType::F64]); + module.section(&types); + let mut funcs = wasm_encoder::FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + let mut exports = wasm_encoder::ExportSection::new(); + exports.export("f", wasm_encoder::ExportKind::Func, 0); + module.section(&exports); + let mut code = wasm_encoder::CodeSection::new(); + let mut func = wasm_encoder::Function::new([]); + func.instruction(&wasm_encoder::Instruction::LocalGet(0)); + func.instruction(&wasm_encoder::Instruction::End); + code.function(&func); + module.section(&code); + let bytes = module.finish(); + + let declared = DeclaredSignature { + params: vec![WasmValType::I64], + results: vec![WasmValType::I64], + }; + let err = validate_extern(&bytes, "f", &declared).unwrap_err(); + match err { + ValidateError::UnsupportedExportType { export_field } => { + assert_eq!(export_field, "f"); + } + other => panic!("expected UnsupportedExportType, got {other:?}"), + } + } + + #[test] + fn value_type_display_renders_keywords() { + assert_eq!(WasmValType::I32.to_string(), "i32"); + assert_eq!(WasmValType::I64.to_string(), "i64"); + } + + #[test] + fn lower_signature_error_display_is_descriptive() { + assert!(LowerSignatureError::UnitParameter + .to_string() + .contains("unit")); + assert!(LowerSignatureError::UnsupportedType { + rendered: "Generic".into(), + } + .to_string() + .contains("Generic")); + } + + #[test] + fn export_not_found_display_lists_available_functions() { + let rendered = ValidateError::ExportNotFound { + export_field: "product".into(), + available_functions: vec!["sum".into(), "diff".into()], + } + .to_string(); + assert!(rendered.contains("product"), "names the missing export"); + assert!(rendered.contains("sum, diff"), "lists what is available"); + + // With nothing exported, the hint is omitted. + let bare = ValidateError::ExportNotFound { + export_field: "x".into(), + available_functions: Vec::new(), + } + .to_string(); + assert!(!bare.contains("exported functions:"), "no hint when empty"); + } + + #[test] + fn export_not_found_caps_the_listed_functions() { + // L2: an adversarial module exporting thousands of functions must not + // flood the diagnostic. At most MAX_LISTED_EXPORTS names appear, the rest + // summarized as a count. + let available: Vec = (0..1000).map(|i| format!("f{i}")).collect(); + let rendered = ValidateError::ExportNotFound { + export_field: "target".into(), + available_functions: available, + } + .to_string(); + + assert!(rendered.contains("f0"), "lists the first names: {rendered}"); + assert!( + rendered.contains(&format!("... and {} more", 1000 - MAX_LISTED_EXPORTS)), + "summarizes the remainder: {rendered}" + ); + // The last name must NOT appear in full — it is past the cap. + assert!(!rendered.contains("f999"), "caps the listing: {rendered}"); + + // Exactly the cap many names: every name shown, no "more" suffix. + let exact: Vec = (0..MAX_LISTED_EXPORTS).map(|i| format!("g{i}")).collect(); + let rendered_exact = ValidateError::ExportNotFound { + export_field: "x".into(), + available_functions: exact, + } + .to_string(); + assert!( + !rendered_exact.contains("more"), + "no remainder suffix when nothing is hidden: {rendered_exact}" + ); + } + + #[test] + fn signature_mismatch_display_shows_both_signatures() { + let rendered = ValidateError::SignatureMismatch { + export_field: "sum".into(), + mismatch: SignatureMismatch { + declared: DeclaredSignature { + params: vec![WasmValType::I32], + results: vec![WasmValType::I32], + }, + found_params: vec![WasmValType::I32, WasmValType::I32], + found_results: vec![WasmValType::I64], + }, + } + .to_string(); + assert!(rendered.contains("declared (i32) -> (i32)"), "{rendered}"); + assert!(rendered.contains("found (i32, i32) -> (i64)"), "{rendered}"); + } + + #[test] + fn other_validate_error_displays_render() { + assert!(ValidateError::Parse("boom".into()) + .to_string() + .contains("boom")); + assert!(ValidateError::UnsupportedExportType { + export_field: "g".into(), + } + .to_string() + .contains('g')); + } +} diff --git a/core/inference/tests/wasm_driver.rs b/core/inference/tests/wasm_driver.rs new file mode 100644 index 00000000..bb014937 --- /dev/null +++ b/core/inference/tests/wasm_driver.rs @@ -0,0 +1,610 @@ +//! Integration tests for the driver-side external-module orchestration +//! (`inference::wasm_link::resolve_external_modules`), which ties resolution and +//! validation together and reads the bytes the linker consumes. +//! +//! These tests drive the real front end (`parse` → `type_check`) so the extern +//! provenance the driver enumerates is produced exactly as a build produces it, +//! and resolve against a real temporary directory tree. + +use std::path::{Path, PathBuf}; + +use inference::wasm_link::{ + resolve_external_modules, ExternalResolutionError, ManifestDeps, SearchPath, +}; +use inference::{codegen, parse, type_check, TypedContext}; + +/// A self-cleaning temporary directory rooted under the OS temp dir. +struct TempTree { + root: PathBuf, +} + +impl TempTree { + fn new(tag: &str) -> Self { + let unique = format!( + "inference-wasm-driver-{tag}-{}-{:?}", + std::process::id(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + ); + let root = std::env::temp_dir().join(unique); + std::fs::create_dir_all(&root).unwrap(); + TempTree { root } + } + + /// Writes `bytes` at `relative` (creating parent dirs) and returns the path. + fn write(&self, relative: impl AsRef, bytes: &[u8]) -> PathBuf { + let path = self.root.join(relative); + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).unwrap(); + } + std::fs::write(&path, bytes).unwrap(); + path + } + + fn root(&self) -> &Path { + &self.root + } +} + +impl Drop for TempTree { + fn drop(&mut self) { + let _ = std::fs::remove_dir_all(&self.root); + } +} + +/// Compiles `source` to a `.wasm` module via the real codegen path. +fn compile(source: &str, module_name: &str) -> Vec { + let arena = parse(source).expect("source parses"); + let typed = type_check(arena).expect("source type-checks"); + codegen(&typed, module_name) + .expect("codegen succeeds") + .wasm() + .to_vec() +} + +/// Type-checks `source` into the context the driver enumerates externs from. +fn typed_of(source: &str) -> TypedContext { + let arena = parse(source).expect("source parses"); + type_check(arena).expect("source type-checks") +} + +#[test] +fn resolves_validates_and_reads_a_bound_extern() { + let lib = compile("pub fn sum(a: i32, b: i32) -> i32 { return a + b; }", "arith"); + let tree = TempTree::new("ok"); + tree.write("arith.wasm", &lib); + + let typed = typed_of( + "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x, 1); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let modules = + resolve_external_modules(&typed, &search, None).expect("resolution succeeds"); + assert_eq!(modules.len(), 1); + assert_eq!(modules[0].logical_module, "arith"); + assert_eq!(modules[0].bytes, lib); +} + +#[test] +fn resolves_a_bound_extern_through_a_manifest_entry() { + // The manifest binds the logical module to a `.wasm` whose name on disk does + // not match the logical name — only a manifest entry (not the search path) + // could resolve it, proving the manifest feeds the driver end to end. + let lib = compile("pub fn sum(a: i32, b: i32) -> i32 { return a + b; }", "arith"); + let tree = TempTree::new("manifest-ok"); + let on_disk = tree.write("vendor/arith-1.2.3.wasm", &lib); + + let typed = typed_of( + "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x, 1); }", + ); + + let mut manifest = ManifestDeps::new(); + manifest.insert("arith", on_disk); + + let modules = resolve_external_modules(&typed, &SearchPath::new(), Some(&manifest)) + .expect("manifest resolution succeeds"); + assert_eq!(modules.len(), 1); + assert_eq!(modules[0].logical_module, "arith"); + assert_eq!(modules[0].bytes, lib); +} + +#[test] +fn manifest_entry_overrides_a_search_path_directory() { + // Both the manifest and a `-L` directory carry `arith`, but the search-path + // copy has the WRONG signature. If the manifest did not win, validation + // against the search-path module would fail — so a clean resolution proves + // the manifest took priority. + let right = compile("pub fn sum(a: i32, b: i32) -> i32 { return a + b; }", "arith"); + let wrong = compile("pub fn sum(a: i32) -> i32 { return a; }", "arith"); + let tree = TempTree::new("manifest-override"); + let manifest_target = tree.write("vendor/arith.wasm", &right); + tree.write("lib/arith.wasm", &wrong); + + let typed = typed_of( + "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x, 1); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().join("lib")); + let mut manifest = ManifestDeps::new(); + manifest.insert("arith", manifest_target); + + let modules = resolve_external_modules(&typed, &search, Some(&manifest)) + .expect("manifest must override the wrong search-path module"); + assert_eq!(modules.len(), 1); + assert_eq!(modules[0].bytes, right); +} + +#[test] +fn a_program_without_externs_resolves_to_an_empty_set() { + let typed = typed_of("pub fn double(x: i32) -> i32 { return x + x; }"); + let modules = resolve_external_modules(&typed, &SearchPath::new(), None).unwrap(); + assert!(modules.is_empty()); +} + +#[test] +fn unresolved_module_is_a_resolve_error() { + // The extern is bound, but no search directory contains `arith.wasm`. + let typed = typed_of( + "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x, 1); }", + ); + + let tree = TempTree::new("missing"); + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let err = resolve_external_modules(&typed, &search, None).unwrap_err(); + assert!( + matches!(err, ExternalResolutionError::Resolve(_)), + "expected a resolve error, got {err:?}" + ); +} + +#[test] +fn signature_mismatch_is_a_validate_error() { + // The library exports `sum` taking two i32s, but the declaration claims a + // single i32 parameter — validation must reject it distinctly from a miss. + let lib = compile("pub fn sum(a: i32, b: i32) -> i32 { return a + b; }", "arith"); + let tree = TempTree::new("mismatch"); + tree.write("arith.wasm", &lib); + + let typed = typed_of( + "external fn sum(a: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let err = resolve_external_modules(&typed, &search, None).unwrap_err(); + assert!( + matches!(err, ExternalResolutionError::Validate { .. }), + "expected a validate error, got {err:?}" + ); +} + +#[test] +fn bound_top_level_extern_validates_against_its_own_declaration_not_a_spec_sibling() { + // H10: a bound top-level `external fn sort(i32)->i32` matches the library, + // while a same-named spec-inner `external fn sort(i32,i32)->i32` is a + // distinct, unbound declaration. The driver must validate the resolved + // library against the *bound* top-level declaration (recovered by DefId), + // not whichever same-named declaration last won a bare-name map slot. With + // the prior bare-name keying, the spec's `(i32,i32)` overwrote the slot and + // this resolved to a bogus signature-mismatch rejection. + let lib = compile("pub fn sort(a: i32) -> i32 { return a; }", "sorting"); + let tree = TempTree::new("h10"); + tree.write("sorting.wasm", &lib); + + let typed = typed_of( + "external fn sort(a: i32) -> i32;\n\ + use { sort } from sorting;\n\ + pub fn top(x: i32) -> i32 { return sort(x); }\n\ + spec Ms {\n\ + external fn sort(a: i32, b: i32) -> i32;\n\ + }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let modules = resolve_external_modules(&typed, &search, None) + .expect("the bound top-level `sort(i32)` must validate against the library"); + assert_eq!(modules.len(), 1); + assert_eq!(modules[0].logical_module, "sorting"); +} + +#[test] +fn export_not_found_is_a_validate_error() { + // The library exports `add`, not the `sum` the program binds. + let lib = compile("pub fn add(a: i32, b: i32) -> i32 { return a + b; }", "arith"); + let tree = TempTree::new("noexport"); + tree.write("arith.wasm", &lib); + + let typed = typed_of( + "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x, 1); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let err = resolve_external_modules(&typed, &search, None).unwrap_err(); + assert!( + matches!(err, ExternalResolutionError::Validate { .. }), + "expected a validate error for the missing export, got {err:?}" + ); +} + +#[test] +fn two_externs_from_one_module_dedup_to_a_single_entry() { + // Both `sum` and `diff` come from the same `arith` library. They resolve to + // the same `.wasm` path, so the driver must read the bytes once and return a + // single deduplicated module entry — exercising the by-path cache. + let lib = compile( + "pub fn sum(a: i32, b: i32) -> i32 { return a + b; }\n\ + pub fn diff(a: i32, b: i32) -> i32 { return a - b; }", + "arith", + ); + let tree = TempTree::new("dedup"); + tree.write("arith.wasm", &lib); + + let typed = typed_of( + "external fn sum(a: i32, b: i32) -> i32;\n\ + external fn diff(a: i32, b: i32) -> i32;\n\ + use { sum, diff } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x, diff(x, 1)); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let modules = resolve_external_modules(&typed, &search, None).expect("resolution succeeds"); + assert_eq!( + modules.len(), + 1, + "two externs from one library must dedup to one module entry" + ); + assert_eq!(modules[0].logical_module, "arith"); + assert_eq!(modules[0].bytes, lib); +} + +#[test] +fn two_distinct_modules_yield_one_entry_each_keyed_by_logical_module() { + // C4: two libraries bound under distinct logical modules must each produce + // their own resolved entry, carrying their own logical-module label, so the + // linker can match each import's recorded `(module, field)` on the right + // external rather than the first that merely exports the field name. + let adder = compile("pub fn add_op(a: i32, b: i32) -> i32 { return a + b; }", "adder"); + let subber = compile("pub fn sub_op(a: i32, b: i32) -> i32 { return a - b; }", "subber"); + let tree = TempTree::new("twomods"); + tree.write("adder.wasm", &adder); + tree.write("subber.wasm", &subber); + + let typed = typed_of( + "external fn add_op(a: i32, b: i32) -> i32;\n\ + external fn sub_op(a: i32, b: i32) -> i32;\n\ + use { add_op } from adder;\n\ + use { sub_op } from subber;\n\ + pub fn use_it(x: i32) -> i32 { return add_op(x, sub_op(x, 1)); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let modules = resolve_external_modules(&typed, &search, None).expect("resolution succeeds"); + let logical: Vec<&str> = modules.iter().map(|m| m.logical_module.as_str()).collect(); + assert_eq!( + logical, + vec!["adder", "subber"], + "each distinct logical module must get its own entry, sorted deterministically" + ); +} + +/// Builds a structurally-decodable module exporting `sum:(i32,i32)->i32` whose +/// body is malformed: it returns nothing while the signature promises an i32, so +/// it decodes (and signature-validates) but fails full WASM validation. This is +/// the H4 shape — a malformed-but-decodable external the body-blind +/// `validate_extern` would otherwise wave through into the linker. +fn malformed_but_decodable_sum() -> Vec { + use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, Module, + TypeSection, ValType, + }; + + let mut module = Module::new(); + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::I32, ValType::I32], [ValType::I32]); + module.section(&types); + let mut funcs = FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + let mut exports = ExportSection::new(); + exports.export("sum", ExportKind::Func, 0); + module.section(&exports); + let mut code = CodeSection::new(); + // Empty body: `end` with no value pushed, but the type demands an i32 result. + let mut func = Function::new([]); + func.instruction(&Instruction::End); + code.function(&func); + module.section(&code); + module.finish() +} + +/// Builds a *valid* external exporting `sum:(i32,i32)->i32` whose body uses a +/// SIMD `v128.const` (immediately dropped). The module is well-formed WebAssembly +/// — it passes the structural validation pass — but SIMD is outside the linker's +/// supported WASM 1.0 subset, so the driver's gate must reject it as an +/// unsupported feature, not as malformed. +fn simd_external_sum() -> Vec { + use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, Module, + TypeSection, ValType, + }; + + let mut module = Module::new(); + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::I32, ValType::I32], [ValType::I32]); + module.section(&types); + let mut funcs = FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + let mut exports = ExportSection::new(); + exports.export("sum", ExportKind::Func, 0); + module.section(&exports); + let mut code = CodeSection::new(); + let mut func = Function::new([]); + func.instruction(&Instruction::V128Const(0)); + func.instruction(&Instruction::Drop); + func.instruction(&Instruction::LocalGet(0)); + func.instruction(&Instruction::LocalGet(1)); + func.instruction(&Instruction::I32Add); + func.instruction(&Instruction::End); + code.function(&func); + module.section(&code); + module.finish() +} + +/// Builds a *valid* external exporting `sum:(i32,i32)->i32` whose body uses a +/// floating-point op (`f32.add` over two `f32.const`, immediately dropped). The +/// module is well-formed WebAssembly — it passes the structural validation pass — +/// but the Inference language has no `f32`/`f64` types and the linker's gate drops +/// the baseline `FLOATS` flag, so the driver's gate must reject it as an +/// unsupported feature naming floating point, not as malformed. +fn float_external_sum() -> Vec { + use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, Module, + TypeSection, ValType, + }; + + let mut module = Module::new(); + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::I32, ValType::I32], [ValType::I32]); + module.section(&types); + let mut funcs = FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + let mut exports = ExportSection::new(); + exports.export("sum", ExportKind::Func, 0); + module.section(&exports); + let mut code = CodeSection::new(); + let mut func = Function::new([]); + func.instruction(&Instruction::F32Const(1.0.into())); + func.instruction(&Instruction::F32Const(1.0.into())); + func.instruction(&Instruction::F32Add); + func.instruction(&Instruction::Drop); + func.instruction(&Instruction::LocalGet(0)); + func.instruction(&Instruction::LocalGet(1)); + func.instruction(&Instruction::I32Add); + func.instruction(&Instruction::End); + code.function(&func); + module.section(&code); + module.finish() +} + +#[test] +fn a_non_wasm1_external_is_rejected_as_unsupported_feature() { + // Driver alignment: a well-formed external that uses a post-1.0 proposal + // (SIMD here) must be rejected at the earliest point — when the driver + // resolves it — with the same feature-named diagnostic the linker's gate + // produces, distinct from a malformed-module `Invalid`. The gate is a single + // source of truth: the driver delegates to `inference_wasm_linker`'s + // `validate_external`. + let tree = TempTree::new("simd-external"); + tree.write("arith.wasm", &simd_external_sum()); + + let typed = typed_of( + "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x, 1); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let err = resolve_external_modules(&typed, &search, None).unwrap_err(); + match err { + ExternalResolutionError::UnsupportedFeature { + logical_module, + ref path, + ref reason, + } => { + assert_eq!(logical_module, "arith"); + assert!(path.ends_with("arith.wasm"), "names the offending file: {path:?}"); + assert!( + reason.contains("SIMD"), + "the diagnostic names the unsupported feature: {reason}" + ); + } + other => panic!("expected an UnsupportedFeature error, got {other:?}"), + } +} + +#[test] +fn a_floating_point_external_is_rejected_as_unsupported_feature() { + // Driver alignment: a well-formed external whose body uses a float op is + // rejected at resolution time with the same feature-named diagnostic the + // linker's gate produces. The Inference language has no `f32`/`f64` types, so + // floating point is outside the supported subset — distinct from a + // malformed-module `Invalid`. + let tree = TempTree::new("float-external"); + tree.write("arith.wasm", &float_external_sum()); + + let typed = typed_of( + "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x, 1); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let err = resolve_external_modules(&typed, &search, None).unwrap_err(); + match err { + ExternalResolutionError::UnsupportedFeature { + logical_module, + ref path, + ref reason, + } => { + assert_eq!(logical_module, "arith"); + assert!(path.ends_with("arith.wasm"), "names the offending file: {path:?}"); + assert!( + reason.contains("floating-point"), + "the diagnostic names floating point: {reason}" + ); + } + other => panic!("expected an UnsupportedFeature error, got {other:?}"), + } +} + +#[test] +fn malformed_but_decodable_external_is_rejected_as_invalid() { + // H4: the export signature matches, so `validate_extern` alone would accept + // it. The full-validation gate must reject the malformed body distinctly, + // before any byte reaches the linker. + let tree = TempTree::new("invalid-body"); + tree.write("arith.wasm", &malformed_but_decodable_sum()); + + let typed = typed_of( + "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x, 1); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let err = resolve_external_modules(&typed, &search, None).unwrap_err(); + match err { + ExternalResolutionError::Invalid { + logical_module, + ref path, + .. + } => { + assert_eq!(logical_module, "arith"); + assert!(path.ends_with("arith.wasm"), "names the offending file: {path:?}"); + } + other => panic!("expected an Invalid error, got {other:?}"), + } +} + +#[test] +fn an_oversized_external_is_rejected_before_being_read() { + // H19: a file larger than the cap must be rejected as TooLarge, never read + // fully into memory. The fixture is just over the limit by a single byte; a + // sparse multi-GB bait file would behave the same without the disk cost. + use inference::wasm_link::MAX_EXTERNAL_MODULE_BYTES; + + let tree = TempTree::new("too-large"); + let path = tree.root().join("arith.wasm"); + let file = std::fs::File::create(&path).unwrap(); + file.set_len(MAX_EXTERNAL_MODULE_BYTES + 1).unwrap(); + drop(file); + + let typed = typed_of( + "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x, 1); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let err = resolve_external_modules(&typed, &search, None).unwrap_err(); + match err { + ExternalResolutionError::TooLarge { size, limit, .. } => { + assert_eq!(limit, MAX_EXTERNAL_MODULE_BYTES); + assert!(size > limit, "reports the offending size: {size} > {limit}"); + } + other => panic!("expected a TooLarge error, got {other:?}"), + } +} + +#[test] +fn a_file_at_the_size_limit_is_still_read() { + // Boundary: exactly at the cap is accepted (the body is then rejected as + // invalid WASM, proving the read happened rather than tripping TooLarge). + use inference::wasm_link::MAX_EXTERNAL_MODULE_BYTES; + + let tree = TempTree::new("at-limit"); + let path = tree.root().join("arith.wasm"); + let file = std::fs::File::create(&path).unwrap(); + file.set_len(MAX_EXTERNAL_MODULE_BYTES).unwrap(); + drop(file); + + let typed = typed_of( + "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn use_it(x: i32) -> i32 { return sum(x, 1); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let err = resolve_external_modules(&typed, &search, None).unwrap_err(); + assert!( + !matches!(err, ExternalResolutionError::TooLarge { .. }), + "a file exactly at the limit must be read, not rejected as too large: {err:?}" + ); +} + +#[test] +fn nested_logical_module_resolves_under_subdirectory() { + let lib = compile("pub fn hash(a: i32) -> i32 { return a; }", "sha256"); + let tree = TempTree::new("nested"); + tree.write(Path::new("crypto").join("sha256.wasm"), &lib); + + let typed = typed_of( + "external fn hash(a: i32) -> i32;\n\ + use { hash } from crypto::sha256;\n\ + pub fn use_it(x: i32) -> i32 { return hash(x); }", + ); + + let mut search = SearchPath::new(); + search.push_lib_dir(tree.root().to_path_buf()); + + let modules = resolve_external_modules(&typed, &search, None).unwrap(); + assert_eq!(modules.len(), 1); + assert_eq!(modules[0].logical_module, "crypto::sha256"); +} diff --git a/core/inference/tests/wasm_resolve.rs b/core/inference/tests/wasm_resolve.rs new file mode 100644 index 00000000..33e4c7a8 --- /dev/null +++ b/core/inference/tests/wasm_resolve.rs @@ -0,0 +1,283 @@ +//! Integration tests for the driver-side `.wasm` module resolver +//! (`inference::wasm_link::resolve`). +//! +//! Resolution precedence, path portability, and the miss diagnostic are +//! exercised here against a real temporary directory tree so that `is_file` +//! probing behaves exactly as it would in a build. + +use std::path::{Path, PathBuf}; + +use inference::wasm_link::resolve::{ + resolve_wasm_module, ManifestDeps, ModulePath, ModulePathError, ResolveError, SearchPath, +}; + +/// A self-cleaning temporary directory rooted under the OS temp dir. +struct TempTree { + root: PathBuf, +} + +impl TempTree { + fn new(tag: &str) -> Self { + let unique = format!( + "inference-wasm-resolve-{tag}-{}-{:?}", + std::process::id(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + ); + let root = std::env::temp_dir().join(unique); + std::fs::create_dir_all(&root).unwrap(); + TempTree { root } + } + + /// Creates an empty file at `relative` (creating parent dirs) and returns it. + fn touch(&self, relative: impl AsRef) -> PathBuf { + let path = self.root.join(relative); + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).unwrap(); + } + std::fs::write(&path, b"\0asm").unwrap(); + path + } + + /// Creates a subdirectory and returns it. + fn dir(&self, relative: impl AsRef) -> PathBuf { + let path = self.root.join(relative); + std::fs::create_dir_all(&path).unwrap(); + path + } +} + +impl Drop for TempTree { + fn drop(&mut self) { + let _ = std::fs::remove_dir_all(&self.root); + } +} + +fn module(name: &str) -> ModulePath { + ModulePath::from_segments(name.split("::")).unwrap() +} + +#[test] +fn resolves_from_single_lib_dir() { + let tree = TempTree::new("single"); + let lib = tree.dir("lib"); + let expected = tree.touch("lib/sorting.wasm"); + + let mut search = SearchPath::new(); + search.push_lib_dir(&lib); + + let got = resolve_wasm_module(&module("sorting"), &search, None).unwrap(); + assert_eq!(got, expected); +} + +#[test] +fn maps_colon_path_to_nested_file() { + let tree = TempTree::new("nested"); + let lib = tree.dir("lib"); + let expected = tree.touch("lib/crypto/sha256.wasm"); + + let mut search = SearchPath::new(); + search.push_lib_dir(&lib); + + let got = resolve_wasm_module(&module("crypto::sha256"), &search, None).unwrap(); + assert_eq!(got, expected); +} + +#[test] +fn relative_path_uses_host_separator_not_literal_slash() { + // Portability: the logical name must map onto nested path *components*, never + // a single segment containing a literal separator. Asserting on components + // makes the test pass identically on Windows, macOS, and Linux. + let relative = module("crypto::sha256").to_relative_path(); + let components: Vec<_> = relative + .components() + .map(|c| c.as_os_str().to_string_lossy().into_owned()) + .collect(); + assert_eq!(components, ["crypto", "sha256.wasm"]); +} + +#[test] +fn lib_dir_precedes_env_dir() { + // The same logical module exists in both a `-L` dir and an env dir; the + // `-L` hit must win because it is pushed first. + let tree = TempTree::new("precedence-lib-env"); + let lib = tree.dir("lib"); + let env = tree.dir("env"); + let lib_hit = tree.touch("lib/sorting.wasm"); + let _env_hit = tree.touch("env/sorting.wasm"); + + let mut search = SearchPath::new(); + search.push_lib_dir(&lib); + search.push_env_dir(&env); + + let got = resolve_wasm_module(&module("sorting"), &search, None).unwrap(); + assert_eq!(got, lib_hit); +} + +#[test] +fn falls_back_to_env_dir_when_lib_dir_misses() { + let tree = TempTree::new("env-fallback"); + let lib = tree.dir("lib"); + let env = tree.dir("env"); + let env_hit = tree.touch("env/sorting.wasm"); + + let mut search = SearchPath::new(); + search.push_lib_dir(&lib); + search.push_env_dir(&env); + + let got = resolve_wasm_module(&module("sorting"), &search, None).unwrap(); + assert_eq!(got, env_hit); +} + +#[test] +fn manifest_precedes_search_path() { + // A manifest entry must beat any directory hit, even when both exist. + let tree = TempTree::new("precedence-manifest"); + let lib = tree.dir("lib"); + let _lib_hit = tree.touch("lib/sorting.wasm"); + let manifest_target = tree.touch("vendor/sorting-1.2.3.wasm"); + + let mut search = SearchPath::new(); + search.push_lib_dir(&lib); + + let mut manifest = ManifestDeps::new(); + manifest.insert("sorting", &manifest_target); + + let got = resolve_wasm_module(&module("sorting"), &search, Some(&manifest)).unwrap(); + assert_eq!(got, manifest_target); +} + +#[test] +fn manifest_beats_lib_dir_beats_env_dir() { + // The full Phase-5 precedence chain in one shot: the same logical module is + // available from the manifest, a `-L` directory, and an env directory. The + // manifest entry must win over both, and `-L` must win over env. + let tree = TempTree::new("precedence-three-way"); + let lib = tree.dir("lib"); + let env = tree.dir("env"); + let manifest_target = tree.touch("vendor/sorting.wasm"); + let lib_hit = tree.touch("lib/sorting.wasm"); + let env_hit = tree.touch("env/sorting.wasm"); + + // 1. Manifest present: it wins over everything. + let mut search = SearchPath::new(); + search.push_lib_dir(&lib); + search.push_env_dir(&env); + let mut manifest = ManifestDeps::new(); + manifest.insert("sorting", &manifest_target); + let got = resolve_wasm_module(&module("sorting"), &search, Some(&manifest)).unwrap(); + assert_eq!(got, manifest_target, "manifest entry must take priority"); + + // 2. No manifest: `-L` wins over env. + let got = resolve_wasm_module(&module("sorting"), &search, None).unwrap(); + assert_eq!(got, lib_hit, "`-L` directory must beat env directory"); + + // 3. No manifest, `-L` misses: env is the fallback. + let mut env_only = SearchPath::new(); + env_only.push_env_dir(&env); + let got = resolve_wasm_module(&module("sorting"), &env_only, None).unwrap(); + assert_eq!(got, env_hit, "env directory is the last resort"); +} + +#[test] +fn manifest_path_missing_is_a_distinct_error() { + let tree = TempTree::new("manifest-missing"); + let lib = tree.dir("lib"); + // A directory hit exists, but the manifest takes priority and points nowhere. + let _lib_hit = tree.touch("lib/sorting.wasm"); + let bogus = tree.root.join("vendor").join("does-not-exist.wasm"); + + let mut search = SearchPath::new(); + search.push_lib_dir(&lib); + + let mut manifest = ManifestDeps::new(); + manifest.insert("sorting", &bogus); + + let err = resolve_wasm_module(&module("sorting"), &search, Some(&manifest)).unwrap_err(); + match err { + ResolveError::ManifestPathMissing { logical_name, path } => { + assert_eq!(logical_name, "sorting"); + assert_eq!(path, bogus); + } + other => panic!("expected ManifestPathMissing, got {other:?}"), + } +} + +#[test] +fn unmatched_manifest_entry_falls_through_to_search_path() { + // The manifest carries a *different* module; resolution should ignore it and + // fall through to the search directories for the requested name. + let tree = TempTree::new("manifest-unmatched"); + let lib = tree.dir("lib"); + let expected = tree.touch("lib/sorting.wasm"); + + let mut search = SearchPath::new(); + search.push_lib_dir(&lib); + + let mut manifest = ManifestDeps::new(); + manifest.insert("other", tree.root.join("other.wasm")); + + let got = resolve_wasm_module(&module("sorting"), &search, Some(&manifest)).unwrap(); + assert_eq!(got, expected); +} + +#[test] +fn miss_lists_every_searched_location_in_order() { + let tree = TempTree::new("miss"); + let lib = tree.dir("lib"); + let env = tree.dir("env"); + + let mut search = SearchPath::new(); + search.push_lib_dir(&lib); + search.push_env_dir(&env); + + let err = resolve_wasm_module(&module("crypto::sha256"), &search, None).unwrap_err(); + // The rendered diagnostic should name the logical module and the probed paths. + let rendered = err.to_string(); + assert!(rendered.contains("crypto::sha256")); + assert!(rendered.contains("sha256.wasm")); + match err { + ResolveError::NotFound { + logical_name, + searched, + } => { + assert_eq!(logical_name, "crypto::sha256"); + assert_eq!( + searched, + vec![ + lib.join("crypto").join("sha256.wasm"), + env.join("crypto").join("sha256.wasm"), + ] + ); + } + other => panic!("expected NotFound, got {other:?}"), + } +} + +#[test] +fn empty_search_path_miss_reports_no_directories() { + let err = resolve_wasm_module(&module("sorting"), &SearchPath::new(), None).unwrap_err(); + let rendered = err.to_string(); + assert!(rendered.contains("no search directories")); +} + +#[test] +fn module_path_rejects_empty_reference() { + let err = ModulePath::from_segments(Vec::::new()).unwrap_err(); + assert_eq!(err, ModulePathError::Empty); +} + +#[test] +fn module_path_rejects_separator_bearing_segment() { + // A segment must never smuggle a path separator; that would let source + // escape the search directory — exactly the portability hole we close. + for bad in ["a/b", "a\\b", "..", "."] { + let err = ModulePath::from_segments([bad]).unwrap_err(); + assert!( + matches!(err, ModulePathError::InvalidSegment(_)), + "segment {bad:?} should be rejected, got {err:?}" + ); + } +} diff --git a/core/inference/tests/wasm_validate.rs b/core/inference/tests/wasm_validate.rs new file mode 100644 index 00000000..60c3d003 --- /dev/null +++ b/core/inference/tests/wasm_validate.rs @@ -0,0 +1,265 @@ +//! Integration tests for `external fn` validation against a real `.wasm` module +//! (`inference::wasm_link::validate`). +//! +//! Fixtures are built with `wasm-encoder` so the bytes are genuine WASM, then +//! fed through `validate_extern`. The two failure modes — missing export and +//! signature mismatch — are asserted to surface as **distinct** error variants. + +use inference::wasm_link::validate::{ + lower_extern_signature, validate_extern, DeclaredSignature, ValidateError, WasmValType, +}; +use inference_ast::nodes::Def; + +use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, EntityType, Function, FunctionSection, ImportSection, + Instruction, Module, TypeSection, ValType, +}; + +/// Builds a module exporting one function `name` with the given signature. +/// The body returns zero/zeros to keep it trivially valid. +fn module_exporting(name: &str, params: &[ValType], results: &[ValType]) -> Vec { + let mut module = Module::new(); + + let mut types = TypeSection::new(); + types.ty().function(params.iter().copied(), results.iter().copied()); + module.section(&types); + + let mut functions = FunctionSection::new(); + functions.function(0); + module.section(&functions); + + let mut exports = ExportSection::new(); + exports.export(name, ExportKind::Func, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + let mut func = Function::new([]); + for result in results { + match result { + ValType::I64 => func.instruction(&Instruction::I64Const(0)), + _ => func.instruction(&Instruction::I32Const(0)), + }; + } + func.instruction(&Instruction::End); + code.function(&func); + module.section(&code); + + module.finish() +} + +/// Builds a module with one *imported* function and one local exported function, +/// so the exported function lives at function index 1 (imports occupy index 0). +/// Validation must follow the index space and read the *local* function's type. +fn module_with_import_then_export( + export_name: &str, + params: &[ValType], + results: &[ValType], +) -> Vec { + let mut module = Module::new(); + + let mut types = TypeSection::new(); + // type 0: the imported function (i32) -> () + types.ty().function([ValType::I32], []); + // type 1: the exported function + types.ty().function(params.iter().copied(), results.iter().copied()); + module.section(&types); + + let mut imports = ImportSection::new(); + imports.import("host", "log", EntityType::Function(0)); + module.section(&imports); + + let mut functions = FunctionSection::new(); + functions.function(1); + module.section(&functions); + + let mut exports = ExportSection::new(); + // imported func is index 0; the local func is index 1. + exports.export(export_name, ExportKind::Func, 1); + module.section(&exports); + + let mut code = CodeSection::new(); + let mut func = Function::new([]); + for result in results { + match result { + ValType::I64 => func.instruction(&Instruction::I64Const(0)), + _ => func.instruction(&Instruction::I32Const(0)), + }; + } + func.instruction(&Instruction::End); + code.function(&func); + module.section(&code); + + module.finish() +} + +/// Builds a module that exports a *memory* (not a function) named `name`. +fn module_exporting_memory(name: &str) -> Vec { + let mut module = Module::new(); + + let mut memories = wasm_encoder::MemorySection::new(); + memories.memory(wasm_encoder::MemoryType { + minimum: 1, + maximum: Some(1), + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&memories); + + let mut exports = ExportSection::new(); + exports.export(name, ExportKind::Memory, 0); + module.section(&exports); + + module.finish() +} + +fn sig(params: &[WasmValType], results: &[WasmValType]) -> DeclaredSignature { + DeclaredSignature { + params: params.to_vec(), + results: results.to_vec(), + } +} + +#[test] +fn accepts_matching_signature() { + let bytes = module_exporting("sum", &[ValType::I32, ValType::I32], &[ValType::I32]); + let declared = sig(&[WasmValType::I32, WasmValType::I32], &[WasmValType::I32]); + validate_extern(&bytes, "sum", &declared).expect("matching signature should validate"); +} + +#[test] +fn accepts_i64_and_void_signatures() { + let bytes = module_exporting("store", &[ValType::I64, ValType::I32], &[]); + let declared = sig(&[WasmValType::I64, WasmValType::I32], &[]); + validate_extern(&bytes, "store", &declared).expect("i64/void signature should validate"); +} + +#[test] +fn missing_export_is_distinct_error() { + let bytes = module_exporting("sum", &[ValType::I32], &[ValType::I32]); + let declared = sig(&[WasmValType::I32], &[WasmValType::I32]); + let err = validate_extern(&bytes, "product", &declared).unwrap_err(); + match err { + ValidateError::ExportNotFound { + export_field, + available_functions, + } => { + assert_eq!(export_field, "product"); + assert_eq!(available_functions, vec!["sum".to_string()]); + } + other => panic!("expected ExportNotFound, got {other:?}"), + } +} + +#[test] +fn non_function_export_of_same_name_is_export_not_found() { + // A memory named `sum` is not a function export; validation must report it as + // a missing *function* export, not a signature mismatch. + let bytes = module_exporting_memory("sum"); + let declared = sig(&[WasmValType::I32], &[WasmValType::I32]); + let err = validate_extern(&bytes, "sum", &declared).unwrap_err(); + assert!( + matches!(err, ValidateError::ExportNotFound { .. }), + "expected ExportNotFound, got {err:?}" + ); +} + +#[test] +fn mismatched_param_count_is_signature_mismatch() { + let bytes = module_exporting("sum", &[ValType::I32, ValType::I32], &[ValType::I32]); + // declares one parameter; module has two. + let declared = sig(&[WasmValType::I32], &[WasmValType::I32]); + let err = validate_extern(&bytes, "sum", &declared).unwrap_err(); + match err { + ValidateError::SignatureMismatch { export_field, mismatch } => { + assert_eq!(export_field, "sum"); + assert_eq!(mismatch.found_params, vec![WasmValType::I32, WasmValType::I32]); + } + other => panic!("expected SignatureMismatch, got {other:?}"), + } +} + +#[test] +fn mismatched_param_type_is_signature_mismatch() { + let bytes = module_exporting("sum", &[ValType::I64], &[ValType::I32]); + // declares i32 param; module has i64. + let declared = sig(&[WasmValType::I32], &[WasmValType::I32]); + let err = validate_extern(&bytes, "sum", &declared).unwrap_err(); + assert!( + matches!(err, ValidateError::SignatureMismatch { .. }), + "expected SignatureMismatch, got {err:?}" + ); +} + +#[test] +fn mismatched_return_type_is_signature_mismatch() { + let bytes = module_exporting("sum", &[ValType::I32], &[ValType::I64]); + // declares i32 return; module returns i64. + let declared = sig(&[WasmValType::I32], &[WasmValType::I32]); + let err = validate_extern(&bytes, "sum", &declared).unwrap_err(); + assert!( + matches!(err, ValidateError::SignatureMismatch { .. }), + "expected SignatureMismatch, got {err:?}" + ); +} + +#[test] +fn validates_export_behind_imported_function_index() { + // The exported function is at index 1 (an import occupies index 0). The + // validator must read the *local* function's type, not the import's. + let bytes = module_with_import_then_export("sum", &[ValType::I32, ValType::I32], &[ValType::I32]); + let declared = sig(&[WasmValType::I32, WasmValType::I32], &[WasmValType::I32]); + validate_extern(&bytes, "sum", &declared) + .expect("export behind import index should validate against the local type"); +} + +#[test] +fn rejects_invalid_wasm_bytes() { + let err = validate_extern(b"not wasm at all", "sum", &sig(&[], &[])).unwrap_err(); + assert!(matches!(err, ValidateError::Parse(_)), "got {err:?}"); +} + +#[test] +fn lowers_extern_declaration_to_wasm_signature() { + // The signature comparison is only meaningful if the declared side is lowered + // exactly like codegen. Lower a real `external fn` and check the value types. + let arena = inference::parse( + "spec s { external fn mix(a: i32, b: i64, c: bool) -> u64; }", + ) + .expect("parse"); + + let extern_def = arena + .source_files() + .flat_map(|file| file.defs.iter().copied()) + .flat_map(|def_id| collect_externs(&arena, def_id)) + .next() + .expect("an external fn"); + + let Def::ExternFunction { args, returns, .. } = &arena[extern_def].kind else { + unreachable!("collect_externs only yields externs"); + }; + + let declared = lower_extern_signature(&arena, args, *returns).expect("lower"); + assert_eq!( + declared, + sig( + &[WasmValType::I32, WasmValType::I64, WasmValType::I32], + &[WasmValType::I64], + ) + ); +} + +/// Yields every `external fn` reachable from `def_id`, descending into specs. +fn collect_externs( + arena: &inference_ast::arena::AstArena, + def_id: inference_ast::ids::DefId, +) -> Vec { + match &arena[def_id].kind { + Def::ExternFunction { .. } => vec![def_id], + Def::Spec { defs, .. } => defs + .iter() + .flat_map(|&inner| collect_externs(arena, inner)) + .collect(), + _ => Vec::new(), + } +} diff --git a/core/parser/src/grammar.rs b/core/parser/src/grammar.rs index 2131ed0d..95f9bdef 100644 --- a/core/parser/src/grammar.rs +++ b/core/parser/src/grammar.rs @@ -46,7 +46,16 @@ pub fn source_file(p: &mut Parser) { let m = p.start(); while !p.at_eof() { if at_item_start(p) { + // Defense-in-depth: if a future `item` handler completes without + // consuming any token, the cursor is unchanged and this loop would + // spin forever (the fuel guard does not catch it, since completing a + // marker refills the fuel). Bump the offending token into an Error + // node so any non-advancing handler degrades to a recoverable error. + let before = p.pos(); item(p); + if p.pos() == before { + p.err_and_bump("expected an item"); + } } else { // An unexpected token at item position: consume it into an Error // node so the loop always advances, then retry from the next token. @@ -302,12 +311,23 @@ mod tests { } #[test] - fn use_from_literal() { - let src = "use { sort, hash } from \"./sort.rs\";"; + fn use_from_simple_name() { + let src = "use { sort, hash } from sorting;"; assert_clean(src); let u = first(src, SyntaxKind::UseDirective); - assert!(u.child(SyntaxKind::StringLiteral).is_some()); - assert_eq!(count_kind(&u, SyntaxKind::Identifier), 2); + assert!(u.child(SyntaxKind::StringLiteral).is_none()); + // two imported types plus one module-ref segment + assert_eq!(count_kind(&u, SyntaxKind::Identifier), 3); + } + + #[test] + fn use_from_path() { + let src = "use { hash } from crypto::sha256;"; + assert_clean(src); + let u = first(src, SyntaxKind::UseDirective); + assert!(u.child(SyntaxKind::StringLiteral).is_none()); + // one imported type plus two module-ref segments + assert_eq!(count_kind(&u, SyntaxKind::Identifier), 3); } // ---- types ---- @@ -814,6 +834,56 @@ mod tests { assert!(find(&root, SyntaxKind::FunctionDefinition).is_some()); } + #[test] + fn pub_external_fn_terminates_with_diagnostic() { + // Regression for C3: `pub external fn …` used to spin the source_file + // loop forever because the external handler never consumed the leading + // `pub`. The parser must now terminate (reaching this assertion proves + // it did) and emit a diagnostic, producing the external node. + let (root, errors) = parse("pub external fn f();"); + assert!(errors > 0, "expected a diagnostic for the stray `pub`"); + assert_eq!(root.kind, SyntaxKind::SourceFile); + assert!( + find(&root, SyntaxKind::ExternalFunctionDefinition).is_some(), + "the external declaration should still be recognised:\n{}", + root.debug_tree("pub external fn f();") + ); + let e = first("pub external fn f();", SyntaxKind::ExternalFunctionDefinition); + assert!( + e.child(SyntaxKind::Visibility).is_some(), + "the stray `pub` is consumed as a Visibility node" + ); + } + + #[test] + fn pub_external_fn_with_return_terminates() { + // The `-> i32` form must also terminate cleanly (it shared the same + // non-advancing path before C3 was fixed). + let (root, errors) = parse("pub external fn f() -> i32;"); + assert!(errors > 0); + assert_eq!(root.kind, SyntaxKind::SourceFile); + assert!(find(&root, SyntaxKind::ExternalFunctionDefinition).is_some()); + } + + #[test] + fn spec_pub_external_fn_terminates_with_diagnostic() { + // Regression for C3 inside a spec body: the spec loop dispatches through + // `definition`, so a `pub external fn` there also has to terminate. + let src = "spec S { pub external fn f(); }"; + let (root, errors) = parse(src); + assert!(errors > 0, "expected a diagnostic for the stray `pub`"); + assert_eq!(root.kind, SyntaxKind::SourceFile); + assert!( + find(&root, SyntaxKind::SpecDefinition).is_some(), + "the spec should still be recognised:\n{}", + root.debug_tree(src) + ); + assert!( + find(&root, SyntaxKind::ExternalFunctionDefinition).is_some(), + "the spec-inner external declaration should still be recognised" + ); + } + #[test] fn fuzz_lite_never_panics() { // A handful of garbage strings: the parser must never panic and must diff --git a/core/parser/src/grammar/items.rs b/core/parser/src/grammar/items.rs index c593a650..b3181d08 100644 --- a/core/parser/src/grammar/items.rs +++ b/core/parser/src/grammar/items.rs @@ -46,19 +46,20 @@ fn visibility(p: &mut Parser) { } } -/// `use ( path [ :: { types } ] | { types } from string ) ;` +/// `use ( path [ :: { types } ] | { types } from module_ref ) ;` /// (`use_directive`). The two forms are distinguished by whether the body starts -/// with `{`. +/// with `{`. In the `from` form, `module_ref` is a logical identifier path +/// (`name` or `a::b`) — not a filesystem string — so source stays portable. pub(crate) fn use_directive(p: &mut Parser) { let m = p.start(); p.bump(SyntaxKind::UseKw); if p.at(SyntaxKind::LBrace) { imported_type_list(p); p.expect(SyntaxKind::FromKw); - if p.at(SyntaxKind::String) { - expr::string_literal(p); + if p.at(SyntaxKind::Ident) { + module_ref(p); } else { - p.error("expected a string literal"); + p.error("expected a module name"); } } else { types::identifier(p); @@ -75,6 +76,21 @@ pub(crate) fn use_directive(p: &mut Parser) { m.complete(p, SyntaxKind::UseDirective); } +/// `ident ( :: ident )*` — the logical module reference of a `from` clause. +/// Emits one `Identifier` per path segment; segments are separated by `::`. +fn module_ref(p: &mut Parser) { + types::identifier(p); + while p.at(SyntaxKind::ColonColon) { + p.bump(SyntaxKind::ColonColon); + if p.at(SyntaxKind::Ident) { + types::identifier(p); + } else { + p.error("expected a module path segment"); + break; + } + } +} + /// `{ sep1(ident, ,) }` — the imported-type list shared by both use forms. fn imported_type_list(p: &mut Parser) { p.expect(SyntaxKind::LBrace); @@ -98,7 +114,15 @@ pub(crate) fn spec_definition(p: &mut Parser) { p.expect(SyntaxKind::LBrace); while !p.at(SyntaxKind::RBrace) && !p.at_eof() { if at_definition_start(p) { + // Defense-in-depth: a `definition` handler that consumes nothing + // (e.g. a future non-advancing routing) would spin this loop, since + // completing a marker refills the fuel guard. Detect the unchanged + // cursor and bump the offending token into an Error node. + let before = p.pos(); definition(p); + if p.pos() == before { + p.err_and_bump("expected a definition"); + } } else { // The `}` terminating this body is the recovery anchor, so consume // the offending token into an Error node to guarantee progress — @@ -147,9 +171,17 @@ pub(crate) fn function_definition(p: &mut Parser) { } /// `external fn ident argument_list [ -> _type ] ;` -/// (`external_function_definition`). No visibility is allowed. +/// (`external_function_definition`). No visibility is allowed: a stray leading +/// `pub` is a grammar error, so we report it and then consume it as a +/// `Visibility` node for resilience — mirroring every other definition handler — +/// so the cursor always advances past it (otherwise the `source_file` item loop +/// would spin on `pub external …`). pub(crate) fn external_function_definition(p: &mut Parser) { let m = p.start(); + if p.at(SyntaxKind::PubKw) { + p.error("`external` functions cannot be `pub`"); + visibility(p); + } p.expect(SyntaxKind::ExternalKw); p.expect(SyntaxKind::FnKw); types::identifier(p); @@ -170,6 +202,10 @@ pub(crate) fn struct_definition(p: &mut Parser) { types::identifier(p); p.expect(SyntaxKind::LBrace); while !p.at(SyntaxKind::RBrace) && !p.at_eof() { + // Defense-in-depth: capture the cursor so a member handler that consumes + // nothing degrades to a recoverable error instead of spinning the loop + // (completing a marker refills the fuel guard, so it cannot catch this). + let before = p.pos(); match p.current() { SyntaxKind::Ident => { struct_field(p); @@ -181,6 +217,9 @@ pub(crate) fn struct_definition(p: &mut Parser) { // leaving it via a recovery set (which could spin the loop). _ => p.err_and_bump("expected a struct field or method"), } + if p.pos() == before { + p.err_and_bump("expected a struct field or method"); + } } p.expect(SyntaxKind::RBrace); m.complete(p, SyntaxKind::StructDefinition); diff --git a/core/parser/src/lower.rs b/core/parser/src/lower.rs index 5517e725..3cfe2d72 100644 --- a/core/parser/src/lower.rs +++ b/core/parser/src/lower.rs @@ -22,8 +22,8 @@ use inference_ast::arena::AstArena; use inference_ast::ids::{BlockId, DefId, ExprId, IdentId, StmtId, TypeId}; use inference_ast::nodes::{ ArgData, ArgKind, BlockData, BlockKind, Def, DefData, Directive, Expr, ExprData, Field, Ident, - Location, OperatorKind, SimpleTypeKind, SourceFileData, Stmt, StmtData, TypeData, TypeNode, - UnaryOperatorKind, UseDirective, Visibility, + Location, ModuleRef, OperatorKind, SimpleTypeKind, SourceFileData, Stmt, StmtData, TypeData, + TypeNode, UnaryOperatorKind, UseDirective, Visibility, }; use crate::errors::ParseError; @@ -105,36 +105,49 @@ impl<'s> Lowering<'s> { use crate::syntax_tree::SyntaxElement; let location = node.loc; - let from = node - .child(SyntaxKind::StringLiteral) - .map(|from_literal| self.lower_string_literal_value(from_literal)); - - let mut before_brace: Vec<&SyntaxNode> = Vec::new(); - let mut after_brace: Vec<&SyntaxNode> = Vec::new(); + // Three identifier buckets, keyed by position relative to the `{ … }` + // import list and the `from` keyword: + // * inside braces → imported types + // * before any brace, no `from` → path-form module segments + // * after `from`, outside braces → `from`-clause module reference + let mut path_segments: Vec<&SyntaxNode> = Vec::new(); + let mut imported: Vec<&SyntaxNode> = Vec::new(); + let mut from_segments: Vec<&SyntaxNode> = Vec::new(); let mut in_braces = false; + let mut after_from = false; for element in &node.children { match element { SyntaxElement::Token(t) if t.kind == SyntaxKind::LBrace => in_braces = true, + SyntaxElement::Token(t) if t.kind == SyntaxKind::RBrace => in_braces = false, + SyntaxElement::Token(t) if t.kind == SyntaxKind::FromKw => after_from = true, SyntaxElement::Node(n) if n.kind == SyntaxKind::Identifier => { if in_braces { - after_brace.push(n); + imported.push(n); + } else if after_from { + from_segments.push(n); } else { - before_brace.push(n); + path_segments.push(n); } } _ => {} } } - let segments: Vec = if from.is_some() { - Vec::new() + let from = if from_segments.is_empty() { + None } else { - before_brace - .into_iter() - .map(|segment| self.lower_identifier(segment)) - .collect() + Some(ModuleRef { + segments: from_segments + .into_iter() + .map(|segment| self.lower_identifier(segment)) + .collect(), + }) }; - let imported_types: Vec = after_brace + let segments: Vec = path_segments + .into_iter() + .map(|segment| self.lower_identifier(segment)) + .collect(); + let imported_types: Vec = imported .into_iter() .map(|imported_type| self.lower_identifier(imported_type)) .collect(); @@ -1113,12 +1126,6 @@ impl<'s> Lowering<'s> { } } - /// Mirrors `Builder::build_string_literal_value`: the literal's raw source - /// text, quotes included (used for the `from` of a use directive). - fn lower_string_literal_value(&mut self, node: &SyntaxNode) -> String { - node.text(self.src).to_string() - } - /// Mirrors `Builder::build_type`, dispatching on node kind. Primitive type /// keywords map to `SimpleTypeKind`; arrays lower **element then length**; /// generics lower the base identifier then the argument identifiers; qualified @@ -1955,11 +1962,17 @@ mod tests { } #[test] - fn lowers_use_directive_from_form() { - let arena = lower("use { X, Y } from \"lib\";"); + fn lowers_use_directive_from_simple_name() { + let arena = lower("use { X, Y } from lib;"); let files: Vec<_> = arena.source_files().collect(); let Directive::Use(directive) = &files[0].directives[0]; - assert_eq!(directive.from.as_deref(), Some("\"lib\"")); + let from = directive.from.as_ref().expect("from module reference"); + let from_segments: Vec<&str> = from + .segments + .iter() + .map(|&s| arena.ident_name(s)) + .collect(); + assert_eq!(from_segments, ["lib"]); assert!(directive.segments.is_empty()); let imported: Vec<&str> = directive .imported_types @@ -1969,6 +1982,27 @@ mod tests { assert_eq!(imported, ["X", "Y"]); } + #[test] + fn lowers_use_directive_from_path() { + let arena = lower("use { hash } from crypto::sha256;"); + let files: Vec<_> = arena.source_files().collect(); + let Directive::Use(directive) = &files[0].directives[0]; + let from = directive.from.as_ref().expect("from module reference"); + let from_segments: Vec<&str> = from + .segments + .iter() + .map(|&s| arena.ident_name(s)) + .collect(); + assert_eq!(from_segments, ["crypto", "sha256"]); + assert!(directive.segments.is_empty()); + let imported: Vec<&str> = directive + .imported_types + .iter() + .map(|&t| arena.ident_name(t)) + .collect(); + assert_eq!(imported, ["hash"]); + } + // -- Statements ---------------------------------------------------------- #[test] diff --git a/core/parser/src/parser.rs b/core/parser/src/parser.rs index 3871b092..2824f1da 100644 --- a/core/parser/src/parser.rs +++ b/core/parser/src/parser.rs @@ -123,6 +123,16 @@ impl<'i> Parser<'i> { self.at(SyntaxKind::Eof) } + /// The current meaningful-token position. + /// + /// Used by item loops to assert forward progress: a handler that completes + /// without consuming a token leaves this unchanged, which the loop detects + /// and recovers from rather than spinning forever. + #[must_use] + pub fn pos(&self) -> usize { + self.pos + } + /// Consumes the current token if it is `kind`, reporting whether it did. pub fn eat(&mut self, kind: SyntaxKind) -> bool { if self.at(kind) { diff --git a/core/parser/test_data/example.inf b/core/parser/test_data/example.inf index ed4baf25..e9548661 100644 --- a/core/parser/test_data/example.inf +++ b/core/parser/test_data/example.inf @@ -90,8 +90,8 @@ struct identity { //Use use inference::std; use inference::std::algorithms::sort; -use { sort } from "./sort.rs"; -use { sort, hash } from "./sort.rs"; +use { sort } from sorting; +use { sort, hash } from sorting; use inference::std::algorithms::{sort,hash}; //Binary Expression fn spec_assign() -> () { @@ -281,7 +281,7 @@ struct Account { } } use inference::std::algorithms::sort; -use { sort, hash } from "./sort.0.wasm"; +use { sort, hash } from sorting; use inference::std::algorithms::{sort, hash}; fn example() -> u32 { let a: u32 = 42; @@ -374,7 +374,7 @@ fn bubble_sort(arr: [i32;10], compare_function: fn(left: i32, right: i32) -> i32 } } } -use { hash } from "./cryptography.0.wasm"; +use { hash } from cryptography; spec HashContext { type HashFunction = fn([u8; 100]) -> [u8; 32]; fn verify_hash_transitivity(hash_f: HashFunction) -> () { diff --git a/core/type-checker/src/errors.rs b/core/type-checker/src/errors.rs index 5df89522..a7472007 100644 --- a/core/type-checker/src/errors.rs +++ b/core/type-checker/src/errors.rs @@ -559,6 +559,36 @@ pub enum TypeCheckError { variant_name: String, location: Location, }, + + /// An `external fn` is named by more than one `use … from ` clause, + /// each referring to a different module. + /// + /// Extern provenance must be unambiguous: the linker needs exactly one + /// source module per extern. List the offending modules and rename or + /// remove the conflicting `use` clauses to disambiguate. + #[error( + "{location}: external function `{name}` is bound to multiple modules ({modules}); each extern must come from exactly one module" + )] + AmbiguousExternModule { + name: String, + modules: String, + location: Location, + }, + + /// A `use { name } from ` clause names an import that has no + /// matching `external fn` declaration. + /// + /// A `from` import binds an extern to its source module; without a + /// corresponding `external fn name(...)` declaration there is nothing to + /// bind, so the import is dangling. Declare the extern or drop the import. + #[error( + "{location}: `use` imports `{name}` from module `{module}`, but no `external fn {name}` is declared" + )] + ExternImportNotDeclared { + name: String, + module: String, + location: Location, + }, } impl TypeCheckError { @@ -611,6 +641,8 @@ impl TypeCheckError { | TypeCheckError::ArrayLiteralSizeMismatch { location, .. } | TypeCheckError::DivisionByZero { location, .. } | TypeCheckError::DuplicateEnumVariant { location, .. } + | TypeCheckError::AmbiguousExternModule { location, .. } + | TypeCheckError::ExternImportNotDeclared { location, .. } | TypeCheckError::SpecFunctionShadowsTopLevel { location, .. } => location, } } @@ -1389,6 +1421,32 @@ mod tests { ); } + #[test] + fn display_ambiguous_extern_module() { + let err = TypeCheckError::AmbiguousExternModule { + name: "sort".to_string(), + modules: "`sorting`, `collections`".to_string(), + location: test_location(), + }; + assert_eq!( + err.to_string(), + "1:5: external function `sort` is bound to multiple modules (`sorting`, `collections`); each extern must come from exactly one module" + ); + } + + #[test] + fn display_extern_import_not_declared() { + let err = TypeCheckError::ExternImportNotDeclared { + name: "hash".to_string(), + module: "crypto".to_string(), + location: test_location(), + }; + assert_eq!( + err.to_string(), + "1:5: `use` imports `hash` from module `crypto`, but no `external fn hash` is declared" + ); + } + // Tests for CompoundReturnCallInAssignment and MethodCallChainOnCompoundReturn // migrated to analysis rules A017 and A018. } diff --git a/core/type-checker/src/lib.rs b/core/type-checker/src/lib.rs index 480d40a8..97f5aceb 100644 --- a/core/type-checker/src/lib.rs +++ b/core/type-checker/src/lib.rs @@ -108,7 +108,7 @@ mod type_checker; pub mod type_info; pub mod typed_context; -pub use symbol_table::{EnumInfo, StructFieldInfo, StructInfo}; +pub use symbol_table::{EnumInfo, ExternOrigin, StructFieldInfo, StructInfo}; pub use typed_context::MethodMetadata; /// Marker state indicating builder has not yet been initialized with an arena. diff --git a/core/type-checker/src/symbol_table.rs b/core/type-checker/src/symbol_table.rs index e7064103..9efb9ac5 100644 --- a/core/type-checker/src/symbol_table.rs +++ b/core/type-checker/src/symbol_table.rs @@ -31,6 +31,8 @@ //! represented as `TypeInfo { kind: TypeInfoKind::Unit, type_params: vec![] }`. use std::cell::RefCell; +use std::collections::BTreeSet; +use std::path::PathBuf; use std::sync::Weak; use std::sync::Arc; @@ -46,6 +48,53 @@ use rustc_hash::FxHashMap; pub(crate) type ScopeRef = Arc>; pub(crate) type WeakScopeRef = Weak>; +/// Provenance of an `external fn` declaration: the logical module that exports +/// it, the export field name to bind against, and (once the driver resolves it) +/// the concrete `.wasm` path. +/// +/// `logical_module` and `export_field` are platform-independent: they come from +/// the `use { field } from logical::module;` clause that names the extern, not +/// from any filesystem path. `resolved_path` stays `None` until the driver maps +/// the logical module to a file; later phases populate it for the linker. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ExternOrigin { + /// Logical, `::`-joined module reference from the binding `use` clause + /// (e.g. `"crypto::sha256"`). Never a filesystem path. + pub logical_module: String, + /// Export field name to bind against in the resolved module. Equals the + /// extern's declared name; carried explicitly so renaming-on-import can + /// diverge the two in a later phase without changing the data model. + pub export_field: String, + /// The `external fn` declaration this binding attaches to. + /// + /// Two same-named externs (e.g. a top-level and a spec-inner `sort` with + /// divergent signatures) would otherwise collapse together when keyed by + /// bare name. Carrying the declaring [`DefId`] lets the driver recover the + /// exact declared signature to validate against — never a same-named + /// sibling — and lets analysis resolve each call to the specific extern it + /// names. + pub decl: DefId, + /// Concrete `.wasm` path once the driver resolves `logical_module`. + /// `None` during type checking; populated downstream. + pub resolved_path: Option, +} + +/// Whether a registered function is local or an `external fn`, and — for an +/// extern — whether it was bound to a source module via a `use … from` clause. +/// +/// This discriminates the three states that otherwise collapse together: a +/// local function, an unbound extern (declared without a binding `use`), and a +/// bound extern (carrying its [`ExternOrigin`]). +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub(crate) enum FuncKind { + /// An ordinary function defined in this program. + #[default] + Local, + /// An `external fn`. `Some` once bound to a source module via a + /// `use … from` clause; `None` while unbound. + Extern(Option), +} + #[derive(Debug, Clone)] pub(crate) struct FuncInfo { pub(crate) name: String, @@ -54,6 +103,25 @@ pub(crate) struct FuncInfo { pub(crate) return_type: TypeInfo, pub(crate) visibility: Visibility, pub(crate) definition_scope_id: u32, + /// Local function, unbound extern, or bound extern. See [`FuncKind`]. + pub(crate) kind: FuncKind, +} + +impl FuncInfo { + /// Returns true if this is an `external fn`, bound or unbound. + #[must_use = "this is a pure check with no side effects"] + pub(crate) fn is_extern(&self) -> bool { + matches!(self.kind, FuncKind::Extern(_)) + } + + /// Returns the provenance of this function if it is a *bound* extern. + #[must_use = "this is a pure lookup with no side effects"] + pub(crate) fn extern_origin(&self) -> Option<&ExternOrigin> { + match &self.kind { + FuncKind::Extern(origin) => origin.as_ref(), + FuncKind::Local => None, + } + } } /// Information about a struct field. @@ -685,6 +753,49 @@ impl SymbolTable { param_types: Vec, return_type: TypeInfo, visibility: Visibility, + ) -> Result<(), String> { + self.insert_func_symbol( + name, + type_params, + param_types, + return_type, + visibility, + FuncKind::Local, + ) + } + + /// Registers an `external fn`, discriminating it from a local function. + /// + /// `origin` carries the binding module and export field when the extern is + /// named by a `use … from` clause; it is `None` for an extern declared + /// without a binding `use`. Either way the function is recorded as + /// [`FuncKind::Extern`], so an unbound extern stays distinguishable from a + /// local function. + pub(crate) fn register_extern_function( + &mut self, + name: &str, + param_types: Vec, + return_type: TypeInfo, + origin: Option, + ) -> Result<(), String> { + self.insert_func_symbol( + name, + vec![], + param_types, + return_type, + Visibility::Private, + FuncKind::Extern(origin), + ) + } + + fn insert_func_symbol( + &mut self, + name: &str, + type_params: Vec, + param_types: Vec, + return_type: TypeInfo, + visibility: Visibility, + kind: FuncKind, ) -> Result<(), String> { if let Some(scope) = &self.current_scope { let scope_id = scope.borrow().id; @@ -698,6 +809,7 @@ impl SymbolTable { return_type: self.resolve_custom_type(return_type), visibility, definition_scope_id: scope_id, + kind, }; scope .borrow_mut() @@ -835,6 +947,87 @@ impl SymbolTable { None } + /// Collects the provenance of every **bound** `external fn` across all + /// scopes, deduplicated by `(logical_module, export_field)`. + /// + /// The driver consumes this to resolve and validate each external `.wasm` + /// before linking. Unbound bare externs (declared without a binding `use`) + /// carry no origin and are skipped — they never reach the linker. + #[must_use = "this enumeration has no side effects"] + pub(crate) fn extern_origins(&self) -> Vec { + let mut seen: BTreeSet<(String, String)> = BTreeSet::new(); + let mut origins = Vec::new(); + let mut ids: Vec = self.scopes.keys().copied().collect(); + ids.sort_unstable(); + for id in ids { + let Some(scope) = self.scopes.get(&id) else { + continue; + }; + for symbol in scope.borrow().symbols.values() { + let Some(info) = symbol.as_function() else { + continue; + }; + let Some(origin) = info.extern_origin() else { + continue; + }; + let key = (origin.logical_module.clone(), origin.export_field.clone()); + if seen.insert(key) { + origins.push(origin.clone()); + } + } + } + origins + } + + /// Returns the provenance of the **bound** `external fn` declared by + /// `decl`, resolving strictly by declaration identity rather than by name. + /// + /// Two same-named externs (a top-level and a spec-inner `f`) register under + /// the same bare name in different scopes; a name keyed lookup would return + /// whichever the scope walk reaches first, masking which declaration is + /// actually bound. Keying on the declaring [`DefId`] lets a caller ask the + /// precise question "is *this* extern bound?" — the basis for resolving each + /// call site to the specific extern it names. + #[must_use = "this is a pure lookup with no side effects"] + pub(crate) fn extern_origin_by_decl(&self, decl: DefId) -> Option { + let mut ids: Vec = self.scopes.keys().copied().collect(); + ids.sort_unstable(); + for id in ids { + let Some(scope) = self.scopes.get(&id) else { + continue; + }; + for symbol in scope.borrow().symbols.values() { + let Some(info) = symbol.as_function() else { + continue; + }; + if let Some(origin) = info.extern_origin() + && origin.decl == decl + { + return Some(origin.clone()); + } + } + } + None + } + + /// Looks up a function by name across **all** registered scopes. Mirrors + /// [`Self::lookup_struct_anywhere`]; the returned [`FuncInfo`] carries the + /// extern provenance, so post-type-check phases can read it scope-agnostically. + #[must_use = "this is a pure lookup with no side effects"] + pub(crate) fn lookup_function_anywhere(&self, name: &str) -> Option { + let mut ids: Vec = self.scopes.keys().copied().collect(); + ids.sort_unstable(); + for id in ids { + if let Some(scope) = self.scopes.get(&id) + && let Some(symbol) = scope.borrow().lookup_symbol_local(name) + && let Some(info) = symbol.as_function() + { + return Some(info.clone()); + } + } + None + } + pub(crate) fn register_method( &mut self, type_name: &str, @@ -1019,7 +1212,7 @@ impl SymbolTable { for sf in arena.source_files() { for &def_id in &sf.defs { - self.register_definition_from_external(arena, def_id)?; + self.register_definition_from_external(module_name, arena, def_id)?; } } @@ -1029,9 +1222,14 @@ impl SymbolTable { } /// Register a definition from an external module into the current scope. + /// + /// `module_name` is the logical name of the module being loaded; an + /// `external fn` registered here is bound to it by construction, so its + /// [`ExternOrigin`] names this module. #[allow(dead_code)] fn register_definition_from_external( &mut self, + module_name: &str, arena: &AstArena, def_id: DefId, ) -> anyhow::Result<()> { @@ -1105,7 +1303,40 @@ impl SymbolTable { Def::TypeAlias { name, ty, .. } => { self.register_type(&arena[*name].name, Some(TypeInfo::from_type_id(arena, *ty)))?; } - Def::Constant { .. } | Def::ExternFunction { .. } | Def::Module { .. } => {} + Def::ExternFunction { + name, + args, + returns, + .. + } => { + let extern_name = arena[*name].name.clone(); + let param_types: Vec = args + .iter() + .filter_map(|a| match &a.kind { + ArgKind::SelfRef { .. } => None, + ArgKind::Named { ty, .. } + | ArgKind::Ignored { ty } + | ArgKind::TypeOnly(ty) => Some(TypeInfo::from_type_id(arena, *ty)), + }) + .collect(); + let return_type = returns + .map(|r| TypeInfo::from_type_id(arena, r)) + .unwrap_or_default(); + let origin = ExternOrigin { + logical_module: module_name.to_string(), + export_field: extern_name.clone(), + decl: def_id, + resolved_path: None, + }; + self.register_extern_function( + &extern_name, + param_types, + return_type, + Some(origin), + ) + .map_err(|e| anyhow::anyhow!(e))?; + } + Def::Constant { .. } | Def::Module { .. } => {} } Ok(()) } @@ -1492,6 +1723,7 @@ mod tests { return_type: TypeInfo::default(), visibility: Visibility::Private, definition_scope_id: 0, + kind: FuncKind::Local, }, visibility: Visibility::Private, scope_id: 0, @@ -1510,6 +1742,7 @@ mod tests { return_type: TypeInfo::default(), visibility: Visibility::Public, definition_scope_id: 0, + kind: FuncKind::Local, }, visibility: Visibility::Public, scope_id: 0, @@ -1529,6 +1762,7 @@ mod tests { return_type: TypeInfo::default(), visibility: Visibility::Public, definition_scope_id: 0, + kind: FuncKind::Local, }; let result = table.register_method("TestType", sig, Visibility::Public, true); assert!(result.is_ok()); @@ -1550,6 +1784,7 @@ mod tests { return_type: TypeInfo::default(), visibility: Visibility::Public, definition_scope_id: 0, + kind: FuncKind::Local, }; let result = table.register_method("TestType", sig, Visibility::Public, false); assert!(result.is_ok()); @@ -1570,6 +1805,7 @@ mod tests { return_type: TypeInfo::default(), visibility: Visibility::Private, definition_scope_id: 0, + kind: FuncKind::Local, }, visibility: Visibility::Private, scope_id: 0, @@ -1583,6 +1819,7 @@ mod tests { return_type: TypeInfo::default(), visibility: Visibility::Private, definition_scope_id: 0, + kind: FuncKind::Local, }, visibility: Visibility::Private, scope_id: 0, @@ -1624,4 +1861,138 @@ mod tests { assert_eq!(info.variant_index("Yellow"), None); } } + + mod extern_registration { + use super::*; + + fn i32_type() -> TypeInfo { + TypeInfo { + kind: TypeInfoKind::Number(NumberType::I32), + type_params: vec![], + } + } + + fn origin(module: &str, field: &str) -> ExternOrigin { + ExternOrigin { + logical_module: module.to_string(), + export_field: field.to_string(), + decl: inference_ast::ids::idx_from_u32(0), + resolved_path: None, + } + } + + #[test] + fn bound_extern_carries_origin_and_is_discriminated() { + let mut table = SymbolTable::default(); + table + .register_extern_function( + "sort", + vec![i32_type()], + i32_type(), + Some(origin("collections", "sort")), + ) + .expect("registering a bound extern should succeed"); + + let info = table + .lookup_function_anywhere("sort") + .expect("sort should be registered"); + assert!(info.is_extern(), "a registered extern must be discriminated"); + let found = info.extern_origin().expect("bound extern carries origin"); + assert_eq!(found.logical_module, "collections"); + assert_eq!(found.export_field, "sort"); + } + + #[test] + fn unbound_extern_is_extern_without_origin() { + let mut table = SymbolTable::default(); + table + .register_extern_function("add", vec![i32_type()], i32_type(), None) + .expect("registering an unbound extern should succeed"); + + let info = table + .lookup_function_anywhere("add") + .expect("add should be registered"); + assert!( + info.is_extern(), + "an unbound extern stays distinguishable from a local function" + ); + assert!( + info.extern_origin().is_none(), + "an unbound extern has no provenance" + ); + } + + #[test] + fn local_function_is_not_extern() { + let mut table = SymbolTable::default(); + table + .register_function("helper", vec![], vec![], i32_type()) + .expect("registering a local function should succeed"); + + let info = table + .lookup_function_anywhere("helper") + .expect("helper should be registered"); + assert!(!info.is_extern()); + assert!(info.extern_origin().is_none()); + } + + #[test] + fn extern_origins_collects_only_bound_externs() { + let mut table = SymbolTable::default(); + table + .register_extern_function( + "sort", + vec![i32_type()], + i32_type(), + Some(origin("collections", "sort")), + ) + .unwrap(); + table + .register_extern_function("unbound", vec![], i32_type(), None) + .unwrap(); + table + .register_function("helper", vec![], vec![], i32_type()) + .unwrap(); + + let origins = table.extern_origins(); + assert_eq!( + origins.len(), + 1, + "only the bound extern contributes an origin, got {origins:?}" + ); + assert_eq!(origins[0].logical_module, "collections"); + assert_eq!(origins[0].export_field, "sort"); + } + + #[test] + fn extern_origins_dedups_repeated_module_field_pairs() { + // Two distinct externs that name the same module+field collapse to a + // single resolution unit; the driver should resolve that `.wasm` once. + let mut table = SymbolTable::default(); + table + .register_extern_function( + "sort", + vec![i32_type()], + i32_type(), + Some(origin("collections", "sort")), + ) + .unwrap(); + let _ = table.enter_module("nested", Visibility::Public); + table + .register_extern_function( + "sort", + vec![i32_type()], + i32_type(), + Some(origin("collections", "sort")), + ) + .unwrap(); + + let origins = table.extern_origins(); + assert_eq!( + origins.len(), + 1, + "identical (module, field) pairs dedup to one origin, got {origins:?}" + ); + } + } } diff --git a/core/type-checker/src/type_checker.rs b/core/type-checker/src/type_checker.rs index b2d02f57..6c831902 100644 --- a/core/type-checker/src/type_checker.rs +++ b/core/type-checker/src/type_checker.rs @@ -42,7 +42,10 @@ use rustc_hash::{FxHashMap, FxHashSet}; use crate::{ errors::{DedupKind, RegistrationKind, TypeCheckError, TypeMismatchContext, VisibilityContext}, - symbol_table::{FuncInfo, Import, ImportItem, ImportKind, ResolvedImport, SymbolTable}, + symbol_table::{ + ExternOrigin, FuncInfo, FuncKind, Import, ImportItem, ImportKind, ResolvedImport, + SymbolTable, + }, type_info::{NumberType, TypeInfo, TypeInfoKind}, typed_context::TypedContext, }; @@ -57,6 +60,19 @@ pub(crate) struct TypeChecker { /// Set before walking the body, cleared after. Used by `infer_statement` to /// pass type param context to `validate_type` and `TypeInfo::from_type_id_with_type_params`. current_type_params: Vec, + /// Declaring extern [`DefId`] → provenance, derived from `use … from` + /// directives before externs are registered. + /// + /// Keyed by the *declaration*, not the bare name: a `use { f } from m;` + /// directive is file-global, so it binds only the **top-level** `external fn + /// f` and never a same-named extern declared inside a `spec` or `module`. + /// Keying by [`DefId`] keeps those scopes' externs unbound (and so + /// A024-rejected) even when they share a name with a bound top-level extern. + /// + /// Holds only unambiguously-bound externs; an extern named by conflicting + /// modules is reported as [`TypeCheckError::AmbiguousExternModule`] and + /// omitted here so it falls back to an unbound registration. + extern_module_bindings: FxHashMap, } /// RAII guard that enters a spec scope on construction and pops it on drop. @@ -128,6 +144,7 @@ impl TypeChecker { /// 5. Infer variable types in function bodies pub fn infer_types(&mut self, ctx: &mut TypedContext) -> anyhow::Result { self.process_directives(ctx); + self.collect_extern_bindings(ctx); self.register_types(ctx); self.resolve_imports(); self.collect_function_and_constant_definitions(ctx); @@ -311,6 +328,7 @@ impl TypeChecker { return_type, visibility: method_vis.clone(), definition_scope_id, + kind: FuncKind::Local, }; self.symbol_table @@ -716,6 +734,31 @@ impl TypeChecker { .. } => { let func_name = ctx.arena()[*name].name.clone(); + // Externs declare no type parameters, so every type in the + // signature must resolve against the surrounding scope. Validate + // them up front (mirroring `Def::Function`): an undeclared + // `Custom` type would otherwise pass the signature-only extern + // validator and `todo!()`-panic codegen (H6). A `self` receiver + // is meaningless on an extern and is rejected here (H7), matching + // how standalone functions reject it. + for arg in args { + match &arg.kind { + ArgKind::SelfRef { .. } => { + self.errors.push(TypeCheckError::SelfReferenceInFunction { + function_name: func_name.clone(), + location: arg.location, + }); + } + ArgKind::Named { ty, .. } + | ArgKind::Ignored { ty } + | ArgKind::TypeOnly(ty) => { + self.validate_type(ctx.arena(), *ty, &[]); + } + } + } + if let Some(return_type_id) = returns { + self.validate_type(ctx.arena(), *return_type_id, &[]); + } let param_types: Vec = args .iter() .filter_map(|a| match &a.kind { @@ -730,11 +773,12 @@ impl TypeChecker { let return_type = returns .map(|r| TypeInfo::from_type_id(ctx.arena(), r)) .unwrap_or_default(); - if let Err(err) = self.symbol_table.register_function( + let origin = self.extern_module_bindings.get(&def_id).cloned(); + if let Err(err) = self.symbol_table.register_extern_function( &func_name, - vec![], param_types, return_type, + origin, ) { self.errors.push(TypeCheckError::RegistrationFailed { kind: RegistrationKind::Function, @@ -2400,6 +2444,28 @@ impl TypeChecker { .. } => { let fn_name = arena[*ef_name].name.clone(); + // Same validation as the top-level extern arm: reject a + // `self` receiver (H7) and check every signature type so + // an undeclared `Custom` cannot reach codegen (H6). + // Externs carry no type parameters, hence the empty slice. + for arg in args { + match &arg.kind { + ArgKind::SelfRef { .. } => { + self.errors.push(TypeCheckError::SelfReferenceInFunction { + function_name: fn_name.clone(), + location: arg.location, + }); + } + ArgKind::Named { ty, .. } + | ArgKind::Ignored { ty } + | ArgKind::TypeOnly(ty) => { + self.validate_type(arena, *ty, &[]); + } + } + } + if let Some(return_type_id) = returns { + self.validate_type(arena, *return_type_id, &[]); + } let param_types: Vec = args .iter() .filter_map(|a| match &a.kind { @@ -2412,11 +2478,16 @@ impl TypeChecker { let return_type = returns .map(|r| TypeInfo::from_type_id(arena, r)) .unwrap_or_default(); - if let Err(err) = self.symbol_table.register_function( + // Module-inner externs are never bound by a (file-global) + // `use … from` clause: the binding map is keyed by + // top-level declaration only, so this lookup correctly + // misses, leaving the module-inner extern unbound. + let origin = self.extern_module_bindings.get(&inner_def_id).cloned(); + if let Err(err) = self.symbol_table.register_extern_function( &fn_name, - vec![], param_types, return_type, + origin, ) { self.errors.push(TypeCheckError::RegistrationFailed { kind: RegistrationKind::Function, @@ -2463,13 +2534,126 @@ impl TypeChecker { } } + /// Binds each `external fn` to the source module named by a `use … from` + /// clause, populating [`Self::extern_module_bindings`]. + /// + /// For every `use { fields } from module;` directive, each field is paired + /// with `module`. The resulting bindings are validated: + /// + /// - A field imported from two or more distinct modules is reported as + /// [`TypeCheckError::AmbiguousExternModule`] and left unbound. + /// - A field imported from a module but never declared as an `external fn` + /// is reported as [`TypeCheckError::ExternImportNotDeclared`]. + /// - A field imported from exactly one module and declared as an extern is + /// recorded as a bound [`ExternOrigin`]. + /// + /// An `external fn` with no binding `use` is left unbound (no error): a bare + /// extern declaration is valid; analysis rule A024 governs whether *calling* + /// an unlinked extern is allowed. + fn collect_extern_bindings(&mut self, ctx: &TypedContext) { + let arena = ctx.arena(); + + let extern_decls = Self::collect_top_level_extern_decls(arena); + + // field name → (distinct modules in first-seen order, first import location) + let mut imports: FxHashMap, Location)> = FxHashMap::default(); + for sf in arena.source_files() { + for directive in &sf.directives { + let Directive::Use(use_dir) = directive; + let Some(module_ref) = &use_dir.from else { + continue; + }; + let module = module_ref + .segments + .iter() + .map(|s| arena[*s].name.as_str()) + .collect::>() + .join("::"); + for &field_id in &use_dir.imported_types { + let field = arena[field_id].name.clone(); + let entry = imports + .entry(field) + .or_insert_with(|| (Vec::new(), use_dir.location)); + if !entry.0.contains(&module) { + entry.0.push(module.clone()); + } + } + } + } + + for (field, (modules, location)) in imports { + let Some(&decl) = extern_decls.get(&field) else { + self.errors.push(TypeCheckError::ExternImportNotDeclared { + name: field, + module: modules.join(", "), + location, + }); + continue; + }; + if modules.len() > 1 { + let module_list = modules + .iter() + .map(|m| format!("`{m}`")) + .collect::>() + .join(", "); + self.errors.push(TypeCheckError::AmbiguousExternModule { + name: field, + modules: module_list, + location, + }); + continue; + } + let logical_module = modules.into_iter().next().expect("one module"); + self.extern_module_bindings.insert( + decl, + ExternOrigin { + logical_module, + export_field: field, + decl, + resolved_path: None, + }, + ); + } + } + + /// Collects every **top-level** `external fn` declaration, mapping its name + /// to its declaring [`DefId`]. + /// + /// A `use … from` clause is file-global and binds only top-level externs, + /// so this deliberately does **not** descend into `spec` or `module` bodies: + /// a same-named extern declared in a spec or module is left out, stays + /// unbound, and remains A024-rejected when called. Descending here (the prior + /// behavior) let a top-level `use` silently bind a spec-inner extern, + /// suppressing A024 and miscompiling proof-mode codegen. + fn collect_top_level_extern_decls(arena: &AstArena) -> FxHashMap { + let mut decls = FxHashMap::default(); + for sf in arena.source_files() { + for &def_id in &sf.defs { + if let Def::ExternFunction { name, .. } = &arena[def_id].kind { + decls.insert(arena[*name].name.clone(), def_id); + } + } + } + decls + } + /// Process a use statement (Phase A: registration only). /// Converts UseDirective AST to Import and registers in current scope. + /// + /// A `use … from ` clause binds an `external fn` to its source + /// module; it is not a symbol import to resolve against the local scope + /// tree. Such directives are handled by [`Self::collect_extern_bindings`] + /// and skipped here, so their imported names are not mistaken for dangling + /// path imports. fn process_use_statement( &mut self, arena: &AstArena, use_stmt: &inference_ast::nodes::UseDirective, ) -> anyhow::Result<()> { + if use_stmt.from.is_some() { + return Ok(()); + } + let path: Vec = use_stmt .segments .iter() diff --git a/core/type-checker/src/typed_context.rs b/core/type-checker/src/typed_context.rs index c660eda2..9b4753e6 100644 --- a/core/type-checker/src/typed_context.rs +++ b/core/type-checker/src/typed_context.rs @@ -4,7 +4,7 @@ //! type information for all value expressions in the AST after type checking completes. use crate::{ - symbol_table::{EnumInfo, StructInfo, SymbolTable}, + symbol_table::{EnumInfo, ExternOrigin, StructInfo, SymbolTable}, type_info::{NumberType, TypeInfo, TypeInfoKind}, }; @@ -172,6 +172,50 @@ impl TypedContext { }) } + /// Returns the provenance of an `external fn`, or `None` for a local + /// function or an unbound extern (one declared without a binding `use`). + /// + /// The returned [`ExternOrigin`] gives the logical source module and export + /// field for the named extern. WASM code generation consumes this per call + /// site to emit an import and lower the call to its import index. + #[must_use = "this is a pure lookup with no side effects"] + pub fn extern_origin(&self, name: &str) -> Option { + self.symbol_table + .lookup_function_anywhere(name) + .and_then(|info| info.extern_origin().cloned()) + } + + /// Returns the provenance of the **bound** `external fn` declared by + /// `decl`, resolved by declaration identity rather than by name. + /// + /// Analysis uses this to decide whether a specific call resolves to a bound + /// or unbound extern when two same-named externs exist (e.g. a top-level + /// and a spec-inner `f`): a name keyed query cannot tell them apart, but the + /// declaring [`DefId`] can. + #[must_use = "this is a pure lookup with no side effects"] + pub fn extern_origin_by_decl(&self, decl: DefId) -> Option { + self.symbol_table.extern_origin_by_decl(decl) + } + + /// Returns true if the named function is an `external fn` (bound or unbound). + #[must_use = "this is a pure check with no side effects"] + pub fn is_extern_function(&self, name: &str) -> bool { + self.symbol_table + .lookup_function_anywhere(name) + .is_some_and(|info| info.is_extern()) + } + + /// Returns the provenance of every **bound** `external fn` in the program, + /// deduplicated by `(logical_module, export_field)`. + /// + /// The build driver consumes this to resolve and validate each external + /// `.wasm` once before linking. Unbound bare externs carry no origin and do + /// not appear here. + #[must_use = "this enumeration has no side effects"] + pub fn extern_origins(&self) -> Vec { + self.symbol_table.extern_origins() + } + pub(crate) fn set_node_typeinfo(&mut self, node_id: NodeId, type_info: TypeInfo) { self.node_types.insert(node_id, type_info); } @@ -191,7 +235,7 @@ impl TypedContext { #[cfg(test)] mod tests { use super::*; - use crate::symbol_table::FuncInfo; + use crate::symbol_table::{FuncInfo, FuncKind}; use crate::type_info::{NumberType, TypeInfo, TypeInfoKind}; use inference_ast::nodes::Visibility; @@ -219,6 +263,7 @@ mod tests { return_type, visibility: visibility.clone(), definition_scope_id: 0, + kind: FuncKind::Local, }; ctx.symbol_table .register_method(type_name, sig, visibility, has_self) @@ -330,6 +375,7 @@ mod tests { return_type: make_i32_type(), visibility: Visibility::Public, definition_scope_id: 0, + kind: FuncKind::Local, }; ctx.symbol_table .register_method("Point", sig_get_x, Visibility::Public, true) @@ -345,6 +391,7 @@ mod tests { }, visibility: Visibility::Public, definition_scope_id: 0, + kind: FuncKind::Local, }; ctx.symbol_table .register_method("Point", sig_get_y, Visibility::Public, true) @@ -381,6 +428,7 @@ mod tests { return_type: make_i32_type(), visibility: Visibility::Public, definition_scope_id: 0, + kind: FuncKind::Local, }; ctx.symbol_table .register_method("Point", sig_point, Visibility::Public, true) @@ -396,6 +444,7 @@ mod tests { }, visibility: Visibility::Private, definition_scope_id: 0, + kind: FuncKind::Local, }; ctx.symbol_table .register_method("Vector", sig_vector, Visibility::Private, false) diff --git a/core/wasm-codegen/README.md b/core/wasm-codegen/README.md index 2ab53a53..ec6b98fc 100644 --- a/core/wasm-codegen/README.md +++ b/core/wasm-codegen/README.md @@ -21,13 +21,18 @@ Typed AST (TypedContext) ### Compilation Phases 1. **AST Traversal** - Walk typed AST and visit function definitions -2. **Function and method name pre-scan** - Build `func_name_to_idx` map from function - and method names to WASM function section indices before the main compilation pass. Two - sub-steps run: `build_func_name_to_idx` registers top-level functions, then - `build_method_name_to_idx` registers struct methods under mangled names - (`"{StructName}.{method_name}"`). This enables forward references — a caller defined - before its callee in source can still emit a valid `call` instruction. Method indices - follow top-level function indices in the WASM function section. +2. **Import reservation + function index pre-scan** - Build the complete WASM function + index space before any body is compiled, in three ordered sub-steps: + (a) `register_imports` assigns indices `0..N` to every `external fn` declaration bound + via `use … from `, populating `extern_name_to_idx` and recording the + `(logical_module, export_field, type_idx)` tuple needed for the import section; + (b) `build_func_name_to_idx` assigns indices `N..N+K` to top-level local functions + (shifted past the imports by `set_local_func_base(N)`); + (c) `build_method_name_to_idx` assigns indices beyond that for struct methods under + mangled names (`"{StructName}.{method_name}"`). This three-stage registration ensures + all callee indices — imports, locals, and methods — are known before the first `call` + instruction is emitted. Extern calls lower to `call ` identically to local + calls. See [docs/function-calls-lowering.md](docs/function-calls-lowering.md). 3. **Compound Frame Layout** - For functions with array- or struct-typed variables or parameters, compute a stack frame layout by walking the entire function body and collecting array and struct @@ -97,10 +102,13 @@ Typed AST (TypedContext) Non-void functions emit an `unreachable` instruction before the function `end` to satisfy the WASM validator when all paths exit through explicit `return` instructions. See [docs/conditionals-lowering.md](docs/conditionals-lowering.md). -6. **Module Assembly** - Assemble TypeSection, FunctionSection, ExportSection, CodeSection, - NameSection, and (if any function uses linear memory) MemorySection and GlobalSection into - a complete WASM binary. Memory and globals are only emitted when at least one function uses - arrays or structs. +6. **Module Assembly** - Assemble sections in WASM-required order into a complete binary: + TypeSection first, then ImportSection (only if at least one `external fn` is present; + sits between Type and Function per WASM spec), FunctionSection, MemorySection and + GlobalSection (only when at least one function uses arrays or structs), ExportSection, + CodeSection, NameSection, and custom spec sections. The import section placement is + mandatory because imported functions occupy the lowest indices and the section ordering + is enforced by the binary format. ## Non-Deterministic Extensions @@ -267,9 +275,10 @@ Detailed design documents live in `docs/`: - [docs/assignment-lowering.md](docs/assignment-lowering.md) - How assignment statements (`x = expr;`) are lowered to WASM local.set instructions, local index resolution, and current limitations on target forms. -- [docs/function-calls-lowering.md](docs/function-calls-lowering.md) - Forward-reference - pre-scan, parameter index interlock with locals, call lowering pipeline, drop emission - rules, and known limitations. +- [docs/function-calls-lowering.md](docs/function-calls-lowering.md) - Three-stage index + pre-scan (import reservation, top-level functions, methods), import section emission, + extern call lowering, parameter index interlock with locals, the call lowering pipeline, + drop emission rules, and known limitations. - [docs/conditionals-lowering.md](docs/conditionals-lowering.md) - How `if`/`else` statements are lowered to WASM structured control flow and why `unreachable` is emitted before the `end` of every non-void function. @@ -442,6 +451,18 @@ Test data includes: array; validated and executed via wasmtime - `enum_in_struct.inf` - Enum-typed struct field: struct literal with an enum field, reading the field and comparing it to a variant; validated and executed via wasmtime +- Extern import test fixtures in `tests/test_data/codegen/wasm/extern_import/` + (tests in `tests/src/codegen/wasm/extern_import.rs`): + - `single_import.inf` - One `external fn` bound to a module via `use … from`; verifies + import occupies index 0 and the local function shifts to index 1; golden WAT validates + import section content and call target + - `multi_import.inf` - Two externs from the same module; both imports at indices 0 and 1; + the local function shifts to index 2; verifies nested call order in the body + - `import_with_locals.inf` - One import and two local functions; all locals shift past the + import; verifies that cross-local calls use local indices and the extern call uses the + import index + - `import_dedup.inf` - Two externs with an identical `(i32) -> i32` signature share one + type entry; verifies import-against-import type deduplication - Loop test fixtures in `tests/test_data/codegen/wasm/loops/`: - `simple_loop.inf` - Basic conditional loops (`loop COND { body }`) with counter patterns - `infinite_loop_break.inf` - Infinite loops (`loop { body }`) with `break` exit diff --git a/core/wasm-codegen/docs/function-calls-lowering.md b/core/wasm-codegen/docs/function-calls-lowering.md index c0811199..8d378ead 100644 --- a/core/wasm-codegen/docs/function-calls-lowering.md +++ b/core/wasm-codegen/docs/function-calls-lowering.md @@ -27,39 +27,69 @@ Inference allows forward references: a caller can appear before its callee in th file. A single-pass compiler that emits `call` instructions as it encounters calls would not yet know the index of a callee defined later. -The compiler solves this with a two-stage pre-scan in `lib.rs`: first top-level functions -are indexed, then struct methods are indexed with mangled names. +The compiler solves this with a three-stage index registration pass in `lib.rs` +(`register_function_indices`). Importantly, **imported functions occupy the lowest indices +first**, so all local-function indices must be offset by the import count. -Stage 1 registers all top-level functions: +### Stage 0 — Import reservation (`register_imports`) + +`external fn` declarations bound to a source module via `use … from ` are +emitted as WASM function imports. They are registered before any local function so +they occupy indices `0..N`. `set_local_func_base(N)` then seeds the local-function +index counter past the imports. + +```text +register_imports(arena, extern_def_ids, ctx) + extern_name_to_idx["sum"] = 0 (import at index 0) + extern_name_to_idx["neg"] = 1 (import at index 1) + returns N = 2 (import count) + +set_local_func_base(2) (locals now start at 2) +``` + +### Stage 1 — Top-level function registration (`build_func_name_to_idx`) + +Local top-level functions are assigned indices starting at `N` (the import count +returned by Stage 0, passed as `base_idx`): ```text -build_func_name_to_idx(arena, func_def_ids, ctx) - func_name_to_idx["foo"] = 0, ["bar"] = 1, ... +build_func_name_to_idx(arena, func_def_ids, ctx, base_idx=N) + func_name_to_idx["foo"] = N+0, ["bar"] = N+1, ... ``` -Stage 2 registers methods under mangled names (`"{StructName}.{method_name}"`): +### Stage 2 — Method registration (`build_method_name_to_idx`) + +Struct methods are indexed under mangled names (`"{StructName}.{method_name}"`) starting +after all top-level functions: ```text -build_method_name_to_idx(arena, method_defs, ctx, base_idx) - func_name_to_idx["Point.new"] = base_idx + 0 - func_name_to_idx["Point.translate"] = base_idx + 1 +build_method_name_to_idx(arena, method_defs, ctx, base_idx=N+toplevel_count) + func_name_to_idx["Point.new"] = N + toplevel + 0 + func_name_to_idx["Point.translate"] = N + toplevel + 1 method_mangled_names[("Point", "new")] = "Point.new" method_mangled_names[("Point", "translate")] = "Point.translate" ``` -Both stages run before any body is compiled, so all callee names resolve correctly -regardless of definition order in the source file. +All three stages run before any body is compiled, so all callee names resolve correctly +regardless of definition order in the source file and regardless of whether the callee +is an import or a local. ### Diagram ```text -traverse_t_ast_with_compiler +register_function_indices + | + +---> register_imports(extern_def_ids) // Stage 0 + | extern_name_to_idx["sum"] = 0, ... + | returns N = import_count | - +---> build_func_name_to_idx(func_def_ids) - | func_name_to_idx["foo"] = 0, ["bar"] = 1, ... + +---> set_local_func_base(N) // seeds func_idx = N | - +---> build_method_name_to_idx(method_defs, base_idx=N) - | func_name_to_idx["Point.new"] = N+0, ... + +---> build_func_name_to_idx(func_def_ids, base_idx=N) // Stage 1 + | func_name_to_idx["foo"] = N+0, ["bar"] = N+1, ... + | + +---> build_method_name_to_idx(method_defs, base_idx=N+toplevel) // Stage 2 + | func_name_to_idx["Point.new"] = N+toplevel+0, ... | method_mangled_names[("Point","new")] = "Point.new" | +---> visit_function_definition(func_def_ids[0], None) // "foo" @@ -67,8 +97,8 @@ traverse_t_ast_with_compiler +---> ... +---> visit_function_definition(method_def_ids[0], Some("Point")) // "Point.new" | - | lower_function_call can look up any index - | regardless of definition order + | lower_function_call / lower_extern_call can look up any index + | regardless of definition order or import vs local ``` ## How Parameter Indices Interlock with Local Indices @@ -207,15 +237,73 @@ Statement::Expression(expression) => { | non-void | RHS of `let` | No | `local.set` consumes the value (different code path) | | non-void | RHS of `return` | No | `return` consumes the value | +## Extern Function Calls + +An `external fn` declaration bound to a source module is an import: it has no +local body to compile, but it does have a WASM function index (assigned by Stage +0) and a WASM type signature derived from the declared Inference parameter and +return types. + +When `lower_function_call` resolves the callee and finds it in +`extern_name_to_idx`, it emits `call ` via the same +`Instruction::Call` path used for local functions. The only difference is which +index table the lookup hits. + +### Example + +```inference +external fn sum(a: i32, b: i32) -> i32; +use { sum } from arith; + +pub fn add_three(x: i32) -> i32 { + return sum(x, 3); +} +``` + +After Stage 0, `sum` is at import index `0`; after Stage 1, `add_three` is at +local index `1`. The generated WAT: + +```wat +(module + (type (;0;) (func (param i32 i32) (result i32))) + (type (;1;) (func (param i32) (result i32))) + (import "arith" "sum" (func (;0;) (type 0))) + (func $add_three (;1;) (type 1) (param $x i32) (result i32) + local.get $x + i32.const 3 + call 0 + return + unreachable) + (export "add_three" (func 1))) +``` + +### Import Section Emission + +The import section is emitted in `finish_and_take` between the Type section and +the Function section (the WASM section ordering mandate). It is guarded by +`cov_mark::hit!(wasm_codegen_emit_import_section)` and omitted entirely when +there are no externs. Each entry carries the logical module name, the export +field name, and the type index from `intern_type`. + +### Type Deduplication + +`intern_type` deduplicates function signatures before assigning a type index: an +import and a local function (or two imports) with the same parameter and result +types share one type entry in the type section. This keeps the type section +compact even when multiple externs share a common signature. + ## Supported vs Unsupported Callee Kinds -Three callee forms are now supported: +Four callee forms are now supported: ```inference -// Supported: plain identifier +// Supported: plain identifier (local) let x = foo(1, 2); return bar(); +// Supported: extern call +let y = sum(x, 3); // sum is an external fn + // Supported: associated function call let p = Point::new(1, 2); @@ -289,18 +377,27 @@ resolved until multi-file compilation is implemented (currently `todo!()` in `co | `wasm_codegen_emit_function_params` | 7 | 7 parameters across all functions in `fn_params.inf` | | `wasm_codegen_emit_function_call` | 5 | 5 call sites in `fn_calls.inf` | | `wasm_codegen_emit_self_copy_on_entry` | varies | `mut self` frame copy emitted for each method with mutable receiver | +| `wasm_codegen_emit_import_section` | 1+ | Import section emitted (fires whenever at least one `external fn` is present) | +| `wasm_codegen_emit_extern_call` | 1+ | Extern call lowered to `call ` (fires in `single_import_test`) | The `fn_params_test` verifies `wasm_codegen_emit_function_params` fires exactly 7 times (matching `fn_params.inf`: 1+1+1+2+2 params). The `fn_calls_test` verifies -`wasm_codegen_emit_function_call` fires exactly 5 times. +`wasm_codegen_emit_function_call` fires exactly 5 times. The `single_import_test` checks +both import-section marks together. ## Related Files -- `core/wasm-codegen/src/compiler.rs` — `build_func_name_to_idx`, `build_method_name_to_idx`, `resolve_function_callee`, `lower_function_call` +- `core/wasm-codegen/src/compiler.rs` — `register_imports`, `build_func_name_to_idx`, `build_method_name_to_idx`, `resolve_function_callee`, `lower_function_call`, `finish_and_take` (import section emission) +- `core/wasm-codegen/src/lib.rs` — `register_function_indices`, `traverse_t_ast_with_compiler`, `collect_emittable_functions` (extern fn routed to imports bucket) - `core/wasm-codegen/src/errors.rs` — `CodegenError` enum -- `core/wasm-codegen/src/lib.rs` — `traverse_t_ast_with_compiler` (where pre-scan is called) - `core/wasm-codegen/README.md` — Crate-level overview and compilation phases - `core/wasm-codegen/docs/local-variables-lowering.md` — Local variable lowering (prerequisite) +- `core/wasm-linker/README.md` — How the linked output is produced from the import-bearing intermediate module +- `tests/test_data/codegen/wasm/extern_import/single_import/single_import.inf` — Minimal one-import fixture +- `tests/test_data/codegen/wasm/extern_import/multi_import/multi_import.inf` — Two imports, index shift +- `tests/test_data/codegen/wasm/extern_import/import_with_locals/import_with_locals.inf` — Import plus two local functions +- `tests/test_data/codegen/wasm/extern_import/import_dedup/import_dedup.inf` — Two same-signature imports sharing one type +- `tests/src/codegen/wasm/extern_import.rs` — Structural and golden tests for import emission - `tests/test_data/codegen/wasm/base/fn_params/fn_params.inf` — Parameter test fixture - `tests/test_data/codegen/wasm/base/fn_calls/fn_calls.inf` — Function call test fixture - `tests/test_data/codegen/wasm/base/method_assoc/method_assoc.inf` — Associated function call fixture diff --git a/core/wasm-codegen/src/compiler.rs b/core/wasm-codegen/src/compiler.rs index f18e5f42..33aa1444 100644 --- a/core/wasm-codegen/src/compiler.rs +++ b/core/wasm-codegen/src/compiler.rs @@ -63,7 +63,8 @@ use rustc_hash::FxHashMap; use inference_ast::arena::AstArena; use inference_ast::ids::{BlockId, DefId, ExprId, IdentId, NodeId, StmtId, TypeId}; use inference_ast::nodes::{ - ArgKind, BlockKind, Def, Expr, OperatorKind, SimpleTypeKind, Stmt, TypeNode, UnaryOperatorKind, + ArgData, ArgKind, BlockKind, Def, Expr, OperatorKind, SimpleTypeKind, Stmt, TypeNode, + UnaryOperatorKind, Visibility, }; use inference_type_checker::{ @@ -71,9 +72,9 @@ use inference_type_checker::{ typed_context::TypedContext, }; use wasm_encoder::{ - BlockType as WasmBlockType, CodeSection, ConstExpr, ExportKind, ExportSection, Function, - FunctionSection, GlobalSection, GlobalType, IndirectNameMap, Instruction, MemorySection, - MemoryType, Module, NameMap, NameSection, TypeSection, ValType, + BlockType as WasmBlockType, CodeSection, ConstExpr, EntityType, ExportKind, ExportSection, + Function, FunctionSection, GlobalSection, GlobalType, ImportSection, IndirectNameMap, + Instruction, MemorySection, MemoryType, Module, NameMap, NameSection, TypeSection, ValType, }; use crate::memory::{ @@ -267,6 +268,21 @@ struct LoopContext { loop_exit_depths: Vec, } +/// A single WASM function import emitted for an `external fn`. +/// +/// `module` / `field` are the two-level WASM import name. `module` is the +/// logical, platform-independent module reference (`ExternOrigin::logical_module`, +/// `::`-joined), and `field` is the export field the linker satisfies. `type_idx` +/// indexes the shared [`Compiler::types`] table; identical signatures dedup onto +/// the same entry. Imports occupy WASM function indices `0..N`, ahead of every +/// locally defined function (see [`Compiler::register_imports`]). +#[derive(Debug, Clone)] +struct ImportEntry { + module: String, + field: String, + type_idx: u32, +} + /// Metadata about a function that returns an array type. /// /// Populated during `build_func_name_to_idx` so that callers and callees @@ -369,6 +385,14 @@ pub(crate) struct Compiler { module_name: String, /// Maps function keys to their WASM function section indices. func_name_to_idx: FxHashMap, + /// Function imports emitted for `external fn` declarations, in + /// registration order. Each occupies WASM function index `i` for its + /// position `i` in this vector (imports come before all local functions). + imports: Vec, + /// Maps an `external fn` name to its WASM import function index (`0..N`). + /// Calls to an extern lower to `call ` rather than a local + /// function index. Populated alongside [`Self::imports`] during Stage 1. + extern_name_to_idx: FxHashMap, /// Sticky flag: set to `true` when any function requires linear memory. has_memory: bool, /// Maps function keys to their array return type metadata. @@ -443,6 +467,8 @@ impl Compiler { has_main: false, module_name: module_name.to_string(), func_name_to_idx: FxHashMap::default(), + imports: Vec::new(), + extern_name_to_idx: FxHashMap::default(), has_memory: false, func_array_returns: FxHashMap::default(), func_struct_returns: FxHashMap::default(), @@ -491,6 +517,13 @@ impl Compiler { .or_default(); } + /// Borrows the recorded `(spec_name -> [func_idx])` map so a caller can + /// validate it before [`Self::finish_and_take`] consumes the compiler and + /// emits the `inference.spec_funcs` section. + pub(crate) fn spec_func_indices(&self) -> &FxHashMap> { + &self.spec_func_indices_by_spec + } + fn func(&mut self) -> &mut Function { self.func .as_mut() @@ -554,10 +587,137 @@ impl Compiler { Ok(()) } + /// Registers a function import for every `external fn` reachable from this + /// source file, assigning each WASM function index `0..N` ahead of all local + /// functions. + /// + /// For each extern this: + /// 1. lowers its declared signature to a WASM `(params, results)` type and + /// dedups it into [`Self::types`] (identical signatures share one entry); + /// 2. records an [`ImportEntry`] carrying the logical module / export field + /// from the Phase 1 provenance ([`TypedContext::extern_origin`]); + /// 3. maps the extern's name to its import function index so call lowering can + /// emit `call `. + /// + /// Externs without provenance (a bare `external fn` with no binding `use`) are + /// skipped: they cannot be emitted as a well-formed two-level import, and + /// analysis rule A024 already gates *calling* an unlinked extern. Returns the + /// number of imports registered, which is the base index for local functions. + /// + /// Must run before [`Self::build_func_name_to_idx`] so local indices follow + /// the imports. + #[allow(clippy::cast_possible_truncation)] + pub(crate) fn register_imports( + &mut self, + arena: &AstArena, + extern_def_ids: &[DefId], + ctx: &TypedContext, + ) -> Result { + for &def_id in extern_def_ids { + let Def::ExternFunction { + name, + args, + returns, + .. + } = &arena[def_id].kind + else { + continue; + }; + let extern_name = arena[*name].name.clone(); + // Resolve provenance by the declaring `DefId`, not by name. Two + // same-named externs can coexist (a bound top-level `f` and an + // unbound spec-inner `f`); a name-keyed lookup cannot tell them apart + // and would bind the unbound declaration to the bound one's origin, + // registering a spurious/duplicate import. The decl-keyed query + // returns `None` for the unbound one, so it is correctly skipped. + let Some(origin) = ctx.extern_origin_by_decl(def_id) else { + continue; + }; + + let params = Self::import_param_types(arena, args, ctx)?; + let results = match returns { + Some(ty_id) => Self::val_type_from_type_id(arena, *ty_id, ctx)? + .into_iter() + .collect::>(), + None => Vec::new(), + }; + let type_idx = self.intern_type(params, results); + + let import_idx = self.imports.len() as u32; + self.imports.push(ImportEntry { + module: origin.logical_module, + field: origin.export_field, + type_idx, + }); + self.extern_name_to_idx.insert(extern_name, import_idx); + } + + Ok(self.imports.len() as u32) + } + + /// Lowers an extern's declared parameter types to WASM value types. An + /// ignored parameter (`external fn f(_: i32)`) still occupies an ABI slot: + /// the call site pushes the argument and the real `.wasm` export declares + /// that parameter, so it is lowered as a real param just like a named or + /// type-only one. This keeps the import signature in lock-step with the + /// validator's `lower_extern_signature`. A `unit` parameter cannot reach + /// this point: the validator rejects it (`LowerSignatureError::UnitParameter`) + /// earlier in the pipeline. + fn import_param_types( + arena: &AstArena, + args: &[ArgData], + ctx: &TypedContext, + ) -> Result, CodegenError> { + let mut params = Vec::with_capacity(args.len()); + for arg in args { + let ty = match &arg.kind { + ArgKind::Named { ty, .. } + | ArgKind::TypeOnly(ty) + | ArgKind::Ignored { ty } => *ty, + // The type-checker now rejects `self` on an extern, so this is + // unreachable from valid source; drop it to match codegen, which + // emits no receiver param for an import. + ArgKind::SelfRef { .. } => continue, + }; + if let Some(val) = Self::val_type_from_type_id(arena, ty, ctx)? { + params.push(val); + } + } + Ok(params) + } + + /// Returns the index of `(params, results)` in [`Self::types`], appending a + /// new entry only when no identical signature is already present. Used so an + /// import and a local function (or two imports) with the same signature share + /// one type entry. + fn intern_type(&mut self, params: Vec, results: Vec) -> u32 { + if let Some(idx) = self + .types + .iter() + .position(|(p, r)| p == ¶ms && r == &results) + { + #[allow(clippy::cast_possible_truncation)] + return idx as u32; + } + #[allow(clippy::cast_possible_truncation)] + let idx = self.types.len() as u32; + self.types.push((params, results)); + idx + } + + /// Sets the WASM function index at which body compilation begins. Imports + /// occupy `0..base`, so the first locally defined function body is index + /// `base`. Called once after [`Self::register_imports`], before any body is + /// compiled. + pub(crate) fn set_local_func_base(&mut self, base: u32) { + self.func_idx = base; + } + /// Builds the function name-to-WASM-index map from the source file's function definitions. /// /// `base_idx` is the WASM function index assigned to `func_def_ids[0]`. Top-level - /// functions pass 0; spec-originated functions are routed through + /// functions pass the import count `N` (so locals follow the imports); spec-originated + /// functions are routed through /// [`Self::build_func_name_to_idx_with_spec_names`] instead. /// /// Must be called before `visit_function_definition` so that forward references @@ -715,9 +875,9 @@ impl Compiler { arena: &AstArena, ty_id: TypeId, ctx: &TypedContext, - ) -> Option { + ) -> Result, CodegenError> { match &arena[ty_id].kind { - TypeNode::Simple(SimpleTypeKind::Unit) => None, + TypeNode::Simple(SimpleTypeKind::Unit) => Ok(None), TypeNode::Simple( SimpleTypeKind::Bool | SimpleTypeKind::I8 @@ -727,8 +887,8 @@ impl Compiler { | SimpleTypeKind::I32 | SimpleTypeKind::U32, ) - | TypeNode::Array { .. } => Some(ValType::I32), - TypeNode::Simple(SimpleTypeKind::I64 | SimpleTypeKind::U64) => Some(ValType::I64), + | TypeNode::Array { .. } => Ok(Some(ValType::I32)), + TypeNode::Simple(SimpleTypeKind::I64 | SimpleTypeKind::U64) => Ok(Some(ValType::I64)), TypeNode::Generic { .. } => todo!(), TypeNode::Function { .. } => todo!(), TypeNode::QualifiedName { .. } => todo!(), @@ -736,9 +896,15 @@ impl Compiler { TypeNode::Custom(ident_id) => { let name = &arena[*ident_id].name; if ctx.lookup_struct(name).is_some() || ctx.lookup_enum(name).is_some() { - Some(ValType::I32) + Ok(Some(ValType::I32)) } else { - todo!("Unsupported custom type in WASM codegen: {name}") + // The type-checker rejects an unknown type before codegen, so + // this is unreachable from a well-formed pipeline. Returning an + // error rather than `todo!()` keeps a malformed type from + // panicking the compiler (H6 defense-in-depth). + Err(CodegenError::UnsupportedType { + rendered: name.clone(), + }) } } } @@ -818,10 +984,12 @@ impl Compiler { let results: Vec = if is_sret { vec![] } else { - returns - .and_then(|ty_id| Self::val_type_from_type_id(arena, ty_id, ctx)) - .into_iter() - .collect() + match returns { + Some(ty_id) => Self::val_type_from_type_id(arena, ty_id, ctx)? + .into_iter() + .collect(), + None => vec![], + } }; let mut params: Vec = vec![]; @@ -841,7 +1009,7 @@ impl Compiler { match &arg.kind { ArgKind::Named { name, ty, .. } => { cov_mark::hit!(wasm_codegen_emit_function_params); - let vt = Self::val_type_from_type_id(arena, *ty, ctx) + let vt = Self::val_type_from_type_id(arena, *ty, ctx)? .expect("Function parameter type must not be unit"); params.push(vt); let arg_name = arena[*name].name.clone(); @@ -2170,6 +2338,15 @@ impl Compiler { self.lower_expression(arena, *arg_expr_id, ctx, None); } + // An `external fn` call targets its import index (0..N) rather than a + // local function index. Imports never participate in spec-mangled + // lookup, so this probe precedes the free-callee resolution. + if let Some(&import_idx) = self.extern_name_to_idx.get(callee_name) { + cov_mark::hit!(wasm_codegen_emit_extern_call); + self.func().instruction(&Instruction::Call(import_idx)); + return Ok(()); + } + let func_idx = self .resolve_free_callee_idx(callee_name) .ok_or_else(|| CodegenError::UnknownFunction(callee_name.to_owned()))?; @@ -4273,6 +4450,22 @@ impl Compiler { } module.section(&type_section); + // Import section sits between Type and Function (WASM section order). + // Imported functions occupy the lowest function indices, so emitting it + // here is what makes the local `func_idx` base reservation correct. + if !self.imports.is_empty() { + cov_mark::hit!(wasm_codegen_emit_import_section); + let mut import_section = ImportSection::new(); + for import in &self.imports { + import_section.import( + &import.module, + &import.field, + EntityType::Function(import.type_idx), + ); + } + module.section(&import_section); + } + let mut function_section = FunctionSection::new(); for &type_idx in &self.functions { function_section.function(type_idx); diff --git a/core/wasm-codegen/src/errors.rs b/core/wasm-codegen/src/errors.rs index 37eb48ee..1ce03f4c 100644 --- a/core/wasm-codegen/src/errors.rs +++ b/core/wasm-codegen/src/errors.rs @@ -42,4 +42,20 @@ pub(crate) enum CodegenError { outer_spec: String, inner_spec: String, }, + /// A type in a signature has no WASM value-type representation. The + /// type-checker rejects unknown types before codegen, so reaching this is a + /// defense-in-depth failure rather than a normal diagnostic path; emitting + /// an error keeps codegen from `todo!()`-panicking on a malformed type. + #[error("unsupported type in WASM codegen: {rendered}")] + UnsupportedType { rendered: String }, + /// A spec name exceeds the byte cap that both `inference.spec_funcs` + /// decoders enforce (the linker and the Rocq translator). Emitting it would + /// produce a `.wasm` artifact that fails its own downstream link/translate + /// step, so codegen refuses up front with an actionable diagnostic. + #[error("spec name is {len} bytes, which exceeds the maximum of {max} bytes: '{name}'")] + SpecNameTooLong { + name: String, + len: usize, + max: usize, + }, } diff --git a/core/wasm-codegen/src/lib.rs b/core/wasm-codegen/src/lib.rs index b37c407d..2f6297e3 100644 --- a/core/wasm-codegen/src/lib.rs +++ b/core/wasm-codegen/src/lib.rs @@ -144,6 +144,20 @@ pub fn codegen( traverse_t_ast_with_compiler(typed_context, &mut compiler, mode)?; } + // Reject any spec name that would overflow the byte cap both + // `inference.spec_funcs` decoders enforce, before the section is emitted. + // Surfacing it here yields a clean codegen diagnostic instead of an + // artifact that fails its own downstream link/translate step. + if let Err(too_long) = spec_section::check_spec_name_lengths(compiler.spec_func_indices()) { + cov_mark::hit!(wasm_codegen_spec_name_too_long); + return Err(CodegenError::SpecNameTooLong { + name: too_long.name, + len: too_long.len, + max: spec_section::MAX_SPEC_NAME_LEN, + } + .into()); + } + // Snapshot `has_main` before `finish_and_take` consumes the compiler: // the section is emitted in a single pass that moves out the recorded // spec map alongside the WASM bytes. @@ -239,19 +253,27 @@ fn traverse_t_ast_with_compiler( /// Stage 1: register every WASM function index up front so forward references /// resolve correctly during body compilation. Index order: -/// regular fns (base 0) → regular methods → spec fns → spec methods. +/// imports (base 0) → regular fns → regular methods → spec fns → spec methods. +/// +/// Imported `external fn`s occupy the lowest WASM function indices, so every +/// local function is shifted by the import count. `set_local_func_base` seeds the +/// body-compilation index counter past the imports to keep it in lockstep with +/// the `func_name_to_idx` entries. fn register_function_indices( arena: &AstArena, compiler: &mut Compiler, typed_context: &TypedContext, buckets: &EmittableFunctions, ) -> Result<(), CodegenError> { + let import_count = compiler.register_imports(arena, &buckets.imports, typed_context)?; + compiler.set_local_func_base(import_count); + let toplevel_count = u32::try_from(buckets.funcs.len()).expect("more than u32::MAX top-level functions"); let method_count = u32::try_from(buckets.methods.len()).expect("more than u32::MAX top-level methods"); - compiler.build_func_name_to_idx(arena, &buckets.funcs, typed_context, 0)?; + compiler.build_func_name_to_idx(arena, &buckets.funcs, typed_context, import_count)?; let method_base_idx = compiler.func_idx_after_toplevel(toplevel_count); compiler.build_method_name_to_idx( arena, @@ -260,7 +282,7 @@ fn register_function_indices( method_base_idx, )?; - let spec_func_base = toplevel_count + method_count; + let spec_func_base = import_count + toplevel_count + method_count; let spec_func_indices = compiler.build_func_name_to_idx_with_spec_names( arena, &buckets.spec_funcs, @@ -322,6 +344,10 @@ fn register_function_indices( } struct EmittableFunctions { + /// Top-level `external fn` declarations, emitted as WASM function imports + /// at indices `0..N` ahead of every local function (see + /// [`Compiler::register_imports`]). + imports: Vec, funcs: Vec, methods: Vec<(String, DefId)>, /// Each entry: `(spec_name, def_id)`. @@ -335,12 +361,13 @@ struct EmittableFunctions { visited_spec_names: Vec, } -/// Sorts top-level defs into the four buckets used by Stage 1 registration. +/// Sorts top-level defs into the five buckets used by Stage 1 registration. /// -/// `Def::ExternFunction` is intentionally skipped — extern functions are not currently -/// emitted to the WASM import section (top-level or spec-inner). When extern-fn -/// emission lands, spec-inner externs will need to either join `spec_funcs` or be -/// surfaced in a sibling `_spec_imports` list in the Rocq output. +/// Top-level `Def::ExternFunction` declarations land in the `imports` bucket and +/// are emitted as WASM function imports at indices `0..N` ahead of every local +/// function. Spec-inner externs are still skipped — when they are wired through, +/// they will need to either join `spec_funcs` or be surfaced in a sibling +/// `_spec_imports` list in the Rocq output. /// /// In `compile` mode the spec buckets stay empty (specs are stripped). In `proof` /// mode, top-level `Def::Spec.defs` is recursed one level deep to surface inner @@ -352,6 +379,7 @@ fn collect_emittable_functions( mode: CompilationMode, ) -> Result { let mut buckets = EmittableFunctions { + imports: Vec::new(), funcs: Vec::new(), methods: Vec::new(), spec_funcs: Vec::new(), @@ -361,6 +389,7 @@ fn collect_emittable_functions( for &def_id in defs { match &arena[def_id].kind { + Def::ExternFunction { .. } => buckets.imports.push(def_id), Def::Function { .. } => buckets.funcs.push(def_id), Def::Struct { name, methods, .. } => { let struct_name = arena[*name].name.clone(); diff --git a/core/wasm-codegen/src/spec_section.rs b/core/wasm-codegen/src/spec_section.rs index 60552bd7..5913841c 100644 --- a/core/wasm-codegen/src/spec_section.rs +++ b/core/wasm-codegen/src/spec_section.rs @@ -44,6 +44,53 @@ pub const SECTION_NAME: &str = "inference.spec_funcs"; /// `wasm-to-v` decoder so encoder and decoder share a single source of truth. pub const SECTION_VERSION: u32 = 1; +/// Upper bound, in bytes, on a single spec name embedded in the +/// `inference.spec_funcs` payload. +/// +/// Both decoders reject any longer name: the linker +/// (`core/wasm-linker/src/spec_funcs.rs`) and the Rocq translator +/// (`core/wasm-to-v/src/wasm_parser.rs`) each cap at the same value, the +/// latter inheriting it from `validate_rocq_identifier`'s `TooLong` rule. +/// Enforcing the cap here keeps codegen from emitting an artifact that would +/// fail its own downstream link/translate step. +pub(crate) const MAX_SPEC_NAME_LEN: usize = 255; + +/// Verifies that every spec name in `map` fits within [`MAX_SPEC_NAME_LEN`]. +/// +/// The encoder writes names verbatim, so an over-long name would produce a +/// `.wasm` artifact that both downstream decoders reject. Checking here lets +/// codegen surface a clean diagnostic instead of deferring the failure to the +/// linker or translator. +/// +/// # Errors +/// +/// Returns the offending name and its byte length when any name exceeds the +/// cap, sorted-first by name for a deterministic message. +pub(crate) fn check_spec_name_lengths( + map: &FxHashMap>, +) -> Result<(), SpecNameTooLong> { + let mut over_long: Vec<&str> = map + .keys() + .filter(|name| name.len() > MAX_SPEC_NAME_LEN) + .map(String::as_str) + .collect(); + over_long.sort_unstable(); + match over_long.first() { + Some(name) => Err(SpecNameTooLong { + name: (*name).to_string(), + len: name.len(), + }), + None => Ok(()), + } +} + +/// A spec name exceeded [`MAX_SPEC_NAME_LEN`] bytes during codegen. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct SpecNameTooLong { + pub(crate) name: String, + pub(crate) len: usize, +} + /// Encodes the spec map into the canonical payload bytes. pub(crate) fn encode_payload(map: &FxHashMap>) -> Vec { let mut entries: Vec<(&str, &[u32])> = map @@ -137,6 +184,46 @@ mod tests { assert_eq!(payload, vec![1, 2, 1, b'A', 1, 2, 1, b'B', 1, 5]); } + #[test] + fn name_within_cap_passes_check() { + let mut map: FxHashMap> = FxHashMap::default(); + map.insert("a".repeat(MAX_SPEC_NAME_LEN), vec![0]); + assert_eq!(check_spec_name_lengths(&map), Ok(())); + } + + #[test] + fn over_long_name_is_rejected() { + let mut map: FxHashMap> = FxHashMap::default(); + let name = "a".repeat(MAX_SPEC_NAME_LEN + 1); + map.insert(name.clone(), vec![0]); + assert_eq!( + check_spec_name_lengths(&map), + Err(SpecNameTooLong { + name, + len: MAX_SPEC_NAME_LEN + 1, + }) + ); + } + + #[test] + fn reports_first_over_long_name_deterministically() { + let mut map: FxHashMap> = FxHashMap::default(); + // Two names share the over-cap length; the lexicographically smaller + // one must be reported so the diagnostic is stable across hash orders. + let long_b = format!("b{}", "x".repeat(MAX_SPEC_NAME_LEN)); + let long_a = format!("a{}", "x".repeat(MAX_SPEC_NAME_LEN)); + map.insert(long_b, vec![0]); + map.insert(long_a.clone(), vec![1]); + let err = check_spec_name_lengths(&map).expect_err("over-long names must reject"); + assert_eq!(err.name, long_a); + } + + #[test] + fn cap_matches_decoder_contract() { + // Mirrors the cap both `inference.spec_funcs` decoders enforce. + assert_eq!(MAX_SPEC_NAME_LEN, 255); + } + #[test] fn payload_starts_with_version_byte() { let map: FxHashMap> = FxHashMap::default(); diff --git a/core/wasm-linker/Cargo.toml b/core/wasm-linker/Cargo.toml new file mode 100644 index 00000000..8d381fe4 --- /dev/null +++ b/core/wasm-linker/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "inference-wasm-linker" +version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +description = "Static-merge linker: folds external `.wasm` function bodies into the main module so no cross-module imports remain" +keywords = ["compiler", "webassembly", "linker", "wasm"] +categories = ["compilers", "wasm"] + +[dependencies] +wasm-encoder = "0.249.0" +inf-wasmparser.workspace = true +thiserror.workspace = true + +[dev-dependencies] +wat = "1.225.0" +wasm-encoder = "0.249.0" +inf-wasmparser = { workspace = true } +inference-wasm-to-v-translator = { workspace = true } +rustc-hash = { workspace = true } diff --git a/core/wasm-linker/README.md b/core/wasm-linker/README.md new file mode 100644 index 00000000..ad5fe057 --- /dev/null +++ b/core/wasm-linker/README.md @@ -0,0 +1,313 @@ +# inference-wasm-linker + +Static-merge linker for the Inference compiler: folds external `.wasm` function +bodies into the main module so no cross-module imports remain in the output. + +## Overview + +When an Inference program declares `external fn` bindings and calls them, the +compiler emits the main module with `(import …)` entries — one per external +function — at the lowest function indices. `inference-wasm-linker` consumes that +intermediate module plus the resolved external `.wasm` binaries and produces a +**single self-contained module** with those imports satisfied and removed. +The result has no dangling cross-module imports and flows directly into +`wasm-to-v` for Rocq translation. + +This approach mirrors `wasm-ld`: compile first, link second. Keeping the link +pass in a separate crate makes it testable in isolation and reusable for the +C-library half of issue #9. + +## How the Merge Works + +For each import in the main module the linker performs these steps: + +```text +1. Find which external module exports a function of that name +2. Compute the transitive closure of that export inside its source module + (the functions it calls, recursively, plus any helpers) +3. Classify the closure's feasibility tier (A, B, or C — see below) +4. Dedup the closure's function types into the output type section +5. Append the closure's bodies after the main module's local functions, + rewriting every index-bearing instruction into the unified index space +6. Remove the satisfied import and redirect the main module's calls + from the old import index onto the merged body's new index +``` + +### Index Space After Merging + +The output module defines a single function index space: + +```text +[0 .. main_local_count) main module's local functions (imports removed) +[main_local_count .. total) merged external functions, in closure order +``` + +Every `call`, `ref.func`, and `call_indirect` type index in all copied bodies is +rewritten through the `rewrite` module to land in this space. + +### Operator Re-encoding + +The `rewrite` module walks each copied body's operator stream and re-encodes only +the index-bearing operators (`Call`, `ReturnCall`, `RefFunc`, `CallIndirect`, +`ReturnCallIndirect`, block/loop/if when carrying a function type index). Every +other operator is copied verbatim from the source bytes, so the output is +byte-identical to the input wherever no index changes. + +### Type Deduplication + +Two functions with identical signatures share one type entry in the output type +section. The deduplication key is a byte-packed encoding of the parameter and +result value types. This prevents the type section from growing with duplicate +entries as more external closures are merged in. Only **type-section entries** +(signature declarations) are deduplicated — function bodies are never +deduplicated or dropped by this step. Unreachable functions are excluded earlier +by the transitive closure walk, before any output index is committed. + +### Name Section + +The linker preserves the `name` custom section so the Rocq translator emits +named `Definition`s rather than opaque `func_` placeholders: + +- Main module local functions keep their source debug names (re-indexed onto the + import-free output space). +- Every merged external function is named under its source's logical module, + using a `module.field` form: + - A merged closure **root** is named `.` — a closure that + satisfies import `sum` bound under logical module `mathlib` becomes + `mathlib.sum`. + - A merged **inner callee** the source module named keeps that name, prefixed: + `mathlib.helper`. + - A **nameless** inner callee (an external stripped of its name section) is + given a deterministic fallback derived from its output index, prefixed the + same way: `mathlib.func_`. +- If no function carries a name, the name section is omitted entirely. + +The module prefix is collision-free by construction: two externals bound under +different logical modules may export — and internally call — functions of the +same field, and an unprefixed scheme would let those names collide in the name +section, forcing the Rocq translator down its index-suffix disambiguation +(`sum` vs `sum_2`), which is index-dependent and shifts across merges. The `.` +separator matches Inference's `Type.method` naming convention. The Rocq +translator (`core/wasm-to-v/src/rocq_names.rs`) sanitizes every non-alphanumeric +to `_`, so `mathlib.sum` reads as `Definition mathlib_sum` in the `.v`. A residual +name collision after sanitization (e.g. two distinct logical modules that +sanitize to the same identifier) is still disambiguated by the translator's index +suffix; the module prefix removes the common case rather than every possible one. + +## Feasibility Tiers + +Whether an external function can be merged depends on what its transitive closure +touches. The tier model ships the common cases first and gates the hard case +behind a clear error rather than attempting an unsound merge. + +### Tier A — Pure Functions + +No memory accesses, no globals, no data segments, no tables. Examples: `sum`, +`sub`, `abs`, any function that only reads its parameters and does arithmetic. + +Merge cost: copy the body, dedup the type, rewrite `call` targets. No address +relocation needed. + +```wat +;; Tier A: pure arithmetic — trivially mergeable +(func $sum (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) +``` + +### Tier B — Memory Through Caller-Passed Pointers + +The closure loads or stores through addresses the caller supplies, but defines no +static data of its own, no mutable globals, and no table or element entries. +Examples: `sort(ptr, len)`, `memcpy(dst, src, n)`. + +Merge cost: same as Tier A. The function shares the single linear memory the main +module owns; no address relocation is required because all addresses are +caller-supplied at runtime. + +```wat +;; Tier B: writes to a caller-supplied address — mergeable +(func $store_at (param $addr i32) (param $val i32) + local.get $addr + local.get $val + i32.store) +``` + +### Tier C — Own Static Data, Globals, or Tables + +The closure carries its own baked-in data segments (lookup tables, string +constants), defines or accesses module globals (per-module mutable state), or +uses table and element entries for indirect calls. Merging these without +relocation metadata would silently produce an incorrect module because the +absolute addresses and per-module state would alias unpredictably with the main +module. + +The linker rejects Tier-C inputs with `LinkError::RequiresRelocatableBuild` and +a list of specific reasons. Build the external module with a +relocatable/position-independent toolchain to enable future Tier-C support. + +```text +error: external function `lookup` requires a relocatable build: + defines or initializes its own static data segments +``` + +### Classification Logic + +The `tier` module collects "Tier-C reasons" by inspecting the parsed module +structure and the closure's `ClosureEffects`: + +| Signal | Tier-C reason | +|--------|---------------| +| `module.data_count > 0` or closure uses `memory.init` / `data.drop` | own static data segments | +| `!module.globals.is_empty()` or closure uses `global.get` / `global.set` | defines or accesses module globals | +| `!module.tables.is_empty()` or `module.element_count > 0` or closure uses `call_indirect` / `table.*` / `ref.func` / `elem.drop` | uses a table or element segment | + +If no Tier-C reasons are collected, the closure is Tier B when any body accesses +memory (load/store/copy/fill/size/grow), and Tier A otherwise. + +## Entry Point + +```rust +use inference_wasm_linker::{link, LinkError}; + +let unified: Vec = link( + main_wasm, + &[("arith", arith_wasm), ("crypto", crypto_wasm)], +)?; +``` + +`link` takes the main module bytes and a slice of `(logical_module, bytes)` +pairs — each external is tagged with the logical module name codegen emitted for +it. It returns the unified module bytes, or a `LinkError` if any module fails to +parse, a merged closure reaches a transitive host import, or a closure is Tier C. + +Every import in the main module must be satisfiable by one of the supplied +external modules. The match is by **both** the logical module name and the export +field name: `find_export` (in `src/merge.rs`) only considers externals whose +`logical_module` equals the import's module, then matches the field. So an import +`("arith", "sum")` binds to the `sum` export of the external tagged `arith` — not +to a same-named `sum` exported by a different module. + +## Error Reference + +| Error | Meaning | +|-------|---------| +| `LinkError::Parse(msg)` | A module's bytes could not be parsed as valid WASM | +| `LinkError::UnsatisfiedImport { field }` | No external module exports a function named `field` | +| `LinkError::TransitiveHostImport { module, field }` | A body inside the merged closure calls one of the external module's own imports; there is no body to copy for it | +| `LinkError::RequiresRelocatableBuild { field, reasons }` | The closure for `field` is Tier C; `reasons` lists the specific signals | +| `LinkError::UnsupportedConstruct(msg)` | A body contains an unmergeable construct: any floating-point instruction (diagnosed with the exact mnemonic, e.g. `floating-point instruction 'f32.add' is not supported`), a float or `v128` value type in a merged signature/local/block type, a reference-typed value, a tail call (`return_call`/`return_call_indirect`), a sign-extension op, a segment-indexed table op (`table.init`/`elem.drop`/`table.copy`), a verification-only non-det or uzumaki opcode in an external body, or the external module importing its environment (non-function imports). Also raised when the main module carries a section the merge cannot preserve: a start function, a table section, non-function imports, or data/element segments. The message names the specific construct. | +| `LinkError::UnsupportedWasmFeature { module, details }` | The external module is well-formed WASM but uses a feature outside the supported subset: any floating-point type or instruction, sign-extension, saturating float-to-int, reference types, SIMD, atomics, exceptions, `memory64`, multi-memory, multi-value, GC, or tail calls. The `details` field carries the validator's feature-named diagnostic. | + +## Supported Subset + +The linker accepts only the following WebAssembly feature set (see `SUPPORTED_WASM_FEATURES` in `src/lib.rs`): + +- Integer core: `i32`/`i64` value types, all integer arithmetic, comparisons, loads/stores, and the three integer width conversions (`i32.wrap_i64`, `i64.extend_i32_s/u`). +- Mutable globals and bulk memory (`memory.copy`/`memory.fill`). + +Rejected at the feature gate (external modules using any of these produce `UnsupportedWasmFeature`): + +- **Floats** — `f32`/`f64` value types in any signature, local, or global; any float instruction. The Inference language has no `f32`/`f64` types and the Rocq translator models none. +- **Sign-extension** (`i32.extend8_s`, `i64.extend32_s`, etc.) — the Rocq translator has no lowering. +- **Saturating float-to-int** (`i32.trunc_sat_f32_s`, etc.) — the Rocq translator has no lowering. +- Reference types, SIMD, atomics/threads, exceptions, `memory64`, multi-memory, multi-value, GC, tail calls. + +The safety allow-list (`src/safety.rs`) provides an independent per-opcode backstop. It additionally rejects, as `UnsupportedConstruct`: + +- Tail calls (`return_call`/`return_call_indirect`) — the Rocq translator has no lowering. +- Segment-indexed table ops (`table.init`/`elem.drop`/`table.copy`) — carry element segments the merge cannot relocate, and the Rocq translator has no lowering. +- Float instructions that reach the allow-list from the main-module re-encode path (which bypasses the feature gate), diagnosed with the exact mnemonic. +- Verification-only constructs (`forall`/`exists`/`assume`/`unique` blocks, `i32.uzumaki`/`i64.uzumaki`) in an external body — they have no executable semantics. + +## Current Limitations + +- Only Tier-A and Tier-B external functions merge. Tier-C inputs produce a clear + `RequiresRelocatableBuild` error until a follow-on adds relocation metadata + support. +- An external module that itself imports its host environment (non-function + imports — memory, global, tag) is rejected as `UnsupportedConstruct`. A module + importing only other functions from its host is rejected as + `TransitiveHostImport` when the closure reaches one of those imports. +- Reference-typed values (`funcref`, `externref`) and `v128` in merged signatures or bodies + are rejected as `UnsupportedConstruct`. The Inference codegen output uses only `i32`/`i64`, + so this limit does not affect Inference-generated main modules. +- The main module must not declare a start function, a table section, data or element + segments, or non-function imports — the static merge does not preserve these sections, so + each is rejected up front rather than silently dropped. Inference codegen emits none of + them; the guards apply to hand-built or third-party main modules fed to the public `link()`. +- One `.wasm` library version per logical name. Multi-version resolution is + deferred to the manifest layer (issue #96). + +## Module Organization + +| File | Responsibility | +|------|---------------| +| `src/lib.rs` | Public API (`link`, `LinkError`), crate-level documentation | +| `src/parse.rs` | `ParsedModule` — section-by-section owned representation; `ParsedModule::parse` | +| `src/closure.rs` | `compute` — transitive closure via BFS; `ClosureEffects` for tier classification | +| `src/tier.rs` | `classify` — Tier A/B/C feasibility decision | +| `src/merge.rs` | `Plan::build` + `Plan::emit` — the full merge pass; index allocation, type dedup, body re-encoding, name section | +| `src/rewrite.rs` | `reencode_body` — operator-level re-encoding under a new index space | +| `tests/link.rs` | Integration tests: Tier A, Tier B, Tier C rejection, transitive closure, type dedup, name section, multiple externals, diamond closure | + +## Testing + +The integration tests in `tests/link.rs` build all fixtures from inline WAT via +the `wat` crate and assert on the linked module structure via `inf-wasmparser`: + +```bash +cargo test -p inference-wasm-linker +``` + +Test coverage includes: + +- **Tier A** — two pure functions (`sum`, `sub`) merged from one external +- **Tier A call targets** — `call` operands in the main body repoint to merged indices +- **Name section** — merged closure roots named after satisfied import fields; main names survive +- **Type dedup** — shared `(i32,i32)->i32` signature collapses to one type entry +- **Transitive closure** — `sum` delegates to an unexported `add_impl`; both are merged +- **Dead-code exclusion** — an unreferenced `unused` function is not merged +- **Tier B** — `store_at` writes to a caller address; merge succeeds; memory export survives +- **Tier C (data segment)** — `lookup` using `memory.init` is rejected with a data-segment reason +- **Tier C (global)** — `counter` accessing a module global is rejected with a global reason +- **Tier C (indirect call)** — `call_indirect` use is rejected with a table/element reason +- **Multiple externals** — `sum` from one library and `sub` from another; both satisfied +- **Unsatisfied import** — missing `sub` fails with `UnsatisfiedImport` +- **No-import passthrough** — self-contained module links without modification +- **Transitive host import** — a closure body that calls its own module's import is rejected +- **Body re-encoding** — locals, value-typed blocks, mixed types, `return_call`, `call_indirect` +- **Diamond closure** — two roots sharing one internal callee; merged exactly once +- **Main globals** — main module globals and global exports survive the merge +- **Environment import** — external module importing its host environment is rejected +- **Adversarial robustness** — a hand-seeded corpus of malformed/adversarial + externals (one per confirmed Issue #9 robustness-audit defect) plus a + deterministic byte-mutation sweep is fed through `link` by + `adversarial_corpus_never_panics_and_only_emits_valid_modules`, asserting the + contract on every input: `link` returns `Err` **or** a validator-clean module, + and never panics, hangs, or emits a silently-invalid artifact + +## Fuzzing + +A coverage-guided `cargo-fuzz` target over `link` lives in `fuzz/`, a crate +detached from the main workspace (so `cargo build`/`cargo test` never touch it). +`cargo-fuzz` and nightly are not part of the default build; where they are +available: + +```bash +cargo install cargo-fuzz +cargo +nightly fuzz run link +``` + +The deterministic property test above mirrors the fuzzer's invariant and seed +corpus, so the seam is exercised under stable `cargo test` even without +`cargo-fuzz`. See `fuzz/README.md` for details. + +## Related Resources + +- `core/wasm-codegen` — emits the intermediate module with `(import …)` entries consumed by this crate +- `core/inference/src/lib.rs` — driver entry points (`codegen`, `link`, `wasm_to_v`) +- Master plan: `.claude/docs/issues/9/master_plan.md` — design decisions and phase scope +- [WebAssembly binary format](https://webassembly.github.io/spec/core/binary/index.html) — section ordering, index spaces +- [WASM name custom section](https://github.com/WebAssembly/extended-name-section/blob/main/proposals/extended-name-section/Overview.md) — function debug names diff --git a/core/wasm-linker/fuzz/Cargo.toml b/core/wasm-linker/fuzz/Cargo.toml new file mode 100644 index 00000000..412a5bba --- /dev/null +++ b/core/wasm-linker/fuzz/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "inference-wasm-linker-fuzz" +version = "0.0.0" +edition = "2024" +publish = false + +# Detach this crate from the parent Inference workspace. `cargo fuzz` requires +# a `cargo-fuzz`/nightly toolchain that is not part of the default build, so the +# fuzz target must never be pulled into `cargo build`/`cargo test`. The empty +# `[workspace]` table makes this directory its own workspace root; the parent's +# `core/*` glob only matches direct children of `core/`, so `core/wasm-linker/fuzz` +# is already excluded — this is belt-and-braces. +[workspace] + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +inference-wasm-linker = { path = ".." } +inf-wasmparser = { path = "../../../tools/inf-wasmparser" } + +# Each fuzz target is its own binary. `test = false`/`doc = false` keep the +# harness out of the normal test/doc build path. +[[bin]] +name = "link" +path = "fuzz_targets/link.rs" +test = false +doc = false +bench = false + +[profile.release] +debug = 1 diff --git a/core/wasm-linker/fuzz/README.md b/core/wasm-linker/fuzz/README.md new file mode 100644 index 00000000..a5d4cccc --- /dev/null +++ b/core/wasm-linker/fuzz/README.md @@ -0,0 +1,57 @@ +# `inference-wasm-linker` fuzz targets + +Coverage-guided fuzzing over the static-merge linker's public entry point, +[`inference_wasm_linker::link`]. Under the Issue #9 threat model the external +`.wasm` bytes handed to `link` are arbitrary / third-party / adversarial, so the +linker must never panic, hang, or out-of-memory on any input, and a successful +merge must always produce a structurally valid module. + +This crate is **detached from the main Inference workspace** (it declares its own +`[workspace]` table) because `cargo-fuzz` and a nightly toolchain are not part of +the default build. `cargo build` / `cargo test` at the repo root never touch it. + +## Targets + +- **`link`** — splits each input into a main module plus a list of externals and + feeds them to `link`. A panic/abort is a crash; an `Ok` whose merged bytes fail + `inf_wasmparser::validate` is also a crash (a silently-invalid merged artifact + is the worst-case outcome for the verification pipeline). + +## Running + +```sh +cargo install cargo-fuzz +cargo +nightly fuzz run link core/wasm-linker/fuzz/seeds/link +``` + +## Seed corpus + +`seeds/link/` holds a committed seed corpus of the audit reproductions — the +round-2 control-flow-join (C-1), param-nulling-arithmetic (C-2), call-laundering +(C-3), memory64 (C-4), deep-nesting (H-3), over-declared-locals (M-1), and +main-data-segment (M-2) cases, plus a positive control that must merge. Each seed +imports from the empty module `""` so the target's first-external binding +satisfies it and the seed reaches the real closure / provenance / merge logic. + +The seeds are reproducible and continuously verified by two tests in +`core/wasm-linker/tests/fuzz_seeds.rs`: + +- `regenerate_fuzz_seeds` (`#[ignore]`d) rebuilds the corpus from source — + `cargo test -p inference-wasm-linker --test fuzz_seeds regenerate -- --ignored`; +- `committed_fuzz_seeds_reach_link_cleanly` (runs on every `cargo test`) replays + each committed seed through this target's exact wire-format `split` and module + rotation, asserting it never panics, never yields a silently-invalid `Ok`, and + that each round-2 seed still reaches the specific guard it was built to exercise. + +The same reproductions are also carried inline by the property test +`adversarial_corpus_never_panics_and_only_emits_valid_modules` in +`core/wasm-linker/tests/link.rs`, so the seam runs under stable `cargo test` even +without `cargo-fuzz`. + +## Relationship to the regression suite + +The fuzzer is the *generative* guard; the integration tests in +`core/wasm-linker/tests/link.rs` are the *deterministic* guard. Every confirmed +robustness-audit issue (round-1 C1–C4 / H1–H26 / L1–L2 and round-2 C-1–C-4 / +H-1–H-4 / M-1–M-2 / L-1) has a hand-written regression test asserting a clean +outcome; the fuzzer exists to surface the *next* such defect before it ships. diff --git a/core/wasm-linker/fuzz/fuzz_targets/link.rs b/core/wasm-linker/fuzz/fuzz_targets/link.rs new file mode 100644 index 00000000..54c1d285 --- /dev/null +++ b/core/wasm-linker/fuzz/fuzz_targets/link.rs @@ -0,0 +1,106 @@ +//! libFuzzer harness over [`inference_wasm_linker::link`]. +//! +//! The static-merge linker consumes the codegen-produced "main" module plus one +//! or more **resolved external `.wasm` binaries**, which under the Issue #9 +//! threat model are arbitrary / third-party / adversarial bytes. The robustness +//! contract is absolute: `link` must **never** panic, hang, or out-of-memory on +//! any input — every failure is a returned [`inference_wasm_linker::LinkError`]. +//! +//! This target stresses that contract directly. Each fuzzer input is split into +//! a main module and a sequence of externals; the harness feeds them to `link` +//! and lets libFuzzer treat any panic / abort as a crash. It additionally +//! asserts the *soundness* half of the contract — when `link` returns `Ok`, the +//! merged bytes must pass the in-tree WASM validator, so a silently-invalid +//! merged artifact (the worst-case outcome for a verification toolchain) is also +//! a fuzzer crash rather than a persisted bad module. +//! +//! ## Running +//! +//! `cargo-fuzz` and a nightly toolchain are required and are intentionally *not* +//! part of the default workspace (this crate declares its own `[workspace]`): +//! +//! ```text +//! cargo install cargo-fuzz +//! cargo +nightly fuzz run link +//! ``` +//! +//! ## Seed corpus +//! +//! A committed seed corpus of the audit reproductions (the round-2 +//! control-flow-join / param-nulling / call-laundering / memory64 / deep-nesting +//! / over-declared-locals / main-data-segment cases, plus a positive control) +//! lives at `core/wasm-linker/fuzz/seeds/link/`. Start the fuzzer from it for +//! fast coverage: +//! +//! ```text +//! cargo +nightly fuzz run link core/wasm-linker/fuzz/seeds/link +//! ``` +//! +//! Those seeds are reproducible: `regenerate_fuzz_seeds` in +//! `core/wasm-linker/tests/fuzz_seeds.rs` rebuilds them from source, and +//! `committed_fuzz_seeds_reach_link_cleanly` (run on every stable `cargo test`) +//! replays each through this target's exact `split` + module rotation, asserting +//! the same panic-free / `Ok ⇒ valid` invariant and that each round-2 seed still +//! reaches its intended rejection. The broader property test +//! `adversarial_corpus_never_panics_and_only_emits_valid_modules` in +//! `core/wasm-linker/tests/link.rs` carries the same reproductions inline, so the +//! seam is exercised even where `cargo-fuzz` is unavailable. + +#![no_main] + +use libfuzzer_sys::fuzz_target; + +/// Splits the fuzzer-supplied bytes into a main module and a list of externals. +/// +/// The wire format is deliberately simple so the structured-aware fuzzer can +/// reach the linker quickly: a leading byte `n` (clamped to `0..=4`) is the +/// external count, then `n` length-prefixed (`u16` little-endian) external +/// blobs, then the remainder as the main module. Truncated inputs degrade +/// gracefully — a short length prefix just yields the rest of the buffer. +fn split(data: &[u8]) -> (Vec, Vec>) { + let Some((&count_byte, rest)) = data.split_first() else { + return (Vec::new(), Vec::new()); + }; + let count = (count_byte % 5) as usize; + let mut externals = Vec::with_capacity(count); + let mut cursor = rest; + for _ in 0..count { + if cursor.len() < 2 { + break; + } + let len = u16::from_le_bytes([cursor[0], cursor[1]]) as usize; + cursor = &cursor[2..]; + let take = len.min(cursor.len()); + externals.push(cursor[..take].to_vec()); + cursor = &cursor[take..]; + } + (cursor.to_vec(), externals) +} + +fuzz_target!(|data: &[u8]| { + let (main, externals) = split(data); + + // Match every external against each of a small set of plausible logical + // module names. Codegen records imports as `(module, field)`, so resolution + // keys on the module; exercising several names probes the binding path + // (C4 / AmbiguousImport) rather than only the all-empty-name case. + let module_names = ["", "mathlib", "crypto::sha256", "a"]; + let pairs: Vec<(&str, &[u8])> = externals + .iter() + .enumerate() + .map(|(i, bytes)| (module_names[i % module_names.len()], bytes.as_slice())) + .collect(); + + match inference_wasm_linker::link(&main, &pairs) { + // A returned error is the contractually-correct outcome for malformed or + // unsupported input. Nothing more to check. + Err(_) => {} + // A successful merge must be a structurally valid module. A silently + // invalid merged artifact is the worst-case failure for the verification + // pipeline, so treat it as a fuzzer crash. + Ok(merged) => { + inf_wasmparser::validate(&merged) + .expect("link returned Ok but the merged module fails WASM validation"); + } + } +}); diff --git a/core/wasm-linker/fuzz/seeds/link/c1_control_flow_join b/core/wasm-linker/fuzz/seeds/link/c1_control_flow_join new file mode 100644 index 00000000..b2be844b Binary files /dev/null and b/core/wasm-linker/fuzz/seeds/link/c1_control_flow_join differ diff --git a/core/wasm-linker/fuzz/seeds/link/c2_param_nulling_arith b/core/wasm-linker/fuzz/seeds/link/c2_param_nulling_arith new file mode 100644 index 00000000..4285857c Binary files /dev/null and b/core/wasm-linker/fuzz/seeds/link/c2_param_nulling_arith differ diff --git a/core/wasm-linker/fuzz/seeds/link/c2b_add_side_cancellation b/core/wasm-linker/fuzz/seeds/link/c2b_add_side_cancellation new file mode 100644 index 00000000..e7bd1423 Binary files /dev/null and b/core/wasm-linker/fuzz/seeds/link/c2b_add_side_cancellation differ diff --git a/core/wasm-linker/fuzz/seeds/link/c3_call_laundered b/core/wasm-linker/fuzz/seeds/link/c3_call_laundered new file mode 100644 index 00000000..edddf911 Binary files /dev/null and b/core/wasm-linker/fuzz/seeds/link/c3_call_laundered differ diff --git a/core/wasm-linker/fuzz/seeds/link/c4_memory64 b/core/wasm-linker/fuzz/seeds/link/c4_memory64 new file mode 100644 index 00000000..8fb8b341 Binary files /dev/null and b/core/wasm-linker/fuzz/seeds/link/c4_memory64 differ diff --git a/core/wasm-linker/fuzz/seeds/link/h3_deep_nesting b/core/wasm-linker/fuzz/seeds/link/h3_deep_nesting new file mode 100644 index 00000000..8dc5174f Binary files /dev/null and b/core/wasm-linker/fuzz/seeds/link/h3_deep_nesting differ diff --git a/core/wasm-linker/fuzz/seeds/link/m1_over_declared_locals b/core/wasm-linker/fuzz/seeds/link/m1_over_declared_locals new file mode 100644 index 00000000..c043ff46 Binary files /dev/null and b/core/wasm-linker/fuzz/seeds/link/m1_over_declared_locals differ diff --git a/core/wasm-linker/fuzz/seeds/link/m2_main_data_segment b/core/wasm-linker/fuzz/seeds/link/m2_main_data_segment new file mode 100644 index 00000000..6690fa92 Binary files /dev/null and b/core/wasm-linker/fuzz/seeds/link/m2_main_data_segment differ diff --git a/core/wasm-linker/fuzz/seeds/link/pure_control_merges b/core/wasm-linker/fuzz/seeds/link/pure_control_merges new file mode 100644 index 00000000..7eb55d00 Binary files /dev/null and b/core/wasm-linker/fuzz/seeds/link/pure_control_merges differ diff --git a/core/wasm-linker/src/closure.rs b/core/wasm-linker/src/closure.rs new file mode 100644 index 00000000..c4315b1a --- /dev/null +++ b/core/wasm-linker/src/closure.rs @@ -0,0 +1,344 @@ +//! Transitive closure of a satisfied import. +//! +//! Given the function an external module exports to satisfy a main-module +//! import, this computes the set of *everything that function transitively +//! depends on* inside its own module: the functions it calls (directly or +//! indirectly), and — recorded for tier classification — whether any body in +//! the closure touches memory, globals, tables, or data/element segments. +//! +//! Only locally-defined functions enter the closure. If a closure body calls +//! one of the external module's *own* imports, that is surfaced so the linker +//! can reject it: a static merge cannot satisfy a transitive host import. + +use std::collections::{BTreeSet, VecDeque}; + +use inf_wasmparser::{BinaryReader, FunctionBody, Operator}; + +use crate::parse::ParsedModule; +use crate::safety::{check_operator, opens_control_frame, MAX_CONTROL_DEPTH}; +use crate::LinkError; + +/// What a closure's bodies touch, used by tier classification. +#[derive(Debug, Default, Clone)] +pub(crate) struct ClosureEffects { + /// Any body reads or writes linear memory (load/store/copy/fill/size/grow). + pub uses_memory: bool, + /// Any body grows linear memory (`memory.grow`). Tracked separately so the + /// merge can reconcile growth against the reconciled output memory maximum. + pub uses_memory_grow: bool, + /// Any body reads or writes a global. + pub uses_globals: bool, + /// Any body refers to a data segment (`memory.init` / `data.drop`). + pub uses_data_segments: bool, + /// Any body performs an indirect call or otherwise touches the table / + /// element space (`call_indirect`, `table.*`, `ref.func`, `elem.drop`). + pub uses_tables: bool, +} + +/// The result of closing over an exported function. +#[derive(Debug, Clone)] +pub(crate) struct Closure { + /// Local function indices to copy, in ascending order (deterministic). + pub local_func_indices: Vec, + pub effects: ClosureEffects, +} + +/// Computes the transitive closure of the function at `root_func_idx` inside +/// `module`. +/// +/// # Errors +/// +/// Returns [`LinkError::TransitiveHostImport`] if the closure reaches one of +/// the module's own imported functions — a static merge has no body to copy +/// for it. +pub(crate) fn compute( + module: &ParsedModule, + root_func_idx: u32, +) -> Result { + let import_count = module.local_func_base(); + let mut visited: BTreeSet = BTreeSet::new(); + let mut queue: VecDeque = VecDeque::new(); + let mut effects = ClosureEffects::default(); + + queue.push_back(root_func_idx); + + while let Some(func_idx) = queue.pop_front() { + if func_idx < import_count { + // The root export is guaranteed local by the caller; reaching an + // import here means a body inside the closure called one. + let import = &module.imported_funcs[func_idx as usize]; + return Err(LinkError::TransitiveHostImport { + module: import.module.clone(), + field: import.field.clone(), + }); + } + if !visited.insert(func_idx) { + continue; + } + + let local = module + .local_funcs + .get((func_idx - import_count) as usize) + .ok_or_else(|| { + LinkError::Parse(format!( + "function body references function index {func_idx}, which is out of range" + )) + })?; + scan_body(&local.body, &mut effects, |callee| { + queue.push_back(callee); + })?; + } + + Ok(Closure { + local_func_indices: visited.into_iter().collect(), + effects, + }) +} + +/// Walks a function body's operators, recording effects and reporting every +/// directly-called function index through `on_call`. +/// +/// Every operator is gated through the fail-closed allow-list +/// ([`check_operator`]): an operator the static merge does not model — an +/// atomic, a SIMD op, an exception-handling instruction, a typed reference, a +/// multi-memory access — is rejected here, before its closure is committed, +/// rather than copied verbatim into a structurally-invalid output. +fn scan_body( + body: &[u8], + effects: &mut ClosureEffects, + mut on_call: impl FnMut(u32), +) -> Result<(), LinkError> { + let reader = BinaryReader::new(body, 0); + let func_body = FunctionBody::new(reader); + let ops = func_body + .get_operators_reader() + .map_err(|e| LinkError::Parse(e.to_string()))?; + + let mut control_depth: usize = 0; + for op in ops { + let op = op.map_err(|e| LinkError::Parse(e.to_string()))?; + + // Bound structured-control-flow nesting so the downstream wasm-to-v + // translator (which recurses one frame per level) cannot be driven to + // stack exhaustion by an adversarially deep external body. An `End` + // closes the innermost frame; a `block`/`loop`/`if`/non-det op opens a + // new one. This scan gates external bodies; the main module's body is + // bounded by the matching cap in `crate::rewrite::reencode_body`, so an + // over-nested body is kept out of the merged module whatever its origin. + if opens_control_frame(&op) { + control_depth += 1; + // Reject at `>=` so a body nested exactly `MAX_CONTROL_DEPTH` deep is + // rejected by *both* this scan and the wasm-to-v translator, which + // itself rejects at `depth >= 256`. With a strict `>` the two caps + // disagreed: a body at exactly the cap would link here but then abort + // the `-v` translator that admits only `depth < 256`. + if control_depth >= MAX_CONTROL_DEPTH { + return Err(LinkError::UnsupportedConstruct(format!( + "external function body nests structured control flow at least {MAX_CONTROL_DEPTH} levels deep" + ))); + } + } else if matches!(op, Operator::End) { + control_depth = control_depth.saturating_sub(1); + } + + let effect = check_operator(&op)?; + effects.uses_memory |= effect.uses_memory; + effects.uses_memory_grow |= effect.uses_memory_grow; + effects.uses_globals |= effect.uses_globals; + effects.uses_data_segments |= effect.uses_data_segments; + effects.uses_tables |= effect.uses_tables; + + // Calls drag their target into the closure. `ref.func` also references a + // function (and marks table use, surfaced by `check_operator`). + match op { + Operator::Call { function_index } + | Operator::ReturnCall { function_index } + | Operator::RefFunc { function_index } => { + on_call(function_index); + } + _ => {} + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + //! Unit tests for closure effect scanning over hand-built modules. + //! + //! Effects from table/element operators and `ref.func` mark a closure as + //! touching the table space (Tier C). The `link` API rejects such modules at + //! tier classification, so these scan-level effects are asserted directly by + //! computing the closure of a module that uses them. + + use super::*; + use crate::parse::ParsedModule; + + fn parse(wat: &str) -> ParsedModule { + let bytes = wat::parse_str(wat).expect("valid WAT"); + ParsedModule::parse(&bytes).expect("parse") + } + + #[test] + fn ref_func_marks_table_use_and_enqueues_target() { + // `root` takes a reference to internal `target` via `ref.func`. The scan + // must mark table use *and* drag `target` into the closure. + let module = parse( + r#" + (module + (type (;0;) (func)) + (func (;0;) (type 0) + ref.func 1 + drop) + (func (;1;) (type 0)) + (export "root" (func 0))) + "#, + ); + let root = module.exported_func_index("root").unwrap(); + let cl = compute(&module, root).expect("closure computes"); + assert!(cl.effects.uses_tables, "ref.func must mark table use"); + assert_eq!( + cl.local_func_indices, + vec![0, 1], + "ref.func target must be pulled into the closure" + ); + } + + #[test] + fn call_indirect_marks_table_use() { + let module = parse( + r#" + (module + (type (;0;) (func)) + (table (;0;) 1 funcref) + (func (;0;) (type 0) + i32.const 0 + call_indirect (type 0)) + (export "root" (func 0))) + "#, + ); + let root = module.exported_func_index("root").unwrap(); + let cl = compute(&module, root).expect("closure computes"); + assert!(cl.effects.uses_tables, "call_indirect must mark table use"); + } + + #[test] + fn table_size_marks_table_use() { + let module = parse( + r#" + (module + (type (;0;) (func (result i32))) + (table (;0;) 1 funcref) + (func (;0;) (type 0) (result i32) + table.size 0) + (export "root" (func 0))) + "#, + ); + let root = module.exported_func_index("root").unwrap(); + let cl = compute(&module, root).expect("closure computes"); + assert!(cl.effects.uses_tables, "table.size must mark table use"); + } + + #[test] + fn global_access_marks_global_use() { + let module = parse( + r#" + (module + (type (;0;) (func (result i32))) + (global (;0;) i32 (i32.const 3)) + (func (;0;) (type 0) (result i32) + global.get 0) + (export "root" (func 0))) + "#, + ); + let root = module.exported_func_index("root").unwrap(); + let cl = compute(&module, root).expect("closure computes"); + assert!(cl.effects.uses_globals, "global.get must mark global use"); + } + + #[test] + fn out_of_range_call_index_is_a_clean_error() { + // A body that calls a function index past the module's function count + // must yield a `LinkError::Parse`, never index `local_funcs` out of + // bounds and panic. `wat` assembles a numeric `call N` without resolving + // it, so the out-of-range index reaches the closure walk. + let module = parse( + r#" + (module + (type (;0;) (func)) + (func (;0;) (type 0) + call 99) + (export "root" (func 0))) + "#, + ); + let root = module.exported_func_index("root").unwrap(); + let err = compute(&module, root).expect_err("out-of-range call must error"); + assert!( + matches!(err, LinkError::Parse(_)), + "expected Parse, got {err:?}" + ); + } + + /// Builds a single-function module whose body nests `depth` empty `block`s, + /// exported as `root`, for the depth-cap boundary tests. + fn module_nested(depth: usize) -> ParsedModule { + let mut body = String::new(); + for _ in 0..depth { + body.push_str("block "); + } + for _ in 0..depth { + body.push_str("end "); + } + parse(&format!( + r#"(module (func (;0;) (export "root") {body}))"# + )) + } + + #[test] + fn nesting_exactly_at_the_cap_is_the_first_rejected_depth() { + // D1: the closure scan rejects at `control_depth >= MAX_CONTROL_DEPTH`, so + // a body nested exactly `MAX_CONTROL_DEPTH` deep is rejected — matching the + // wasm-to-v translator, which itself rejects at `depth >= 256`. One level + // shallower must still merge, so the cap is exact, not off-by-one. + let at_cap = module_nested(MAX_CONTROL_DEPTH); + let root = at_cap.exported_func_index("root").unwrap(); + let err = compute(&at_cap, root) + .expect_err("a body nested exactly at the cap must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("nests structured control flow")), + "expected an UnsupportedConstruct naming the nesting limit, got {err:?}" + ); + + let below_cap = module_nested(MAX_CONTROL_DEPTH - 1); + let root = below_cap.exported_func_index("root").unwrap(); + assert!( + compute(&below_cap, root).is_ok(), + "a body nested one level below the cap must still merge" + ); + } + + #[test] + fn shared_callee_is_visited_once() { + // `root` calls `shared` twice; the re-visit guard (`visited.insert`) must + // keep the closure to two distinct functions, not loop or duplicate. + let module = parse( + r#" + (module + (type (;0;) (func)) + (func (;0;) (type 0) + call 1 + call 1) + (func (;1;) (type 0)) + (export "root" (func 0))) + "#, + ); + let root = module.exported_func_index("root").unwrap(); + let cl = compute(&module, root).expect("closure computes"); + assert_eq!( + cl.local_func_indices, + vec![0, 1], + "a doubly-called callee must appear exactly once" + ); + } +} diff --git a/core/wasm-linker/src/lib.rs b/core/wasm-linker/src/lib.rs new file mode 100644 index 00000000..db9c0386 --- /dev/null +++ b/core/wasm-linker/src/lib.rs @@ -0,0 +1,239 @@ +//! Static-merge WASM linker. +//! +//! Inference compiles a program that `use`s functions from an external `.wasm` +//! module into an intermediate module whose extern calls lower to `(import …)` +//! entries (see `core/wasm-codegen` Phase 2). This crate consumes that +//! intermediate module plus the resolved external `.wasm` binaries and produces +//! **one self-contained module** with those imports *satisfied and removed* — +//! the external function bodies are merged in and re-indexed so the output has +//! no dangling cross-module imports. +//! +//! ## What the merge does +//! +//! For each import the main module declares, the linker: +//! +//! 1. finds the external module that exports a function of that name, +//! 2. computes the **transitive closure** of that export inside its module (the +//! functions it calls, the types they reference), +//! 3. classifies the closure into a **feasibility tier** (see [`tier`]), +//! 4. **dedups** the closure's function types into the output type section, +//! 5. **appends** the closure's bodies after the main module's, rewriting every +//! internal index reference (`call`, `call_indirect` type, …) into the +//! unified index space, +//! 6. **removes** the satisfied import and redirects the main module's calls to +//! it onto the merged body. +//! +//! ## Feasibility tiers +//! +//! - **Tier A** — pure functions (no memory, globals, data, tables). Merged. +//! - **Tier B** — memory through caller-passed pointers only. Merged onto the +//! single shared linear memory; no address relocation needed. +//! - **Tier C** — own static data, mutable globals, or absolute addresses. +//! Merging would require relocation metadata the static merge does not +//! consume, so it is **rejected** with +//! [`LinkError::RequiresRelocatableBuild`] rather than producing an unsound +//! module. +//! +//! ## Entry point +//! +//! [`link`] takes the main module bytes and the external module bytes and +//! returns the unified module bytes. + +mod closure; +mod merge; +mod parse; +mod provenance; +mod rewrite; +mod safety; +mod spec_funcs; +mod tier; + +use inf_wasmparser::WasmFeatures; +use thiserror::Error; + +/// The WebAssembly feature subset the static-merge linker supports. +/// +/// The merge copies external function bodies verbatim onto a single shared +/// linear memory, re-indexing only the handful of index-bearing operators, and +/// the paired Rocq translator (`wasm-to-v`) models exactly this machine. That is +/// sound only for the integer **WebAssembly 1.0** core (the MVP plus +/// `mutable-global`) and the single scalar post-MVP addition the merge models: +/// +/// - **bulk memory** (`memory.copy` / `memory.fill` over the single memory). +/// +/// Every other *proposal* — reference types, multi-value, tail calls, SIMD, +/// threads/atomics, exception handling, `memory64`, multi-memory, the GC +/// proposal, stack switching, sign-extension, and saturating float-to-int — is +/// **off**. An external using any of them is rejected up front at the link gate +/// with a feature-named [`LinkError::UnsupportedWasmFeature`], rather than late +/// and indirectly when a specific unmodeled opcode happens to reach the merge. +/// +/// ## No floating point, anywhere +/// +/// The Inference language has no `f32`/`f64` types: its codegen never emits a +/// float operator, a float value type, or a float constant, and the Rocq +/// translator models none of them. Floats are therefore deliberately excluded +/// at the gate. In this `inf-wasmparser` fork, `WasmFeatures::WASM1` bundles +/// `FLOATS` (the baseline float value-type/operator flag) into the MVP set, so +/// this gate cannot name `WASM1` directly: it lists the baseline value-type +/// flags it *does* need and leaves `FLOATS` out. With `FLOATS` off the validator +/// rejects, at the feature pass, any float instruction ("floating-point +/// instruction disallowed") and any float value type in a signature, local, or +/// global ("floating-point support is disabled"). The gate thus encodes a single +/// rule — no floats anywhere, neither operators nor types — enforced before a +/// body is ever copied. +/// +/// `GC_TYPES` and `MUTABLE_GLOBAL` are the fork's internal *baseline value-type* +/// flags (`GC_TYPES` gates the GC reference types `externref`/`anyref` — `funcref` +/// is *not* gated by it; `MUTABLE_GLOBAL` admits mutable globals), not WebAssembly +/// proposals, and the validator needs them on to accept ordinary MVP modules. +/// They are therefore deliberately retained. Crucially, `GC_TYPES` being on does +/// **not** admit the GC *proposal*: a GC reference type (`externref`/`anyref`) +/// additionally requires `REFERENCE_TYPES` *and* `GC` (`1 << 19`), neither of +/// which is in this set, and no GC/reference *instruction* survives the allow-list +/// in [`safety`] — every one rejects as an [`LinkError::UnsupportedConstruct`] if +/// it reaches the merge. +/// `STACK_SWITCHING` is likewise off (and defaults off in the fork). +/// +/// Sign-extension and saturating float-to-int are *not* in this set even though +/// they are scalar integer-adjacent proposals: the Rocq translator does not +/// model them (it has no lowering for `i32.extend8_s` or `i32.trunc_sat_f32_s`), +/// and Inference codegen emits neither, so admitting them at the gate would let a +/// third-party external carry an opcode the `-v` proof path cannot render. An +/// external using either is rejected at this gate with the validator's +/// feature-named diagnostic. +/// +/// This is the linker's explicit, enforced supported-version contract: a feature +/// added to the parser later cannot quietly become linkable. +pub const SUPPORTED_WASM_FEATURES: WasmFeatures = WasmFeatures::GC_TYPES + .union(WasmFeatures::MUTABLE_GLOBAL) + .union(WasmFeatures::BULK_MEMORY); + +/// Why a static merge could not be produced. +#[derive(Debug, Error, Clone, PartialEq, Eq)] +pub enum LinkError { + /// A module's bytes could not be parsed as WASM. + #[error("failed to parse WASM module: {0}")] + Parse(String), + + /// An external module is well-formed WebAssembly but uses a feature outside + /// the supported [`SUPPORTED_WASM_FEATURES`] subset (e.g. any floating-point + /// type or instruction, reference types, SIMD, atomics, exceptions, + /// `memory64`, multi-memory, multi-value, tail calls, sign-extension, or + /// saturating float-to-int). The merge cannot soundly fold such a module onto + /// the single shared memory the output models — and the Rocq translator does + /// not model these constructs — so it is rejected at the link gate with the + /// validator's feature-named diagnostic rather than later, per unmodeled + /// opcode. + #[error( + "external module `{module}` uses a WebAssembly feature beyond the supported WASM 1.0 subset: {details}" + )] + UnsupportedWasmFeature { module: String, details: String }, + + /// A required export was not found in any supplied external module. + #[error("no external module exports a function named `{field}`")] + UnsatisfiedImport { field: String }, + + /// A function in a merged closure calls one of its own module's imports, + /// which a static merge has no body to satisfy. + #[error("merged function transitively imports `{module}::{field}`, which has no body to merge")] + TransitiveHostImport { module: String, field: String }, + + /// The external function requires relocation support (Tier C): it carries + /// its own static data, globals, or table/element entries, so merging it + /// into the shared memory would need relocation metadata. + #[error( + "external function `{field}` requires a relocatable build: {}", + .reasons.join("; ") + )] + RequiresRelocatableBuild { field: String, reasons: Vec }, + + /// A WASM construct the static merge does not model (e.g. a reference-typed + /// value, a non-constant global initializer, or a transitively-imported + /// environment). + #[error("unsupported WASM construct for static merge: {0}")] + UnsupportedConstruct(String), + + /// More than one supplied external module exports a function of the same + /// field name an import requests, so the body to merge is ambiguous. + #[error( + "import `{module}::{field}` is ambiguous: more than one external module exports `{field}`" + )] + AmbiguousImport { module: String, field: String }, + + /// The merged module failed structural validation. This is a guard against + /// every effect-scanner gap that would otherwise persist a silently-invalid + /// artifact: rather than write WASM no runtime accepts, the merge fails with + /// the validator's diagnostic. + #[error("merged module failed WASM validation: {0}")] + InvalidMergedModule(String), + + /// The linear memories of the main module and a merged external could not be + /// reconciled into one shared output memory. The merge folds every body onto + /// a single memory; if the modules' memory requirements (minimum pages, + /// maximum pages, or growth) cannot be satisfied by one memory, the merge + /// fails rather than emit a module that traps at runtime. + #[error("cannot reconcile linear memory for `{field}`: {reason}")] + IncompatibleMemory { field: String, reason: String }, +} + +/// Merges the satisfiable imports of `main_wasm` from `externals`, returning a +/// single self-contained module with those imports removed. +/// +/// Each external is supplied as `(logical_module, bytes)`: the logical, +/// `::`-joined module name the front end bound it under, paired with its `.wasm` +/// bytes. Codegen records every import's `(module, field)` pair, so the merge +/// resolves each import against the external whose logical module matches — +/// never the first external that merely exports the same field name. Two +/// libraries exporting the same field but bound under different logical modules +/// are thereby disambiguated rather than conflated. +/// +/// Every import of `main_wasm` must be satisfiable by some external: the merge +/// is **fail-closed**, so an import no external exports is a hard +/// [`LinkError::UnsatisfiedImport`], never a survivor left intact in the output. +/// (The Inference codegen output resolves all its imports before linking, so the +/// live pipeline never trips this; it guards the public API against an +/// unresolved import.) +/// +/// Every external is structurally validated (`inf_wasmparser::validate`) at +/// entry, before any closure or provenance work, so this entry point is +/// self-defending against a malformed or adversarial external even when the +/// caller did not pre-validate it. +/// +/// # Errors +/// +/// Returns a [`LinkError`] if any module fails to parse or an external fails +/// structural validation ([`LinkError::Parse`]), a merged closure reaches a +/// host import, a closure falls into the unsupported Tier C, or more than one +/// external is bound under the same `(module, field)` pair an import names +/// ([`LinkError::AmbiguousImport`]). +pub fn link(main_wasm: &[u8], externals: &[(&str, &[u8])]) -> Result, LinkError> { + merge::link(main_wasm, externals) +} + +/// Validates one external `.wasm` against the linker's supported-version +/// contract, the same two-pass gate [`link`] applies to every external before it +/// is merged. +/// +/// The check runs in two passes so the diagnostic is precise: +/// +/// 1. **Structural** validation under the parser's default features distinguishes +/// genuinely malformed bytes ([`LinkError::Parse`]) from a well-formed module +/// that merely uses a newer feature. +/// 2. **Feature** validation under [`SUPPORTED_WASM_FEATURES`] rejects a +/// well-formed module that uses any proposal beyond the supported WASM 1.0 +/// subset ([`LinkError::UnsupportedWasmFeature`], whose message names the +/// feature). +/// +/// Exposed so the CLI driver can reject a non-1.0 external at the earliest point +/// with the *same* feature-named diagnostic the linker uses — keeping the gate a +/// single source of truth rather than two divergent validations. +/// +/// # Errors +/// +/// Returns [`LinkError::Parse`] for structurally invalid bytes, or +/// [`LinkError::UnsupportedWasmFeature`] for a well-formed module outside the +/// supported subset. +pub fn validate_external(logical_module: &str, bytes: &[u8]) -> Result<(), LinkError> { + merge::validate_external(logical_module, bytes) +} diff --git a/core/wasm-linker/src/merge.rs b/core/wasm-linker/src/merge.rs new file mode 100644 index 00000000..5114a70c --- /dev/null +++ b/core/wasm-linker/src/merge.rs @@ -0,0 +1,1591 @@ +//! The static-merge pass: fold satisfied imports' closures into the main +//! module and rebuild a single self-contained module. +//! +//! ## Index spaces +//! +//! The merge defines one new function index space for the output. Every import +//! must be satisfied (an unsatisfiable one is a hard [`LinkError::UnsatisfiedImport`] +//! — the merge is fail-closed and never carries a surviving import), so the +//! output has no import section and: +//! +//! 1. The main module's local functions occupy the lowest indices, starting at +//! 0 (every satisfied import is removed, so there is no import block above +//! them). +//! 2. Each merged external function is appended after the main locals. +//! +//! Every `call`, `ref.func`, and `call_indirect` type index inside a copied +//! body is rewritten through [`crate::rewrite`] to land in this space. The main +//! module's own bodies are re-encoded too, because removing imports shifts +//! their local-function indices and redirects their calls to satisfied imports +//! onto the merged bodies. + +use std::cell::RefCell; +use std::collections::BTreeMap; + +use inf_wasmparser::ExternalKind; +use wasm_encoder::{ + CodeSection, ConstExpr, ExportKind, ExportSection, Function, FunctionSection, GlobalSection, + GlobalType as EncGlobalType, MemorySection, MemoryType as EncMemoryType, Module, NameMap, + NameSection, TypeSection, ValType as EncValType, +}; + +use crate::closure; +use crate::parse::{FuncSig, GlobalDef, GlobalInit, ParsedModule, TypeEntry}; +use crate::rewrite::{reencode_body, BodyOrigin, IndexMap}; +use crate::tier::{self, Tier}; +use crate::LinkError; + +/// Resolves and merges every satisfiable import of `main` from the supplied +/// external modules, returning the unified module bytes. +/// +/// Each external arrives as `(logical_module, bytes)` so the merge can match an +/// import's recorded `(module, field)` against the external's logical module. +pub(crate) fn link( + main_bytes: &[u8], + externals: &[(&str, &[u8])], +) -> Result, LinkError> { + // Structural validation of the main module on entry. The main module is the + // linker's own codegen output on the live CLI pipeline, but the public + // library API (`inference_wasm_linker::link`, `inference::link`) accepts + // arbitrary `main_bytes`, so this entry point must never panic on a hostile + // main. Without this gate a main whose FunctionSection names an out-of-range + // type index, or any other structural corruption, would reach a raw + // main-derived slice index in `emit`/the re-encoder and abort *before* the + // post-merge gate ever runs. Validating structurally here (under the parser's + // default features, the same validation the post-merge gate applies to the + // merged module — which embeds these same main bodies — so no legitimate + // proof-mode main is regressed) turns that panic into a clean `Parse` error. + inf_wasmparser::validate(main_bytes) + .map_err(|e| LinkError::Parse(format!("main module is invalid WASM: {e}")))?; + + // Two-pass entry gate over every external, before any closure or provenance + // work touches its bytes. The CLI driver validates each resolved external + // (`wasm_link/driver.rs`) before it reaches this crate, but the public + // library API is an entry point in its own right whose contract previously + // only *assumed* pre-validated input. Validating here makes that backstop + // universal: a structurally-invalid or adversarially-crafted external (e.g. + // an over-declared locals count) is rejected as a clean `Parse`, and a + // well-formed but post-1.0 external is rejected up front with a feature-named + // `UnsupportedWasmFeature` rather than late, when a specific unmodeled opcode + // happens to reach the merge. + for (logical_module, bytes) in externals { + validate_external(logical_module, bytes)?; + } + + let main = ParsedModule::parse(main_bytes)?; + let externals = externals + .iter() + .map(|(logical_module, bytes)| ParsedModule::parse_external(bytes, logical_module)) + .collect::, _>>()?; + + let plan = Plan::build(&main, &externals)?; + let merged = plan.emit(&main, &externals)?; + + // Post-merge validation gate. The effect scanner is an allow-list and can + // never be proven complete against an adversarial external `.wasm`; this + // final check ensures the merge never persists a structurally-invalid + // artifact (the input to formal verification), converting every effect- + // scanner gap into a clean diagnostic instead of a silent miscompile. + inf_wasmparser::validate(&merged) + .map_err(|e| LinkError::InvalidMergedModule(e.to_string()))?; + + Ok(merged) +} + +/// Validates one external against the linker's supported-version contract in two +/// passes, so the diagnostic distinguishes a malformed module from a well-formed +/// but unsupported one. +/// +/// 1. **Structural** pass under the parser's default features: a failure here is +/// genuinely malformed or adversarial bytes, surfaced as +/// [`LinkError::Parse`]. This keeps the prior universal pre-validation +/// behavior (a structurally-invalid external is rejected before the permissive +/// `parse_external` reader or the provenance interpreter sees it). +/// 2. **Feature** pass under [`crate::SUPPORTED_WASM_FEATURES`]: a failure here +/// means the module is valid WebAssembly but uses a proposal beyond the +/// supported WASM 1.0 subset, surfaced as +/// [`LinkError::UnsupportedWasmFeature`] with the validator's feature-named +/// message. +/// +/// Running structural-first is deliberate: a malformed module reported by the +/// restricted-feature pass alone could mask the real defect behind a feature +/// name, so the broad pass classifies malformedness first and the narrow pass +/// classifies version. +pub(crate) fn validate_external(logical_module: &str, bytes: &[u8]) -> Result<(), LinkError> { + inf_wasmparser::validate(bytes).map_err(|e| { + LinkError::Parse(format!("external module `{logical_module}` is invalid WASM: {e}")) + })?; + + inf_wasmparser::Validator::new_with_features(crate::SUPPORTED_WASM_FEATURES) + .validate_all(bytes) + .map_err(|e| LinkError::UnsupportedWasmFeature { + module: logical_module.to_string(), + details: e.to_string(), + })?; + + Ok(()) +} + +/// One merged external function, ready to be appended to the output. +struct MergedFunc { + /// Index of the source external module within the `externals` slice. + external_idx: usize, + /// The function's index within that external module. + source_func_idx: u32, + /// The function's type index within the *output* type section. + out_type_idx: u32, + /// The name to record for this function in the output `name` section, so + /// the Rocq translator emits a `Definition ` rather than an opaque + /// `func_`. A closure root takes the satisfied import field; an inner + /// callee keeps its own debug name when the source module carried one. + name: Option, +} + +/// The fully-resolved merge plan: which imports are satisfied, the output type +/// table, and the output index of every function. +struct Plan { + /// Output type section: the main module's types followed by the deduped + /// external function types pulled in by closures. + out_types: Vec, + /// For each main-module type index, its index in `out_types`. + main_type_remap: Vec, + /// `satisfied main import index -> output function index of its body`. + import_target: BTreeMap, + /// Output function index of the first main local function. + main_local_base: u32, + /// The merged external functions, in output order (appended after main + /// locals). + merged: Vec, + /// `(external_idx, source_func_idx) -> output function index`. + merged_index: BTreeMap<(usize, u32), u32>, + /// Per external module: `source_type_idx -> output type idx` for the types + /// its merged closure references. + external_type_remap: Vec>, + /// The single shared linear memory the output declares, reconciled across + /// the main module and every memory-using merged external. `None` when no + /// module needs a memory (a fully pure merge). + reconciled_memory: Option, +} + +impl Plan { + fn build(main: &ParsedModule, externals: &[ParsedModule]) -> Result { + // 0. Reject a main module that carries its own data or element segments. + // `emit` rebuilds the main module section-by-section and emits no + // `DataSection`/`ElementSection`, so a main-side data segment would be + // silently dropped (its memory initializer lost — a valid-but-wrong + // `.wasm`/`.v`) and a main-side element segment would survive as an + // orphaned table reference. Until full preservation-and-reindexing of + // these sections exists, reject up front with a clean diagnostic, + // mirroring the external-side Tier-C reasons. Today Inference codegen + // emits neither section, so this guards the public library API rather + // than the live CLI pipeline. + if main.data_count > 0 { + return Err(LinkError::UnsupportedConstruct(format!( + "main module declares {} data segment(s); the static merge does not yet \ + preserve and re-index main-side data segments", + main.data_count + ))); + } + if main.element_count > 0 { + return Err(LinkError::UnsupportedConstruct(format!( + "main module declares {} element segment(s); the static merge does not yet \ + preserve and re-index main-side element segments", + main.element_count + ))); + } + // A main-side start function runs side-effecting initialization that + // `emit` rebuilds no `StartSection` for — so it would be silently dropped, + // losing its initializer effects in a valid-but-wrong `.wasm`/`.v`. Reject + // it up front, mirroring the external-side start guard. Inference codegen + // emits no start section, so this guards the public library API. + if main.start.is_some() { + return Err(LinkError::UnsupportedConstruct( + "main module declares a start function; the static merge does not \ + preserve the start section" + .into(), + )); + } + // `emit` writes no import section: every function import is satisfied and + // removed, and the merge models *function* imports only. A main-side + // non-function import (global/memory/table) would be silently dropped, and + // a body's `global.get`/etc. would then rebind to the first *defined* + // entity — a wrong value in a valid-but-wrong output, with no diagnostic. + // Reject it up front. + if main.non_func_imports > 0 { + return Err(LinkError::UnsupportedConstruct(format!( + "main module imports {} non-function (global/memory/table) entit{} from its \ + environment; the static merge models function imports only", + main.non_func_imports, + if main.non_func_imports == 1 { "y" } else { "ies" } + ))); + } + // `emit` writes no `TableSection`, so a main-side table is silently + // dropped; a surviving `call_indirect`/`table.*` then fails *after* the + // merge as `InvalidMergedModule("unknown table 0")`, blaming the linker's + // own output rather than naming the unsupported construct. Reject the + // table section up front so the diagnostic names the real cause. + if !main.tables.is_empty() { + return Err(LinkError::UnsupportedConstruct(format!( + "main module declares {} table(s); the static merge does not preserve tables", + main.tables.len() + ))); + } + // The output declares a single shared linear memory. The parser keeps only + // the first declared memory, so a second main-side memory would be silently + // dropped and a body's memarg over it would rebind to memory 0 — a + // valid-but-wrong output. Reject up front, mirroring the external-side + // multi-memory guard below. + if main.memory_count > 1 { + return Err(LinkError::UnsupportedConstruct(format!( + "main module declares {} memories; the static merge models a single shared memory", + main.memory_count + ))); + } + + // 1. Seed the output type table with the main module's function types, + // recording where each main type index lands. + let mut out_types: Vec = Vec::new(); + let mut sig_to_out: BTreeMap, u32> = BTreeMap::new(); + let mut main_type_remap = vec![0u32; main.types.len()]; + for (i, entry) in main.types.iter().enumerate() { + if let TypeEntry::Func(sig) = entry { + let out_idx = intern_sig(&mut out_types, &mut sig_to_out, sig)?; + main_type_remap[i] = out_idx; + } + } + + // 2. Resolve each satisfied import to an external export and close over + // it. An import is satisfiable when some external module exports a + // function of the import's field name; the module name is the + // logical module the front-end bound, but the merge keys on the + // field, matching the codegen import contract. + let main_import_count = main.imported_funcs.len() as u32; + let mut import_target = BTreeMap::new(); + let mut merged: Vec = Vec::new(); + let mut merged_index: BTreeMap<(usize, u32), u32> = BTreeMap::new(); + let mut external_type_remap: Vec> = + externals.iter().map(|_| BTreeMap::new()).collect(); + + // Every import of the main module must be satisfiable: the driver + // resolves all extern bindings before linking, so an unsatisfied import + // is a real error rather than a survivor to keep. Resolving them all up + // front also lets every main local function start at index 0. + let mut satisfied: Vec<(usize, u32)> = Vec::with_capacity(main_import_count as usize); + for import in &main.imported_funcs { + let Some((ext_idx, root)) = + find_export(externals, &import.module, &import.field)? + else { + return Err(LinkError::UnsatisfiedImport { + field: import.field.clone(), + }); + }; + satisfied.push((ext_idx, root)); + } + + // With every import removed, main locals occupy indices `0..`, and + // merged functions follow them. + let main_local_base = 0u32; + let mut next_output_idx = main.local_funcs.len() as u32; + + // The output declares one shared linear memory, reconciled across the + // main module and every memory-using external. Seed it with the main + // module's memory (if any); each satisfied external folds its memory and + // memory-effect requirements in below. + let mut memory = MemoryReconciler::new(main.memory.as_ref())?; + + // 3. For every satisfied import, compute its closure, classify the tier, + // and allocate output indices + output types for the whole closure. + for (import_idx, &(ext_idx, root)) in satisfied.iter().enumerate() { + let external = &externals[ext_idx]; + + if external.non_func_imports > 0 { + return Err(LinkError::UnsupportedConstruct(format!( + "external module providing `{}` imports its environment", + main.imported_funcs[import_idx].field + ))); + } + + // A start function runs side-effecting initialization (e.g. + // `__wasm_call_ctors`) whose closure the merge never folds in. Were + // it silently dropped, those effects would vanish and a host import + // reachable only via the start function would bypass the + // `TransitiveHostImport` gate. Reject rather than miscompile. + if external.start.is_some() { + return Err(LinkError::UnsupportedConstruct(format!( + "external module providing `{}` declares a start function, which the static merge cannot run", + main.imported_funcs[import_idx].field + ))); + } + + // The output has a single shared linear memory. An external with + // more than one memory would carry memargs naming memories the + // output lacks; keeping only the first (the prior behavior) silently + // miscompiled. Reject the whole module. + if external.memory_count > 1 { + return Err(LinkError::UnsupportedConstruct(format!( + "external module providing `{}` declares {} memories; the static merge supports a single shared memory", + main.imported_funcs[import_idx].field, external.memory_count + ))); + } + + let cl = closure::compute(external, root)?; + // Tier C is rejected here, before any output index is committed. The + // classifier runs the address-provenance analysis for memory-using + // closures, so an absolute-address access is rejected as Tier C. + let _tier: Tier = + tier::classify(external, &cl, root, &main.imported_funcs[import_idx].field)?; + + // Reconcile this external's memory into the shared output memory: + // fold in its declared limits (widening minimum/maximum) and check + // its memory effects against the reconciled result. This folds an + // external memory onto a memoryless main (H24), keeps the merged + // minimum large enough for every module's static range (H15), and + // rejects growth the reconciled maximum cannot satisfy. Incompatible + // fundamental shapes (`memory64`/`shared`/page size) are rejected. + memory.fold( + external.memory.as_ref(), + cl.effects.uses_memory, + cl.effects.uses_memory_grow, + &main.imported_funcs[import_idx].field, + )?; + + for &src_func in &cl.local_func_indices { + let key = (ext_idx, src_func); + if merged_index.contains_key(&key) { + continue; + } + // Allocate the output type for this function (deduped). + let sig = external + .func_sig(src_func) + .ok_or_else(|| LinkError::Parse(format!( + "external function {src_func} has no function type" + )))? + .clone(); + let out_type_idx = intern_sig(&mut out_types, &mut sig_to_out, &sig)?; + let local = external + .local_funcs + .get((src_func - external.local_func_base()) as usize) + .ok_or_else(|| { + LinkError::Parse(format!( + "external function index {src_func} is out of range" + )) + })?; + let src_type_idx = local.type_idx; + external_type_remap[ext_idx].insert(src_type_idx, out_type_idx); + + // Make the type remap total: a body's function-typed blocks and + // indirect calls reference type indices other than the + // function's own, which the re-encoder must remap. Intern each + // referenced signature now so re-encoding never hits an unmapped + // index (the prior `.expect()` panic, H2) — and an out-of-range + // source type index surfaces as a clean parse error. + for type_idx in scan_body_type_indices(&local.body)? { + if external_type_remap[ext_idx].contains_key(&type_idx) { + continue; + } + let referenced = match external.types.get(type_idx as usize) { + Some(TypeEntry::Func(s)) => s.clone(), + _ => { + return Err(LinkError::Parse(format!( + "merged body references type index {type_idx}, which is not a function type" + ))); + } + }; + let out_idx = intern_sig(&mut out_types, &mut sig_to_out, &referenced)?; + external_type_remap[ext_idx].insert(type_idx, out_idx); + } + + let out_func_idx = next_output_idx; + next_output_idx += 1; + merged_index.insert(key, out_func_idx); + // Prefix the merged inner callee's debug name with its logical + // module (`mathlib.helper`). Two externals bound under different + // logical modules may export — and internally call — functions of + // the same name; without the prefix those names would collide in + // the output name section and force wasm-to-v's index-suffix + // disambiguation (`helper` vs `helper_2`), which is index- + // dependent and shifts across merges. The prefix keeps each merged + // function traceable to its source module and makes the *wasm-level* + // names distinct. It is not a hard collision guarantee at the Rocq + // level: wasm-to-v sanitizes `.` (and other non-identifier bytes) + // to `_`, so two distinct sources can still sanitize to the same + // Rocq identifier (e.g. via `__` runs); wasm-to-v's index suffix + // remains the final disambiguator. The `.` separator matches + // Inference's `Type.method` convention. + merged.push(MergedFunc { + external_idx: ext_idx, + source_func_idx: src_func, + out_type_idx, + name: external + .func_name(src_func) + .map(|name| format!("{}.{name}", external.logical_module)), + }); + } + + let root_output = merged_index[&(ext_idx, root)]; + import_target.insert(import_idx as u32, root_output); + + // The closure root satisfies this import: name it after the import + // field, prefixed with the external's logical module + // (`mathlib.sum`), so the merged function reads as an ordinary, named + // definition that is traceable to its source module. The field alone + // is not unique: two externals bound under different logical modules + // may satisfy imports of the same field, and their roots would then + // collide in the output name section, forcing wasm-to-v's index- + // suffix disambiguation (`sum` vs `sum_2`), which is index-dependent + // across merges. The module prefix makes the wasm-level names distinct; + // it is not a hard Rocq-level collision guarantee, since wasm-to-v + // sanitizes `.` to `_` and two distinct sources can still sanitize to + // the same Rocq identifier (`__` runs), with wasm-to-v's index suffix + // as the final disambiguator. The `.` separator matches Inference's + // `Type.method` convention. An explicit debug name on the source module + // would otherwise win, but a codegen-produced external typically + // exports the field under that same name, so this is stable. + let external = &externals[ext_idx]; + let field = &main.imported_funcs[import_idx].field; + if let Some(root_merged) = merged.iter_mut().find(|m| { + merged_index.get(&(m.external_idx, m.source_func_idx)) == Some(&root_output) + }) { + root_merged.name = Some(format!("{}.{field}", external.logical_module)); + } + } + + // Give every still-nameless merged inner callee a name derived from its + // output function index, prefixed with its logical module + // (`lib.func_5`). An external stripped of its `name` section (third-party + // / `wasm-tools`-stripped) leaves inner callees with `name: None`; + // without a name `build_func_names` emits no name-section entry, and + // `wasm-to-v` then falls back to a per-process random UUID `Definition` + // name, making the `.v` non-reproducible for byte-identical input. Naming + // each from its deterministic output index keeps the name section + // complete and the proof artifact reproducible. The module prefix keeps + // the synthesized name in the same `module.field` namespace as the named + // roots and callees above, so two stripped externals can never produce + // the same fallback name for distinct functions. The `.` separator + // matches Inference's `Type.method` convention and sanitizes to `_` in + // the Rocq name. + let merged_base = main_local_base + main.local_funcs.len() as u32; + for (i, m) in merged.iter_mut().enumerate() { + if m.name.is_none() { + let logical_module = &externals[m.external_idx].logical_module; + m.name = Some(format!("{}.func_{}", logical_module, merged_base + i as u32)); + } + } + + Ok(Plan { + out_types, + main_type_remap, + import_target, + main_local_base, + merged, + merged_index, + external_type_remap, + reconciled_memory: memory.finish(), + }) + } + + /// Maps a main-module function index into the output index space. + /// + /// Every import is satisfied and removed, so an import index maps to its + /// merged body's output index, and a main local shifts down by the + /// (now fully removed) import count onto `main_local_base`. + /// + /// The local index is bounds-checked against the main module's local + /// function count. Most callers feed indices the parser already validated + /// (a body's `call` targets, an export), but `remap_spec_funcs` feeds indices + /// straight from the `inference.spec_funcs` custom section — which the + /// post-merge `inf_wasmparser::validate` treats as opaque, so a garbage or + /// out-of-range spec index would otherwise be silently remapped onto the + /// wrong or a nonexistent function and emitted into the Rocq proof obligation. + /// Rejecting an out-of-range local here keeps that verification deliverable + /// honest. + fn map_main_func(&self, main: &ParsedModule, idx: u32) -> Result { + let import_count = main.imported_funcs.len() as u32; + if idx < import_count { + return self.import_target.get(&idx).copied().ok_or_else(|| { + LinkError::Parse(format!( + "main function index {idx} references an unsatisfied import" + )) + }); + } + let local_idx = idx - import_count; + if local_idx as usize >= main.local_funcs.len() { + return Err(LinkError::Parse(format!( + "function index {idx} out of range" + ))); + } + Ok(self.main_local_base + local_idx) + } + + /// Emits the unified module bytes. + fn emit( + &self, + main: &ParsedModule, + externals: &[ParsedModule], + ) -> Result, LinkError> { + let mut module = Module::new(); + + // Type section. + let mut types = TypeSection::new(); + for sig in &self.out_types { + let params = sig + .params + .iter() + .map(map_val_type) + .collect::, _>>()?; + let results = sig + .results + .iter() + .map(map_val_type) + .collect::, _>>()?; + types.ty().function(params, results); + } + module.section(&types); + + // No import section: every import is satisfied and removed. The merge is + // fail-closed (an unsatisfiable import is rejected in `Plan::build` as + // `UnsatisfiedImport`), so no import can survive to be re-emitted here. + + // Function section: main locals (remapped types) then merged functions. + let mut functions = FunctionSection::new(); + for local in &main.local_funcs { + // Checked lookup, mirroring the `reencode_main_body` `ty` closure: a + // main FunctionSection naming an out-of-range type index must surface + // as a clean error rather than panic on a raw slice index (S3). The + // entry-side structural validation already rejects such a main, but + // this keeps the index access self-defending in its own right. + let out_type = self + .main_type_remap + .get(local.type_idx as usize) + .copied() + .ok_or_else(|| { + LinkError::Parse(format!( + "main function references type index {} out of range", + local.type_idx + )) + })?; + functions.function(out_type); + } + for m in &self.merged { + functions.function(m.out_type_idx); + } + module.section(&functions); + + // Memory section: the single shared linear memory reconciled across the + // main module and every memory-using merged external. + if let Some(mem) = &self.reconciled_memory { + let mut memory = MemorySection::new(); + memory.memory(*mem); + module.section(&memory); + } + + // Global section (main globals only; external globals are Tier C). + if !main.globals.is_empty() { + let mut globals = GlobalSection::new(); + for g in &main.globals { + globals.global(map_global_type(g)?, &map_global_init(g.init)); + } + module.section(&globals); + } + + // Export section: rewrite function-export indices into the output space. + if !main.exports.is_empty() { + let mut exports = ExportSection::new(); + for export in &main.exports { + let (kind, index) = match export.kind { + ExternalKind::Func => { + (ExportKind::Func, self.map_main_func(main, export.index)?) + } + ExternalKind::Memory => (ExportKind::Memory, export.index), + ExternalKind::Global => (ExportKind::Global, export.index), + ExternalKind::Table => (ExportKind::Table, export.index), + ExternalKind::Tag => (ExportKind::Tag, export.index), + }; + exports.export(&export.name, kind, index); + } + module.section(&exports); + } + + // Code section: re-encode every main body, then every merged body, each + // under its own index map. + let mut code = CodeSection::new(); + for local in &main.local_funcs { + let body = self.reencode_main_body(main, &local.body)?; + code.function(&body); + } + for m in &self.merged { + let external = &externals[m.external_idx]; + let local = external + .local_funcs + .get((m.source_func_idx - external.local_func_base()) as usize) + .ok_or_else(|| { + LinkError::Parse(format!( + "merged external function index {} is out of range", + m.source_func_idx + )) + })?; + let body = self.reencode_external_body(m.external_idx, &local.body)?; + code.function(&body); + } + module.section(&code); + + // Name section: preserve sane debug names so the Rocq translator emits + // named `Definition`s. Subsections must appear in ascending id order: + // module (0), then functions (1), then locals (2). Without this section + // every function — main locals included — would translate to an opaque + // `func_`, and the module/local debug names would be lost. + let func_names = self.build_func_names(main); + let local_names = self.build_local_names(main); + if main.module_name.is_some() || func_names.is_some() || local_names.is_some() { + let mut name_section = NameSection::new(); + if let Some(module_name) = &main.module_name { + name_section.module(module_name); + } + if let Some(names) = &func_names { + name_section.functions(names); + } + if let Some(locals) = &local_names { + name_section.locals(locals); + } + module.section(&name_section); + } + + // `inference.spec_funcs` section: rewrite each recorded spec function + // index into the post-link output space and re-emit it. Codegen records + // these indices in the pre-link space (which includes the now-removed + // imports); without this rewrite a bare linked `.wasm` would name the + // wrong functions in its proof obligations (C1), or — were the section + // simply dropped (H25) — carry no obligations at all. + if let Some(spec_funcs) = &main.spec_funcs { + let remapped = self.remap_spec_funcs(main, spec_funcs)?; + let payload = crate::spec_funcs::encode(&remapped); + module.section(&wasm_encoder::CustomSection { + name: crate::spec_funcs::SECTION_NAME.into(), + data: (&payload[..]).into(), + }); + } + + Ok(module.finish()) + } + + /// Rewrites every recorded spec-function index from the pre-link space into + /// the post-link output space via [`Self::map_main_func`]. + /// + /// Each index names a main-module function (a spec function is emitted by + /// codegen as an ordinary local function), so the same import-removal shift + /// that re-indexes calls applies here. + fn remap_spec_funcs( + &self, + main: &ParsedModule, + spec_funcs: &[(String, Vec)], + ) -> Result)>, LinkError> { + spec_funcs + .iter() + .map(|(name, indices)| { + let mapped = indices + .iter() + .map(|&idx| self.map_main_func(main, idx)) + .collect::, _>>()?; + Ok((name.clone(), mapped)) + }) + .collect() + } + + /// Builds the output `name`-section local map: each main local function's + /// local-variable names, re-indexed onto the import-free output space. The + /// local indices within a function are unchanged by the merge; only the + /// enclosing function index shifts. Returns `None` when no local carries a + /// name. + fn build_local_names(&self, main: &ParsedModule) -> Option { + let import_count = main.imported_funcs.len() as u32; + let mut entries: Vec<(u32, &Vec<(u32, String)>)> = Vec::new(); + for (local_idx, _) in main.local_funcs.iter().enumerate() { + let source_idx = import_count + local_idx as u32; + if let Some(locals) = main.local_names.get(&source_idx) { + entries.push((self.main_local_base + local_idx as u32, locals)); + } + } + if entries.is_empty() { + return None; + } + entries.sort_unstable_by_key(|(idx, _)| *idx); + let mut indirect = wasm_encoder::IndirectNameMap::new(); + for (func_idx, locals) in entries { + let mut map = NameMap::new(); + for (local_idx, name) in locals { + map.append(*local_idx, name); + } + indirect.append(func_idx, &map); + } + Some(indirect) + } + + /// Builds the output `name`-section function map: main locals keep their + /// source debug names (re-indexed onto the import-free output space), and + /// each merged function takes the name resolved at plan-build time. Returns + /// `None` when no function carries a name, leaving the section out entirely. + fn build_func_names(&self, main: &ParsedModule) -> Option { + let import_count = main.imported_funcs.len() as u32; + let mut entries: Vec<(u32, &str)> = Vec::new(); + + for (local_idx, _) in main.local_funcs.iter().enumerate() { + let source_idx = import_count + local_idx as u32; + if let Some(name) = main.func_name(source_idx) { + entries.push((self.main_local_base + local_idx as u32, name)); + } + } + for (i, m) in self.merged.iter().enumerate() { + if let Some(name) = &m.name { + entries.push((self.main_local_base + main.local_funcs.len() as u32 + i as u32, name)); + } + } + + if entries.is_empty() { + return None; + } + entries.sort_unstable_by_key(|(idx, _)| *idx); + let mut names = NameMap::new(); + for (idx, name) in entries { + names.append(idx, name); + } + Some(names) + } + + fn reencode_main_body( + &self, + main: &ParsedModule, + body: &[u8], + ) -> Result { + // A re-encode failure inside the `func` closure cannot be returned + // through `IndexMap`'s `Fn` signature, so it is captured in a `RefCell` + // (keeping the closure `Fn`) and surfaced after `reencode_body` returns. + let func_err: RefCell> = RefCell::new(None); + let func = |idx: u32| match self.map_main_func(main, idx) { + Ok(mapped) => mapped, + Err(e) => { + func_err.borrow_mut().get_or_insert(e); + 0 + } + }; + let ty = |idx: u32| { + self.main_type_remap + .get(idx as usize) + .copied() + .ok_or_else(|| { + LinkError::Parse(format!("main body references type index {idx} out of range")) + }) + }; + let map = IndexMap { + func: &func, + ty: &ty, + }; + let function = reencode_body(body, &map, BodyOrigin::Main)?; + if let Some(e) = func_err.into_inner() { + return Err(e); + } + Ok(function) + } + + fn reencode_external_body( + &self, + external_idx: usize, + body: &[u8], + ) -> Result { + // As in `reencode_main_body`, a missing function-index mapping is + // captured and surfaced after re-encoding rather than panicking. + let func_err: RefCell> = RefCell::new(None); + let func = |idx: u32| match self.merged_index.get(&(external_idx, idx)) { + Some(&mapped) => mapped, + None => { + func_err.borrow_mut().get_or_insert(LinkError::Parse(format!( + "merged body references function index {idx} not in its closure" + ))); + 0 + } + }; + let remap = &self.external_type_remap[external_idx]; + let ty = |idx: u32| { + remap.get(&idx).copied().ok_or_else(|| { + LinkError::UnsupportedConstruct(format!( + "merged body references an unmapped type index {idx}" + )) + }) + }; + let map = IndexMap { + func: &func, + ty: &ty, + }; + let function = reencode_body(body, &map, BodyOrigin::External)?; + if let Some(e) = func_err.into_inner() { + return Err(e); + } + Ok(function) + } +} + +/// Interns a signature into `out_types`, returning its index. Two functions +/// with identical signatures share one type entry (type dedup). +/// +/// # Errors +/// +/// Returns [`LinkError::UnsupportedConstruct`] if the signature contains a +/// reference-typed parameter or result. The static merge models no reference +/// types: collapsing `Ref(_)` to `i32` (the prior behavior) silently produced a +/// module whose bodies still operated on the reference, which no runtime +/// accepts. Rejecting here, at the single interning chokepoint, keeps every +/// merged signature representable. +fn intern_sig( + out_types: &mut Vec, + cache: &mut BTreeMap, u32>, + sig: &FuncSig, +) -> Result { + let key = sig_key(sig)?; + if let Some(&idx) = cache.get(&key) { + return Ok(idx); + } + let idx = out_types.len() as u32; + out_types.push(sig.clone()); + cache.insert(key, idx); + Ok(idx) +} + +/// A stable byte key for a signature, used for dedup. Value types are encoded +/// as their discriminant; a `0xFF` separator distinguishes params from results. +/// +/// Fails if any value type is a reference type, so a ref-typed signature can +/// never be interned and silently emitted. +fn sig_key(sig: &FuncSig) -> Result, LinkError> { + let mut key = Vec::with_capacity(sig.params.len() + sig.results.len() + 1); + for ty in &sig.params { + key.push(val_type_tag(*ty)?); + } + key.push(0xFF); + for ty in &sig.results { + key.push(val_type_tag(*ty)?); + } + Ok(key) +} + +/// A dedup discriminant for a supported value type. Floating-point, SIMD, and +/// reference types have no tag: each is an unsupported construct, surfaced as a +/// clean error (a float because the Inference language has no `f32`/`f64` types; +/// a `v128` because it has no SIMD types and every SIMD operator is rejected; a +/// reference rather than the prior `Ref(_) => I32` collapse). This is the +/// signature-axis chokepoint, paired with the operator-stream gate in +/// [`crate::safety`]. +fn val_type_tag(ty: inf_wasmparser::ValType) -> Result { + use inf_wasmparser::ValType::*; + Ok(match ty { + I32 => 0, + I64 => 1, + F32 | F64 => { + return Err(LinkError::UnsupportedConstruct( + "floating-point value type (f32/f64) in merged function signature: \ + the Inference language has no f32/f64 types" + .into(), + )); + } + V128 => { + return Err(LinkError::UnsupportedConstruct( + "v128 value type in merged function signature: \ + the Inference language has no SIMD types" + .into(), + )); + } + Ref(_) => { + return Err(LinkError::UnsupportedConstruct( + "reference-typed value in merged function signature".into(), + )); + } + }) +} + +/// Finds the external module bound under `module` that exports a function named +/// `field`, returning `(external_idx, func_idx)`. +/// +/// Matches on the full `(module, field)` pair codegen records for every import, +/// not the field alone: an external is a candidate only when its logical module +/// equals `module`. This disambiguates two libraries that export the same field +/// but were bound under different logical modules — the earlier behavior, which +/// matched on field alone, let the path-sort order decide which body was merged. +/// +/// Returns `Ok(None)` when no external bound under `module` exports `field`, and +/// [`LinkError::AmbiguousImport`] when more than one external is bound under the +/// same `(module, field)` pair, in which case the merge cannot soundly choose a +/// body and fails rather than silently linking the first. +fn find_export( + externals: &[ParsedModule], + module: &str, + field: &str, +) -> Result, LinkError> { + let mut found: Option<(usize, u32)> = None; + for (i, ext) in externals.iter().enumerate() { + if ext.logical_module != module { + continue; + } + if let Some(idx) = ext.exported_func_index(field) { + if found.is_some() { + return Err(LinkError::AmbiguousImport { + module: module.to_string(), + field: field.to_string(), + }); + } + found = Some((i, idx)); + } + } + Ok(found) +} + +/// Collects the type indices a body references through function-typed +/// `block`/`loop`/`if` and `call_indirect`/`return_call_indirect`, so the merge +/// can intern each signature and keep the type remap total. +/// +/// Every operator is also gated through the fail-closed allow-list, matching +/// the closure scanner: a body reaching here has been closure-scanned already, +/// but re-checking keeps this walk self-contained. The verification-only non-det +/// blocks share the `blockty` payload, but they are rejected by the allow-list +/// (they have no executable semantics, so an external body that carries one is +/// not mergeable), so this walk never interns a type index on their behalf. +fn scan_body_type_indices(body: &[u8]) -> Result, LinkError> { + use inf_wasmparser::{BinaryReader, BlockType, FunctionBody, Operator}; + + let func_body = FunctionBody::new(BinaryReader::new(body, 0)); + let ops = func_body + .get_operators_reader() + .map_err(|e| LinkError::Parse(e.to_string()))?; + + let mut indices = Vec::new(); + for op in ops { + let op = op.map_err(|e| LinkError::Parse(e.to_string()))?; + crate::safety::check_operator(&op)?; + match op { + Operator::Block { + blockty: BlockType::FuncType(idx), + } + | Operator::Loop { + blockty: BlockType::FuncType(idx), + } + | Operator::If { + blockty: BlockType::FuncType(idx), + } + | Operator::CallIndirect { + type_index: idx, .. + } + | Operator::ReturnCallIndirect { + type_index: idx, .. + } => indices.push(idx), + _ => {} + } + } + Ok(indices) +} + +/// Maps a value type into the encoder equivalent, rejecting floating-point and +/// reference types. +/// +/// A float value type cannot appear in a merged signature: the Inference language +/// has no `f32`/`f64` types. A `v128` likewise cannot: the language has no SIMD +/// types and every SIMD operator is rejected, so the type axis must stay +/// consistent rather than carry the SIMD type into the output. A reference-typed +/// value cannot be soundly emitted either: the static merge models no reference +/// types, and collapsing `Ref(_)` to `i32` (the prior behavior) silently produced +/// a module whose bodies still operate on the reference, which no runtime +/// accepts. Surface each as a clean error. This duplicates the rejection in +/// [`val_type_tag`] as defense in depth: the two functions are reached on +/// independent paths (dedup keying vs. type emission), so each must guard the +/// unsupported value-type axes itself. +fn map_val_type(ty: &inf_wasmparser::ValType) -> Result { + use inf_wasmparser::ValType::*; + Ok(match ty { + I32 => EncValType::I32, + I64 => EncValType::I64, + F32 | F64 => { + return Err(LinkError::UnsupportedConstruct( + "floating-point value type (f32/f64) in merged function signature: \ + the Inference language has no f32/f64 types" + .into(), + )); + } + V128 => { + return Err(LinkError::UnsupportedConstruct( + "v128 value type in merged function signature: \ + the Inference language has no SIMD types" + .into(), + )); + } + Ref(_) => { + return Err(LinkError::UnsupportedConstruct( + "reference-typed value in merged function signature".into(), + )); + } + }) +} + +/// Reconciles the linear memories of the main module and every memory-using +/// merged external into one shared output memory. +/// +/// The merge folds every body onto a *single* memory, so the output's memory +/// must satisfy all of them at once. This accumulator folds each module's +/// memory in turn: +/// +/// - **Fundamental shape** (`memory64`, `shared`, page size) must match across +/// every memory: a memory64 body addresses with i64, a shared body needs an +/// atomic memory, and a custom page size changes the address-to-page mapping — +/// none can be folded onto a differently-shaped memory. A mismatch is a clean +/// [`LinkError::IncompatibleMemory`]. +/// - **Minimum** is widened to the maximum of every module's minimum, so the +/// output reserves enough pages for every module's static range (closing the +/// out-of-bounds miscompile, H15). +/// - **Maximum** is widened (a larger or unbounded maximum is the +/// least-restrictive choice), and a module that grows memory forces the +/// maximum to admit growth or the merge rejects it (H15). +/// - A **memoryless main** with a memory-using external synthesizes an output +/// memory from the external's declaration (H24); a memory-using external with +/// *no* memory declaration of its own and a memoryless main is irreconcilable +/// (there is nothing to address), so it is rejected (the guard, part C). +struct MemoryReconciler { + /// The reconciled memory so far, or `None` if no module has contributed one. + current: Option, + /// Whether the reconciled memory is required (some closure uses memory), + /// even if no module declared one — which is then an error. + required: bool, +} + +impl MemoryReconciler { + /// Seeds the reconciler with the main module's memory, if it has one. + /// + /// The main memory's shape is rejected here for the same reasons an + /// external's is rejected in [`MemoryReconciler::fold`]: the output models a + /// single 32-bit, non-shared, default-page-size memory, and wasm-to-v encodes + /// only that model. A `memory64`, `shared`, or custom-page-size main memory + /// would be merged into an output the translator silently re-encodes as + /// 32-bit, so it is rejected absolutely rather than on the reconcile path + /// alone (audit C-4/L-1). + fn new(main_mem: Option<&inf_wasmparser::MemoryType>) -> Result { + if let Some(main_mem) = main_mem { + reject_unsupported_memory_shape(main_mem, "
")?; + } + Ok(MemoryReconciler { + current: main_mem.map(to_enc_memory), + required: false, + }) + } + + /// Folds one external's memory and memory effects into the reconciliation. + /// + /// `uses_memory`/`uses_memory_grow` are the external closure's effects, used + /// to decide whether a memory is required at all and whether growth must be + /// admitted. + fn fold( + &mut self, + ext_mem: Option<&inf_wasmparser::MemoryType>, + uses_memory: bool, + uses_memory_grow: bool, + field: &str, + ) -> Result<(), LinkError> { + if uses_memory { + self.required = true; + } + + if let Some(ext_mem) = ext_mem { + // Reject an unsupported memory shape for *every* contributed external + // memory, including the `None => ext` adopt path onto a memoryless + // main — otherwise a memory64/shared/custom-page external would be + // adopted verbatim and wasm-to-v would silently re-encode it as a + // 32-bit memory (audit C-4/L-1). + reject_unsupported_memory_shape(ext_mem, field)?; + let ext = to_enc_memory(ext_mem); + self.current = Some(match self.current { + None => ext, + Some(cur) => reconcile_pair(cur, ext, field)?, + }); + } + + if uses_memory_grow { + self.admit_growth(field)?; + } + + // A closure that uses memory but no module supplies one to address has + // no valid shared memory to fold onto — reject rather than emit a body + // that references a memory the output lacks (the guard, part C). + if self.required && self.current.is_none() { + return Err(LinkError::IncompatibleMemory { + field: field.to_string(), + reason: "the external accesses linear memory, but neither it nor the main module \ + declares a memory to share" + .to_string(), + }); + } + + Ok(()) + } + + /// Verifies the reconciled memory can actually grow: its maximum must + /// exceed its minimum (or be unbounded). When the reconciled memory is + /// pinned (`max == min`), a `memory.grow` always fails at runtime (returning + /// -1), so the merge rejects it with a clear diagnostic rather than emit a + /// module that silently mis-grows. Widening main's fixed maximum is avoided + /// deliberately: an external must not silently relax the host program's own + /// memory bound. A memoryless reconciliation that needs to grow is rejected + /// by the caller's required-memory guard before reaching here. + fn admit_growth(&self, field: &str) -> Result<(), LinkError> { + let Some(mem) = self.current.as_ref() else { + return Ok(()); + }; + if let Some(max) = mem.maximum + && max <= mem.minimum + { + return Err(LinkError::IncompatibleMemory { + field: field.to_string(), + reason: format!( + "the external grows linear memory, but the reconciled memory's maximum \ + ({max} pages) does not exceed its minimum ({} pages)", + mem.minimum + ), + }); + } + Ok(()) + } + + /// Returns the reconciled memory to emit, or `None` when no module needs one. + fn finish(self) -> Option { + self.current + } +} + +/// Reconciles the **anchor** memory `a` (the main module's declared memory, or +/// the accumulator already reconciled with it) with a contributing external +/// memory `b`, or returns a clean [`LinkError::IncompatibleMemory`]. +/// +/// `a`'s declared maximum is *authoritative* and is **never relaxed upward** by +/// `b`: the output keeps `a`'s maximum unchanged. Widening the output maximum to +/// the larger bound or to unbounded (the prior behavior) silently relaxed a main +/// that declared `(memory 1 1)` to admit an external's looser cap — +/// contradicting [`MemoryReconciler::admit_growth`]'s own refusal to relax the +/// host's memory bound, and removing the runtime backstop that would otherwise +/// trap an over-long fill early. An external declaring a larger or unbounded +/// maximum is *clamped* to the anchor's bound, not rejected: the external's +/// declared maximum only caps growth, and folding it under main's stricter cap is +/// a more-restrictive (sound) runtime. (A closure that actually grows memory is +/// gated separately by [`MemoryReconciler::admit_growth`] against the kept +/// maximum.) +/// +/// The minimum is widened to `max(a.min, b.min)` to reserve enough pages for +/// every module's static range. The one reconciliation that *cannot* be honored +/// is a reserved minimum that exceeds the anchor's maximum — the external's +/// static footprint does not fit under the host's declared cap — which is +/// rejected rather than emitting an invalid `min > max` memory. +fn reconcile_pair( + a: EncMemoryType, + b: EncMemoryType, + field: &str, +) -> Result { + if a.memory64 != b.memory64 { + return Err(LinkError::IncompatibleMemory { + field: field.to_string(), + reason: format!( + "memory64 mismatch (one memory is memory64={}, the other memory64={})", + a.memory64, b.memory64 + ), + }); + } + if a.shared != b.shared { + return Err(LinkError::IncompatibleMemory { + field: field.to_string(), + reason: format!( + "shared mismatch (one memory is shared={}, the other shared={})", + a.shared, b.shared + ), + }); + } + if a.page_size_log2 != b.page_size_log2 { + return Err(LinkError::IncompatibleMemory { + field: field.to_string(), + reason: "custom page sizes differ between the two memories".to_string(), + }); + } + + // Widen the minimum to satisfy both modules' static ranges, but keep the + // anchor's maximum: the main module's declared cap is never relaxed upward. + let minimum = a.minimum.max(b.minimum); + let maximum = a.maximum; + + // The external's static footprint must fit under the host's declared cap; a + // reserved minimum above it cannot be honored without relaxing the cap. + if let Some(anchor_max) = maximum + && minimum > anchor_max + { + return Err(LinkError::IncompatibleMemory { + field: field.to_string(), + reason: format!( + "the reconciled minimum ({minimum} pages) exceeds the declared maximum \ + ({anchor_max} pages) of the memory it is merged into; the kept memory bound \ + is not relaxed" + ), + }); + } + + Ok(EncMemoryType { + minimum, + maximum, + memory64: a.memory64, + shared: a.shared, + page_size_log2: a.page_size_log2, + }) +} + +/// Rejects a memory whose fundamental shape the static merge and the Rocq +/// translator cannot model: a `memory64` (i64-addressed) memory, a `shared` +/// memory, or a memory with a non-default page size. +/// +/// The output module declares a single 32-bit, non-shared, default-page-size +/// memory, and wasm-to-v encodes exactly that model (`Mm {|lim_min; lim_max|}`, +/// with no `memory64`/`shared`/page-size field). Adopting any other shape would +/// produce a `.wasm` whose machine the paired `.v` silently misdescribes — the +/// worst failure class for a verification-first toolchain. Every contributed +/// memory (the main module's and each external's) is checked, so the rejection +/// is absolute rather than reachable only on the two-memory reconcile path +/// (audit C-4/L-1). +fn reject_unsupported_memory_shape( + mem: &inf_wasmparser::MemoryType, + field: &str, +) -> Result<(), LinkError> { + let reason = if mem.memory64 { + "the memory is `memory64` (i64-addressed); the static merge models only a 32-bit memory \ + and would require a relocatable build" + } else if mem.shared { + "the memory is `shared`; the static merge models only a non-shared memory" + } else if mem.page_size_log2.is_some() { + "the memory declares a custom page size; the static merge models only the default page size" + } else { + return Ok(()); + }; + Err(LinkError::IncompatibleMemory { + field: field.to_string(), + reason: reason.to_string(), + }) +} + +fn to_enc_memory(mem: &inf_wasmparser::MemoryType) -> EncMemoryType { + EncMemoryType { + minimum: mem.initial, + maximum: mem.maximum, + memory64: mem.memory64, + shared: mem.shared, + page_size_log2: mem.page_size_log2, + } +} + +fn map_global_type(g: &GlobalDef) -> Result { + Ok(EncGlobalType { + val_type: map_val_type(&g.ty.content_type)?, + mutable: g.ty.mutable, + shared: g.ty.shared, + }) +} + +fn map_global_init(init: GlobalInit) -> ConstExpr { + match init { + GlobalInit::I32(v) => ConstExpr::i32_const(v), + GlobalInit::I64(v) => ConstExpr::i64_const(v), + } +} + +#[cfg(test)] +mod tests { + //! Direct unit tests for memory reconciliation paths the public `link` API + //! cannot reach through valid WAT — notably the guard for a memory-using + //! closure with no memory to address, which would require a structurally + //! invalid external the `wat` assembler refuses to build. + + use super::*; + use inf_wasmparser::MemoryType; + + fn mem(initial: u64, maximum: Option) -> MemoryType { + MemoryType { + memory64: false, + shared: false, + initial, + maximum, + page_size_log2: None, + } + } + + #[test] + fn memory_using_closure_without_any_memory_is_rejected() { + // The guard (part C): a closure that touches memory while no module — + // neither main nor the external — declares one has nothing to address. + let mut r = MemoryReconciler::new(None).expect("a memoryless main is supported"); + let err = r + .fold(None, true, false, "f") + .expect_err("a memory-using closure with no memory must be rejected"); + assert!(matches!(err, LinkError::IncompatibleMemory { .. }), "got {err:?}"); + } + + #[test] + fn pure_closure_without_memory_is_fine() { + // No memory effect, no memory declared: a pure merge needs no memory. + let mut r = MemoryReconciler::new(None).expect("a memoryless main is supported"); + r.fold(None, false, false, "f").expect("pure closure needs no memory"); + assert!(r.finish().is_none(), "no memory is emitted for a pure merge"); + } + + #[test] + fn minimum_is_widened_to_the_larger_of_two_memories() { + let mut r = + MemoryReconciler::new(Some(&mem(1, Some(20)))).expect("a 32-bit main is supported"); + r.fold(Some(&mem(10, Some(20))), true, false, "f") + .expect("compatible memories reconcile"); + let out = r.finish().expect("a memory is emitted"); + assert_eq!(out.minimum, 10, "reconciled minimum is the larger of 1 and 10"); + assert_eq!(out.maximum, Some(20)); + } + + #[test] + fn an_unbounded_external_maximum_does_not_relax_a_bounded_main() { + // S4: a main that declared `(memory 1 5)` must NOT be relaxed to unbounded + // by an external with no maximum. The external's static footprint (min 2) + // fits under the cap, so the merge succeeds — but the output maximum stays + // the main's declared 5, never silently unbounded. + let mut r = + MemoryReconciler::new(Some(&mem(1, Some(5)))).expect("a 32-bit main is supported"); + r.fold(Some(&mem(2, None)), true, false, "f") + .expect("an unbounded external fits under the main's cap and clamps to it"); + let out = r.finish().expect("a memory is emitted"); + assert_eq!( + out.maximum, + Some(5), + "the output maximum stays the main's declared cap, not unbounded" + ); + assert_eq!(out.minimum, 2, "the minimum widens to the external's larger footprint"); + } + + #[test] + fn a_larger_external_maximum_is_clamped_to_the_main_cap() { + // S4: an external declaring a larger maximum (9) than the main's cap (5) + // is clamped down to the main's bound, not widened up to the external's. + // The external's static minimum (1) fits, so the merge succeeds with the + // main's maximum preserved. + let mut r = + MemoryReconciler::new(Some(&mem(1, Some(5)))).expect("a 32-bit main is supported"); + r.fold(Some(&mem(1, Some(9))), true, false, "f") + .expect("a larger external maximum is clamped to the main's cap"); + let out = r.finish().expect("a memory is emitted"); + assert_eq!(out.maximum, Some(5), "the output maximum stays the main's declared cap"); + } + + #[test] + fn an_external_minimum_above_the_main_cap_is_rejected() { + // S4: when the external's static footprint (min 9) exceeds the main's cap + // (5), the reservation cannot be honored without relaxing the host's + // declared maximum — reject rather than emit an invalid `min > max`. + let mut r = + MemoryReconciler::new(Some(&mem(1, Some(5)))).expect("a 32-bit main is supported"); + let err = r + .fold(Some(&mem(9, None)), true, false, "f") + .expect_err("an external footprint above the main's cap must be rejected"); + assert!(matches!(err, LinkError::IncompatibleMemory { .. }), "got {err:?}"); + } + + #[test] + fn grow_against_a_fixed_reconciled_memory_is_rejected() { + let mut r = + MemoryReconciler::new(Some(&mem(1, Some(1)))).expect("a 32-bit main is supported"); + let err = r + .fold(Some(&mem(1, Some(1))), true, true, "f") + .expect_err("growth against a pinned memory must reject"); + assert!(matches!(err, LinkError::IncompatibleMemory { .. }), "got {err:?}"); + } + + #[test] + fn a_smaller_bounded_external_keeps_the_main_maximum() { + // Reconciling a `(memory 10 10)` main with a `(memory 1 3)` external: the + // minimum widens to 10 (the larger), and the maximum stays the main's + // declared 10 (the external's smaller 3 fits under it), so the result is a + // valid `10..10` memory, not an invalid `min > max` and not a relaxed cap. + let mut r = + MemoryReconciler::new(Some(&mem(10, Some(10)))).expect("a 32-bit main is supported"); + r.fold(Some(&mem(1, Some(3))), true, false, "f") + .expect("a smaller external maximum keeps the memory valid"); + let out = r.finish().expect("a memory is emitted"); + assert_eq!(out.minimum, 10); + assert_eq!(out.maximum, Some(10), "the main's declared maximum is preserved"); + } + + #[test] + fn a_no_max_external_keeps_a_pinned_main_cap_not_unbounded() { + // S4 (the audit's named case): main `(memory 1 1)` + external `(memory 1)` + // (no maximum). The external's static footprint (min 1) fits under the + // pinned cap, so the merge succeeds — and the output maximum stays the + // main's pinned 1, never silently unbounded. + let mut r = + MemoryReconciler::new(Some(&mem(1, Some(1)))).expect("a 32-bit main is supported"); + r.fold(Some(&mem(1, None)), true, false, "f") + .expect("a no-max external fits under the pinned cap"); + let out = r.finish().expect("a memory is emitted"); + assert_eq!( + out.maximum, + Some(1), + "the output maximum stays the main's pinned cap, NOT silently unbounded" + ); + } + + fn memory64(initial: u64, maximum: Option) -> MemoryType { + MemoryType { memory64: true, ..mem(initial, maximum) } + } + + fn shared(initial: u64, maximum: Option) -> MemoryType { + MemoryType { shared: true, ..mem(initial, maximum) } + } + + fn custom_page(initial: u64, maximum: Option) -> MemoryType { + MemoryType { page_size_log2: Some(0), ..mem(initial, maximum) } + } + + /// Asserts the reconciler's `new` rejected the main memory by shape. `new` + /// returns `Result`, whose `Ok` arm is not `Debug`, so + /// we match the `Err` directly rather than calling `expect_err`. + fn assert_new_rejects(main_mem: &MemoryType) { + let result = MemoryReconciler::new(Some(main_mem)); + assert!( + matches!(result, Err(LinkError::IncompatibleMemory { .. })), + "expected IncompatibleMemory, got {:?}", + result.err() + ); + } + + #[test] + fn a_memory64_main_is_rejected_absolutely() { + // C-4: the main memory's shape is checked in `new`, so a 64-bit main can + // never reach the output the translator re-encodes as 32-bit. + assert_new_rejects(&memory64(1, Some(1))); + } + + #[test] + fn a_shared_main_is_rejected_absolutely() { + // L-1: a bare `shared` main memory (no atomic op) is rejected by shape. + assert_new_rejects(&shared(1, Some(1))); + } + + #[test] + fn a_custom_page_main_is_rejected_absolutely() { + assert_new_rejects(&custom_page(1, Some(1))); + } + + #[test] + fn a_memory64_external_on_a_memoryless_main_is_rejected_on_the_adopt_path() { + // C-4: the `None => ext` adopt path must reject too, so a 64-bit external + // forwarded by a memoryless main is never silently adopted as 32-bit. + let mut r = MemoryReconciler::new(None).expect("a memoryless main is supported"); + let err = r + .fold(Some(&memory64(1, Some(1))), true, false, "f") + .expect_err("a memory64 external must be rejected on adoption"); + assert!(matches!(err, LinkError::IncompatibleMemory { .. }), "got {err:?}"); + } + + #[test] + fn a_shared_external_on_a_memoryless_main_is_rejected_on_the_adopt_path() { + // L-1: a bare `shared` external (non-atomic body) onto a memoryless main + // is rejected by shape on the adopt path, not just on reconcile. + let mut r = MemoryReconciler::new(None).expect("a memoryless main is supported"); + let err = r + .fold(Some(&shared(1, Some(1))), true, false, "f") + .expect_err("a shared external must be rejected on adoption"); + assert!(matches!(err, LinkError::IncompatibleMemory { .. }), "got {err:?}"); + } + + #[test] + fn a_custom_page_external_on_a_memoryless_main_is_rejected_on_the_adopt_path() { + let mut r = MemoryReconciler::new(None).expect("a memoryless main is supported"); + let err = r + .fold(Some(&custom_page(1, Some(1))), true, false, "f") + .expect_err("a custom-page external must be rejected on adoption"); + assert!(matches!(err, LinkError::IncompatibleMemory { .. }), "got {err:?}"); + } + + #[test] + fn a_memory64_external_against_a_32_bit_main_is_rejected_before_reconcile() { + // The fold-path shape guard runs before `reconcile_pair`, so the + // rejection reason names the unsupported shape, not a `memory64` mismatch. + let mut r = + MemoryReconciler::new(Some(&mem(1, Some(1)))).expect("a 32-bit main is supported"); + let err = r + .fold(Some(&memory64(1, Some(1))), true, false, "f") + .expect_err("a memory64 external must be rejected"); + let LinkError::IncompatibleMemory { reason, .. } = &err else { + panic!("got {err:?}"); + }; + assert!(reason.contains("memory64"), "reason names the unsupported shape: {reason}"); + } + + #[test] + fn ref_typed_signature_is_rejected_at_intern_time() { + // Defense-in-depth behind the WASM 1.0 feature gate: the gate rejects a + // ref-typed external up front, but `intern_sig` is the chokepoint every + // merged signature passes through, so it must independently reject a + // reference type rather than collapse it to `i32` (the prior silent + // miscompile). This is the unit-level coverage for the layer the + // integration test + // `reference_typed_parameter_signature_is_rejected_at_the_feature_gate` + // can no longer reach (the gate fronts it). + use inf_wasmparser::{RefType, ValType}; + + let ref_param = FuncSig { + params: vec![ValType::Ref(RefType::FUNCREF)], + results: vec![], + }; + let err = sig_key(&ref_param).expect_err("a ref-typed param must not be interned"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("reference-typed")), + "expected an UnsupportedConstruct naming reference types, got {err:?}" + ); + + let ref_result = FuncSig { + params: vec![], + results: vec![ValType::Ref(RefType::FUNCREF)], + }; + let mut out_types = Vec::new(); + let mut cache = std::collections::BTreeMap::new(); + let err = intern_sig(&mut out_types, &mut cache, &ref_result) + .expect_err("a ref-typed result must not be interned"); + assert!( + matches!(err, LinkError::UnsupportedConstruct(_)), + "expected an UnsupportedConstruct, got {err:?}" + ); + assert!(out_types.is_empty(), "no signature is committed on rejection"); + } + + #[test] + fn v128_signature_is_rejected_at_intern_time() { + // The Inference language has no SIMD types, and every SIMD operator is + // rejected, so a `v128` in a function signature must be rejected on the + // signature axis too rather than carried through into the merged type + // table. `sig_key` is the dedup chokepoint every signature passes through; + // `intern_sig` reaches it. This parallels the float/reference rejections. + use inf_wasmparser::ValType; + + let v128_param = FuncSig { + params: vec![ValType::V128], + results: vec![], + }; + let err = sig_key(&v128_param).expect_err("a v128 param must not be interned"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("v128")), + "expected an UnsupportedConstruct naming v128, got {err:?}" + ); + + let v128_result = FuncSig { + params: vec![], + results: vec![ValType::V128], + }; + let mut out_types = Vec::new(); + let mut cache = std::collections::BTreeMap::new(); + let err = intern_sig(&mut out_types, &mut cache, &v128_result) + .expect_err("a v128 result must not be interned"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("v128")), + "expected an UnsupportedConstruct naming v128, got {err:?}" + ); + assert!(out_types.is_empty(), "no signature is committed on rejection"); + } +} diff --git a/core/wasm-linker/src/parse.rs b/core/wasm-linker/src/parse.rs new file mode 100644 index 00000000..f128edbf --- /dev/null +++ b/core/wasm-linker/src/parse.rs @@ -0,0 +1,704 @@ +//! Owned, section-by-section representation of a parsed WASM module. +//! +//! The linker needs to inspect and re-emit both the main module and each +//! external module. `inf-wasmparser` yields borrowed views into the original +//! bytes; this module copies the parts the linker manipulates into owned +//! structures so the borrow does not outlive a single parse pass. +//! +//! The main module is rebuilt section-by-section after merging, so its +//! exports, memory, globals, and name/custom sections are all retained. An +//! external module is only mined for the closure of a satisfied export, so for +//! those only the type table, import/local function split, exports, and bodies +//! matter — but the same structure is reused for both. + +use std::collections::BTreeMap; + +use inf_wasmparser::{ + CompositeInnerType, CustomSectionReader, Export, ExternalKind, FuncType, GlobalType, Import, + KnownCustom, MemoryType, Name, Operator, Parser, Payload, RecGroup, TableType, TypeRef, ValType, +}; + +use crate::LinkError; + +/// A WASM function signature, owned so it survives the parse borrow. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct FuncSig { + pub params: Vec, + pub results: Vec, +} + +impl FuncSig { + fn from_func_type(ty: &FuncType) -> Self { + FuncSig { + params: ty.params().to_vec(), + results: ty.results().to_vec(), + } + } +} + +/// A type-section entry. Non-function composite types are retained as `Other` +/// so that type indices stay aligned with the section they came from; the +/// merge pass only ever copies function types into the output. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) enum TypeEntry { + Func(FuncSig), + Other, +} + +/// An imported function: its `(module, field)` pair and the type index it +/// references. +#[derive(Debug, Clone)] +pub(crate) struct ImportedFunc { + pub module: String, + pub field: String, + pub type_idx: u32, +} + +/// An exported entity and the index it names, retaining its kind so the main +/// module's `memory` / `__stack_pointer` exports survive the rebuild. +#[derive(Debug, Clone)] +pub(crate) struct ExportEntry { + pub name: String, + pub kind: ExternalKind, + pub index: u32, +} + +/// A locally-defined function: its type index plus the verbatim body bytes. +#[derive(Debug, Clone)] +pub(crate) struct LocalFunc { + pub type_idx: u32, + /// Raw body bytes: the locals vector and operator stream, *without* the + /// leading body byte-length prefix — what `wasm-encoder::Function::raw` + /// consumes and what the rewrite pass walks. + pub body: Vec, +} + +/// A global definition, captured with the operators of its (constant) +/// initializer so it can be re-emitted faithfully. +#[derive(Debug, Clone)] +pub(crate) struct GlobalDef { + pub ty: GlobalType, + /// The constant initializer as `i32.const` / `i64.const` style operators. + /// The main module only ever emits a single `i32.const` here. + pub init: GlobalInit, +} + +/// The constant initializer of a global, restricted to the forms the codegen +/// output produces. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum GlobalInit { + I32(i32), + I64(i64), +} + +/// Whether a parsed module is the main module being linked or an external +/// dependency merged into it. Controls whether the `inference.spec_funcs` +/// custom section is decoded: the main module's drives proof-mode translation +/// and is re-emitted, while an external's is verification-only scaffolding that +/// the merge strips, so it is skipped (and never fails the link if malformed). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ModuleRole { + Main, + External, +} + +/// The subset of a WASM module the static-merge linker manipulates. +#[derive(Debug, Clone, Default)] +pub(crate) struct ParsedModule { + pub types: Vec, + pub imported_funcs: Vec, + /// Count of non-function imports — a self-contained module has none. + pub non_func_imports: usize, + pub exports: Vec, + /// Locally-defined functions in function-index order (imports occupy the + /// indices below `imported_funcs.len()`). + pub local_funcs: Vec, + pub globals: Vec, + pub tables: Vec, + pub element_count: usize, + pub data_count: usize, + pub memory: Option, + /// The number of memories the module declares. A module with more than one + /// memory cannot be merged: the body's memargs would name memories the + /// single shared output memory does not have. Counted (rather than inferred + /// from `memory.is_some()`) so the merge can reject multi-memory modules + /// explicitly instead of silently dropping every memory past the first. + pub memory_count: usize, + /// The start-function index, if the module declares a start section. A + /// merged external must not declare one: its initialization closure is never + /// folded in, so the merge rejects it rather than silently dropping the + /// side-effects. + pub start: Option, + /// Debug names from the `name` custom section, keyed by global function + /// index. Retained so the merged output keeps sane function names (which the + /// Rocq translator reads to name its `Definition`s); merged external bodies + /// that carry no name fall back to their satisfied export field. + pub func_names: BTreeMap, + /// The module name from the `name` custom section's module subsection, if + /// present. Re-emitted so the Rocq translator's `Definition ` survives + /// the merge. + pub module_name: Option, + /// Per-function local names from the `name` section's local subsection, + /// keyed by global function index, each carrying `(local_idx, name)` pairs. + /// The function index shifts with the merge; the local indices within a + /// function do not. Retained so the proof artifact keeps local debug names. + pub local_names: BTreeMap>, + /// The decoded `inference.spec_funcs` custom section: `spec_name -> + /// [func_idx]` in the *pre-link* index space. The merge rewrites each index + /// into the output space and re-emits the section, so a bare linked `.wasm` + /// still names the correct spec functions (the input to formal verification). + /// Only the main module carries one; externals never do. + pub spec_funcs: Option)>>, + /// The logical, `::`-joined module reference this module was bound under + /// (e.g. `"crypto::sha256"`), for an external; empty for the main module. + /// The merge matches each main-module import's recorded `(module, field)` + /// against this, so two externals exporting the same field but bound under + /// different logical modules can be disambiguated. + pub logical_module: String, +} + +impl ParsedModule { + /// The function index of the first locally-defined function. + pub(crate) fn local_func_base(&self) -> u32 { + self.imported_funcs.len() as u32 + } + + /// Returns the type signature for a function by its global function index. + pub(crate) fn func_sig(&self, func_idx: u32) -> Option<&FuncSig> { + let type_idx = if (func_idx as usize) < self.imported_funcs.len() { + self.imported_funcs[func_idx as usize].type_idx + } else { + self.local_funcs + .get(func_idx as usize - self.imported_funcs.len())? + .type_idx + }; + match self.types.get(type_idx as usize)? { + TypeEntry::Func(sig) => Some(sig), + TypeEntry::Other => None, + } + } + + /// The debug name recorded for a function by its global function index, + /// if the source module carried a `name` custom section entry for it. + pub(crate) fn func_name(&self, func_idx: u32) -> Option<&str> { + self.func_names.get(&func_idx).map(String::as_str) + } + + /// The function index an export of this name resolves to, if it is a + /// function export. + pub(crate) fn exported_func_index(&self, name: &str) -> Option { + self.exports + .iter() + .find(|e| e.name == name && e.kind == ExternalKind::Func) + .map(|e| e.index) + } + + /// Parses `bytes` into the owned representation, recording `logical_module` + /// as the logical name the module was bound under (empty for the main + /// module). The merge uses it to disambiguate two externals that export the + /// same field but were bound from different logical modules. + /// + /// An external module's `inference.spec_funcs` custom section — and any spec + /// functions it names — are *not* merged into the executable output: only + /// the executable closure of the satisfied export crosses the merge. So the + /// section is skipped here rather than decoded, and a malformed one in an + /// external never fails the link (the section is irrelevant to the merge). + pub(crate) fn parse_external(bytes: &[u8], logical_module: &str) -> Result { + let mut module = Self::parse_with_role(bytes, ModuleRole::External)?; + module.logical_module = logical_module.to_string(); + Ok(module) + } + + /// Parses the main module's `bytes`, decoding its `inference.spec_funcs` + /// section (a verification deliverable the merge re-emits, re-indexed). + pub(crate) fn parse(bytes: &[u8]) -> Result { + Self::parse_with_role(bytes, ModuleRole::Main) + } + + /// Parses `bytes` into the owned representation under the given `role`, which + /// decides whether the `inference.spec_funcs` custom section is decoded (main + /// module) or skipped (external module). + fn parse_with_role(bytes: &[u8], role: ModuleRole) -> Result { + let mut module = ParsedModule::default(); + + // Running cursor into `local_funcs` for code-section assignment. Code + // bodies arrive in function-declaration order, so the i-th body fills + // slot i; tracking the next slot makes parsing O(N) instead of the + // O(N^2) linear scan a wide external module would otherwise incur. + let mut next_body_idx = 0usize; + + for payload in Parser::new(0).parse_all(bytes) { + let payload = payload.map_err(|e| LinkError::Parse(e.to_string()))?; + match payload { + Payload::TypeSection(reader) => { + for group in reader { + let group = group.map_err(|e| LinkError::Parse(e.to_string()))?; + collect_types(&group, &mut module.types); + } + } + Payload::ImportSection(reader) => { + for import in reader { + let import = import.map_err(|e| LinkError::Parse(e.to_string()))?; + collect_import(&import, &mut module); + } + } + Payload::FunctionSection(reader) => { + for type_idx in reader { + let type_idx = type_idx.map_err(|e| LinkError::Parse(e.to_string()))?; + module.local_funcs.push(LocalFunc { + type_idx, + body: Vec::new(), + }); + } + } + Payload::ExportSection(reader) => { + for export in reader { + let Export { name, kind, index } = + export.map_err(|e| LinkError::Parse(e.to_string()))?; + module.exports.push(ExportEntry { + name: name.to_string(), + kind, + index, + }); + } + } + Payload::GlobalSection(reader) => { + for global in reader { + let global = global.map_err(|e| LinkError::Parse(e.to_string()))?; + module.globals.push(collect_global(&global)?); + } + } + Payload::TableSection(reader) => { + for table in reader { + let table = table.map_err(|e| LinkError::Parse(e.to_string()))?; + module.tables.push(table.ty); + } + } + Payload::ElementSection(reader) => { + module.element_count += reader.count() as usize; + } + Payload::DataSection(reader) => { + module.data_count += reader.count() as usize; + } + Payload::MemorySection(reader) => { + for memory in reader { + let memory = memory.map_err(|e| LinkError::Parse(e.to_string()))?; + module.memory_count += 1; + if module.memory.is_none() { + module.memory = Some(memory); + } + } + } + Payload::StartSection { func, .. } => { + module.start = Some(func); + } + Payload::CodeSectionEntry(body) => { + assign_body(&mut module, &mut next_body_idx, &body)?; + } + Payload::CustomSection(reader) => { + collect_custom_section(&reader, &mut module, role)?; + } + _ => {} + } + } + + Ok(module) + } +} + +fn collect_types(group: &RecGroup, out: &mut Vec) { + for sub_type in group.types() { + match &sub_type.composite_type.inner { + CompositeInnerType::Func(func_type) => { + out.push(TypeEntry::Func(FuncSig::from_func_type(func_type))); + } + _ => out.push(TypeEntry::Other), + } + } +} + +/// Mines a custom section for everything the merge must carry through: the +/// `name` section's module/function/local subsections, and (for the main module +/// only) the `inference.spec_funcs` section that drives proof-mode translation. +/// +/// The `name` subsections are best-effort (an unparseable one is skipped). The +/// main module's `inference.spec_funcs` payload, by contrast, is a verification +/// deliverable: a malformed one is a hard [`LinkError`], never silently dropped. +/// An external module's spec section is verification-only scaffolding the merge +/// strips, so it is skipped here without decoding — its presence never fails the +/// link, and a malformed one in an irrelevant external cannot block the merge. +fn collect_custom_section( + custom: &CustomSectionReader, + module: &mut ParsedModule, + role: ModuleRole, +) -> Result<(), LinkError> { + if custom.name() == crate::spec_funcs::SECTION_NAME { + if role == ModuleRole::External { + return Ok(()); + } + // A second spec_funcs section would silently discard the first under a + // last-wins assignment, dropping its proof obligations. Since the section + // is a verification deliverable, reject the duplicate with a clean error + // rather than vanish the earlier obligations. + if module.spec_funcs.is_some() { + return Err(LinkError::Parse( + "main module declares more than one inference.spec_funcs section; \ + its proof obligations would be silently dropped" + .into(), + )); + } + let decoded = crate::spec_funcs::decode(custom.data())?; + module.spec_funcs = Some(decoded); + return Ok(()); + } + + let KnownCustom::Name(names) = custom.as_known() else { + return Ok(()); + }; + for subsection in names { + let Ok(subsection) = subsection else { + continue; + }; + match subsection { + Name::Module { name, .. } => module.module_name = Some(name.to_string()), + Name::Function(func_names) => { + for naming in func_names { + let Ok(naming) = naming else { + continue; + }; + module.func_names.insert(naming.index, naming.name.to_string()); + } + } + Name::Local(indirect) => { + for per_func in indirect { + let Ok(per_func) = per_func else { + continue; + }; + let mut locals = Vec::new(); + for naming in per_func.names { + let Ok(naming) = naming else { + continue; + }; + locals.push((naming.index, naming.name.to_string())); + } + if !locals.is_empty() { + module.local_names.insert(per_func.index, locals); + } + } + } + _ => {} + } + } + Ok(()) +} + +fn collect_import(import: &Import, module: &mut ParsedModule) { + match import.ty { + TypeRef::Func(type_idx) => module.imported_funcs.push(ImportedFunc { + module: import.module.to_string(), + field: import.name.to_string(), + type_idx, + }), + TypeRef::Global(_) | TypeRef::Table(_) | TypeRef::Memory(_) | TypeRef::Tag(_) => { + module.non_func_imports += 1; + } + } +} + +fn collect_global(global: &inf_wasmparser::Global) -> Result { + let mut ops = global.init_expr.get_operators_reader(); + let first = ops + .read() + .map_err(|e| LinkError::Parse(e.to_string()))?; + let init = match first { + Operator::I32Const { value } => GlobalInit::I32(value), + Operator::I64Const { value } => GlobalInit::I64(value), + // Only the two integer constant initializers are modeled. A float + // initializer (`f32.const`/`f64.const`) is the most likely "other" here — + // the Inference language has no `f32`/`f64` types — so the catch-all names + // what is supported rather than mislabeling a constant `f32.const` as + // "non-constant". A float global is also rejected up front by the feature + // gate; this is the chokepoint for the main-module path that bypasses it. + other => { + return Err(LinkError::UnsupportedConstruct(format!( + "unsupported global initializer for the static merge: {other:?} \ + (only i32.const/i64.const are modeled)" + ))); + } + }; + Ok(GlobalDef { + ty: global.ty, + init, + }) +} + +/// Stores a code-section body against the local function at `next_body_idx`, +/// then advances the cursor. Bodies arrive in function-declaration order, so +/// this assigns body `i` to local function `i` in a single linear pass. +fn assign_body( + module: &mut ParsedModule, + next_body_idx: &mut usize, + body: &inf_wasmparser::FunctionBody, +) -> Result<(), LinkError> { + let Some(slot) = module.local_funcs.get_mut(*next_body_idx) else { + return Err(LinkError::Parse( + "code section has more bodies than declared functions".into(), + )); + }; + slot.body = body.as_bytes().to_vec(); + *next_body_idx += 1; + Ok(()) +} + +#[cfg(test)] +mod tests { + //! Parser unit tests for sections and constructs the `link` API only sees + //! indirectly: non-function imports, element/data/table counting, and the + //! function-signature lookup that bridges the import/local index split. + + use super::*; + + fn parse(wat: &str) -> ParsedModule { + let bytes = wat::parse_str(wat).expect("valid WAT"); + ParsedModule::parse(&bytes).expect("parse") + } + + #[test] + fn counts_non_function_imports_separately() { + // A memory import and a global import are non-function imports; only the + // function import enters `imported_funcs`, the rest bump the count. + let module = parse( + r#" + (module + (type (;0;) (func)) + (import "env" "memory" (memory (;0;) 1)) + (import "env" "g" (global (;0;) i32)) + (import "env" "log" (func (;0;) (type 0))) + (func (;1;) (type 0)) + (export "f" (func 1))) + "#, + ); + assert_eq!(module.imported_funcs.len(), 1, "one function import"); + assert_eq!(module.imported_funcs[0].field, "log"); + assert_eq!( + module.non_func_imports, 2, + "the memory and global imports are counted as non-function imports" + ); + } + + #[test] + fn counts_table_element_and_data_sections() { + let module = parse( + r#" + (module + (type (;0;) (func)) + (table (;0;) 1 1 funcref) + (memory (;0;) 1) + (func (;0;) (type 0)) + (elem (;0;) (i32.const 0) func 0) + (data (;0;) (i32.const 0) "ab") + (export "f" (func 0))) + "#, + ); + assert_eq!(module.tables.len(), 1, "one table"); + assert_eq!(module.element_count, 1, "one element segment"); + assert_eq!(module.data_count, 1, "one data segment"); + assert!(module.memory.is_some(), "memory captured"); + } + + #[test] + fn func_sig_reads_imported_and_local_function_types() { + // `func_sig` must follow the index space: function 0 is the import, 1 is + // the local — and both share the same type. + let module = parse( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "env" "ext" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0) + (export "f" (func 1))) + "#, + ); + let import_sig = module.func_sig(0).expect("imported function has a type"); + let local_sig = module.func_sig(1).expect("local function has a type"); + assert_eq!(import_sig.params, vec![ValType::I32]); + assert_eq!(import_sig.results, vec![ValType::I32]); + assert_eq!(import_sig, local_sig, "import and local share one type here"); + assert!( + module.func_sig(99).is_none(), + "an out-of-range function index has no signature" + ); + } + + #[test] + fn captures_function_names_from_the_name_section() { + // The `name` custom section's function subsection must be mined so merged + // and main functions keep sane names for the Rocq translator. + let module = parse( + r#" + (module + (type (;0;) (func)) + (func $entry (;0;) (type 0)) + (export "f" (func 0))) + "#, + ); + assert_eq!( + module.func_name(0), + Some("entry"), + "the $entry debug name must be captured" + ); + assert_eq!(module.func_name(1), None, "no name for an unnamed index"); + } + + #[test] + fn exported_func_index_ignores_non_function_exports() { + // A memory export named `shared` must not be mistaken for a function of + // that name. + let module = parse( + r#" + (module + (type (;0;) (func)) + (memory (;0;) 1) + (func (;0;) (type 0)) + (export "shared" (memory 0)) + (export "run" (func 0))) + "#, + ); + assert_eq!( + module.exported_func_index("shared"), + None, + "a memory export is not a function export" + ); + assert_eq!(module.exported_func_index("run"), Some(0)); + } + + #[test] + fn rejects_invalid_bytes() { + let err = ParsedModule::parse(b"definitely not wasm").unwrap_err(); + assert!(matches!(err, LinkError::Parse(_)), "got {err:?}"); + } + + #[test] + fn captures_the_start_function_index() { + // The start section must be captured so the merge can reject modules that + // declare one (their initialization closure is never folded in). + let module = parse( + r#" + (module + (type (;0;) (func)) + (func (;0;) (type 0)) + (func (;1;) (type 0)) + (start 1) + (export "f" (func 0))) + "#, + ); + assert_eq!(module.start, Some(1), "the start function index is captured"); + } + + #[test] + fn no_start_section_leaves_start_none() { + let module = parse( + r#" + (module + (type (;0;) (func)) + (func (;0;) (type 0)) + (export "f" (func 0))) + "#, + ); + assert_eq!(module.start, None, "no start section means no start index"); + } + + #[test] + fn code_bodies_are_assigned_in_function_order() { + // The running-cursor assignment must place each body on its own function + // in declaration order, so a function's `type_idx` and `body` agree. + let module = parse( + r#" + (module + (type (;0;) (func (result i32))) + (type (;1;) (func (result i64))) + (func (;0;) (type 0) (result i32) i32.const 1) + (func (;1;) (type 1) (result i64) i64.const 2) + (export "a" (func 0)) + (export "b" (func 1))) + "#, + ); + assert_eq!(module.local_funcs.len(), 2); + assert_eq!(module.local_funcs[0].type_idx, 0, "first body -> function 0"); + assert_eq!(module.local_funcs[1].type_idx, 1, "second body -> function 1"); + assert!( + module.local_funcs.iter().all(|f| !f.body.is_empty()), + "every function received a body" + ); + } + + /// A minimal one-function module carrying an `inference.spec_funcs` custom + /// section with the given payload. + fn module_with_spec_section(payload: &[u8]) -> Vec { + use wasm_encoder::{ + CodeSection, CustomSection, ExportKind, ExportSection, Function, FunctionSection, Module, + TypeSection, + }; + let mut module = Module::new(); + let mut types = TypeSection::new(); + types.ty().function([], []); + module.section(&types); + let mut funcs = FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + let mut exports = ExportSection::new(); + exports.export("f", ExportKind::Func, 0); + module.section(&exports); + let mut code = CodeSection::new(); + let mut body = Function::new([]); + body.instruction(&wasm_encoder::Instruction::End); + code.function(&body); + module.section(&code); + module.section(&CustomSection { + name: crate::spec_funcs::SECTION_NAME.into(), + data: payload.into(), + }); + module.finish() + } + + #[test] + fn main_decodes_the_spec_section_external_skips_it() { + // The main module's spec section is a verification deliverable: decode it. + // An external's is verification-only scaffolding the merge strips: skip + // it so the external never even materialises a `spec_funcs` field. + // version=1, count=1, name_len=1 'S', idx_count=1, idx=0. + let payload = [1u8, 1, 1, b'S', 1, 0]; + let bytes = module_with_spec_section(&payload); + + let main = ParsedModule::parse(&bytes).expect("main parse"); + assert_eq!( + main.spec_funcs, + Some(vec![("S".to_string(), vec![0])]), + "the main module must decode its spec section" + ); + + let external = ParsedModule::parse_external(&bytes, "lib").expect("external parse"); + assert_eq!( + external.spec_funcs, None, + "an external's spec section must be skipped, not decoded" + ); + } + + #[test] + fn a_malformed_spec_section_fails_main_but_not_external() { + // A malformed spec section (version byte 0xff) is a hard error for the + // main module (a verification deliverable), but for an external — which + // strips it — it must not fail the parse at all. + let bytes = module_with_spec_section(&[0xffu8, 0xff, 0xff]); + + assert!( + matches!(ParsedModule::parse(&bytes), Err(LinkError::Parse(_))), + "a malformed main spec section must be a hard parse error" + ); + assert!( + ParsedModule::parse_external(&bytes, "lib").is_ok(), + "a malformed external spec section must not fail the parse" + ); + } +} diff --git a/core/wasm-linker/src/provenance.rs b/core/wasm-linker/src/provenance.rs new file mode 100644 index 00000000..ff77671e --- /dev/null +++ b/core/wasm-linker/src/provenance.rs @@ -0,0 +1,1517 @@ +//! Address-provenance analysis for Tier-B soundness. +//! +//! Tier B's contract is that a merged external touches the single shared memory +//! *only through addresses the caller passes in* — never an address it +//! fabricates from a constant or reads from its own global/stack. Such a +//! fabricated address would alias the host program's own linear memory at a +//! fixed offset, a silent miscompile the static merge cannot detect by section +//! inspection alone (the body validates and the export signature matches). +//! +//! This module proves the contract by a **sound, flow-sensitive, +//! interprocedural** abstract interpretation over the whole closure. The single +//! trusted source of addresses is the **closure root's** parameters — whatever +//! pointer the caller passes the satisfied export, the caller owns. Every memory +//! access, in the root or any function it transitively calls, must address +//! memory through a value that provably derives from a *trusted* parameter on +//! every reachable control-flow path. Anything not proven safe rejects the whole +//! closure as Tier C ([`LinkError::RequiresRelocatableBuild`]). Fail closed. +//! +//! For a bulk-memory op (`memory.fill`/`memory.copy`/`memory.init`) the **size / +//! extent** operand carries the *same* caller-derivation requirement as an +//! address. Such an op touches the contiguous region `[address, address + size)`, +//! so proving only the start caller-relative is not enough: a constant or global +//! extent would let the op clobber or read an unbounded span above a caller +//! pointer (`memory.fill(base, v, 0x8000)` scorches host memory the caller never +//! exposed) — the same unbounded-clobber the rejected counted-loop form achieves, +//! one instruction at a time. Modeling the extent with the address rule keeps the +//! realistic caller-owns-`(ptr, len)` pattern linkable while closing that escape. +//! +//! ## The lattice +//! +//! Every operand-stack slot and every local carries one of three provenance tags: +//! +//! - [`Prov::Param`] — the value provably flows from one or more function +//! parameters (the carried [`ParamMask`] records *which* of this function's +//! parameters) through operations that cannot erase the caller's pointer +//! (`local.get` of a `Param` slot, `add` of a `Param` with a `Param` or a +//! proven `Const`, `sub` of a `Param` minuend by a non-`Param` subtrahend). +//! - [`Prov::Const`] — the value provably is a compile-time constant +//! (caller-independent), produced by a `*.const` literal or by `add`/`sub` of +//! two `Const`s. This tag exists solely so a `Param + Const` (a struct-field or +//! array-element offset) can stay `Param`, while a `Param + NotParam` cannot — +//! a `NotParam` addend is only *not provably param-derived*, so it may secretly +//! hold a negated parameter (`C - p`) that cancels the `Param` operand back to a +//! caller-independent constant (`(C - p) + p == C`). A constant used *directly* +//! as a memory address is still rejected (it is not `Param`). +//! - [`Prov::NotParam`] — every other producer: a global, a call result, the +//! table space, *any* multiplicative/bitwise/shift op (each can cancel the +//! caller contribution: `param*0`, `param&0`, `param^param`), every unary op, +//! any binary op whose operands are not both proven constant, or any source the +//! analysis cannot prove parameter-derived. The fail-closed default for an +//! uninitialized local, a stack underflow, and any unmodeled situation. +//! +//! The lattice join is the must-join: a value stays `Param` only when it is +//! `Param` on *all* incoming paths (the carried mask is then the **union** of +//! the per-path masks — on every path it derives from *some* parameter, so on the +//! merged path it derives from one of the union), and it widens to `NotParam` the +//! moment any incoming path is `Const` or `NotParam`. The lattice has no value +//! identity, so it can never prove two `Param` operands unequal — which is why +//! every operator that can cancel two equal `Param` inputs to a constant (`sub`, +//! `xor`, `and`, `mul`, …) treats its result as `NotParam`. +//! +//! ### Why the mask join is a union, and verification is `⊆` +//! +//! A `select(p0, p1)` or a two-armed `if` yields `p0` on one path and `p1` on the +//! other; at runtime the address is *either* parameter. The merged value derives +//! from `{p0} ∪ {p1}`, and it is a safe address only when **every** parameter it +//! might resolve to is caller-supplied — so verification requires the access's +//! mask to be a **subset** of the trusted-parameter set, not merely to intersect +//! it. (An `add(p0, p1)` would be safe with only one operand trusted, since +//! `caller_base + anything` stays caller-relative; using the stricter `⊆` rule +//! there is a sound over-approximation that keeps a single, uniform check.) +//! +//! ### Why `add` is not symmetric in `Param` +//! +//! `add` propagates `Param` only when the *other* operand is a `Param` or a +//! proven `Const`, never when it is a general `NotParam`. Tagging `Param + X` +//! as `Param` whenever either operand is `Param` is **unsound**: `NotParam` means +//! "not provably parameter-derived", *not* "constant". A `NotParam` operand may +//! hold `C - p` (the round-2 `sub` rule correctly demotes `const - param` to +//! `NotParam`), and `(C - p) + p == C` is a fixed, caller-independent absolute +//! address. Restricting the non-`Param` addend to a proven `Const` closes this: +//! `caller_base + fixed_offset` provably still varies with the caller's pointer. +//! +//! ## Control flow +//! +//! The analysis is a structured forward abstract interpretation over the WASM +//! structured-control tree (`block`/`loop`/`if`/`else`/`end` and the four +//! Inference non-det blocks), with [`State::join`] of the per-local state at +//! every merge point (`end`, `else`, branch target) and a loop fixpoint over +//! back-edges. A local is `Param` at a use only if `Param` on *every* reaching +//! path, so a `Param` tag written on one branch or one loop iteration cannot +//! survive a merge with a path that leaves it `NotParam`. +//! +//! ## Interprocedural policy (the sound call-graph fixpoint) +//! +//! Each function is summarised *once* against a fixed seed: parameter `i` seeds +//! `Param({i})`. From that single pass the analysis records, per function, the +//! address mask of every memory access and, per call site, the argument mask of +//! every argument (each in the calling function's own parameter terms). A +//! greatest-fixpoint pass over the call graph then computes, for every function +//! `g`, the set `trusted[g]` of `g`'s parameters that are provably caller-derived: +//! +//! - `trusted[root]` is **all** of the root's parameters (the caller owns them). +//! - a parameter `j` of a non-root `g` is trusted iff at *every* recorded call +//! site `f → g` the argument in position `j` is itself param-derived from +//! `trusted[f]` — i.e. its mask is non-empty and a subset of `trusted[f]`. +//! +//! Starting from "all parameters trusted" and iteratively removing any parameter +//! contradicted at a call site converges (a finite lattice, monotone descent), +//! handling self- and mutual recursion. A function reachable only through a table +//! (no direct call site) keeps its default-untrusted parameters, so a dereference +//! of an unjustified parameter is rejected. Call/`call_indirect` results are +//! always `NotParam`. Finally every memory access is verified against its +//! function's `trusted` set (`mask ⊆ trusted`); any access that fails rejects the +//! whole closure. + +use inf_wasmparser::{BinaryReader, BlockType, FunctionBody, Operator}; + +use crate::parse::{FuncSig, ParsedModule}; +use crate::LinkError; + +/// A set of a single function's parameter indices, as a 64-bit bitset. +/// +/// WebAssembly permits more parameters than 64, but a function with that many is +/// neither produced by Inference codegen nor a realistic shared-memory helper; +/// any parameter whose index is `>= 64` cannot be represented, so it is treated +/// as **never trusted** — its bit is simply absent, a value deriving solely from +/// it stays `NotParam`, and a dereference through it is rejected. This is a sound +/// over-rejection at the high-arity tail, never an unsound admission. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +struct ParamMask(u64); + +impl ParamMask { + /// The empty mask: derives from no parameter. + const EMPTY: ParamMask = ParamMask(0); + + /// The mask of the single parameter `index`, or the empty mask when `index` + /// exceeds the representable range (so a high-arity parameter is never + /// trusted). + fn single(index: usize) -> ParamMask { + if index < 64 { + ParamMask(1 << index) + } else { + ParamMask::EMPTY + } + } + + /// The mask of the parameters `0..count`, saturating at the 64-bit range. + fn first_n(count: usize) -> ParamMask { + if count >= 64 { + ParamMask(u64::MAX) + } else if count == 0 { + ParamMask::EMPTY + } else { + ParamMask((1u64 << count) - 1) + } + } + + fn is_empty(self) -> bool { + self.0 == 0 + } + + fn union(self, other: ParamMask) -> ParamMask { + ParamMask(self.0 | other.0) + } + + /// Whether every parameter in `self` is also in `other` (`self ⊆ other`). + fn is_subset_of(self, other: ParamMask) -> bool { + self.0 & !other.0 == 0 + } + + fn without(self, index: usize) -> ParamMask { + if index < 64 { + ParamMask(self.0 & !(1 << index)) + } else { + self + } + } +} + +/// Upper bound on declared locals, mirroring `inf_wasmparser`'s +/// `MAX_WASM_FUNCTION_LOCALS` (the validator's own cap, which the driver's +/// pre-link validation already enforces for the CLI path). Re-stated here as a +/// private constant because that limit is not re-exported from the parser, and +/// the locals cap below combines it with the body length so the public library +/// API is self-defending even without the driver's validation gate. Each +/// declared local costs at least one byte in the locals encoding, so a body of +/// `B` bytes can never legitimately declare more than `B` locals; a count +/// exceeding `min(this, B)` is a malformed/adversarial group, rejected as a +/// clean [`LinkError::Parse`] rather than a multi-gigabyte allocation. +const MAX_WASM_FUNCTION_LOCALS: usize = 50_000; + +/// Maximum structured-block nesting the analysis recurses into before failing +/// closed. The function-size cap bounds total operators, but a deeply-nested +/// body recurses one stack frame per level (`interpret` → `run_block` → +/// `interpret`), so past this depth the body is conservatively rejected (Tier C) +/// rather than risk an abort. The bound is kept well under what the smallest +/// stack the analysis runs on (a 2 MiB test thread) can hold, so the guard fires +/// long before a real overflow. +const MAX_ANALYSIS_DEPTH: usize = 256; + +/// The provenance lattice for a value (an operand-stack slot or a local). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Prov { + /// The value provably derives from one or more of *this function's* + /// parameters (recorded in the [`ParamMask`]) and cannot have been cancelled + /// to a caller-independent constant. The mask is always non-empty. + Param(ParamMask), + /// The value provably is a compile-time constant (a `*.const` literal, or + /// `add`/`sub` of two `Const`s). Caller-independent: never a valid memory + /// address on its own, but a valid *offset* to add to a `Param` base. + Const, + /// The value derives from a global, call result, a parameter-cancelling + /// operator, a non-constant binary op, or any source the analysis cannot + /// prove parameter-derived or constant. The fail-closed default. + NotParam, +} + +impl Prov { + /// The must-join: a value stays `Param` only when *both* operands are + /// `Param`, with the **union** of their masks (on every path it derives from + /// some parameter, so on the merged path it derives from one of the union). + /// Two `Const`s stay `Const`; anything else widens to `NotParam`. Used to + /// merge a local (or a stack slot) across control-flow paths. + fn join(self, other: Prov) -> Prov { + match (self, other) { + (Prov::Param(a), Prov::Param(b)) => Prov::Param(a.union(b)), + (Prov::Const, Prov::Const) => Prov::Const, + _ => Prov::NotParam, + } + } + + /// The mask of parameters this value derives from, empty for a non-`Param`. + fn mask(self) -> ParamMask { + match self { + Prov::Param(m) => m, + _ => ParamMask::EMPTY, + } + } +} + +/// The abstract state at a program point: the provenance of every local and of +/// every operand-stack slot in the current block. +#[derive(Debug, Clone, PartialEq, Eq)] +struct State { + /// Per-local provenance, length = the (capped) local count. + locals: Vec, + /// The abstract operand stack within the current structured block. + stack: Vec, +} + +impl State { + /// Elementwise must-join of two states. The `locals` vectors always share + /// one length. When the two operand stacks have equal height (the case at + /// every real merge point a valid body produces) they join elementwise; + /// when they differ the merged stack is widened to all-`NotParam` of the + /// taller height — failing closed, never accepting a stale `Param`. + fn join(&self, other: &State) -> State { + let locals = self + .locals + .iter() + .zip(&other.locals) + .map(|(a, b)| a.join(*b)) + .collect(); + + let stack = if self.stack.len() == other.stack.len() { + self.stack + .iter() + .zip(&other.stack) + .map(|(a, b)| a.join(*b)) + .collect() + } else { + vec![Prov::NotParam; self.stack.len().max(other.stack.len())] + }; + + State { locals, stack } + } +} + +/// A direct `call` site recorded during a function's summary pass: the callee's +/// global function index and the provenance mask of each argument, expressed in +/// the *calling* function's own parameter terms. +#[derive(Debug, Clone)] +struct CallSite { + callee: u32, + arg_masks: Vec, +} + +/// The flow-sensitive summary of one function, computed *once* with each +/// parameter `i` seeded `Param({i})`. The interprocedural fixpoint reads these +/// summaries; nothing in them depends on which parameters are trusted, so the +/// per-function abstract interpretation never re-runs. +#[derive(Debug, Default, Clone)] +struct FunctionSummary { + /// The number of leading locals that are parameters. + param_count: usize, + /// One entry per memory access: the provenance mask of its required address + /// operand(s) (`memory.copy` records one entry each for its dest and src so + /// the single subset check covers both). For a bulk-memory op the **size / + /// extent** operand is recorded as its own entry too: the op touches the + /// whole region `[address, address + size)`, so a caller-bounded start is not + /// enough — the extent must be caller-derived as well, or a constant size + /// could clobber an unbounded region above a caller pointer. An empty mask is + /// an address or extent the pass could not prove parameter-derived — it can + /// never satisfy the subset check, so it rejects unconditionally. + accesses: Vec, + /// One entry per direct `call` site, in body order. + calls: Vec, +} + +/// Verifies that every memory access across the whole closure addresses memory +/// through a value derived from the closure **root's** parameters, propagated +/// across internal `call`s that pass param-derived arguments. Returns `Ok(())` +/// when the closure is provably parameter-addressing (sound for Tier B), or +/// [`LinkError::RequiresRelocatableBuild`] naming `field` when any function +/// performs a memory access whose address cannot be proven caller-supplied. +/// +/// `func_indices` are the global function indices of the closure (as produced by +/// [`crate::closure::compute`], ascending); `root` is the satisfied export, whose +/// parameters are the trusted caller pointers. Each function is summarised once, +/// then a greatest-fixpoint over the call graph computes which of every +/// function's parameters are trusted, and finally every access is checked against +/// its function's trusted set. +pub(crate) fn verify_param_addressing( + module: &ParsedModule, + func_indices: &[u32], + root: u32, + field: &str, +) -> Result<(), LinkError> { + let base = module.local_func_base(); + + // 1. Summarise every closure function once (param i seeded Param({i})). + // `summaries` is keyed by global function index for O(1) fixpoint lookup. + let mut summaries: std::collections::BTreeMap = + std::collections::BTreeMap::new(); + for &func_idx in func_indices { + let param_count = module + .func_sig(func_idx) + .map(|sig| sig.params.len()) + .ok_or_else(|| { + LinkError::Parse(format!( + "closure function {func_idx} has no function type for provenance analysis" + )) + })?; + let local = module + .local_funcs + .get((func_idx - base) as usize) + .ok_or_else(|| { + LinkError::Parse(format!( + "closure function {func_idx} is out of range for provenance analysis" + )) + })?; + summaries.insert(func_idx, summarize_function(module, &local.body, param_count)?); + } + + // 2. Greatest fixpoint: trusted[g] starts as all of g's params, root keeps + // them all, and any param contradicted at a reachable call site is removed + // until the assignment stabilises. + let trusted = compute_trusted_params(&summaries, root); + + // 3. Verify every access: its address mask must be a non-empty subset of its + // function's trusted set. + for (&func_idx, summary) in &summaries { + let trust = trusted.get(&func_idx).copied().unwrap_or(ParamMask::EMPTY); + for &access in &summary.accesses { + if access.is_empty() || !access.is_subset_of(trust) { + return Err(reject( + field, + "accesses memory at an address not derived from the exported function's \ + parameters (a constant, a module-internal address, or an argument a \ + caller never supplied); a relocatable build is required to place its \ + data safely", + )); + } + } + } + + Ok(()) +} + +/// Computes, for every summarised function, the subset of its parameters that are +/// provably caller-derived (the *greatest* fixpoint). +/// +/// The root has an implicit **external** call site — the host that calls the +/// exported function and supplies its pointer arguments — that justifies all of +/// the root's parameters. Every other function is justified only by the *internal* +/// `call` sites inside the closure. A parameter of a function `g` is trusted only +/// when it is justified at the external site (root only) *and* at every internal +/// call site `f → g`: the argument in that position must be non-empty and a subset +/// of `f`'s own trusted set. Crucially the root is **not** exempt from its own +/// internal call sites: a self- or mutually-recursive call that passes a constant +/// for a root parameter demotes it, because the recursive invocation re-enters the +/// root with an argument the host never supplied. +/// +/// Starting from "all parameters trusted" and removing any parameter a reachable +/// call site contradicts is monotone (a parameter never re-enters once removed) +/// over a finite lattice, so the iteration converges — handling self- and mutual +/// recursion. A non-root function with no internal call site (reachable only +/// through a table, or unreachable directly) is left with the empty trusted set, +/// so a dereference of its parameter is rejected. +fn compute_trusted_params( + summaries: &std::collections::BTreeMap, + root: u32, +) -> std::collections::BTreeMap { + // Optimistic seed: every function starts all-trusted and the fixpoint pares + // each one down against the current assignment until it stabilises. + let mut trusted: std::collections::BTreeMap = summaries + .iter() + .map(|(&idx, summary)| (idx, ParamMask::first_n(summary.param_count))) + .collect(); + + loop { + let mut changed = false; + let mut next = trusted.clone(); + for (&callee, summary) in summaries { + // The root's external caller justifies all params and always counts as + // a caller; a non-root is justified only by internal call sites. + let mut justified = ParamMask::first_n(summary.param_count); + let mut has_caller = callee == root; + + for (&caller, caller_summary) in summaries { + let caller_trust = trusted.get(&caller).copied().unwrap_or(ParamMask::EMPTY); + for call in &caller_summary.calls { + if call.callee != callee { + continue; + } + has_caller = true; + for j in 0..summary.param_count { + let arg = call.arg_masks.get(j).copied().unwrap_or(ParamMask::EMPTY); + let arg_trusted = !arg.is_empty() && arg.is_subset_of(caller_trust); + if !arg_trusted { + justified = justified.without(j); + } + } + } + } + + let result = if has_caller { + justified + } else { + ParamMask::EMPTY + }; + if next.get(&callee).copied() != Some(result) { + next.insert(callee, result); + changed = true; + } + } + trusted = next; + if !changed { + break; + } + } + + trusted +} + +/// Builds the Tier-C rejection error naming `field` with a single `reason`. +fn reject(field: &str, reason: &str) -> LinkError { + LinkError::RequiresRelocatableBuild { + field: field.to_string(), + reasons: vec![reason.to_string()], + } +} + +/// Abstractly interprets one function body once, seeding each parameter `i` with +/// `Param({i})`, and returns its [`FunctionSummary`]: the address mask of every +/// memory access and the argument masks of every direct call site. A hard parse +/// failure or adversarial deep nesting surfaces as an `Err`; an address the pass +/// cannot prove parameter-derived is recorded as an empty access mask (which the +/// verifier then rejects), never as a silent success. +fn summarize_function( + module: &ParsedModule, + body: &[u8], + param_count: usize, +) -> Result { + let func_body = FunctionBody::new(BinaryReader::new(body, 0)); + + let local_count = count_and_cap_locals(&func_body, param_count, body.len())?; + let mut locals = vec![Prov::NotParam; local_count]; + for (i, slot) in locals.iter_mut().take(param_count).enumerate() { + *slot = Prov::Param(ParamMask::single(i)); + } + + let ops = collect_operators(&func_body)?; + + // A function body's operator stream ends with the function-terminating `End`. + // That `End` closes the implicit function frame, not a structured block, so + // the analyzed region excludes it; analyzing it as a block terminator would + // spuriously reject every body. + let body_end = match ops.last() { + Some(Operator::End) => ops.len() - 1, + // A body without a trailing `End` is malformed; the parser would have + // already rejected it, but fail closed rather than under-run the slice. + _ => ops.len(), + }; + + let mut summary = FunctionSummary { + param_count, + ..FunctionSummary::default() + }; + let mut interp = Interp { + module, + ops: &ops, + summary: &mut summary, + }; + let entry = State { + locals, + stack: Vec::new(), + }; + // The whole function body is one region. Deep nesting or a bracket mismatch + // makes `interpret` return `None`; that is a hard fail-closed signal (the body + // is recorded as having an unprovable access so the closure rejects). Branches + // that exit the function body do not feed any access, so the region result is + // not inspected further. + if interp.interpret(0, body_end, entry, 0)?.is_none() { + // Record an unprovable access so the verifier rejects this function even + // if it had no explicit memory op (a deep-nesting / structural reject). + summary.accesses.push(ParamMask::EMPTY); + } + Ok(summary) +} + +/// Single-function convenience used by the unit tests: runs [`summarize_function`] +/// over one body, treating that function as its own closure root (all parameters +/// trusted), and reports whether every memory access is provably parameter-derived. +#[cfg(test)] +fn function_is_param_addressing( + module: &ParsedModule, + body: &[u8], + param_count: usize, +) -> Result { + let summary = summarize_function(module, body, param_count)?; + let trusted = ParamMask::first_n(param_count); + Ok(summary + .accesses + .iter() + .all(|access| !access.is_empty() && access.is_subset_of(trusted))) +} + +/// Collects the body's operator stream into an owned vector. The body length is +/// already bounded by the parser's function-size cap, so this is a bounded +/// allocation; an owned stream lets the structured walk re-run loop body regions +/// to a fixpoint without re-decoding. +fn collect_operators<'a>(body: &FunctionBody<'a>) -> Result>, LinkError> { + body.get_operators_reader() + .map_err(|e| LinkError::Parse(e.to_string()))? + .into_iter() + .map(|op| op.map_err(|e| LinkError::Parse(e.to_string()))) + .collect() +} + +/// Counts the total local slots (parameters + declared locals) and rejects an +/// over-declared count *before* any per-local allocation. The declared-locals +/// vector lists `(count, type)` groups; parameters are not in it, so they are +/// added explicitly. A single malformed group can claim `u32::MAX` locals, so +/// the running sum is capped against both the WASM cap and the body length +/// (locals cost ≥ 1 byte each) and rejected early as a clean [`LinkError`]. +fn count_and_cap_locals( + body: &FunctionBody, + param_count: usize, + body_len: usize, +) -> Result { + let cap = MAX_WASM_FUNCTION_LOCALS.min(body_len); + + let mut reader = body + .get_locals_reader() + .map_err(|e| LinkError::Parse(e.to_string()))?; + let groups = reader.get_count(); + let mut declared: usize = 0; + for _ in 0..groups { + let (n, _ty) = reader.read().map_err(|e| LinkError::Parse(e.to_string()))?; + declared = declared.saturating_add(n as usize); + if declared > cap { + return Err(too_many_locals()); + } + } + let total = param_count.saturating_add(declared); + if total > cap.saturating_add(param_count) { + return Err(too_many_locals()); + } + Ok(total) +} + +fn too_many_locals() -> LinkError { + LinkError::Parse("function declares too many locals for provenance analysis".to_string()) +} + +/// The structured abstract interpreter over one function's operator stream. +struct Interp<'a, 'b> { + /// The source module, for resolving callee signatures (to pop/push the right + /// number of `NotParam` call results). + module: &'b ParsedModule, + /// The full operator stream of the function under analysis. + ops: &'b [Operator<'a>], + /// The summary being built: every memory access records its address mask and + /// every direct `call` records its argument masks here. Loop bodies are + /// re-interpreted to a fixpoint, so an access inside a loop can be recorded + /// more than once; recording the *same or weaker* mask on a re-run is sound + /// (the verifier rejects on any empty/non-subset access), so the duplicates + /// are harmless. + summary: &'b mut FunctionSummary, +} + +/// The control-flow effect of interpreting a structured region: where control +/// arrives at the region's end, and how branches targeting outer blocks +/// contribute their state. +struct RegionResult { + /// The state at the region's normal fall-through end, if it is reachable. + /// `None` when every path out of the region branched away or terminated. + fallthrough: Option, + /// Per-outer-frame branch accumulators: `branch_acc[d]` joins the state of + /// every branch that targeted the frame `d` levels *outside* this region + /// (relative depth `d + region_depth`). Index 0 is the immediately enclosing + /// frame. Threaded outward so an enclosing `end` can merge them. + branch_acc: Vec>, +} + +impl<'a, 'b> Interp<'a, 'b> { + /// Interprets `[start, end)` from `entry`. Returns `Ok(None)` the moment a + /// memory access addresses a `NotParam` value (the whole analysis must + /// reject), or `Ok(Some(region))` describing where control leaves the region. + /// + /// The region is assumed to be one structured block's body: every `block`/ + /// `loop`/`if`/non-det block opened inside it is matched by an `end` inside + /// it, and the region's own terminating `end`/`else` (the enclosing block's) + /// lies at `end`. + fn interpret( + &mut self, + start: usize, + end: usize, + entry: State, + depth: usize, + ) -> Result, LinkError> { + if depth > MAX_ANALYSIS_DEPTH { + // Fail closed on adversarial deep nesting rather than overflow the + // analysis stack: report an unsafe access so the closure is rejected. + return Ok(None); + } + + let mut state = entry; + let mut reachable = true; + // branch_acc[d]: branches targeting the frame d levels outside this + // region (the enclosing block is d = 0). Grown on demand. + let mut branch_acc: Vec> = Vec::new(); + + let mut i = start; + while i < end { + let op = &self.ops[i]; + + match op { + Operator::Block { blockty } + | Operator::Forall { blockty } + | Operator::Exists { blockty } + | Operator::Assume { blockty } + | Operator::Unique { blockty } => { + let body_end = self.match_end(i, end)?; + if reachable { + let outcome = + self.run_block(*blockty, i + 1, body_end, &state, depth)?; + let Some((exit, inner_acc)) = outcome else { + return Ok(None); + }; + state = exit; + merge_outer(&mut branch_acc, inner_acc); + } + i = body_end + 1; + continue; + } + Operator::Loop { blockty } => { + let body_end = self.match_end(i, end)?; + if reachable { + let outcome = + self.run_loop(*blockty, i + 1, body_end, &state, depth)?; + let Some((exit, inner_acc)) = outcome else { + return Ok(None); + }; + state = exit; + merge_outer(&mut branch_acc, inner_acc); + } + i = body_end + 1; + continue; + } + Operator::If { blockty } => { + let if_end = self.match_end(i, end)?; + if reachable { + let outcome = self.run_if(*blockty, i + 1, if_end, &state, depth)?; + let Some((exit, inner_acc)) = outcome else { + return Ok(None); + }; + state = exit; + merge_outer(&mut branch_acc, inner_acc); + } + i = if_end + 1; + continue; + } + Operator::Else | Operator::End => { + // The region terminator of the enclosing block. The caller + // (`run_block`/`run_if`/`run_loop`) drove `end` to exactly + // this region's bound, so an `end`/`else` at `end` is consumed + // by the caller, not here. Reaching one before `end` means the + // bracket matcher disagreed with the stream; fail closed. + return Ok(None); + } + _ => { + if reachable { + // Borrow the three disjoint fields explicitly so the + // straight-line step can record into `summary` while `op` + // and `module`/`ops` stay borrowed immutably. + match Self::step_straight_line( + self.module, + self.summary, + op, + &mut state, + &mut branch_acc, + )? { + StepOutcome::Continue => {} + StepOutcome::Unreachable => reachable = false, + } + } + i += 1; + } + } + } + + let fallthrough = if reachable { Some(state) } else { None }; + Ok(Some(RegionResult { + fallthrough, + branch_acc, + })) + } + + /// Interprets a `block`/non-det block body `[body_start, body_end)` opened in + /// state `outer`, returning the post-block state and the branch accumulators + /// for frames *outside* the block, or `None` on an unsafe access. + /// + /// A forward branch targeting this block (relative depth 0 inside it) merges + /// with the block's normal fall-through at `end`. The block's params are the + /// top `param_arity` operand slots of `outer`; its results are `result_arity` + /// slots left on `outer`'s stack below the params. + #[allow(clippy::type_complexity)] + fn run_block( + &mut self, + blockty: BlockType, + body_start: usize, + body_end: usize, + outer: &State, + depth: usize, + ) -> Result>)>, LinkError> { + let (param_arity, result_arity) = self.block_arity(blockty); + let entry = self.block_entry_state(outer, param_arity); + + let Some(region) = self.interpret(body_start, body_end, entry, depth + 1)? else { + return Ok(None); + }; + + // The block's exit = join of the normal fall-through end-state and every + // forward branch that targeted this block (its own branch_acc[0]). + let mut self_acc = region.branch_acc; + let target0 = if self_acc.is_empty() { + None + } else { + self_acc.remove(0) + }; + let exit_inner = join_opt(region.fallthrough, target0); + + let exit = match exit_inner { + Some(inner) => self.block_exit_state(outer, &inner, param_arity, result_arity), + // No path reaches the block's end (every path branched out or + // terminated). Control continues after the block with whatever the + // branches that skipped past it carry; model the post-block state as + // the outer state minus params plus NotParam results (fail-closed). + None => self.unreachable_exit_state(outer, param_arity, result_arity), + }; + + // self_acc now holds branches targeting frames *outside* this block, with + // depth shifted down by one (the block frame was index 0). + Ok(Some((exit, self_acc))) + } + + /// Interprets a `loop` body to a fixpoint over its back-edges. A branch + /// targeting the loop (relative depth 0 inside it) re-enters the loop header, + /// so the header entry state is the join of the loop's outer entry and every + /// back-edge, iterated until it stabilizes (monotone descent toward + /// `NotParam`, bounded by the slot count). + #[allow(clippy::type_complexity)] + fn run_loop( + &mut self, + blockty: BlockType, + body_start: usize, + body_end: usize, + outer: &State, + depth: usize, + ) -> Result>)>, LinkError> { + let (param_arity, result_arity) = self.block_arity(blockty); + let mut header_in = self.block_entry_state(outer, param_arity); + + // Fixpoint: re-run the body until the header entry state stops changing. + // Each iteration can only flip slots Param -> NotParam, so the loop + // terminates in at most (slot count + 1) rounds. + let max_rounds = header_in.locals.len() + header_in.stack.len() + 2; + let mut final_region; + let mut rounds = 0; + loop { + let Some(region) = self.interpret(body_start, body_end, header_in.clone(), depth + 1)? + else { + return Ok(None); + }; + + // Back-edges target the loop header (its own branch_acc[0]). + let back = region.branch_acc.first().cloned().flatten(); + let next_header = match back { + Some(b) => header_in.join(&b), + None => header_in.clone(), + }; + + final_region = region; + rounds += 1; + if next_header == header_in || rounds >= max_rounds { + break; + } + header_in = next_header; + } + + // The loop's normal exit is its body's fall-through end (a loop's label + // is a back-edge, so forward exits leave via the fall-through `end`). + let exit = match final_region.fallthrough.take() { + Some(inner) => self.block_exit_state(outer, &inner, param_arity, result_arity), + None => self.unreachable_exit_state(outer, param_arity, result_arity), + }; + + // Drop the loop's own frame (index 0); the rest are outer branches. + let mut self_acc = final_region.branch_acc; + if !self_acc.is_empty() { + self_acc.remove(0); + } + Ok(Some((exit, self_acc))) + } + + /// Interprets an `if` (and optional `else`) over `[body_start, if_end)`, + /// where `if_end` is the `if`'s matching `end`. The condition has already + /// been consumed (it is popped from `outer`'s stack as the `if`'s param + /// model). Both arms are interpreted from the same entry state; the `if`'s + /// exit is the join of the true-arm end, the false/implicit-else-arm end, and + /// any forward branch targeting the `if` block. + #[allow(clippy::type_complexity)] + fn run_if( + &mut self, + blockty: BlockType, + body_start: usize, + if_end: usize, + outer: &State, + depth: usize, + ) -> Result>)>, LinkError> { + let (param_arity, result_arity) = self.block_arity(blockty); + + // The `if` consumes the condition (1 slot) plus `param_arity` block + // params from `outer`'s stack. Pop the condition first, then build the + // block entry state from the remaining stack. + let mut after_cond = outer.clone(); + after_cond.stack.pop(); // the i32 condition + let entry = self.block_entry_state(&after_cond, param_arity); + + // Split the arms at the `else` matching this `if` (if any). + let else_idx = self.find_else(body_start, if_end); + let (true_range, false_range) = match else_idx { + Some(e) => ((body_start, e), Some((e + 1, if_end))), + None => ((body_start, if_end), None), + }; + + let Some(true_region) = self.interpret(true_range.0, true_range.1, entry.clone(), depth + 1)? + else { + return Ok(None); + }; + + let false_region = match false_range { + Some((fs, fe)) => { + let Some(r) = self.interpret(fs, fe, entry.clone(), depth + 1)? else { + return Ok(None); + }; + r + } + None => { + // No `else`: the implicit false arm is the entry state unchanged + // (the block body did not run), contributing `entry` as its + // fall-through with no branches. + RegionResult { + fallthrough: Some(entry.clone()), + branch_acc: Vec::new(), + } + } + }; + + // Merge both arms' branch accumulators; the `if` block is each arm's + // frame 0, so its own branches fold into the exit join. + let mut merged_acc = true_region.branch_acc; + merge_outer(&mut merged_acc, false_region.branch_acc); + let target0 = if merged_acc.is_empty() { + None + } else { + merged_acc.remove(0) + }; + + // The `if` exit = join of the true-arm fall-through, the false-arm + // fall-through, and any branch targeting the `if`. + let arms = join_opt(true_region.fallthrough, false_region.fallthrough); + let exit_inner = join_opt(arms, target0); + + let exit = match exit_inner { + Some(inner) => self.block_exit_state(&after_cond, &inner, param_arity, result_arity), + None => self.unreachable_exit_state(&after_cond, param_arity, result_arity), + }; + + Ok(Some((exit, merged_acc))) + } + + /// Applies one straight-line (non-structured-control) operator to `state`, + /// reporting whether control continues or becomes unreachable. A memory + /// access records its address mask into `summary` (an empty mask for an + /// unprovable address, which the verifier later rejects); a direct `call` + /// records its argument masks. Branches record their state into `branch_acc` + /// at their target's relative depth. + /// + /// An associated function (no `self`) so the caller can hand it the three + /// disjoint borrows it needs — `module` immutably, `summary` mutably, and the + /// borrowed `op` — without re-borrowing the whole interpreter. + fn step_straight_line( + module: &ParsedModule, + summary: &mut FunctionSummary, + op: &Operator, + state: &mut State, + branch_acc: &mut Vec>, + ) -> Result { + use Operator::*; + + match op { + // -- Locals -- + LocalGet { local_index } => { + state.stack.push(local_prov(&state.locals, *local_index)); + } + LocalSet { local_index } => { + let v = pop(state); + set_local(&mut state.locals, *local_index, v); + } + LocalTee { local_index } => { + let v = state.stack.last().copied().unwrap_or(Prov::NotParam); + set_local(&mut state.locals, *local_index, v); + } + + // -- Constant literals: caller-independent constants (a valid offset + // to add to a Param base, never a valid address on their own). -- + I32Const { .. } | I64Const { .. } | F32Const { .. } | F64Const { .. } => { + state.stack.push(Prov::Const); + } + + // -- Sources that are neither parameter-derived nor proven constant: + // a global is runtime-mutable; an uzumaki is non-deterministic. -- + I32Uzumaki { .. } | I64Uzumaki { .. } | GlobalGet { .. } => { + state.stack.push(Prov::NotParam); + } + + // -- Loads: pop the address, record its mask, push NotParam contents -- + I32Load { .. } | I64Load { .. } | F32Load { .. } | F64Load { .. } + | I32Load8S { .. } | I32Load8U { .. } | I32Load16S { .. } + | I32Load16U { .. } | I64Load8S { .. } | I64Load8U { .. } + | I64Load16S { .. } | I64Load16U { .. } | I64Load32S { .. } + | I64Load32U { .. } => { + let addr = pop(state); + record_access(summary, addr.mask()); + state.stack.push(Prov::NotParam); + } + + // -- Stores: pop value then address, record the address mask -- + I32Store { .. } | I64Store { .. } | F32Store { .. } | F64Store { .. } + | I32Store8 { .. } | I32Store16 { .. } | I64Store8 { .. } + | I64Store16 { .. } | I64Store32 { .. } => { + pop(state); // the stored value + let addr = pop(state); + record_access(summary, addr.mask()); + } + + // -- Bulk memory: both the address AND the extent operand must be + // parameter-derived. A bulk-memory op touches the contiguous + // region `[address, address + size)`, so a caller-bounded *start* + // is not enough: a constant or global `size` lets the op clobber + // or read an unbounded region above a caller pointer (e.g. + // `memory.fill(base, v, 0x8000)` scorches host memory the caller + // never exposed). The extent therefore carries the same + // caller-derivation requirement as an address — a `Param` size + // (`fill(ptr, v, len)` with a trusted `len`) is admitted, a + // `Const`/`NotParam` size (empty mask) fails the subset check and + // rejects the whole closure. -- + MemoryFill { .. } => { + // Stack: [dest, value, size]. The size bounds the clobbered + // extent, so it must be caller-derived; the value is the fill + // byte (neither an address nor an extent) and is discarded. + let size = pop(state); + record_access(summary, size.mask()); + pop(state); // value (the fill byte) + let dest = pop(state); + record_access(summary, dest.mask()); + } + MemoryCopy { .. } => { + // Stack: [dest, src, size]; both dest and src are addresses and + // the size bounds the copied extent, so all three must be + // trusted. Each is recorded as its own access; the verifier + // rejects if any is empty or not a subset of the trusted set. + let size = pop(state); + record_access(summary, size.mask()); + let src = pop(state); + let dest = pop(state); + record_access(summary, dest.mask()); + record_access(summary, src.mask()); + } + MemoryInit { .. } => { + // Stack: [dest, offset, size]; dest is the address and size + // bounds the written extent (both caller-derived). The offset is + // a data-segment offset, not a linear-memory address, so it is + // discarded. (memory.init also implies a data segment -> already + // Tier C; this is defense-in-depth on the destination and extent.) + let size = pop(state); + record_access(summary, size.mask()); + pop(state); // offset (into the data segment, not linear memory) + let dest = pop(state); + record_access(summary, dest.mask()); + } + + // -- memory.size / memory.grow yield page counts, never addresses -- + MemorySize { .. } => { + state.stack.push(Prov::NotParam); + } + MemoryGrow { .. } => { + pop(state); // delta + state.stack.push(Prov::NotParam); + } + + // -- Parametric -- + Drop => { + pop(state); + } + Select | TypedSelect { .. } => { + // Pops condition + two values, pushes their join. Param only if + // both value operands are Param. + pop(state); // condition + let a = pop(state); + let b = pop(state); + state.stack.push(a.join(b)); + } + + // -- Calls: record the per-argument provenance masks so the + // interprocedural fixpoint can decide which callee parameters are + // trusted, then pop the callee's params and push NotParam results + // (a call result is never trusted). -- + Call { function_index } => { + let sig = module.func_sig(*function_index).cloned(); + if let Some(sig) = sig.as_ref() { + let arg_masks = top_arg_masks(state, sig.params.len()); + summary.calls.push(CallSite { + callee: *function_index, + arg_masks, + }); + } + apply_call(sig.as_ref(), state); + } + ReturnCall { .. } => { + // Tail call terminates this path locally. + return Ok(StepOutcome::Unreachable); + } + CallIndirect { type_index, .. } => { + // An indirect call dispatches through the table; its result is + // never trusted, and no callee parameter can be justified through + // it (the callee is not statically known). Pop the table index and + // the callee params, push NotParam results. + pop(state); // the table index operand + let sig = type_sig(module, *type_index).cloned(); + apply_call(sig.as_ref(), state); + } + ReturnCallIndirect { .. } => { + return Ok(StepOutcome::Unreachable); + } + + // -- Branches: record state at the target, end reachability where the + // branch is unconditional. -- + Br { relative_depth } => { + accumulate(branch_acc, *relative_depth, state); + return Ok(StepOutcome::Unreachable); + } + BrIf { relative_depth } => { + pop(state); // the i32 condition + accumulate(branch_acc, *relative_depth, state); + // The false edge falls through; reachability continues. + } + BrTable { targets } => { + pop(state); // the i32 index + accumulate(branch_acc, targets.default(), state); + for target in targets.targets() { + let target = target.map_err(|e| LinkError::Parse(e.to_string()))?; + accumulate(branch_acc, target, state); + } + return Ok(StepOutcome::Unreachable); + } + Return | Unreachable => { + return Ok(StepOutcome::Unreachable); + } + Nop => {} + + // -- Arithmetic: only `add` and the constrained `sub` propagate Param; + // every other binary and every unary op produces NotParam (each can + // cancel the caller contribution). -- + _ if is_add(op) => { + let a = pop(state); + let b = pop(state); + state.stack.push(add_prov(a, b)); + } + _ if is_sub(op) => { + // WASM stack for `b - a` is [b, a] with `a` (subtrahend) on top. + let a = pop(state); // subtrahend (top) + let b = pop(state); // minuend + state.stack.push(sub_prov(b, a)); + } + _ if is_other_binary(op) => { + pop(state); + pop(state); + state.stack.push(Prov::NotParam); + } + _ if is_unary(op) => { + pop(state); + state.stack.push(Prov::NotParam); + } + + // -- Any operator whose precise stack effect the analysis does not + // model: widen the stack to empty so later pops read the + // fail-closed NotParam default. The safety allow-list has already + // confined the operator set, so this is unreachable for a body that + // passed `check_operator`; it is defense in depth. -- + _ => { + state.stack.clear(); + } + } + + Ok(StepOutcome::Continue) + } + + /// The `(param_arity, result_arity)` of a block type: how many operand slots + /// it consumes at entry and leaves at `end`. A `FuncType` index is resolved + /// against the module's type section; an unresolvable one fails closed to + /// `(0, 0)` (the surrounding stack is then widened by the result model). + fn block_arity(&self, blockty: BlockType) -> (usize, usize) { + match blockty { + BlockType::Empty => (0, 0), + BlockType::Type(_) => (0, 1), + BlockType::FuncType(t) => match type_sig(self.module, t) { + Some(sig) => (sig.params.len(), sig.results.len()), + None => (0, 0), + }, + } + } + + /// The entry state of a structured block: `outer`'s locals, with the top + /// `param_arity` operand slots carried in as the block's initial stack. + fn block_entry_state(&self, outer: &State, param_arity: usize) -> State { + let take = param_arity.min(outer.stack.len()); + let params = outer.stack[outer.stack.len() - take..].to_vec(); + State { + locals: outer.locals.clone(), + stack: params, + } + } + + /// The state after a structured block exits normally: `outer`'s stack with + /// the block's params popped and `result_arity` result slots pushed (each the + /// inner end-state's corresponding result slot, or `NotParam` if the inner + /// stack is shorter than the declared result arity), and the block's merged + /// locals. + fn block_exit_state( + &self, + outer: &State, + inner_end: &State, + param_arity: usize, + result_arity: usize, + ) -> State { + let mut stack = outer.stack.clone(); + for _ in 0..param_arity.min(stack.len()) { + stack.pop(); + } + let results = result_tail(&inner_end.stack, result_arity); + stack.extend(results); + State { + locals: inner_end.locals.clone(), + stack, + } + } + + /// The post-block state when no path reaches the block's end. Control resumes + /// after the block (carried by branches that skipped it), so the stack shape + /// must still be correct: pop the params, push `NotParam` results, and widen + /// every local to `NotParam` (no path's locals are known). Fail closed. + fn unreachable_exit_state( + &self, + outer: &State, + param_arity: usize, + result_arity: usize, + ) -> State { + let mut stack = outer.stack.clone(); + for _ in 0..param_arity.min(stack.len()) { + stack.pop(); + } + stack.extend(std::iter::repeat_n(Prov::NotParam, result_arity)); + State { + locals: vec![Prov::NotParam; outer.locals.len()], + stack, + } + } + + /// Finds the index of the `else` matching the `if` whose body is + /// `[body_start, if_end)`, skipping nested structured blocks. `None` when the + /// `if` has no `else`. + fn find_else(&self, body_start: usize, if_end: usize) -> Option { + let mut nesting = 0usize; + let mut i = body_start; + while i < if_end { + match &self.ops[i] { + Operator::Block { .. } + | Operator::Loop { .. } + | Operator::If { .. } + | Operator::Forall { .. } + | Operator::Exists { .. } + | Operator::Assume { .. } + | Operator::Unique { .. } => nesting += 1, + Operator::End => { + // An `End` here closes a nested block; the `if`'s own `End` is + // at `if_end`, outside this range. + nesting = nesting.saturating_sub(1); + } + Operator::Else if nesting == 0 => return Some(i), + _ => {} + } + i += 1; + } + None + } + + /// Returns the index of the `End` (or, for an `if`, the matching `End`) + /// closing the structured block opened at `open`, searching within + /// `[open, limit)`. Fails closed to `limit - 1` semantics by returning a + /// `Parse` error if the bracket is unbalanced (a valid body never is). + fn match_end(&self, open: usize, limit: usize) -> Result { + let mut nesting = 0usize; + let mut i = open; + while i < limit { + match &self.ops[i] { + Operator::Block { .. } + | Operator::Loop { .. } + | Operator::If { .. } + | Operator::Forall { .. } + | Operator::Exists { .. } + | Operator::Assume { .. } + | Operator::Unique { .. } => nesting += 1, + Operator::End => { + nesting -= 1; + if nesting == 0 { + return Ok(i); + } + } + _ => {} + } + i += 1; + } + Err(LinkError::Parse( + "unbalanced structured control flow in function body".to_string(), + )) + } +} + +/// The outcome of stepping one straight-line operator. +enum StepOutcome { + /// Control continues to the next operator. + Continue, + /// Control becomes unreachable for the rest of the block (`br`/`return`/ + /// `unreachable`/tail call). + Unreachable, +} + +/// Records one memory access's address mask into `summary`. `memarg.offset` is +/// deliberately not consulted: a `param + N` effective address still varies with +/// the caller's pointer and can never reach a caller-independent host location, +/// so the offset cannot turn a trusted base into an untrusted one. An empty mask +/// (an address not provably parameter-derived) is recorded as-is; the verifier +/// rejects it. +fn record_access(summary: &mut FunctionSummary, mask: ParamMask) { + summary.accesses.push(mask); +} + +/// The provenance masks of the top `count` operand-stack slots, deepest-first +/// (so index `j` is the `j`-th call argument). Underflow slots default to the +/// empty mask (fail closed). +fn top_arg_masks(state: &State, count: usize) -> Vec { + let depth = state.stack.len(); + (0..count) + .map(|j| { + // Argument j sits `count - j` slots below the top of the stack. + depth + .checked_sub(count - j) + .and_then(|idx| state.stack.get(idx)) + .map(|prov| prov.mask()) + .unwrap_or(ParamMask::EMPTY) + }) + .collect() +} + +/// Pops a callee's parameters and pushes one `NotParam` per result, modeling a +/// `call`/`call_indirect` whose results are never trusted. With no resolvable +/// signature the stack is cleared (fail closed: later pops read `NotParam`). +fn apply_call(sig: Option<&FuncSig>, state: &mut State) { + match sig { + Some(sig) => { + for _ in 0..sig.params.len() { + pop(state); + } + for _ in 0..sig.results.len() { + state.stack.push(Prov::NotParam); + } + } + None => state.stack.clear(), + } +} + +/// The function signature a type index names, if it is a function type in the +/// module's type section. +fn type_sig(module: &ParsedModule, type_index: u32) -> Option<&FuncSig> { + match module.types.get(type_index as usize)? { + crate::parse::TypeEntry::Func(sig) => Some(sig), + crate::parse::TypeEntry::Other => None, + } +} + +/// Pops the top of the operand stack, reading the fail-closed `NotParam` on +/// underflow (which a valid body never produces, but the analysis must survive). +fn pop(state: &mut State) -> Prov { + state.stack.pop().unwrap_or(Prov::NotParam) +} + +/// The provenance of a local, reading the fail-closed `NotParam` for an +/// out-of-range index (which a valid body never produces). +fn local_prov(locals: &[Prov], index: u32) -> Prov { + locals + .get(index as usize) + .copied() + .unwrap_or(Prov::NotParam) +} + +/// Writes `prov` to a local, ignoring an out-of-range index. +fn set_local(locals: &mut [Prov], index: u32, prov: Prov) { + if let Some(slot) = locals.get_mut(index as usize) { + *slot = prov; + } +} + +/// Joins two optional states: `Some` only when at least one is `Some`, and the +/// `join` of both when both are present. +fn join_opt(a: Option, b: Option) -> Option { + match (a, b) { + (Some(a), Some(b)) => Some(a.join(&b)), + (Some(a), None) => Some(a), + (None, Some(b)) => Some(b), + (None, None) => None, + } +} + +/// Records a branch's `state` into `acc` at relative `depth`, joining with any +/// branch already targeting that frame. Grows `acc` to cover the depth. +fn accumulate(acc: &mut Vec>, depth: u32, state: &State) { + let d = depth as usize; + if acc.len() <= d { + acc.resize(d + 1, None); + } + acc[d] = match acc[d].take() { + Some(existing) => Some(existing.join(state)), + None => Some(state.clone()), + }; +} + +/// Merges an inner region's outer-frame branch accumulators (already shifted so +/// index 0 is the frame enclosing the inner region) into `outer`, joining +/// per-depth. +fn merge_outer(outer: &mut Vec>, inner: Vec>) { + if outer.len() < inner.len() { + outer.resize(inner.len(), None); + } + for (slot, contrib) in outer.iter_mut().zip(inner) { + if let Some(contrib) = contrib { + *slot = match slot.take() { + Some(existing) => Some(existing.join(&contrib)), + None => Some(contrib), + }; + } + } +} + +/// The top `result_arity` slots of `stack` (the block's result values), padded +/// with `NotParam` when the stack is shorter than the declared arity (fail +/// closed). +fn result_tail(stack: &[Prov], result_arity: usize) -> Vec { + if stack.len() >= result_arity { + stack[stack.len() - result_arity..].to_vec() + } else { + let mut v = vec![Prov::NotParam; result_arity - stack.len()]; + v.extend_from_slice(stack); + v + } +} + +/// The provenance of `a + b`. `add` is commutative, so the rule is symmetric in +/// its operands. +/// +/// - `Param + Param`: two genuine caller values; the result varies with the +/// caller's inputs (`a6`/`a13`). The mask is the **union**: the sum derives +/// from every parameter either operand does. `Param`. +/// - `Param + Const` / `Const + Param`: `caller_base + fixed_offset` provably +/// still varies with the caller's pointer (the struct-field / array-element +/// case `a2`/`a5`). The `Param` mask carries through unchanged. +/// - `Param + NotParam`: **unsound to keep `Param`.** `NotParam` means *not +/// provably parameter-derived*, not *constant*; it may hold `C - p`, and +/// `(C - p) + p == C` is a fixed, caller-independent absolute address. Demote +/// to `NotParam`. +/// - `Const + Const`: a constant. `Const`. +/// - anything else: `NotParam`. +fn add_prov(a: Prov, b: Prov) -> Prov { + match (a, b) { + (Prov::Param(x), Prov::Param(y)) => Prov::Param(x.union(y)), + (Prov::Param(m), Prov::Const) | (Prov::Const, Prov::Param(m)) => Prov::Param(m), + (Prov::Const, Prov::Const) => Prov::Const, + _ => Prov::NotParam, + } +} + +/// The provenance of `b - a` (minuend `b`, subtrahend `a`). +/// +/// - `Param - Const`: `caller_base - fixed_offset` provably still varies with the +/// caller's pointer (a struct field below the pointer, `a7`). The `Param` mask +/// carries through unchanged. +/// - `Param - NotParam`: **unsound to keep `Param`**, the exact mirror of the +/// `add` cancellation. `NotParam` means *not provably parameter-derived*, not +/// *constant*; the subtrahend may itself hold `p - C`, and `p - (p - C) == C` +/// is a fixed, caller-independent absolute address. Demote to `NotParam`. +/// - `Param - Param`: may be `b - b == 0`, a caller-independent constant +/// (`n1`/`n6`). `NotParam`. +/// - `Const - Const`: a constant. `Const`. +/// - anything else (including `Const - Param`, which negates the caller +/// contribution to `C - p` that a later `add` must not re-promote): `NotParam`. +fn sub_prov(b: Prov, a: Prov) -> Prov { + match (b, a) { + (Prov::Param(m), Prov::Const) => Prov::Param(m), + (Prov::Const, Prov::Const) => Prov::Const, + _ => Prov::NotParam, + } +} + +/// Whether `op` is an `add`. +fn is_add(op: &Operator) -> bool { + use Operator::*; + matches!(op, I32Add | I64Add | F32Add | F64Add) +} + +/// Whether `op` is a `sub`. +fn is_sub(op: &Operator) -> bool { + use Operator::*; + matches!(op, I32Sub | I64Sub | F32Sub | F64Sub) +} + +/// Whether `op` is a two-operand numeric instruction *other than* add/sub: a +/// multiply, divide, remainder, bitwise op, shift, rotate, float min/max/ +/// copysign, or any comparison. Each can cancel the caller contribution to a +/// caller-independent value, so its result is unconditionally `NotParam`. +fn is_other_binary(op: &Operator) -> bool { + use Operator::*; + matches!( + op, + // comparisons + I32Eq | I32Ne | I32LtS | I32LtU | I32GtS | I32GtU | I32LeS | I32LeU | I32GeS | I32GeU + | I64Eq | I64Ne | I64LtS | I64LtU | I64GtS | I64GtU | I64LeS | I64LeU | I64GeS + | I64GeU | F32Eq | F32Ne | F32Lt | F32Gt | F32Le | F32Ge | F64Eq | F64Ne | F64Lt + | F64Gt | F64Le | F64Ge + // i32 / i64 multiplicative, bitwise, shift, rotate + | I32Mul | I32DivS | I32DivU | I32RemS | I32RemU | I32And | I32Or | I32Xor | I32Shl + | I32ShrS | I32ShrU | I32Rotl | I32Rotr | I64Mul | I64DivS | I64DivU | I64RemS + | I64RemU | I64And | I64Or | I64Xor | I64Shl | I64ShrS | I64ShrU | I64Rotl | I64Rotr + // float multiplicative / min / max / copysign + | F32Mul | F32Div | F32Min | F32Max | F32Copysign | F64Mul | F64Div | F64Min | F64Max + | F64Copysign + ) +} + +/// Whether `op` is a single-operand numeric instruction: a unary arithmetic, a +/// test, a conversion, a reinterpret, an extend, or a saturating truncation. +/// Every unary op produces `NotParam`: the tagless lattice cannot distinguish a +/// value-preserving width conversion from a value-destroying op like `eqz` +/// (which yields `0`/`1`), so all unary ops fail closed. +fn is_unary(op: &Operator) -> bool { + use Operator::*; + matches!( + op, + I32Eqz | I64Eqz | I32Clz | I32Ctz | I32Popcnt | I64Clz | I64Ctz | I64Popcnt | F32Abs + | F32Neg | F32Ceil | F32Floor | F32Trunc | F32Nearest | F32Sqrt | F64Abs | F64Neg + | F64Ceil | F64Floor | F64Trunc | F64Nearest | F64Sqrt | I32WrapI64 | I32TruncF32S + | I32TruncF32U | I32TruncF64S | I32TruncF64U | I64ExtendI32S | I64ExtendI32U + | I64TruncF32S | I64TruncF32U | I64TruncF64S | I64TruncF64U | F32ConvertI32S + | F32ConvertI32U | F32ConvertI64S | F32ConvertI64U | F32DemoteF64 | F64ConvertI32S + | F64ConvertI32U | F64ConvertI64S | F64ConvertI64U | F64PromoteF32 + | I32ReinterpretF32 | I64ReinterpretF64 | F32ReinterpretI32 | F64ReinterpretI64 + | I32Extend8S | I32Extend16S | I64Extend8S | I64Extend16S | I64Extend32S + | I32TruncSatF32S | I32TruncSatF32U | I32TruncSatF64S | I32TruncSatF64U + | I64TruncSatF32S | I64TruncSatF32U | I64TruncSatF64S | I64TruncSatF64U + ) +} + +#[cfg(test)] +mod tests; diff --git a/core/wasm-linker/src/provenance/tests.rs b/core/wasm-linker/src/provenance/tests.rs new file mode 100644 index 00000000..98949ee4 --- /dev/null +++ b/core/wasm-linker/src/provenance/tests.rs @@ -0,0 +1,1396 @@ +//! Unit tests for the sound address-provenance analysis. +//! +//! The matrix below is the design's full adversarial test set. Each case is a +//! single-function WAT body (or, for the interprocedural cases, a whole module) +//! checked against the analysis. The naming convention follows the design: +//! +//! - **ACCEPT** — every address operand is provably param-derived on every path. +//! - **REJECT** — at least one address operand may be a fabricated, caller- +//! independent value; the closure is Tier C. +//! +//! The legitimate cases (`8a`) must stay accepted; every laundering case +//! (`8b`–`8h`) must be rejected. Over-rejections that the design documents as +//! sound (a unary-converted pointer, alignment masking) are asserted REJECT. + +use super::*; +use inf_wasmparser::{Parser, Payload}; + +/// Assembles `wat` and returns the raw body bytes of its first function. +fn first_body(wat: &str) -> Vec { + let bytes = wat::parse_str(wat).expect("valid WAT"); + for payload in Parser::new(0).parse_all(&bytes) { + if let Payload::CodeSectionEntry(body) = payload.expect("payload") { + return body.as_bytes().to_vec(); + } + } + panic!("no code section"); +} + +/// Parses `wat` into the linker's owned module representation. +fn module(wat: &str) -> ParsedModule { + let bytes = wat::parse_str(wat).expect("valid WAT"); + ParsedModule::parse(&bytes).expect("parse") +} + +/// Runs the single-function analysis over the first function of `wat`. An empty +/// module is used for call resolution; the call cases that need a resolvable +/// callee use [`accepts_in`] with an explicit module. +fn accepts(wat: &str, params: usize) -> bool { + let m = ParsedModule::default(); + let body = first_body(wat); + function_is_param_addressing(&m, &body, params).expect("analysis runs") +} + +/// Parses `module_wat` and runs the analysis over function `func_index`'s body +/// with `params` leading parameters, so its calls resolve against the module. +fn accepts_in(module_wat: &str, func_index: u32, params: usize) -> bool { + let m = module(module_wat); + let local = &m.local_funcs[(func_index - m.local_func_base()) as usize]; + function_is_param_addressing(&m, &local.body, params).expect("analysis runs") +} + +// =========================================================================== +// 8a — MUST-ACCEPT: legitimate, sound param-addressing +// =========================================================================== + +#[test] +fn a1_direct_param_load() { + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn a2_param_plus_const_struct_field() { + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 i32.const 8 i32.add i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn a3_param_base_with_nonzero_memarg_offset() { + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 i32.load offset=12) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn a4_store_through_param() { + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32) + local.get 0 local.get 1 i32.store) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn a5_param_plus_const_store_with_memarg() { + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32) + local.get 0 i32.const 16 i32.add local.get 1 i32.store offset=4) + (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn a6_ptr_plus_param_len_add_propagates() { + // The headline ptr+len case: `add` with either operand Param is Param. + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) + local.get 0 local.get 1 i32.add i32.load) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn a7_param_minus_const() { + // `param - const`: minuend Param, subtrahend NotParam => Param. + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 i32.const 4 i32.sub i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn a8_param_copied_through_scratch_local() { + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) (local i32) + local.get 0 local.set 1 local.get 1 i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn a9_param_through_local_tee() { + // `local.tee` re-pushes the Param value, which then addresses the load. + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) (local i32) + local.get 0 local.tee 1 i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn a11_select_of_two_params() { + // select(param0, param1) => join(Param, Param) = Param. + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) + local.get 0 local.get 1 i32.const 1 select i32.load) + (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn a12_memory_fill_at_param() { + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32 i32) + local.get 0 local.get 1 local.get 2 memory.fill) + (export "f" (func 0)))"#, + 3, + )); +} + +#[test] +fn a13_memory_copy_both_params() { + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32 i32) + local.get 0 local.get 1 local.get 2 memory.copy) + (export "f" (func 0)))"#, + 3, + )); +} + +#[test] +fn a14_param_as_block_result() { + // A param produced inside a block and left as the block's single result must + // survive as Param into the enclosing load. + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) + (block (result i32) local.get 0) i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn a15_if_both_arms_param() { + // Both arms yield the param => join keeps Param. + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) + local.get 1 + (if (result i32) (then local.get 0) (else local.get 0)) + i32.load) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn a16_loop_result_stays_param() { + // A degenerate loop whose body yields the param; the fixpoint keeps Param. + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) + (loop (result i32) local.get 0) i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn a17_pure_function_no_memory() { + // No memory access at all: trivially safe (Tier A in practice). + assert!(accepts( + r#"(module (func (param i32 i32) (result i32) + local.get 0 local.get 1 i32.add) (export "f" (func 0)))"#, + 2, + )); +} + +// =========================================================================== +// 8b — MUST-REJECT: C-2 param-nulling arithmetic +// =========================================================================== + +#[test] +fn n1_param_minus_param_is_zero() { + // (param - param) + const = const: sub with a Param subtrahend is NotParam. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + local.get 0 local.get 0 i32.sub i32.const 65536 i32.add + local.get 1 i32.store) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn n2_param_times_zero() { + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + local.get 0 i32.const 0 i32.mul i32.const 4096 i32.add + local.get 1 i32.store) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn n3_param_and_zero() { + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32 i32) + local.get 0 i32.const 0 i32.and i32.const 32768 i32.add + local.get 1 local.get 2 memory.fill) (export "f" (func 0)))"#, + 3, + )); +} + +#[test] +fn n4_param_xor_param_via_tee() { + // param ^ param = 0, laundered through local.tee, then + const. + assert!(!accepts( + r#"(module (memory 1) (func (param i32) (result i32) (local i32) + local.get 0 local.tee 1 local.get 1 i32.xor + i32.const 49152 i32.add i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn n5_param_shl_then_and() { + assert!(!accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 i32.const 5 i32.shl i32.const 1024 i32.and i32.load) + (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn n6_param_div_param_is_one() { + assert!(!accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 local.get 0 i32.div_u i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn n7_param_eqz() { + // eqz yields 0/1, a caller-independent address. + assert!(!accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 i32.eqz i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn n8_param_wrap_i64_unary_over_rejection() { + // Documented sound over-rejection: a width conversion erases Param. + assert!(!accepts( + r#"(module (memory 1) (func (param i64) (result i32) + local.get 0 i32.wrap_i64 i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +// =========================================================================== +// 8b' — MUST-REJECT: add-side algebraic cancellation `(C - p) + p == C` +// +// The round-2 `sub` rule correctly demotes `const - param` to NotParam, but the +// value it produces is `C - p` (a *negated* parameter), not a constant. Adding +// `p` back recovers the caller-independent constant `C`. The `add` rule must +// therefore never re-promote a `Param + NotParam` to `Param`; only a proven +// `Const` addend keeps the base `Param`. Every case below stores/loads at a +// fixed absolute address regardless of the caller's pointer and MUST reject. +// (The mirror `(C + p) - p` was already correctly rejected by the `sub` rule; +// `cancel7` re-asserts that to pin the symmetry.) +// =========================================================================== + +#[test] +fn cancel1_const_minus_param_plus_param_store() { + // (C - p) + p == C. `i32.const 65536; local.get 0; i32.sub` = C - p + // (NotParam), then `local.get 0; i32.add` re-adds p. Must NOT re-promote. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + i32.const 65536 local.get 0 i32.sub local.get 0 i32.add + local.get 1 i32.store) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn cancel2_param_plus_const_minus_param_store() { + // p + (C - p) == C. The commuted operand order: the param is the first + // `add` operand and the `(C - p)` NotParam is on top. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + local.get 0 i32.const 65536 local.get 0 i32.sub i32.add + local.get 1 i32.store) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn cancel3_bulk_memory_fill_variant() { + // (C - p) + p == C addressing a memory.fill destination. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32 i32) + i32.const 65536 local.get 0 i32.sub local.get 0 i32.add + local.get 1 local.get 2 memory.fill) (export "f" (func 0)))"#, + 3, + )); +} + +#[test] +fn cancel4_const_minus_param_laundered_through_local() { + // (C - p) parked in local 2, then `local.get 2; local.get 0; i32.add` + // reconstitutes the constant. The local must carry NotParam, not Param. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) (local i32) + i32.const 65536 local.get 0 i32.sub local.set 2 + local.get 2 local.get 0 i32.add i32.load) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn cancel5_two_param_slots_store_at_other() { + // (C - p0) + p0 == C, stored at p1: the cancelled address is independent of + // BOTH params; only the value path uses a genuine param. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + i32.const 65536 local.get 0 i32.sub local.get 0 i32.add + local.get 1 i32.store) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn cancel6_const_minus_param_load_directly() { + // The `(C - p)` value itself is NotParam, so loading through it is rejected + // even without the re-adding `add` — pins the `sub`-side classification. + assert!(!accepts( + r#"(module (memory 1) (func (param i32) (result i32) + i32.const 65536 local.get 0 i32.sub i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn cancel7_mirror_const_plus_param_minus_param_rejected() { + // The already-sound mirror `(C + p) - p == C`: `i32.const C; local.get 0; + // i32.add` = Param, then `local.get 0; i32.sub` = Param - Param = NotParam. + // Asserted to lock the symmetry the `add` fix restores. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + i32.const 65536 local.get 0 i32.add local.get 0 i32.sub + local.get 1 i32.store) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn cancel8_param_plus_const_offset_still_accepted() { + // Soundness must not over-reject the legitimate `param + const` it protects: + // a genuine struct-field offset stays Param. (Mirrors a2, re-asserted in the + // cancellation family so a future regression here is caught alongside it.) + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 i32.const 12 i32.add i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +// =========================================================================== +// 8b'' — MUST-REJECT: sub-side algebraic cancellation `p - (p - C) == C` +// +// The mirror of the add-side cancellation family above. `Param - NotParam` must +// NOT preserve `Param`: `NotParam` means *not provably constant*, so the +// subtrahend may itself be a negated/offset parameter such as `p - C`, and +// `p - (p - C) == C` is a fixed, caller-independent absolute address. Only a +// proven `Const` subtrahend keeps the minuend's param-derivation. Each case +// below addresses a constant regardless of the caller's pointer and MUST reject. +// =========================================================================== + +#[test] +fn cancel9_param_minus_param_times_one_is_zero() { + // `p - (p * 1) == 0`. `p * 1` is a multiply, classified NotParam, but its + // runtime value is exactly the caller pointer, so the subtraction cancels to + // the absolute address 0. `Param - NotParam` must NOT re-promote to Param. + assert!(!accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 local.get 0 i32.const 1 i32.mul i32.sub i32.load) + (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn cancel10_param_minus_notparam_offset_is_const() { + // `p - ((p * 1) - C) == C`, the laundering wholly within one function. The + // subtrahend `(p * 1) - C` is genuinely NotParam (a multiply makes `p * 1` + // NotParam, and `NotParam - Const` stays NotParam), yet its runtime value is + // `p - C`. The outer `p - (p - C)` recovers the constant `C` as a store + // address. `Param - NotParam` must NOT re-promote to Param. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + local.get 0 + local.get 0 i32.const 1 i32.mul i32.const 4096 i32.sub + i32.sub + local.get 1 i32.store) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn cancel11_param_minus_helper_result_is_const_store() { + // The interprocedural form with a STORE: `$s(p) = p - 4096`. A call result is + // modeled NotParam, so `p - $s(p) == 4096` is a fixed absolute store address + // that the closure root's caller never supplies. The whole closure must + // reject rather than admit a fabricated host-memory write as Tier B. + let m = module( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32))) + (type (;1;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) + local.get 0 + local.get 0 + call 1 + i32.sub + i32.const 1234 + i32.store) + (func (;1;) (type 1) (param i32) (result i32) + local.get 0 i32.const 4096 i32.sub) + (export "writer" (func 0))) + "#, + ); + let err = verify_param_addressing(&m, &[0, 1], 0, "writer") + .expect_err("p - (p - C) laundered through a call must be rejected"); + assert!( + matches!(err, LinkError::RequiresRelocatableBuild { .. }), + "{err:?}" + ); +} + +#[test] +fn cancel12_param_minus_const_offset_still_accepted() { + // The positive control: the fix must not over-reject the legitimate + // `param - const` it protects. A negative offset into the caller's buffer + // (`p - 8`, a struct field below the pointer) stays Param and Tier B. Only a + // *provable* Const subtrahend keeps param-derivation, which this exercises. + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 i32.const 8 i32.sub i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +// =========================================================================== +// 8c — MUST-REJECT: C-1 control-flow-laundered absolute address +// =========================================================================== + +#[test] +fn f1_if_then_partial_write_skip_keeps_const() { + // The headline C-1: local 2 = join(const 1024 on skip, param0 on taken) = + // NotParam. The skip path leaves the const, which addresses the load. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) (local i32) + i32.const 1024 local.set 2 + (block local.get 1 (if (then local.get 0 local.set 2))) + local.get 2 i32.load) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn f2_if_else_one_arm_const() { + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) (local i32) + local.get 1 + (if (then local.get 0 local.set 2) (else i32.const 2048 local.set 2)) + local.get 2 i32.load) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn f3_single_arm_fallthrough_default() { + // The skip path leaves local 2 at its default (NotParam); join => NotParam. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) (local i32) + local.get 1 (if (then local.get 0 local.set 2)) + local.get 2 i32.load) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn f4_loop_back_edge_clobber() { + // A later iteration overwrites local 2 with a const; the fixpoint joins the + // back-edge and demotes local 2 to NotParam. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) (local i32) + local.get 0 local.set 2 + (loop + local.get 1 + (if (then i32.const 4096 local.set 2 br 1)) + local.get 1 br_if 0) + local.get 2 i32.load) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn f5_br_if_guarded_param_write() { + // br_if skips the param write on one path; the merge demotes local 2. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) (local i32) + i32.const 8192 local.set 2 + (block local.get 1 br_if 0 local.get 0 local.set 2) + local.get 2 i32.load) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn f6_br_table_skips_param_write() { + // One table edge skips the param write into local 2. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) (local i32) + i32.const 16 local.set 2 + (block (block local.get 1 br_table 0 1) local.get 0 local.set 2) + local.get 2 i32.load) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn f7_param_on_stack_does_not_cross_into_block() { + // A param left on the operand stack before a block is not threaded in as the + // block's param unless the block type declares it; conservative reject. + assert!(!accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 (block (result i32) i32.load)) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn f8_control_laundered_store() { + // F1's join, but the demoted local 2 addresses a store instead of a load. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) (local i32) + i32.const 1024 local.set 2 + (block local.get 1 (if (then local.get 0 local.set 2))) + local.get 2 local.get 0 i32.store) (export "f" (func 0)))"#, + 2, + )); +} + +// =========================================================================== +// 8d — MUST-REJECT: straight-line constant / global regression guards +// =========================================================================== + +#[test] +fn s1_const_load_no_params() { + assert!(!accepts( + r#"(module (memory 1) (func (result i32) + i32.const 1024 i32.load) (export "f" (func 0)))"#, + 0, + )); +} + +#[test] +fn s2_store_at_const() { + assert!(!accepts( + r#"(module (memory 1) (func (param i32) + i32.const 4096 local.get 0 i32.store) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn s3_global_address_load() { + assert!(!accepts( + r#"(module (memory 1) (global i32 (i32.const 0)) (func (result i32) + global.get 0 i32.load) (export "f" (func 0)))"#, + 0, + )); +} + +#[test] +fn s4_const_in_scratch_local() { + assert!(!accepts( + r#"(module (memory 1) (func (param i32) (result i32) (local i32) + i32.const 2048 local.set 1 local.get 1 i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn s5_memory_fill_at_const() { + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + i32.const 0 local.get 0 local.get 1 memory.fill) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn s6_memory_grow_result_is_not_an_address() { + assert!(!accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 memory.grow i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn s7_memory_size_result_is_not_an_address() { + assert!(!accepts( + r#"(module (memory 1) (func (result i32) + memory.size i32.load) (export "f" (func 0)))"#, + 0, + )); +} + +// =========================================================================== +// 8e — C-3 call boundaries: the SOUND interprocedural analysis. A constant +// laundered through a `call` rejects (the callee's param is untrusted at that +// site); a param-derived argument threaded through a `call` is accepted (the +// callee's param is trusted at every site). +// =========================================================================== + +#[test] +fn c3a_const_arg_through_helper_call_is_rejected() { + // $sum: const 1024; call $g $g: param0 load. The only call site passes a + // constant for $g's param 0, so param 0 is NOT trusted in $g, and $g's load + // through it is rejected interprocedurally. + let m = module( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32 i32) (result i32))) + (type (;1;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + i32.const 1024 call 1) + (func (;1;) (type 1) (param i32) (result i32) + local.get 0 i32.load) + (export "sum" (func 0))) + "#, + ); + let err = verify_param_addressing(&m, &[0, 1], 0, "sum") + .expect_err("a const arg laundered through a call must be rejected"); + assert!( + matches!(err, LinkError::RequiresRelocatableBuild { .. }), + "{err:?}" + ); +} + +#[test] +fn c3b_param_arg_through_helper_is_accepted() { + // The legitimate factored helper: $sum passes its own param 0 to $g, which + // loads through its (now trusted) param 0. The sound interprocedural fixpoint + // accepts this — the call-laundering stopgap no longer over-rejects it. + let m = module( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32 i32) (result i32))) + (type (;1;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 call 1) + (func (;1;) (type 1) (param i32) (result i32) + local.get 0 i32.load) + (export "sum" (func 0))) + "#, + ); + assert!( + verify_param_addressing(&m, &[0, 1], 0, "sum").is_ok(), + "a param-derived arg threaded through a call must be accepted" + ); +} + +#[test] +fn c3c_call_result_used_as_address_is_rejected() { + // The call result is NotParam, so using it as an address is rejected even at + // the single-function level. + assert!(!accepts_in( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (result i32))) + (func (;0;) (type 0) (result i32) + call 1 i32.load) + (func (;1;) (type 0) (result i32) + i32.const 1024) + (export "f" (func 0))) + "#, + 0, + 0, + )); +} + +#[test] +fn single_function_memory_closure_is_still_analyzed() { + // A single-function closure IS its own root, so its parameters seed the + // trusted set and the analysis proves it precisely (the n=1 case unchanged). + let m = module( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 i32.load) (export "f" (func 0)))"#, + ); + assert!(verify_param_addressing(&m, &[0], 0, "f").is_ok()); +} + +// =========================================================================== +// 8i — SOUND interprocedural address-provenance. The closure root's parameters +// are the only caller-supplied pointers; an inner function's parameter is +// trusted only when *every* reachable call site passes it a param-derived +// argument. Each case is a whole module with two or more functions sharing the +// one memory. +// =========================================================================== + +/// Runs the interprocedural verifier over a whole `module_wat`, treating +/// `func_index` as the closure root and every function as in the closure. +fn verify(module_wat: &str, func_indices: &[u32], root: u32) -> Result<(), LinkError> { + let m = module(module_wat); + verify_param_addressing(&m, func_indices, root, "export") +} + +#[test] +fn ip1_sort_calls_swap_with_param_derived_pointer_accepts() { + // The headline case (a): `sort(ptr,len)` calls `swap(p,a,b)` with a + // param-derived `ptr` argument; `swap` dereferences its pointer param. + // `swap`'s param 0 is trusted at the only call site (it is `sort`'s ptr), so + // the whole closure is accepted. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32 i32))) + (type (;1;) (func (param i32 i32 i32))) + (func (;0;) (type 0) (param i32 i32) + local.get 0 local.get 0 local.get 1 call 1) + (func (;1;) (type 1) (param i32 i32 i32) + local.get 0 local.get 1 i32.store + local.get 0 local.get 2 i32.store) + (export "sort" (func 0))) + "#, + &[0, 1], + 0, + ) + .is_ok()); +} + +#[test] +fn ip2_helper_called_with_constant_address_rejects() { + // Case (b): a helper `g(addr)` that loads through its param, called with a + // *constant* argument. `g`'s param 0 is untrusted (the const arg), so its + // load is rejected. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (result i32))) + (type (;1;) (func (param i32) (result i32))) + (func (;0;) (type 0) (result i32) + i32.const 1024 call 1) + (func (;1;) (type 1) (param i32) (result i32) + local.get 0 i32.load) + (export "root" (func 0))) + "#, + &[0, 1], + 0, + ) + .is_err()); +} + +#[test] +fn ip3_helper_called_from_two_sites_one_const_rejects() { + // Case (c): a helper called from two sites — one param-derived, one constant. + // The must-join over call sites demotes the helper's param to untrusted, so + // its dereference is rejected. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32))) + (type (;1;) (func (param i32))) + (func (;0;) (type 0) (param i32) + local.get 0 call 2 + i32.const 4096 call 2) + (func (;1;) (type 0) (param i32) + local.get 0 call 2) + (func (;2;) (type 1) (param i32) + local.get 0 i32.const 0 i32.store) + (export "root" (func 0))) + "#, + &[0, 1, 2], + 0, + ) + .is_err()); +} + +#[test] +fn ip3b_helper_called_from_two_param_derived_sites_accepts() { + // Control for (c): the same two-call-site shape, but *both* sites pass a + // param-derived argument. The helper's param stays trusted; accepted. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32 i32))) + (type (;1;) (func (param i32))) + (func (;0;) (type 0) (param i32 i32) + local.get 0 call 1 + local.get 1 call 1) + (func (;1;) (type 1) (param i32) + local.get 0 i32.const 0 i32.store) + (export "root" (func 0))) + "#, + &[0, 1], + 0, + ) + .is_ok()); +} + +#[test] +fn ip4a_self_recursion_passing_param_accepts() { + // Case (d): self-recursion passing a param-derived argument (`f(p)` calls + // `f(p+1)`), dereferencing its param. The greatest fixpoint keeps the param + // trusted across the back-edge; accepted. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32))) + (func (;0;) (type 0) (param i32) + local.get 0 i32.const 0 i32.store + local.get 0 i32.const 1 i32.add call 0) + (export "f" (func 0))) + "#, + &[0], + 0, + ) + .is_ok()); +} + +#[test] +fn ip4b_self_recursion_passing_const_rejects() { + // Case (d): self-recursion passing a *constant* argument that the function + // dereferences. The fixpoint removes the param from the trusted set (a const + // reaches it on the recursive path), so its dereference is rejected. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32))) + (func (;0;) (type 0) (param i32) + local.get 0 i32.load drop + i32.const 2048 call 0) + (export "f" (func 0))) + "#, + &[0], + 0, + ) + .is_err()); +} + +#[test] +fn ip5_mutual_recursion_param_derived_accepts() { + // Case (e): mutual recursion `a(p) -> b(p) -> a(p)`, each dereferencing its + // param, every call threading a param-derived argument. The fixpoint keeps + // both params trusted; accepted. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32))) + (func (;0;) (type 0) (param i32) + local.get 0 i32.const 0 i32.store + local.get 0 call 1) + (func (;1;) (type 0) (param i32) + local.get 0 i32.const 0 i32.store + local.get 0 call 0) + (export "a" (func 0))) + "#, + &[0, 1], + 0, + ) + .is_ok()); +} + +#[test] +fn ip5b_mutual_recursion_one_const_arg_rejects() { + // Case (e): mutual recursion where one leg passes a constant to the other, + // which dereferences it. The const poisons the callee's param; rejected. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32))) + (func (;0;) (type 0) (param i32) + i32.const 512 call 1) + (func (;1;) (type 0) (param i32) + local.get 0 i32.const 0 i32.store + local.get 0 call 0) + (export "a" (func 0))) + "#, + &[0, 1], + 0, + ) + .is_err()); +} + +#[test] +fn ip6_call_indirect_result_as_address_rejects() { + // Case (f): a `call_indirect` whose result feeds an address. The result is + // NotParam (no callee param is trusted through an indirect dispatch), so the + // dereference is rejected. (A table use also marks the closure Tier C + // upstream; this pins the provenance-level conservatism directly.) + assert!(!accepts_in( + r#" + (module + (memory (;0;) 1) + (table (;0;) 1 funcref) + (type (;0;) (func (result i32))) + (func (;0;) (type 0) (result i32) + i32.const 0 call_indirect (type 0) i32.load) + (export "f" (func 0))) + "#, + 0, + 0, + )); +} + +#[test] +fn ip7_root_param_is_trusted_even_when_a_callsite_passes_const() { + // The root's parameters are seeded trusted unconditionally (the caller owns + // the shared memory). A const passed to a *helper* poisons only the helper's + // param, never the root's own dereference. Here the root dereferences its own + // param 0 directly and also calls a const-fed helper that does NOT touch + // memory — the root access stays accepted. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32) (result i32))) + (type (;1;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + i32.const 9 call 1 drop + local.get 0 i32.load) + (func (;1;) (type 1) (param i32) (result i32) + local.get 0 i32.const 1 i32.add) + (export "root" (func 0))) + "#, + &[0, 1], + 0, + ) + .is_ok()); +} + +#[test] +fn ip8_callee_reached_only_via_table_param_is_untrusted() { + // Fail-closed (f): a function dereferences its param but is reachable only + // through the table (no direct `call` site records an argument). With no + // call site to justify trusting its param, the param defaults untrusted and + // its dereference is rejected. Modeled here as an inner function present in + // the closure with no direct caller. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32))) + (func (;0;) (type 0) (param i32) + local.get 0 i32.const 0 i32.store) + (func (;1;) (type 0) (param i32) + local.get 0 i32.load drop) + (export "root" (func 0))) + "#, + &[0, 1], + 0, + ) + .is_err()); +} + +#[test] +fn ip9_diamond_all_param_derived_accepts() { + // A diamond: root calls two mids, both of which call one shared leaf with a + // param-derived pointer; the leaf dereferences its param. Every call site is + // param-derived, so the leaf's param is trusted; accepted. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32))) + (func (;0;) (type 0) (param i32) + local.get 0 call 1 + local.get 0 call 2) + (func (;1;) (type 0) (param i32) + local.get 0 call 3) + (func (;2;) (type 0) (param i32) + local.get 0 call 3) + (func (;3;) (type 0) (param i32) + local.get 0 i32.const 0 i32.store) + (export "root" (func 0))) + "#, + &[0, 1, 2, 3], + 0, + ) + .is_ok()); +} + +#[test] +fn ip10_diamond_one_leg_const_rejects() { + // The same diamond, but one mid passes a constant to the shared leaf. The + // must-join over the leaf's two call sites demotes its param; rejected. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32))) + (func (;0;) (type 0) (param i32) + local.get 0 call 1 + local.get 0 call 2) + (func (;1;) (type 0) (param i32) + local.get 0 call 3) + (func (;2;) (type 0) (param i32) + i32.const 64 call 3) + (func (;3;) (type 0) (param i32) + local.get 0 i32.const 0 i32.store) + (export "root" (func 0))) + "#, + &[0, 1, 2, 3], + 0, + ) + .is_err()); +} + +#[test] +fn ip11_non_root_export_position_seeds_only_the_root() { + // The root is whichever function satisfies the export, not function 0. Here + // function 1 is the root; it calls function 0 (the helper) with a constant. + // The helper's param is untrusted; its dereference is rejected — proving the + // seed follows the `root` argument, not the lowest index. + assert!(verify( + r#" + (module + (memory (;0;) 1) + (type (;0;) (func (param i32))) + (func (;0;) (type 0) (param i32) + local.get 0 i32.load drop) + (func (;1;) (type 0) (param i32) + i32.const 7 call 0) + (export "root" (func 1))) + "#, + &[0, 1], + 1, + ) + .is_err()); +} + +// =========================================================================== +// 8f — MUST-REJECT: memory.copy / multi-operand partial-param +// =========================================================================== + +#[test] +fn mc1_copy_src_is_const() { + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + local.get 0 i32.const 0 local.get 1 memory.copy) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn mc2_copy_dest_is_const() { + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + i32.const 0 local.get 0 local.get 1 memory.copy) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn mc3_copy_both_params() { + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32 i32) + local.get 0 local.get 1 local.get 2 memory.copy) (export "f" (func 0)))"#, + 3, + )); +} + +#[test] +fn mc4_copy_src_is_param_plus_zero() { + // src = param1 + 0 => add => Param; dest = param0 => Param. ACCEPT. + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32 i32) + local.get 0 local.get 1 i32.const 0 i32.add local.get 2 memory.copy) + (export "f" (func 0)))"#, + 3, + )); +} + +// =========================================================================== +// 8f' — S1: the bulk-memory SIZE / extent operand must be caller-derived too. +// +// A bulk-memory op touches `[address, address + size)`. Modeling only the start +// address let an external clobber/read an unbounded region above a caller +// pointer with a constant extent (`memory.fill(param, v, 0x8000)`). The extent +// now carries the same caller-derivation requirement as an address: a constant +// or global size (empty mask) REJECTS, a caller-passed size (Param) ADMITS. +// =========================================================================== + +#[test] +fn ext1_fill_param_dest_const_size_rejected() { + // `memory.fill(param0, v, 0x8000)`: dest is caller-derived, but the constant + // extent could scorch host memory above the pointer. REJECT. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + local.get 0 local.get 1 i32.const 32768 memory.fill) + (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn ext2_fill_param_dest_param_size_accepted() { + // `memory.fill(param0, v, param2)`: both the destination and the extent are + // caller-supplied, so the clobber is bounded by a value the caller owns. + // ACCEPT. + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32 i32) + local.get 0 local.get 1 local.get 2 memory.fill) + (export "f" (func 0)))"#, + 3, + )); +} + +#[test] +fn ext3_copy_params_const_size_rejected() { + // `memory.copy(param0, param1, 0x8000)`: both ends are caller-derived, but + // the constant extent is unbounded relative to the caller's pointers. REJECT. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) + local.get 0 local.get 1 i32.const 32768 memory.copy) + (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn ext4_copy_all_params_accepted() { + // `memory.copy(dst_param, src_param, len_param)`: dest, src, and extent are + // all caller-supplied. ACCEPT. + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32 i32) + local.get 0 local.get 1 local.get 2 memory.copy) + (export "f" (func 0)))"#, + 3, + )); +} + +#[test] +fn ext5_fill_const_size_via_local_rejected() { + // The extent laundered through a scratch local is still a constant: the + // local carries `Const`, whose empty mask rejects. REJECT. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) (local i32) + i32.const 32768 local.set 2 + local.get 0 local.get 1 local.get 2 memory.fill + local.get 0) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn ext6_copy_param_extent_plus_const_accepted() { + // `len = param2 + const` stays Param, so a caller-bounded extent adjusted by + // a fixed offset is still admitted (the realistic `len - 1` / `len + 1` + // pattern). ACCEPT. + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32 i32) + local.get 0 local.get 1 local.get 2 i32.const 1 i32.add memory.copy) + (export "f" (func 0)))"#, + 3, + )); +} + +// =========================================================================== +// 8g — select-laundered & nested-block edge cases +// =========================================================================== + +#[test] +fn sl1_select_param_and_const() { + // select(param, const) => join(Param, NotParam) = NotParam. + assert!(!accepts( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 i32.const 1024 i32.const 1 select i32.load) + (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn sl2_select_param_and_param() { + // select(param0, param1) => Param (= a11). + assert!(accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) + local.get 0 local.get 1 i32.const 1 select i32.load) + (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn nb1_param_threaded_through_nested_blocks() { + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) + (block (result i32) + (block (result i32) + (block (result i32) local.get 0))) + i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +#[test] +fn nb2_inner_if_writes_const_demotes_outward() { + // An inner if conditionally writes a const into the address local; the join + // demotes it, and the demotion propagates out of the nested blocks. + assert!(!accepts( + r#"(module (memory 1) (func (param i32 i32) (result i32) (local i32) + local.get 0 local.set 2 + (block + (block + local.get 1 + (if (then i32.const 256 local.set 2)))) + local.get 2 i32.load) (export "f" (func 0)))"#, + 2, + )); +} + +#[test] +fn tee1_local_tee_const_under_control_flow() { + // local.tee writes a const on the taken arm; merge with the skip-path entry + // (local 1 at default NotParam) => NotParam. + assert!(!accepts( + r#"(module (memory 1) (func (param i32) (result i32) (local i32) + local.get 0 + (if (then i32.const 100 local.tee 1 drop)) + local.get 1 i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +// =========================================================================== +// 8h — M-1 resource guard: over-declared locals must not OOM +// =========================================================================== + +#[test] +fn r1_over_declared_locals_rejected_without_huge_alloc() { + // A tiny body whose single locals group claims u32::MAX locals must be + // rejected as a clean LinkError::Parse before any per-local allocation, + // never driving a multi-gigabyte `vec!`. + let body = over_declared_locals_body(u32::MAX); + let m = ParsedModule::default(); + let err = function_is_param_addressing(&m, &body, 0) + .expect_err("over-declared locals must be rejected"); + assert!( + matches!(err, LinkError::Parse(msg) if msg.contains("too many locals")), + "expected a clean Parse rejection for the over-declared locals count" + ); +} + +#[test] +fn r2_locals_exceeding_body_length_rejected() { + // A locals group declaring more locals than the body has bytes is malformed + // (each local costs >= 1 byte); reject before allocation. + let body = over_declared_locals_body(1_000_000); + let m = ParsedModule::default(); + let err = function_is_param_addressing(&m, &body, 0) + .expect_err("locals exceeding body length must be rejected"); + assert!(matches!(err, LinkError::Parse(_)), "{err:?}"); +} + +#[test] +fn r3_locals_under_the_cap_are_analyzed() { + // A modest, legitimate locals count runs the analysis to completion. + assert!(accepts( + r#"(module (memory 1) (func (param i32) (result i32) + (local i32 i32 i32) + local.get 0 i32.load) (export "f" (func 0)))"#, + 1, + )); +} + +// =========================================================================== +// Deep nesting: the analysis must fail closed, never overflow its own stack +// =========================================================================== + +#[test] +fn deeply_nested_blocks_fail_closed_without_aborting() { + // A body nested far past the analysis depth cap must be rejected as a normal + // `Ok(false)` (Tier C), never recurse until the analysis stack overflows. + let depth = super::MAX_ANALYSIS_DEPTH + 50; + let mut wat = String::from( + "(module (memory 1) (func (param i32) (result i32) local.get 0 ", + ); + for _ in 0..depth { + wat.push_str("(block (result i32) "); + } + wat.push_str("i32.load"); + for _ in 0..depth { + wat.push(')'); + } + wat.push_str(") (export \"f\" (func 0)))"); + + // Past the depth cap the analysis fails closed (rejects), and it must return + // a verdict rather than abort. + assert!(!accepts(&wat, 1)); +} + +/// Builds a raw function body whose single locals group declares `count` locals +/// of type `i32`, followed by `i32.const 0; i32.load; drop; end` (a memory-using +/// body). The body is hand-encoded because `wat` would reject a u32::MAX locals +/// count; this exercises the analysis's own pre-allocation cap. +fn over_declared_locals_body(count: u32) -> Vec { + let mut body = Vec::new(); + // locals: one group of (count, i32). count is a LEB128 u32; i32 == 0x7F. + body.push(0x01); // one locals group + write_leb_u32(&mut body, count); + body.push(0x7F); // i32 + // i32.const 0 + body.push(0x41); + body.push(0x00); + // i32.load (align=2, offset=0) + body.push(0x28); + body.push(0x02); + body.push(0x00); + // drop + body.push(0x1A); + // end + body.push(0x0B); + body +} + +/// Writes `value` as unsigned LEB128. +fn write_leb_u32(out: &mut Vec, mut value: u32) { + loop { + let mut byte = (value & 0x7F) as u8; + value >>= 7; + if value != 0 { + byte |= 0x80; + } + out.push(byte); + if value == 0 { + break; + } + } +} diff --git a/core/wasm-linker/src/rewrite.rs b/core/wasm-linker/src/rewrite.rs new file mode 100644 index 00000000..48d75192 --- /dev/null +++ b/core/wasm-linker/src/rewrite.rs @@ -0,0 +1,922 @@ +//! Re-encoding a copied function body under a new index space. +//! +//! When a function body is moved from an external module into the main module, +//! every index it references shifts: a `call N` now means a different function, +//! a `call_indirect (type T)` a different type, and so on. This pass walks the +//! operator stream and re-emits it, copying each operator's bytes verbatim +//! *except* the index-bearing operators, which it re-encodes with the remapped +//! index via `wasm-encoder` (so the opcode encoding stays canonical). +//! +//! The verbatim-copy default keeps the body byte-identical wherever no index +//! changes, which both minimizes surface area and makes round-trips exact for +//! the common (Tier-A/B) case where the only remapped operator is `call`. + +use inf_wasmparser::{BinaryReader, FunctionBody, Operator, ValType}; +use wasm_encoder::{Encode, Function, Instruction}; + +use crate::safety::{check_operator, is_verification_only, opens_control_frame, MAX_CONTROL_DEPTH}; +use crate::LinkError; + +/// Where a body being re-encoded comes from, which decides how the +/// verification-only non-det/uzumaki opcodes are treated. +/// +/// The main module in proof mode legitimately carries these opcodes as Rocq +/// proof scaffolding, so they are copied through verbatim. An external module's +/// body is merged into an executable binary, where the same opcodes have no +/// runtime meaning; they are rejected rather than copied. (The external closure +/// scan rejects them first via [`check_operator`], so this arm is defence in +/// depth for any external body re-encoded directly.) +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum BodyOrigin { + /// The main module's own body. Verification-only opcodes are proof + /// scaffolding and pass through unaltered. + Main, + /// An external module's body merged into the output. Verification-only + /// opcodes are rejected as non-executable. + External, +} + +/// `0xfc`-prefix byte shared by the Inference non-deterministic block opcodes +/// (`forall`/`exists`/`assume`/`unique`), mirroring the codegen encoding. +const NONDET_OPCODE_PREFIX: u8 = 0xfc; + +/// `0xfc` sub-opcode for `forall`, matching the codegen and `inf-wasmparser` +/// decoder. +const FORALL_SUBOPCODE: u8 = 0x3a; +/// `0xfc` sub-opcode for `exists`. +const EXISTS_SUBOPCODE: u8 = 0x3b; +/// `0xfc` sub-opcode for `assume`. +const ASSUME_SUBOPCODE: u8 = 0x3c; +/// `0xfc` sub-opcode for `unique`. +const UNIQUE_SUBOPCODE: u8 = 0x3d; + +/// Maps an index from the source module's space into the merged module's space. +pub(crate) struct IndexMap<'a> { + /// `source_func_idx -> merged_func_idx` for every function in the closure. + pub func: &'a dyn Fn(u32) -> u32, + /// `source_type_idx -> merged_type_idx` for every type the closure uses. + /// + /// Fallible: a body can reference a type index the merge never interned + /// (e.g. a function-typed block over an unused signature, or an out-of-range + /// index in an adversarial body), which must surface as a [`LinkError`] + /// rather than panic. + pub ty: &'a dyn Fn(u32) -> Result, +} + +/// Re-encodes one function body under `map`, returning a `wasm-encoder` +/// [`Function`] ready to append to the output code section. +/// +/// The input `body` is the raw code-section body (locals vector followed by the +/// operator stream, no length prefix), exactly as stored in +/// [`crate::parse::LocalFunc::body`]. +/// +/// `origin` selects how the verification-only non-det/uzumaki opcodes are +/// handled: passed through verbatim for the main module's proof scaffolding, +/// rejected for an external body merged into the executable output (see +/// [`BodyOrigin`]). +pub(crate) fn reencode_body( + body: &[u8], + map: &IndexMap, + origin: BodyOrigin, +) -> Result { + let reader = BinaryReader::new(body, 0); + let func_body = FunctionBody::new(reader); + + let locals = read_locals(&func_body)?; + let mut function = Function::new(locals); + + // Collect operators with their start offsets so each operator's byte span + // can be sliced for verbatim copying. The reader starts at 0, so offsets + // are absolute into `body`; operator `i` spans `[offset_i, offset_{i+1})`, + // and the last (the body-terminating `end`) runs to `body.len()`. + let mut ops = Vec::new(); + for item in func_body + .get_operators_reader() + .map_err(|e| LinkError::Parse(e.to_string()))? + .into_iter_with_offsets() + { + let (op, offset) = item.map_err(|e| LinkError::Parse(e.to_string()))?; + ops.push((op, offset)); + } + + // Bound structured-control-flow nesting on this path too. The closure scan + // gates external bodies, but the main module's body is re-encoded here without + // passing through that scan; an over-nested main body would link and only fail + // in the downstream wasm-to-v translator (which recurses one frame per level), + // violating the invariant that anything the linker emits is translatable. + // Rejecting here at the same cap the closure scan and the translator use keeps + // the three passes in agreement. A `block`/`loop`/`if`/non-det op opens a + // frame; an `End` closes the innermost one. + let mut control_depth: usize = 0; + for (i, (op, offset)) in ops.iter().enumerate() { + if opens_control_frame(op) { + control_depth += 1; + if control_depth >= MAX_CONTROL_DEPTH { + return Err(LinkError::UnsupportedConstruct(format!( + "function body nests structured control flow at least {MAX_CONTROL_DEPTH} levels deep" + ))); + } + } else if matches!(op, Operator::End) { + control_depth = control_depth.saturating_sub(1); + } + let end = ops.get(i + 1).map_or(body.len(), |(_, o)| *o); + let span = &body[*offset..end]; + emit_operator(&mut function, op, span, map, origin)?; + } + + Ok(function) +} + +/// Reads the locals declarations from a body into the `(count, ValType)` form +/// `wasm-encoder::Function::new` expects. +fn read_locals(body: &FunctionBody) -> Result, LinkError> { + let mut locals_reader = body + .get_locals_reader() + .map_err(|e| LinkError::Parse(e.to_string()))?; + let count = locals_reader.get_count(); + let mut locals = Vec::with_capacity(count as usize); + for _ in 0..count { + let (n, ty) = locals_reader + .read() + .map_err(|e| LinkError::Parse(e.to_string()))?; + locals.push((n, map_val_type(ty)?)); + } + Ok(locals) +} + +/// Emits a single operator, re-encoding the index-bearing ones and copying the +/// rest verbatim from their original bytes. +/// +/// `origin` decides the treatment of the verification-only non-det/uzumaki +/// opcodes: an external body rejects them as non-executable, the main module's +/// body passes them through as proof scaffolding (see [`BodyOrigin`]). +fn emit_operator( + function: &mut Function, + op: &Operator, + span: &[u8], + map: &IndexMap, + origin: BodyOrigin, +) -> Result<(), LinkError> { + // An external body must never carry a verification-only opcode into the + // executable output. The external closure scan already rejects such a body + // via `check_operator`, so reaching here means a body re-encoded outside + // that scan; reject it the same way rather than emit a non-executable block. + if origin == BodyOrigin::External && is_verification_only(op) { + check_operator(op)?; + } + match op { + Operator::Call { function_index } => { + function.instruction(&Instruction::Call((map.func)(*function_index))); + } + Operator::RefFunc { function_index } => { + function.instruction(&Instruction::RefFunc((map.func)(*function_index))); + } + Operator::CallIndirect { + type_index, + table_index, + } => { + function.instruction(&Instruction::CallIndirect { + type_index: (map.ty)(*type_index)?, + table_index: *table_index, + }); + } + // The tail-call forms (`return_call` / `return_call_indirect`) have no + // arm of their own: the Rocq translator has no lowering for them, and + // Inference codegen never emits them. They fall through to the final arm, + // which rejects them via the fail-closed allow-list — closing the bypass + // that previously re-indexed and copied a tail call on the main path. + + // Block-type operators can carry a type index in their multi-value + // form. Inference codegen only emits the empty and value block types, + // but a Tier-A/B external body could use a function block type, so + // re-encode those defensively rather than copy a now-stale index. + Operator::Block { blockty } + | Operator::Loop { blockty } + | Operator::If { blockty } => { + emit_block(function, op, *blockty, map)?; + } + // The Inference non-det block operators carry the identical `blockty` + // payload, so their function block-type index must be remapped exactly + // like `Block`/`Loop`/`If`. `wasm-encoder` models no custom opcode, so + // they are re-emitted as raw bytes (prefix + sub-opcode + re-encoded + // block type) rather than via an `Instruction`. + Operator::Forall { blockty } => { + emit_nondet_block(function, FORALL_SUBOPCODE, *blockty, map)?; + } + Operator::Exists { blockty } => { + emit_nondet_block(function, EXISTS_SUBOPCODE, *blockty, map)?; + } + Operator::Assume { blockty } => { + emit_nondet_block(function, ASSUME_SUBOPCODE, *blockty, map)?; + } + Operator::Unique { blockty } => { + emit_nondet_block(function, UNIQUE_SUBOPCODE, *blockty, map)?; + } + // The main module's verification-only opcodes (the uzumaki rvalues, and + // any non-det block reached here) are proof scaffolding with no + // executable meaning: they are copied through verbatim and must bypass + // the fail-closed allow-list, which rejects them for the executable + // merge. (An external body's verification-only opcodes were already + // rejected at the top of this function.) + _ if origin == BodyOrigin::Main && is_verification_only(op) => { + function.raw(span.iter().copied()); + } + _ => { + // Every other operator carries no index that the merge changes + // (locals, constants, arithmetic, control flow targets, and + // memargs over the single shared memory all stay valid), so it is + // copied verbatim — but only after the fail-closed allow-list + // confirms the merge models it. An atomic, SIMD, exception-handling, + // typed-reference, or multi-memory operator is rejected here rather + // than copied into a structurally-invalid output, even for a body + // (e.g. the main module's) the closure scanner never walked. + check_operator(op)?; + function.raw(span.iter().copied()); + } + } + Ok(()) +} + +fn emit_block( + function: &mut Function, + op: &Operator, + blockty: inf_wasmparser::BlockType, + map: &IndexMap, +) -> Result<(), LinkError> { + let encoded = map_block_type(blockty, map)?; + let instr = match op { + Operator::Block { .. } => Instruction::Block(encoded), + Operator::Loop { .. } => Instruction::Loop(encoded), + Operator::If { .. } => Instruction::If(encoded), + _ => unreachable!("emit_block called with non-block operator"), + }; + function.instruction(&instr); + Ok(()) +} + +/// Re-emits an Inference non-det block operator (`forall`/`exists`/`assume`/ +/// `unique`) with its block-type index remapped into the merged type space. +/// +/// `wasm-encoder` has no `Instruction` for the `0xfc`-prefixed custom opcodes, +/// so the operator is written as raw bytes: the `0xfc` prefix, the `sub_opcode`, +/// then the canonical encoding of the remapped block type. The block-type remap +/// is the same fail-closed [`map_block_type`] used by `Block`/`Loop`/`If`, so a +/// function block type whose index the merge never interned (or a reference-typed +/// result) surfaces as a clean [`LinkError`] rather than a verbatim-copied stale +/// index. +fn emit_nondet_block( + function: &mut Function, + sub_opcode: u8, + blockty: inf_wasmparser::BlockType, + map: &IndexMap, +) -> Result<(), LinkError> { + let encoded = map_block_type(blockty, map)?; + let mut bytes = vec![NONDET_OPCODE_PREFIX, sub_opcode]; + encoded.encode(&mut bytes); + function.raw(bytes); + Ok(()) +} + +fn map_block_type( + blockty: inf_wasmparser::BlockType, + map: &IndexMap, +) -> Result { + Ok(match blockty { + inf_wasmparser::BlockType::Empty => wasm_encoder::BlockType::Empty, + // A value block type maps to a single result. A reference-typed result + // is an unsupported construct (surfaced by `map_val_type`), not a silent + // fallback to `Empty` — eliding a block's result would corrupt the body. + inf_wasmparser::BlockType::Type(ty) => { + wasm_encoder::BlockType::Result(map_val_type(ty)?) + } + inf_wasmparser::BlockType::FuncType(type_idx) => { + wasm_encoder::BlockType::FunctionType((map.ty)(type_idx)?) + } + }) +} + +/// Maps an `inf-wasmparser` value type to the `wasm-encoder` equivalent. +/// +/// Rejects floating-point value types: the Inference language has no `f32`/`f64` +/// types, so a float local or float block result cannot appear in a body the +/// merge models. The feature gate rejects a float-using external before its body +/// is re-encoded, but the main-module re-encode path bypasses that gate, so this +/// is the float backstop on the value-type axis (the operator-stream backstop is +/// [`crate::safety::is_float`]). `v128` is rejected for the same reason: the +/// language has no SIMD types and every SIMD operator is rejected, so the type +/// axis must stay consistent. Reference types are likewise unsupported; only the +/// integer value types map through. +fn map_val_type(ty: ValType) -> Result { + Ok(match ty { + ValType::I32 => wasm_encoder::ValType::I32, + ValType::I64 => wasm_encoder::ValType::I64, + ValType::F32 | ValType::F64 => { + return Err(LinkError::UnsupportedConstruct( + "floating-point value type (f32/f64) in merged function body: \ + the Inference language has no f32/f64 types" + .into(), + )); + } + ValType::V128 => { + return Err(LinkError::UnsupportedConstruct( + "v128 value type in merged function body: \ + the Inference language has no SIMD types" + .into(), + )); + } + ValType::Ref(_) => { + return Err(LinkError::UnsupportedConstruct( + "reference-typed value in merged function body".into(), + )); + } + }) +} + +#[cfg(test)] +mod tests { + //! Direct unit tests for the body re-encoder. + //! + //! `reencode_body` handles index-bearing operators that the *public* `link` + //! API never reaches — a body using `call_indirect`, `ref.func`, or a + //! function-typed block belongs to a module the tier classifier rejects + //! before any body is re-encoded. These tests drive the re-encoder directly + //! with synthetic index maps so those defensive arms are exercised and their + //! remapping verified. + + use super::*; + use inf_wasmparser::{Parser, Payload}; + + /// Extracts the raw body bytes (locals vector + operator stream, no length + /// prefix) of the function at `func_idx` from a complete module's bytes. + fn body_bytes(module: &[u8], func_idx: usize) -> Vec { + let mut idx = 0; + for payload in Parser::new(0).parse_all(module) { + if let Payload::CodeSectionEntry(body) = payload.unwrap() { + if idx == func_idx { + return body.as_bytes().to_vec(); + } + idx += 1; + } + } + panic!("no body at index {func_idx}"); + } + + /// The operators of the function at `func_idx` of a re-encoded module. + fn operators(module: &[u8], func_idx: usize) -> Vec> { + let mut idx = 0; + for payload in Parser::new(0).parse_all(module) { + if let Payload::CodeSectionEntry(body) = payload.unwrap() { + if idx == func_idx { + return body + .get_operators_reader() + .unwrap() + .into_iter() + .map(|op| op.unwrap()) + .collect(); + } + idx += 1; + } + } + panic!("no body at index {func_idx}"); + } + + /// Wraps a re-encoded `Function` into a one-function module so it can be + /// parsed back and inspected. The single type is `() -> ()`; the test bodies + /// here are validated structurally (operator stream), not type-checked. + fn wrap(function: &Function) -> Vec { + let mut module = wasm_encoder::Module::new(); + let mut types = wasm_encoder::TypeSection::new(); + types.ty().function([], []); + module.section(&types); + let mut funcs = wasm_encoder::FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + let mut code = wasm_encoder::CodeSection::new(); + code.function(function); + module.section(&code); + module.finish() + } + + /// An index map that adds 10 to every function index and 100 to every type + /// index, so a remap is unmistakable in the output. + fn shifting_map() -> ( + impl Fn(u32) -> u32, + impl Fn(u32) -> Result, + ) { + (|f: u32| f + 10, |t: u32| Ok(t + 100)) + } + + #[test] + fn reencodes_call_indirect_type_index() { + // call_indirect's *type* index must be remapped; its table index stays. + let module = wat::parse_str( + r#" + (module + (type (;0;) (func)) + (table (;0;) 1 funcref) + (func (;0;) (type 0) + i32.const 0 + call_indirect (type 0)) + (export "f" (func 0))) + "#, + ) + .unwrap(); + let body = body_bytes(&module, 0); + + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let out = reencode_body(&body, &map, BodyOrigin::External).expect("re-encode call_indirect"); + let wrapped = wrap(&out); + + let has_remapped = operators(&wrapped, 0).into_iter().any(|op| { + matches!(op, Operator::CallIndirect { type_index, .. } if type_index == 100) + }); + assert!(has_remapped, "call_indirect type index must be remapped to 100"); + } + + #[test] + fn reencodes_ref_func_function_index() { + let module = wat::parse_str( + r#" + (module + (type (;0;) (func)) + (func (;0;) (type 0) + ref.func 0 + drop) + (export "f" (func 0))) + "#, + ) + .unwrap(); + let body = body_bytes(&module, 0); + + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let out = reencode_body(&body, &map, BodyOrigin::External).expect("re-encode ref.func"); + let wrapped = wrap(&out); + + let has_remapped = operators(&wrapped, 0) + .into_iter() + .any(|op| matches!(op, Operator::RefFunc { function_index } if function_index == 10)); + assert!(has_remapped, "ref.func function index must be remapped to 10"); + } + + #[test] + fn tail_call_indirect_is_rejected_not_reencoded() { + // `return_call_indirect` is a tail call: the Rocq translator has no + // lowering for it, and Inference codegen never emits it. The re-encoder + // must not have a dedicated arm that re-indexes and copies it; it falls + // through to the fail-closed allow-list and is rejected. This closes the + // main-module bypass that previously copied a tail call verbatim. + let module = wat::parse_str( + r#" + (module + (type (;0;) (func)) + (table (;0;) 1 funcref) + (func (;0;) (type 0) + i32.const 0 + return_call_indirect (type 0)) + (export "f" (func 0))) + "#, + ); + // return_call_indirect needs the tail-call feature in `wat`; skip if the + // fixture cannot be assembled in this build. + let Ok(module) = module else { return }; + let body = body_bytes(&module, 0); + + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let err = reencode_body(&body, &map, BodyOrigin::External) + .expect_err("return_call_indirect must be rejected"); + assert!( + matches!(err, LinkError::UnsupportedConstruct(_)), + "expected UnsupportedConstruct, got {err:?}" + ); + } + + #[test] + fn tail_call_is_rejected_not_reencoded() { + // `return_call` likewise has no re-encoder arm: it falls through to the + // allow-list and is rejected, on both the external and the main re-encode + // path. This is the direct-call counterpart to the bypass closure above. + let module = wat::parse_str( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + return_call 1) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0) + (export "f" (func 0))) + "#, + ); + let Ok(module) = module else { return }; + let body = body_bytes(&module, 0); + + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + for origin in [BodyOrigin::External, BodyOrigin::Main] { + let err = reencode_body(&body, &map, origin).expect_err("return_call must be rejected"); + assert!( + matches!(err, LinkError::UnsupportedConstruct(_)), + "{origin:?}: expected UnsupportedConstruct, got {err:?}" + ); + } + } + + #[test] + fn reencodes_function_typed_block() { + // A block whose type is a function type (multi-value form) must have its + // type index remapped, not copied stale. + let module = wat::parse_str( + r#" + (module + (type (;0;) (func)) + (type (;1;) (func (param i32) (result i32))) + (func (;0;) (type 0) + i32.const 7 + (block (type 1) (param i32) (result i32)) + drop) + (export "f" (func 0))) + "#, + ); + let Ok(module) = module else { return }; + let body = body_bytes(&module, 0); + + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let out = reencode_body(&body, &map, BodyOrigin::External) + .expect("re-encode function-typed block"); + let wrapped = wrap(&out); + + let has_remapped = operators(&wrapped, 0).into_iter().any(|op| { + matches!( + op, + Operator::Block { + blockty: inf_wasmparser::BlockType::FuncType(t) + } if t == 101 + ) + }); + assert!(has_remapped, "function-typed block index must be remapped to 101"); + } + + #[test] + fn preserves_empty_and_value_block_types() { + // The non-index block forms (empty + value result) must round-trip + // unchanged through the re-encoder. + let module = wat::parse_str( + r#" + (module + (type (;0;) (func)) + (func (;0;) (type 0) + (block) + (block (result i32) i32.const 1) drop) + (export "f" (func 0))) + "#, + ) + .unwrap(); + let body = body_bytes(&module, 0); + + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let out = reencode_body(&body, &map, BodyOrigin::External).expect("re-encode plain blocks"); + let wrapped = wrap(&out); + + let ops = operators(&wrapped, 0); + assert!( + ops.iter() + .any(|op| matches!(op, Operator::Block { blockty: inf_wasmparser::BlockType::Empty })), + "an empty block must round-trip" + ); + assert!( + ops.iter().any(|op| matches!( + op, + Operator::Block { blockty: inf_wasmparser::BlockType::Type(ValType::I32) } + )), + "an i32-result block must round-trip" + ); + } + + #[test] + fn reference_typed_local_is_unsupported() { + // A body declaring a `funcref` local cannot be re-encoded: the static + // merge models no reference types. `read_locals` must surface the error. + let module = wat::parse_str( + r#" + (module + (type (;0;) (func)) + (func (;0;) (type 0) + (local funcref)) + (export "f" (func 0))) + "#, + ) + .unwrap(); + let body = body_bytes(&module, 0); + + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let err = reencode_body(&body, &map, BodyOrigin::External) + .expect_err("ref-typed local must be rejected"); + assert!( + matches!(err, LinkError::UnsupportedConstruct(_)), + "expected UnsupportedConstruct, got {err:?}" + ); + } + + #[test] + fn unmapped_block_type_index_surfaces_a_clean_error() { + // A function-typed block whose type index has no mapping must propagate + // the `ty` closure's error through re-encoding, not panic. This models + // the merge feeding a body whose block type was never interned. + let module = wat::parse_str( + r#" + (module + (type (;0;) (func)) + (type (;1;) (func (param i32) (result i32))) + (func (;0;) (type 0) + i32.const 7 + (block (type 1) (param i32) (result i32)) + drop) + (export "f" (func 0))) + "#, + ); + let Ok(module) = module else { return }; + let body = body_bytes(&module, 0); + + let func = |f: u32| f; + let ty = |idx: u32| { + Err::(LinkError::UnsupportedConstruct(format!( + "unmapped type {idx}" + ))) + }; + let map = IndexMap { func: &func, ty: &ty }; + let err = reencode_body(&body, &map, BodyOrigin::External) + .expect_err("unmapped block type must error"); + assert!( + matches!(err, LinkError::UnsupportedConstruct(_)), + "expected UnsupportedConstruct, got {err:?}" + ); + } + + #[test] + fn reencodes_supported_value_type_locals() { + // Locals of every *supported* value type (the integer types) must map onto + // the encoder equivalents, covering those arms of `map_val_type`. The float + // and `v128` locals are exercised separately below: they are rejected, + // since the Inference language has no `f32`/`f64` or SIMD types. + let module = wat::parse_str( + r#" + (module + (type (;0;) (func)) + (func (;0;) (type 0) + (local i32 i64)) + (export "f" (func 0))) + "#, + ) + .unwrap(); + let body = body_bytes(&module, 0); + + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let out = reencode_body(&body, &map, BodyOrigin::External) + .expect("re-encode supported value-type locals"); + let wrapped = wrap(&out); + + let locals: Vec<_> = { + let mut idx = 0; + let mut found = Vec::new(); + for payload in Parser::new(0).parse_all(&wrapped) { + if let Payload::CodeSectionEntry(b) = payload.unwrap() { + if idx == 0 { + for e in b.get_locals_reader().unwrap() { + found.push(e.unwrap()); + } + } + idx += 1; + } + } + found + }; + let types: Vec = locals.iter().map(|(_, t)| *t).collect(); + assert_eq!( + types, + vec![ValType::I32, ValType::I64], + "every supported value-type local must survive re-encoding" + ); + } + + #[test] + fn v128_local_is_rejected() { + // A `v128` local cannot be re-encoded: the Inference language has no SIMD + // types and every SIMD operator is rejected, so the value-type chokepoint + // must reject the SIMD type too. This is the value-type backstop on the + // main-module path that bypasses the feature gate. + let module = wat::parse_str( + r#" + (module + (type (;0;) (func)) + (func (;0;) (type 0) + (local v128)) + (export "f" (func 0))) + "#, + ) + .unwrap(); + let body = body_bytes(&module, 0); + + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let err = reencode_body(&body, &map, BodyOrigin::External) + .expect_err("v128 local must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("v128")), + "expected a v128 UnsupportedConstruct, got {err:?}" + ); + } + + #[test] + fn float_local_is_rejected() { + // An `f32` or `f64` local cannot be re-encoded: the Inference language has + // no `f32`/`f64` types, so the value-type chokepoint rejects it. This is + // the value-type backstop on the main-module path that bypasses the + // feature gate; the operator-stream backstop is `safety::is_float`. + for ty in ["f32", "f64"] { + let module = wat::parse_str(format!( + r#" + (module + (type (;0;) (func)) + (func (;0;) (type 0) + (local {ty})) + (export "f" (func 0))) + "#, + )) + .unwrap(); + let body = body_bytes(&module, 0); + + let (func, ty_map) = shifting_map(); + let map = IndexMap { + func: &func, + ty: &ty_map, + }; + let err = reencode_body(&body, &map, BodyOrigin::External) + .expect_err("float local must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("floating-point")), + "{ty}: expected a floating-point UnsupportedConstruct, got {err:?}" + ); + } + } + + /// Hand-encodes a single-function body whose only operator is one of the + /// Inference non-det blocks (`forall`/`exists`/`assume`/`unique`) carrying a + /// `FuncType(type_idx)` block type. The `wat` crate cannot assemble the + /// custom `0xfc`-prefixed opcodes, so the body is built byte-by-byte: an + /// empty locals vector, the non-det opcode with a single-byte positive `s33` + /// type index, the block-closing `end`, and the function-closing `end`. + fn nondet_block_body(sub_opcode: u8, type_idx: u8) -> Vec { + assert!(type_idx < 0x40, "type index must be a single positive s33 byte"); + vec![ + 0x00, // zero locals + 0xfc, + sub_opcode, + type_idx, // s33-encoded function block-type index + 0x0b, // end (closes the non-det block) + 0x0b, // end (closes the function) + ] + } + + /// The `(sub_opcode)` for each non-det block operator, matching the codegen + /// and `inf-wasmparser` decoder. + const NONDET_OPS: &[(u8, &str)] = &[ + (0x3a, "forall"), + (0x3b, "exists"), + (0x3c, "assume"), + (0x3d, "unique"), + ]; + + /// Hand-encodes a single-function body whose only operator is an uzumaki + /// rvalue (`i32.uzumaki` = `0xfc 0x31`, `i64.uzumaki` = `0xfc 0x32`), which + /// pushes a value and is immediately dropped to keep the stack balanced. + fn uzumaki_body(sub_opcode: u8) -> Vec { + vec![ + 0x00, // zero locals + 0xfc, sub_opcode, // i32/i64.uzumaki + 0x1a, // drop + 0x0b, // end (closes the function) + ] + } + + /// The `(sub_opcode)` for each uzumaki rvalue. + const UZUMAKI_OPS: &[(u8, &str)] = &[(0x31, "i32.uzumaki"), (0x32, "i64.uzumaki")]; + + #[test] + fn external_nondet_block_is_rejected_as_non_executable() { + // H-2 (corrected): a forall/exists/assume/unique block is verification- + // only and has no executable semantics, so an *external* body merged into + // the output must reject it — never remap and copy it. Both the empty + // (codegen) form and the function-typed form reject identically. + for &(sub_opcode, name) in NONDET_OPS { + for body in [ + nondet_block_body(sub_opcode, 1), + vec![0x00, 0xfc, sub_opcode, 0x40, 0x0b, 0x0b], + ] { + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let err = reencode_body(&body, &map, BodyOrigin::External) + .err() + .unwrap_or_else(|| panic!("external {name} block must be rejected")); + assert!( + matches!(err, LinkError::UnsupportedConstruct(_)), + "{name}: expected UnsupportedConstruct, got {err:?}" + ); + } + } + } + + #[test] + fn external_uzumaki_is_rejected_as_non_executable() { + // H-2 (corrected): the uzumaki rvalues are verification-only and have no + // executable semantics; an external body merged into the output must + // reject them rather than copy them verbatim. + for &(sub_opcode, name) in UZUMAKI_OPS { + let body = uzumaki_body(sub_opcode); + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let err = reencode_body(&body, &map, BodyOrigin::External) + .err() + .unwrap_or_else(|| panic!("external {name} must be rejected")); + assert!( + matches!(err, LinkError::UnsupportedConstruct(_)), + "{name}: expected UnsupportedConstruct, got {err:?}" + ); + } + } + + #[test] + fn main_nondet_block_passes_through_as_proof_scaffolding() { + // The main module in proof mode legitimately carries non-det blocks as + // Rocq scaffolding. They must pass through the main re-encode path: the + // empty (codegen) form round-trips unchanged, and a function-typed form + // has only its block-type index remapped — never rejected. + for &(sub_opcode, name) in NONDET_OPS { + // Empty form round-trips unchanged. + let empty_body = vec![0x00, 0xfc, sub_opcode, 0x40, 0x0b, 0x0b]; + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let out = reencode_body(&empty_body, &map, BodyOrigin::Main) + .unwrap_or_else(|e| panic!("main empty {name} block: {e:?}")); + let wrapped = wrap(&out); + let empty = operators(&wrapped, 0).into_iter().any(|op| { + let blockty = match op { + Operator::Forall { blockty } + | Operator::Exists { blockty } + | Operator::Assume { blockty } + | Operator::Unique { blockty } => Some(blockty), + _ => None, + }; + matches!(blockty, Some(inf_wasmparser::BlockType::Empty)) + }); + assert!(empty, "main empty {name} block must round-trip unchanged"); + + // Function-typed form has its block-type index remapped (+100). + let functype_body = nondet_block_body(sub_opcode, 1); + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let out = reencode_body(&functype_body, &map, BodyOrigin::Main) + .unwrap_or_else(|e| panic!("main function-typed {name} block: {e:?}")); + let wrapped = wrap(&out); + let remapped = operators(&wrapped, 0).into_iter().any(|op| { + let blockty = match op { + Operator::Forall { blockty } + | Operator::Exists { blockty } + | Operator::Assume { blockty } + | Operator::Unique { blockty } => Some(blockty), + _ => None, + }; + matches!(blockty, Some(inf_wasmparser::BlockType::FuncType(t)) if t == 101) + }); + assert!( + remapped, + "main function-typed {name} block index must remap to 101" + ); + } + } + + #[test] + fn main_uzumaki_passes_through_verbatim() { + // The main module's uzumaki rvalues are proof scaffolding: they must be + // copied through the main re-encode path verbatim, never rejected. + for &(sub_opcode, name) in UZUMAKI_OPS { + let body = uzumaki_body(sub_opcode); + let (func, ty) = shifting_map(); + let map = IndexMap { func: &func, ty: &ty }; + let out = reencode_body(&body, &map, BodyOrigin::Main) + .unwrap_or_else(|e| panic!("main {name}: {e:?}")); + let wrapped = wrap(&out); + let survives = operators(&wrapped, 0) + .into_iter() + .any(|op| matches!(op, Operator::I32Uzumaki { .. } | Operator::I64Uzumaki { .. })); + assert!(survives, "main {name} must survive re-encoding verbatim"); + } + } +} + diff --git a/core/wasm-linker/src/safety.rs b/core/wasm-linker/src/safety.rs new file mode 100644 index 00000000..d3906557 --- /dev/null +++ b/core/wasm-linker/src/safety.rs @@ -0,0 +1,934 @@ +//! Fail-closed operator allow-list for the static merge. +//! +//! The merge copies external function bodies verbatim (re-indexing only the +//! handful of index-bearing operators). That is sound only for the small, +//! well-understood subset of WebAssembly the merge actually models: the integer +//! MVP instruction set plus the bulk-memory `memory.copy`/`memory.fill` forms +//! over the single shared memory. +//! +//! ## No floating point +//! +//! The Inference language has no `f32`/`f64` types, and the Rocq translator +//! (`wasm-to-v`) models no float instruction. The feature gate +//! ([`crate::SUPPORTED_WASM_FEATURES`]) already rejects an external carrying any +//! float type or instruction before its body reaches this allow-list, but the +//! main-module re-encode path does not pass through that gate, so this allow-list +//! is also the float backstop: every float operator (loads/stores, constants, +//! arithmetic, comparisons, conversions, reinterprets) is rejected here with a +//! "floating-point" diagnostic. The merge thus never copies a float instruction, +//! from either module role. +//! +//! Every *other* operator family — atomics, SIMD, exception handling, typed +//! function references, GC, stack switching, tail calls, sign-extension, +//! saturating float-to-int, and segment-indexed table initialization — carries +//! semantics the merge or the Rocq translator cannot satisfy: a shared/atomic +//! memory it does not reconcile, a tag section it drops, a type index it never +//! interns, a reference type it cannot encode, or a conversion the translator +//! has no lowering for. Copying such an operator verbatim produces a +//! structurally-invalid module, an untranslatable proof artifact, or a silent +//! miscompile. +//! +//! ## Verification-only constructs are not executable +//! +//! The Inference non-deterministic blocks (`forall`/`exists`/`assume`/`unique`) +//! and the uzumaki rvalues (`i32.uzumaki`/`i64.uzumaki`) are **proof-only**: +//! they have meaning solely in the Rocq lowering and no executable runtime +//! semantics. A function that gets *merged* into the output is, by construction, +//! part of an executable binary — so a verification-only opcode inside such a +//! body would make the output non-executable (a miscompile). This allow-list +//! therefore **rejects** every non-det/uzumaki opcode: an external whose +//! merged-closure body carries one is surfaced as +//! [`LinkError::UnsupportedConstruct`] rather than copied verbatim. (The main +//! module in proof mode legitimately carries these opcodes as proof scaffolding; +//! it is rebuilt through a separate verbatim path that never consults this +//! allow-list — see [`crate::rewrite`].) +//! +//! This module is the single source of truth for what may cross the merge. It +//! is **fail-closed**: an operator is accepted only if it is explicitly on the +//! safe list, so a future opcode family added to the parser cannot fall through +//! a wildcard arm and be copied silently. Both the closure effect scanner +//! ([`crate::closure`]) and the body re-encoder ([`crate::rewrite`]) gate on +//! [`check_operator`], so an unmergeable construct is rejected the first time +//! it is seen, before any output index is committed. + +use inf_wasmparser::{MemArg, Operator}; + +use crate::LinkError; + +/// Maximum structured-control-flow nesting depth a mergeable external body may +/// reach. +/// +/// The merge copies bodies verbatim, but the downstream wasm-to-v translator +/// builds and renders an expression tree by self-recursion (one frame per +/// nesting level). A body of thousands of nested blocks overflows the +/// translator's stack — an unrecoverable `abort()` on the `-v` proof path. +/// Rejecting an over-nested body here, during the closure scan that backs the +/// `link`/`-o` path, turns that DoS into a clean [`LinkError`] *before* the +/// body is committed to the merged module, so neither the `-o` nor the `-v` +/// path can reach the translator with a body it cannot render. The bound +/// matches the translator's own cap so the two passes agree on what is +/// admissible, and sits far above any nesting a real Inference function emits. +pub(crate) const MAX_CONTROL_DEPTH: usize = 256; + +/// Whether `op` opens a structured-control-flow region (a matching `End` +/// closes it). Used to bound nesting depth during the closure scan. +pub(crate) fn opens_control_frame(op: &Operator) -> bool { + use Operator::*; + matches!( + op, + Block { .. } + | Loop { .. } + | If { .. } + | Forall { .. } + | Exists { .. } + | Assume { .. } + | Unique { .. } + ) +} + +/// Whether `op` is a verification-only construct: an Inference +/// non-deterministic block (`forall`/`exists`/`assume`/`unique`) or an uzumaki +/// rvalue (`i32.uzumaki`/`i64.uzumaki`). +/// +/// These opcodes have meaning only in the Rocq lowering and no executable +/// runtime semantics, so they must never appear inside an executable function +/// the merge copies into the output. [`check_operator`] rejects them on the +/// strength of this predicate; the main-module verbatim re-encode path +/// (`crate::rewrite`) uses it to *recognise and pass through* the same opcodes, +/// which are legitimate proof scaffolding there. +pub(crate) fn is_verification_only(op: &Operator) -> bool { + use Operator::*; + matches!( + op, + Forall { .. } + | Exists { .. } + | Assume { .. } + | Unique { .. } + | I32Uzumaki { .. } + | I64Uzumaki { .. } + ) +} + +/// What an operator touches, for tier classification. Computed as a side effect +/// of the safety check so the closure scanner and the allow-list never disagree +/// about an operator's category. +#[derive(Debug, Default, Clone, Copy)] +pub(crate) struct OpEffect { + /// The operator accesses linear memory by address (load/store/size/grow/ + /// copy/fill). Drives Tier-A vs Tier-B classification. + pub uses_memory: bool, + /// The operator grows linear memory (`memory.grow`). Recorded separately + /// from `uses_memory` so the merge can reconcile (or reject) growth against + /// the reconciled output memory's maximum. + pub uses_memory_grow: bool, + /// The operator reads or writes a global. + pub uses_globals: bool, + /// The operator refers to a data segment (`memory.init` / `data.drop`). + pub uses_data_segments: bool, + /// The operator touches the table / element space (`call_indirect`, + /// `table.*`, `ref.func`, `elem.drop`, `memory.init` element forms). + pub uses_tables: bool, +} + +/// Verifies that `op` is one the static merge can soundly copy, returning the +/// effects it carries for tier classification. +/// +/// # Contract: every allow-listed operator must be translatable +/// +/// An operator admitted here can end up in the linker's output, which the +/// downstream `wasm-to-v` translator must lower to Rocq without panicking. The +/// two instruction sets are kept in lockstep by the integration test +/// `tests/v_alignment.rs`: it links a fixture exercising each allow-listed family +/// and asserts the linked output translates. Any operator family newly added to +/// this allow-list (or to the feature gate in [`crate::SUPPORTED_WASM_FEATURES`]) +/// therefore requires a corresponding corpus entry in `tests/v_alignment.rs`, +/// confirming the translator has a lowering for it. Admitting a family the +/// translator hits `todo!()` on yields a clean link followed by an unrecoverable +/// abort on the `-v` proof path. +/// +/// # Errors +/// +/// Returns [`LinkError::UnsupportedConstruct`] for any operator outside the +/// proven-safe set: any floating-point instruction (the Inference language has +/// no `f32`/`f64` types, see [`is_float`]), atomics, SIMD, exception handling, +/// typed references, GC, stack switching, tail calls, sign-extension, saturating +/// float-to-int, segment-indexed table initialization, multi-memory access (a +/// non-zero memarg memory index), the verification-only non-det/uzumaki opcodes +/// (which have no executable semantics, see [`is_verification_only`]), and any +/// other operator family the merge does not model. +pub(crate) fn check_operator(op: &Operator) -> Result { + use Operator::*; + + // Verification-only constructs (non-det blocks and uzumaki) carry no + // executable semantics: they exist solely for the Rocq lowering. A merged + // function is part of an executable binary, so copying one of these opcodes + // into it would yield a non-executable output (a miscompile). Reject before + // any effect classification, so neither the closure scan nor the re-encoder + // can admit one. The main module's proof scaffolding is rebuilt through a + // separate verbatim path that never reaches this allow-list. + if is_verification_only(op) { + return Err(LinkError::UnsupportedConstruct(format!( + "verification-only construct {} has no executable semantics and cannot be merged into an executable binary", + verification_only_family(op) + ))); + } + + // Reject every floating-point instruction. The Inference language has no + // `f32`/`f64` types, so its codegen never emits one, and the Rocq translator + // models no float operator: a merged float instruction would be either an + // untranslatable proof artifact or a miscompile. The feature gate already + // rejects a float-using external before its body reaches here, but the + // main-module re-encode path bypasses that gate, so this is the float + // backstop on the executable merge path. Reject right after the + // verification-only check, before any effect classification. + if is_float(op) { + return Err(LinkError::UnsupportedConstruct(format!( + "floating-point instruction `{}` is not supported by the static merge: the Inference language has no f32/f64 types", + float_mnemonic(op) + ))); + } + + // Reject any memory access that names a memory other than the single shared + // memory 0. This closes the multi-memory miscompile (H14) uniformly for + // every memarg-bearing operator, including ones added to the parser later. + let reject_nonzero_memory = |memarg: &MemArg| -> Result<(), LinkError> { + if memarg.memory != 0 { + return Err(LinkError::UnsupportedConstruct(format!( + "memory access targets memory {} (multi-memory is not supported by the static merge)", + memarg.memory + ))); + } + Ok(()) + }; + + let effect = match op { + // -- Structured control flow (block types handled by the re-encoder) -- + Unreachable | Nop | Block { .. } | Loop { .. } | If { .. } | Else | End | Br { .. } + | BrIf { .. } | BrTable { .. } | Return => OpEffect::default(), + + // The Inference non-deterministic block extensions + // (`forall`/`exists`/`assume`/`unique`) are verification-only and are + // rejected above by `is_verification_only`; they never reach this match. + + // -- Direct calls (function index re-encoded). The tail-call form + // (`return_call`) is rejected as an unmodeled family below: the Rocq + // translator has no lowering for it, and Inference codegen never emits + // it. -- + Call { .. } => OpEffect::default(), + + // -- Indirect calls touch the table/type space. The tail-call form + // (`return_call_indirect`) is rejected as an unmodeled family below + // for the same reason as `return_call`. -- + CallIndirect { .. } => OpEffect { + uses_tables: true, + ..OpEffect::default() + }, + + // -- Parametric -- + Drop | Select => OpEffect::default(), + + // -- Locals -- + LocalGet { .. } | LocalSet { .. } | LocalTee { .. } => OpEffect::default(), + + // -- Globals -- + GlobalGet { .. } | GlobalSet { .. } => OpEffect { + uses_globals: true, + ..OpEffect::default() + }, + + // -- Integer memory load/store over the single shared memory. + // The float forms (`f32.load`/`f64.store`/…) are rejected above by + // `is_float`; they never reach this match. -- + I32Load { memarg } | I64Load { memarg } + | I32Load8S { memarg } | I32Load8U { memarg } | I32Load16S { memarg } + | I32Load16U { memarg } | I64Load8S { memarg } | I64Load8U { memarg } + | I64Load16S { memarg } | I64Load16U { memarg } | I64Load32S { memarg } + | I64Load32U { memarg } | I32Store { memarg } | I64Store { memarg } + | I32Store8 { memarg } + | I32Store16 { memarg } | I64Store8 { memarg } | I64Store16 { memarg } + | I64Store32 { memarg } => { + reject_nonzero_memory(memarg)?; + OpEffect { + uses_memory: true, + ..OpEffect::default() + } + } + MemorySize { mem } => { + if *mem != 0 { + return Err(LinkError::UnsupportedConstruct(format!( + "memory access targets memory {mem} (multi-memory is not supported by the static merge)" + ))); + } + OpEffect { + uses_memory: true, + ..OpEffect::default() + } + } + MemoryGrow { mem } => { + if *mem != 0 { + return Err(LinkError::UnsupportedConstruct(format!( + "memory access targets memory {mem} (multi-memory is not supported by the static merge)" + ))); + } + OpEffect { + uses_memory: true, + uses_memory_grow: true, + ..OpEffect::default() + } + } + + // -- Bulk memory over the single shared memory -- + MemoryFill { mem } => { + if *mem != 0 { + return Err(LinkError::UnsupportedConstruct(format!( + "memory.fill targets memory {mem} (multi-memory is not supported by the static merge)" + ))); + } + OpEffect { + uses_memory: true, + ..OpEffect::default() + } + } + MemoryCopy { dst_mem, src_mem } => { + if *dst_mem != 0 || *src_mem != 0 { + return Err(LinkError::UnsupportedConstruct(format!( + "memory.copy crosses memories {src_mem} -> {dst_mem} (multi-memory is not supported by the static merge)" + ))); + } + OpEffect { + uses_memory: true, + ..OpEffect::default() + } + } + // Segment-indexed bulk-memory forms carry their own static data / + // elements, which the merge cannot relocate: surface them as Tier-C + // effects (data / table use) rather than copy them. + MemoryInit { mem, .. } => { + if *mem != 0 { + return Err(LinkError::UnsupportedConstruct(format!( + "memory.init targets memory {mem} (multi-memory is not supported by the static merge)" + ))); + } + OpEffect { + uses_memory: true, + uses_data_segments: true, + ..OpEffect::default() + } + } + DataDrop { .. } => OpEffect { + uses_data_segments: true, + ..OpEffect::default() + }, + // The segment-indexed table forms (`table.init`/`elem.drop`/`table.copy`) + // carry their own element segments the merge cannot relocate, and the + // Rocq translator has no lowering for them; they are rejected as an + // unmodeled family below. The non-segment table accessors are modeled. + TableGet { .. } | TableSet { .. } | TableGrow { .. } | TableSize { .. } + | TableFill { .. } => OpEffect { + uses_tables: true, + ..OpEffect::default() + }, + RefFunc { .. } => OpEffect { + uses_tables: true, + ..OpEffect::default() + }, + + // -- Integer constants. The float constants (`f32.const`/`f64.const`) + // are rejected above by `is_float`; they never reach this match. -- + I32Const { .. } | I64Const { .. } => OpEffect::default(), + + // The Inference uzumaki rvalues (`i32.uzumaki`/`i64.uzumaki`) are + // verification-only and are rejected above by `is_verification_only`; + // they never reach this match. + + // -- Numeric (comparisons, arithmetic, conversions) -- + _ if is_numeric(op) => OpEffect::default(), + + // -- Everything else is fail-closed -- + other => { + return Err(LinkError::UnsupportedConstruct(format!( + "operator {} is not supported by the static merge", + operator_family(other) + ))); + } + }; + + Ok(effect) +} + +/// Whether `op` is a pure integer numeric operator: an integer comparison, +/// arithmetic, bitwise, or width conversion instruction. These carry no index +/// and no effect, so they are always safe to copy verbatim. +/// +/// Float numeric operators are deliberately excluded — they are rejected up +/// front by [`is_float`] (the Inference language has no `f32`/`f64` types). So +/// are sign-extension (`i32.extend8_s`, …) and saturating float-to-int +/// (`i32.trunc_sat_f32_s`, …): the Rocq translator has no lowering for either, +/// and Inference codegen emits neither, so they reject as unmodeled families +/// rather than copy into a body the `-v` proof path cannot render. Only the +/// three integer width conversions (`i32.wrap_i64`, `i64.extend_i32_s/u`) remain +/// from the conversion block. +fn is_numeric(op: &Operator) -> bool { + use Operator::*; + matches!( + op, + // i32 comparisons + I32Eqz | I32Eq | I32Ne | I32LtS | I32LtU | I32GtS | I32GtU | I32LeS | I32LeU | I32GeS + | I32GeU + // i64 comparisons + | I64Eqz | I64Eq | I64Ne | I64LtS | I64LtU | I64GtS | I64GtU | I64LeS | I64LeU + | I64GeS | I64GeU + // i32 arithmetic / bitwise + | I32Clz | I32Ctz | I32Popcnt | I32Add | I32Sub | I32Mul | I32DivS | I32DivU + | I32RemS | I32RemU | I32And | I32Or | I32Xor | I32Shl | I32ShrS | I32ShrU | I32Rotl + | I32Rotr + // i64 arithmetic / bitwise + | I64Clz | I64Ctz | I64Popcnt | I64Add | I64Sub | I64Mul | I64DivS | I64DivU + | I64RemS | I64RemU | I64And | I64Or | I64Xor | I64Shl | I64ShrS | I64ShrU | I64Rotl + | I64Rotr + // integer width conversions + | I32WrapI64 | I64ExtendI32S | I64ExtendI32U + ) +} + +/// Whether `op` is a floating-point instruction: a float comparison, +/// arithmetic, conversion, reinterpret, load/store, or constant. +/// +/// The Inference language has no `f32`/`f64` types, so its codegen never emits +/// one, and the Rocq translator models none of them. [`check_operator`] rejects +/// every such operator with a "floating-point" diagnostic, the executable-merge +/// backstop to the feature gate (which rejects a float-using external before its +/// body reaches the allow-list, but which the main-module re-encode path does +/// not traverse). +fn is_float(op: &Operator) -> bool { + use Operator::*; + matches!( + op, + // float comparisons + F32Eq | F32Ne | F32Lt | F32Gt | F32Le | F32Ge + | F64Eq | F64Ne | F64Lt | F64Gt | F64Le | F64Ge + // f32 arithmetic + | F32Abs | F32Neg | F32Ceil | F32Floor | F32Trunc | F32Nearest | F32Sqrt | F32Add + | F32Sub | F32Mul | F32Div | F32Min | F32Max | F32Copysign + // f64 arithmetic + | F64Abs | F64Neg | F64Ceil | F64Floor | F64Trunc | F64Nearest | F64Sqrt | F64Add + | F64Sub | F64Mul | F64Div | F64Min | F64Max | F64Copysign + // float-involving conversions + | I32TruncF32S | I32TruncF32U | I32TruncF64S | I32TruncF64U + | I64TruncF32S | I64TruncF32U | I64TruncF64S | I64TruncF64U + | F32ConvertI32S | F32ConvertI32U | F32ConvertI64S | F32ConvertI64U + | F32DemoteF64 | F64ConvertI32S | F64ConvertI32U | F64ConvertI64S | F64ConvertI64U + | F64PromoteF32 + // reinterprets between float and integer + | I32ReinterpretF32 | I64ReinterpretF64 | F32ReinterpretI32 | F64ReinterpretI64 + // saturating float-to-int conversions + | I32TruncSatF32S | I32TruncSatF32U | I32TruncSatF64S | I32TruncSatF64U + | I64TruncSatF32S | I64TruncSatF32U | I64TruncSatF64S | I64TruncSatF64U + // float loads / stores + | F32Load { .. } | F64Load { .. } | F32Store { .. } | F64Store { .. } + // float constants + | F32Const { .. } | F64Const { .. } + ) +} + +/// A human-readable mnemonic for a floating-point operator, for the rejection +/// diagnostic. Only the float opcodes [`is_float`] recognises reach this +/// function. +fn float_mnemonic(op: &Operator) -> &'static str { + use Operator::*; + match op { + F32Eq => "f32.eq", + F32Ne => "f32.ne", + F32Lt => "f32.lt", + F32Gt => "f32.gt", + F32Le => "f32.le", + F32Ge => "f32.ge", + F64Eq => "f64.eq", + F64Ne => "f64.ne", + F64Lt => "f64.lt", + F64Gt => "f64.gt", + F64Le => "f64.le", + F64Ge => "f64.ge", + F32Abs => "f32.abs", + F32Neg => "f32.neg", + F32Ceil => "f32.ceil", + F32Floor => "f32.floor", + F32Trunc => "f32.trunc", + F32Nearest => "f32.nearest", + F32Sqrt => "f32.sqrt", + F32Add => "f32.add", + F32Sub => "f32.sub", + F32Mul => "f32.mul", + F32Div => "f32.div", + F32Min => "f32.min", + F32Max => "f32.max", + F32Copysign => "f32.copysign", + F64Abs => "f64.abs", + F64Neg => "f64.neg", + F64Ceil => "f64.ceil", + F64Floor => "f64.floor", + F64Trunc => "f64.trunc", + F64Nearest => "f64.nearest", + F64Sqrt => "f64.sqrt", + F64Add => "f64.add", + F64Sub => "f64.sub", + F64Mul => "f64.mul", + F64Div => "f64.div", + F64Min => "f64.min", + F64Max => "f64.max", + F64Copysign => "f64.copysign", + I32TruncF32S => "i32.trunc_f32_s", + I32TruncF32U => "i32.trunc_f32_u", + I32TruncF64S => "i32.trunc_f64_s", + I32TruncF64U => "i32.trunc_f64_u", + I64TruncF32S => "i64.trunc_f32_s", + I64TruncF32U => "i64.trunc_f32_u", + I64TruncF64S => "i64.trunc_f64_s", + I64TruncF64U => "i64.trunc_f64_u", + F32ConvertI32S => "f32.convert_i32_s", + F32ConvertI32U => "f32.convert_i32_u", + F32ConvertI64S => "f32.convert_i64_s", + F32ConvertI64U => "f32.convert_i64_u", + F32DemoteF64 => "f32.demote_f64", + F64ConvertI32S => "f64.convert_i32_s", + F64ConvertI32U => "f64.convert_i32_u", + F64ConvertI64S => "f64.convert_i64_s", + F64ConvertI64U => "f64.convert_i64_u", + F64PromoteF32 => "f64.promote_f32", + I32ReinterpretF32 => "i32.reinterpret_f32", + I64ReinterpretF64 => "i64.reinterpret_f64", + F32ReinterpretI32 => "f32.reinterpret_i32", + F64ReinterpretI64 => "f64.reinterpret_i64", + I32TruncSatF32S => "i32.trunc_sat_f32_s", + I32TruncSatF32U => "i32.trunc_sat_f32_u", + I32TruncSatF64S => "i32.trunc_sat_f64_s", + I32TruncSatF64U => "i32.trunc_sat_f64_u", + I64TruncSatF32S => "i64.trunc_sat_f32_s", + I64TruncSatF32U => "i64.trunc_sat_f32_u", + I64TruncSatF64S => "i64.trunc_sat_f64_s", + I64TruncSatF64U => "i64.trunc_sat_f64_u", + F32Load { .. } => "f32.load", + F64Load { .. } => "f64.load", + F32Store { .. } => "f32.store", + F64Store { .. } => "f64.store", + F32Const { .. } => "f32.const", + F64Const { .. } => "f64.const", + _ => "a floating-point instruction", + } +} + +/// A human-readable label for a verification-only operator, for the rejection +/// diagnostic. Only the non-det/uzumaki opcodes [`is_verification_only`] +/// recognises reach this function. +fn verification_only_family(op: &Operator) -> &'static str { + use Operator::*; + match op { + Forall { .. } => "non-deterministic block `forall`", + Exists { .. } => "non-deterministic block `exists`", + Assume { .. } => "non-deterministic block `assume`", + Unique { .. } => "non-deterministic block `unique`", + I32Uzumaki { .. } => "uzumaki rvalue `i32.uzumaki`", + I64Uzumaki { .. } => "uzumaki rvalue `i64.uzumaki`", + _ => "a verification-only construct", + } +} + +/// A human-readable family label for an unsupported operator, for diagnostics. +/// Keeps the error message stable and meaningful without printing the full +/// (often large) operator debug form. +fn operator_family(op: &Operator) -> &'static str { + use Operator::*; + match op { + // Tail calls. The Rocq translator has no lowering for them, and + // Inference codegen never emits them (the sret-forwarding path lowers to + // a plain `call`); an external using either is the only source. + ReturnCall { .. } => "tail calls (return_call)", + ReturnCallIndirect { .. } => "tail calls (return_call_indirect)", + // Sign-extension proposal. Not modeled by the Rocq translator; Inference + // codegen narrows sub-i32 values with shifts/masks instead. + I32Extend8S | I32Extend16S | I64Extend8S | I64Extend16S | I64Extend32S => { + "sign-extension (not supported by the Rocq translator)" + } + // Segment-indexed table initialization. Carries element segments the + // merge cannot relocate, and the Rocq translator has no lowering for it. + TableInit { .. } | ElemDrop { .. } | TableCopy { .. } => { + "segment-indexed table initialization (table.init / elem.drop / table.copy)" + } + // Exception handling (and a defined tag section it implies). + TryTable { .. } | Throw { .. } | ThrowRef => "exception handling (throw / try_table)", + Try { .. } | Catch { .. } | Rethrow { .. } | Delegate { .. } | CatchAll => { + "legacy exception handling" + } + // Typed function references. + CallRef { .. } | ReturnCallRef { .. } | RefAsNonNull | BrOnNull { .. } + | BrOnNonNull { .. } => "typed function references (call_ref / ref.as_non_null)", + RefNull { .. } | RefIsNull | TypedSelect { .. } => "reference types (ref.null / select t)", + // Atomics (0xFE threads family). + AtomicFence | MemoryAtomicNotify { .. } | MemoryAtomicWait32 { .. } + | MemoryAtomicWait64 { .. } => "atomic memory operations", + // SIMD (0xFD family). V128Const carries no memarg but is still SIMD. + V128Const { .. } => "SIMD (v128)", + _ => "an unmodeled WASM construct", + } +} + +#[cfg(test)] +mod tests { + //! Direct unit tests for the fail-closed operator allow-list. + //! + //! Each test assembles a one-function module whose body contains the + //! operator under test, extracts that operator from the code section, and + //! checks it against [`check_operator`]. The proven-safe operators must + //! return their expected effect; every unmodeled family must reject with + //! [`LinkError::UnsupportedConstruct`]. + + use super::*; + use inf_wasmparser::{BinaryReader, FunctionBody, Parser, Payload}; + + /// Returns the operators of the first function body in a WAT module. + fn ops(wat: &str) -> Vec> { + let bytes = wat::parse_str(wat).expect("valid WAT"); + // Leak the bytes so the borrowed operators can outlive this helper; the + // test process is short-lived and this keeps the call sites terse. + let bytes: &'static [u8] = Box::leak(bytes.into_boxed_slice()); + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::CodeSectionEntry(body) = payload.expect("payload") { + let func_body = FunctionBody::new(BinaryReader::new(body.as_bytes(), 0)); + return func_body + .get_operators_reader() + .expect("operators") + .into_iter() + .map(|op| op.expect("operator")) + .collect(); + } + } + panic!("no code section"); + } + + /// Whether any operator of the body is rejected by the allow-list. + fn body_is_rejected(wat: &str) -> bool { + ops(wat).iter().any(|op| check_operator(op).is_err()) + } + + #[test] + fn mvp_arithmetic_is_accepted_with_no_effect() { + for op in ops( + r#"(module (func (param i32 i32) (result i32) + local.get 0 local.get 1 i32.add) (export "f" (func 0)))"#, + ) { + let effect = check_operator(&op).expect("mvp op accepted"); + assert!(!effect.uses_memory && !effect.uses_globals && !effect.uses_tables); + } + } + + #[test] + fn memory_load_marks_memory_use() { + let any_memory = ops( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 i32.load) (export "f" (func 0)))"#, + ) + .iter() + .any(|op| check_operator(op).is_ok_and(|e| e.uses_memory)); + assert!(any_memory, "i32.load must mark memory use"); + } + + #[test] + fn global_access_marks_global_use() { + let any_global = ops( + r#"(module (global i32 (i32.const 0)) (func (result i32) + global.get 0) (export "f" (func 0)))"#, + ) + .iter() + .any(|op| check_operator(op).is_ok_and(|e| e.uses_globals)); + assert!(any_global, "global.get must mark global use"); + } + + #[test] + fn nonzero_memarg_memory_index_is_rejected() { + // A store naming memory 1 must reject even though `i32.store` over + // memory 0 is accepted, closing the multi-memory hole uniformly. + assert!(body_is_rejected( + r#"(module (memory 1) (memory 1) (func (param i32 i32) + local.get 0 local.get 1 i32.store 1) (export "f" (func 0)))"#, + )); + } + + #[test] + fn atomic_op_is_rejected() { + assert!(body_is_rejected( + r#"(module (memory 1 1 shared) (func (param i32 i32) (result i32) + local.get 0 local.get 1 i32.atomic.rmw.add) (export "f" (func 0)))"#, + )); + } + + #[test] + fn simd_op_is_rejected() { + assert!(body_is_rejected( + r#"(module (memory 1) (func (param i32) (result i32) + local.get 0 v128.load drop i32.const 0) (export "f" (func 0)))"#, + )); + } + + #[test] + fn exception_handling_is_rejected() { + assert!(body_is_rejected( + r#"(module (type (func)) (tag (type 0)) (func (param i32) (result i32) + throw 0) (export "f" (func 0)))"#, + )); + } + + #[test] + fn typed_reference_is_rejected() { + assert!(body_is_rejected( + r#"(module (func (param i32) (result i32) + ref.null func drop local.get 0) (export "f" (func 0)))"#, + )); + } + + #[test] + fn indirect_call_marks_table_use() { + let any_table = ops( + r#"(module (type (func)) (table 1 funcref) (func + i32.const 0 call_indirect (type 0)) (export "f" (func 0)))"#, + ) + .iter() + .any(|op| check_operator(op).is_ok_and(|e| e.uses_tables)); + assert!(any_table, "call_indirect must mark table use"); + } + + /// Wraps a raw code-section body (locals vector + operator stream, no length + /// prefix) into a one-function module and returns its operators. `wat` + /// cannot assemble the custom `0xfc`-prefixed Inference opcodes, so the + /// bodies that exercise them are built byte-by-byte. + fn body_ops(body: &[u8]) -> Vec> { + use wasm_encoder::{CodeSection, Function, Module, TypeSection}; + let mut module = Module::new(); + let mut types = TypeSection::new(); + types.ty().function([], []); + module.section(&types); + let mut funcs = wasm_encoder::FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + let mut code = CodeSection::new(); + let mut f = Function::new([]); + f.raw(body.iter().copied()); + code.function(&f); + module.section(&code); + let bytes: &'static [u8] = Box::leak(module.finish().into_boxed_slice()); + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::CodeSectionEntry(fb) = payload.expect("payload") { + let func_body = FunctionBody::new(BinaryReader::new(fb.as_bytes(), 0)); + return func_body + .get_operators_reader() + .expect("operators") + .into_iter() + .map(|op| op.expect("operator")) + .collect(); + } + } + panic!("no code section"); + } + + #[test] + fn nondet_blocks_are_verification_only_and_rejected() { + // H-2 (corrected): each non-det block is verification-only and has no + // executable semantics, so the merge allow-list must reject it. + for sub_opcode in [0x3a, 0x3b, 0x3c, 0x3d] { + // ` (empty) end; end` over a one-byte locals vector. + let body = [0x00, 0xfc, sub_opcode, 0x40, 0x0b, 0x0b]; + for op in body_ops(&body) { + if matches!( + op, + Operator::Forall { .. } + | Operator::Exists { .. } + | Operator::Assume { .. } + | Operator::Unique { .. } + ) { + assert!(is_verification_only(&op), "non-det op must be classified verification-only"); + let err = check_operator(&op).expect_err("non-det op must be rejected"); + assert!( + matches!(err, LinkError::UnsupportedConstruct(_)), + "expected UnsupportedConstruct, got {err:?}" + ); + } + } + } + } + + #[test] + fn uzumaki_rvalues_are_verification_only_and_rejected() { + // H-2 (corrected): each uzumaki rvalue is verification-only and has no + // executable semantics, so the merge allow-list must reject it. + for sub_opcode in [0x31, 0x32] { + // ` drop; end` over a one-byte locals vector. + let body = [0x00, 0xfc, sub_opcode, 0x1a, 0x0b]; + for op in body_ops(&body) { + if matches!(op, Operator::I32Uzumaki { .. } | Operator::I64Uzumaki { .. }) { + assert!(is_verification_only(&op), "uzumaki must be classified verification-only"); + let err = check_operator(&op).expect_err("uzumaki must be rejected"); + assert!( + matches!(err, LinkError::UnsupportedConstruct(_)), + "expected UnsupportedConstruct, got {err:?}" + ); + } + } + } + } + + #[test] + fn plain_ops_are_not_verification_only() { + // A guard against the predicate over-matching: ordinary executable ops + // (arithmetic, calls, constants) must never be flagged verification-only. + for op in ops( + r#"(module (func (param i32 i32) (result i32) + local.get 0 local.get 1 i32.add) (export "f" (func 0)))"#, + ) { + assert!( + !is_verification_only(&op), + "{op:?} must not be classified verification-only" + ); + } + } + + /// Asserts that some operator of the body rejects via [`check_operator`] with + /// a message containing every fragment in `needles`. The body is assembled + /// from WAT (which does not validate features, so float and post-1.0 opcodes + /// assemble), letting the allow-list be exercised directly. + fn assert_body_rejects_with(wat: &str, needles: &[&str]) { + let rejection = ops(wat).iter().find_map(|op| match check_operator(op) { + Err(LinkError::UnsupportedConstruct(msg)) => Some(msg), + _ => None, + }); + let msg = rejection + .unwrap_or_else(|| panic!("no operator of `{wat}` rejected via check_operator")); + for needle in needles { + assert!( + msg.contains(needle), + "rejection message {msg:?} must contain {needle:?}" + ); + } + } + + #[test] + fn float_arithmetic_is_rejected_with_mnemonic() { + // Float arithmetic carries no executable meaning for Inference (no + // `f32`/`f64` types) and no Rocq lowering: the allow-list rejects it with + // a "floating-point" diagnostic naming the exact mnemonic. + assert_body_rejects_with( + r#"(module (func (param f32 f32) (result f32) + local.get 0 local.get 1 f32.add) (export "f" (func 0)))"#, + &["floating-point", "f32.add"], + ); + assert_body_rejects_with( + r#"(module (func (param f64) (result f64) + local.get 0 f64.sqrt) (export "f" (func 0)))"#, + &["floating-point", "f64.sqrt"], + ); + } + + #[test] + fn float_load_store_is_rejected_with_mnemonic() { + assert_body_rejects_with( + r#"(module (memory 1) (func (param i32) (result f32) + local.get 0 f32.load) (export "f" (func 0)))"#, + &["floating-point", "f32.load"], + ); + assert_body_rejects_with( + r#"(module (memory 1) (func (param i32 f64) + local.get 0 local.get 1 f64.store) (export "f" (func 0)))"#, + &["floating-point", "f64.store"], + ); + } + + #[test] + fn float_const_is_rejected_with_mnemonic() { + assert_body_rejects_with( + r#"(module (func (result f32) f32.const 1) (export "f" (func 0)))"#, + &["floating-point", "f32.const"], + ); + } + + #[test] + fn float_conversion_is_rejected_with_mnemonic() { + assert_body_rejects_with( + r#"(module (func (param f32) (result i32) + local.get 0 i32.trunc_f32_s) (export "f" (func 0)))"#, + &["floating-point", "i32.trunc_f32_s"], + ); + assert_body_rejects_with( + r#"(module (func (param i64) (result f64) + local.get 0 f64.convert_i64_u) (export "f" (func 0)))"#, + &["floating-point", "f64.convert_i64_u"], + ); + } + + #[test] + fn float_reinterpret_is_rejected_with_mnemonic() { + assert_body_rejects_with( + r#"(module (func (param f32) (result i32) + local.get 0 i32.reinterpret_f32) (export "f" (func 0)))"#, + &["floating-point", "i32.reinterpret_f32"], + ); + } + + #[test] + fn saturating_truncation_is_rejected_with_mnemonic() { + // A saturating float-to-int conversion is a float op as far as the + // allow-list is concerned: the Rocq translator has no lowering for it. + assert_body_rejects_with( + r#"(module (func (param f32) (result i32) + local.get 0 i32.trunc_sat_f32_s) (export "f" (func 0)))"#, + &["floating-point", "i32.trunc_sat_f32_s"], + ); + } + + #[test] + fn sign_extension_op_is_rejected() { + // Sign-extension is an integer op, so it is not flagged by `is_float`; it + // reaches the fail-closed wildcard and rejects as the sign-extension + // family, named by `operator_family`. + assert_body_rejects_with( + r#"(module (func (param i32) (result i32) + local.get 0 i32.extend8_s) (export "f" (func 0)))"#, + &["sign-extension"], + ); + } + + #[test] + fn tail_call_op_is_rejected() { + // `return_call` is an integer-typed control op, rejected as the tail-call + // family by the fail-closed wildcard. + assert_body_rejects_with( + r#"(module + (func (param i32) (result i32) local.get 0 return_call 1) + (func (param i32) (result i32) local.get 0) + (export "f" (func 0)))"#, + &["tail call", "return_call"], + ); + } + + #[test] + fn segment_indexed_table_init_is_rejected() { + // `table.init` carries an element segment the merge cannot relocate and + // the Rocq translator cannot lower; it rejects as the segment-indexed + // table-initialization family. + assert_body_rejects_with( + r#"(module (table 1 funcref) (elem func 0) + (func i32.const 0 i32.const 0 i32.const 0 table.init 0) + (export "f" (func 0)))"#, + &["table.init"], + ); + } + + #[test] + fn integer_width_conversions_are_accepted() { + // The three integer width conversions survive the allow-list (they are + // the only conversions Inference codegen emits and the translator models). + for op in ops( + r#"(module (func (param i64) (result i64) + local.get 0 i32.wrap_i64 i64.extend_i32_s) (export "f" (func 0)))"#, + ) { + check_operator(&op).expect("integer width conversion must be accepted"); + } + } +} diff --git a/core/wasm-linker/src/spec_funcs.rs b/core/wasm-linker/src/spec_funcs.rs new file mode 100644 index 00000000..c9309351 --- /dev/null +++ b/core/wasm-linker/src/spec_funcs.rs @@ -0,0 +1,217 @@ +//! Codec for the `inference.spec_funcs` custom section the merge carries +//! through. +//! +//! Codegen emits this section into the main module to record, per spec, the +//! WASM function indices the Rocq translator must turn into proof obligations. +//! The merge removes imports and shifts function indices, so the embedded +//! indices are stale post-link unless rewritten. This module decodes the +//! payload to `(spec_name, [func_idx])` pairs, the merge remaps each index +//! through `Plan::map_main_func`, and [`encode`] re-emits the canonical bytes. +//! +//! ## Payload format (LEB128 u32 throughout) +//! +//! ```text +//! version -- format version (must equal `VERSION`) +//! count -- number of (spec_name, indices) pairs +//! repeat `count` times: +//! name_len name_bytes(utf-8) +//! idx_count repeat `idx_count` times: func_idx +//! ``` +//! +//! The format mirrors `inference_wasm_codegen::spec_section`; the linker keeps a +//! self-contained copy rather than depend on the codegen crate. The decoder is +//! fully bounds-checked: a malformed external `.wasm` (or a corrupt main module) +//! must surface a clean [`LinkError`], never a panic or an unbounded allocation. + +use inf_wasmparser::BinaryReader; + +use crate::LinkError; + +/// The custom-section name carrying per-spec function indices. Kept in +/// lock-step with `inference_wasm_codegen`'s emitter and the `wasm-to-v` +/// decoder; the linker keeps its own copy to avoid depending on the codegen +/// crate. +pub(crate) const SECTION_NAME: &str = "inference.spec_funcs"; + +/// Wire-format version. Kept in lock-step with the codegen emitter. +const VERSION: u32 = 1; + +/// Defensive upper bound on a single spec name's length, matching the decoder +/// in `wasm-to-v`. A hand-crafted payload could advertise a far longer name; +/// cap it so the per-name allocation stays bounded. +const MAX_SPEC_NAME_LEN: usize = 255; + +/// Decodes the `inference.spec_funcs` payload into `(spec_name, [func_idx])` +/// pairs, preserving the encoded order so a round-trip is byte-stable. +/// +/// # Errors +/// +/// Returns [`LinkError::Parse`] on any malformed input: an unrecognised +/// version, a truncated LEB128, invalid UTF-8 in a spec name, an +/// over-advertised pair/index count, or a name exceeding [`MAX_SPEC_NAME_LEN`]. +pub(crate) fn decode(data: &[u8]) -> Result)>, LinkError> { + let mut reader = BinaryReader::new(data, 0); + + let version = reader + .read_var_u32() + .map_err(|e| LinkError::Parse(format!("spec_funcs section: truncated version: {e}")))?; + if version != VERSION { + return Err(LinkError::Parse(format!( + "spec_funcs section: unsupported version {version} (expected {VERSION})" + ))); + } + + let count = reader + .read_var_u32() + .map_err(|e| LinkError::Parse(format!("spec_funcs section: truncated count: {e}")))?; + // Each pair consumes at least two payload bytes (a name-length LEB128 and an + // indices-count LEB128), so a count exceeding half the remaining bytes is a + // malformed advertisement; reject before allocating. + if count as usize > reader.bytes_remaining() / 2 { + return Err(LinkError::Parse( + "spec_funcs section: declared pair count exceeds remaining payload".into(), + )); + } + + let mut out: Vec<(String, Vec)> = Vec::with_capacity(count as usize); + for _ in 0..count { + // `read_string` returns a borrowed `&str` into the payload (no + // allocation). Enforce the length cap on that borrow *before* copying it + // into an owned `String`, so a hand-crafted payload advertising a large + // in-bounds name cannot force a large transient allocation ahead of the + // cap — keeping the decoder's "bounded allocation" guarantee intact. + let name = reader.read_string().map_err(|e| { + LinkError::Parse(format!("spec_funcs section: invalid spec name: {e}")) + })?; + if name.len() > MAX_SPEC_NAME_LEN { + return Err(LinkError::Parse(format!( + "spec_funcs section: spec name length {} exceeds cap {MAX_SPEC_NAME_LEN}", + name.len() + ))); + } + let name = name.to_string(); + + let idx_count = reader.read_var_u32().map_err(|e| { + LinkError::Parse(format!("spec_funcs section: truncated indices count: {e}")) + })?; + // Each index consumes at least one payload byte, so `idx_count` cannot + // legitimately exceed the remaining payload. + if idx_count as usize > reader.bytes_remaining() { + return Err(LinkError::Parse( + "spec_funcs section: declared index count exceeds remaining payload".into(), + )); + } + + let mut indices = Vec::with_capacity(idx_count as usize); + for _ in 0..idx_count { + let idx = reader.read_var_u32().map_err(|e| { + LinkError::Parse(format!("spec_funcs section: truncated index: {e}")) + })?; + indices.push(idx); + } + out.push((name, indices)); + } + + // Every declared entry has been consumed; any remaining bytes are trailing + // garbage the count does not cover. A corrupt or version-skewed section would + // re-encode without them, silently dropping data — reject it, matching the + // fail-closed posture of the truncation checks above. + if reader.bytes_remaining() != 0 { + return Err(LinkError::Parse(format!( + "spec_funcs section: {} trailing byte(s) after {count} declared entries", + reader.bytes_remaining() + ))); + } + + Ok(out) +} + +/// Encodes `(spec_name, [func_idx])` pairs into the canonical payload bytes. +/// +/// The encoded order matches the input order; the merge preserves the decoded +/// order, which is the encoder's sorted-by-name order, so a decode/remap/encode +/// round-trip stays byte-stable. +pub(crate) fn encode(pairs: &[(String, Vec)]) -> Vec { + use wasm_encoder::Encode; + + let mut payload = Vec::new(); + VERSION.encode(&mut payload); + let count = u32::try_from(pairs.len()).expect("more than u32::MAX specs"); + count.encode(&mut payload); + + for (name, indices) in pairs { + let name_bytes = name.as_bytes(); + let name_len = u32::try_from(name_bytes.len()).expect("spec name longer than u32::MAX"); + let idx_count = u32::try_from(indices.len()).expect("more than u32::MAX indices per spec"); + + name_len.encode(&mut payload); + payload.extend_from_slice(name_bytes); + idx_count.encode(&mut payload); + for idx in indices { + idx.encode(&mut payload); + } + } + + payload +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn round_trips_a_two_spec_payload() { + let pairs = vec![ + ("A".to_string(), vec![2, 3]), + ("B".to_string(), vec![5]), + ]; + let bytes = encode(&pairs); + // version=1, count=2, len=1 'A', idxc=2, 2,3, len=1 'B', idxc=1, 5 + assert_eq!(bytes, vec![1, 2, 1, b'A', 2, 2, 3, 1, b'B', 1, 5]); + assert_eq!(decode(&bytes).unwrap(), pairs); + } + + #[test] + fn empty_payload_round_trips() { + let pairs: Vec<(String, Vec)> = Vec::new(); + let bytes = encode(&pairs); + assert_eq!(bytes, vec![1, 0]); + assert_eq!(decode(&bytes).unwrap(), pairs); + } + + #[test] + fn rejects_an_unsupported_version() { + // version=2, count=0 + let err = decode(&[2, 0]).unwrap_err(); + assert!(matches!(err, LinkError::Parse(_)), "got {err:?}"); + } + + #[test] + fn rejects_an_over_advertised_pair_count() { + // version=1, count=255 in a 3-byte payload. + let err = decode(&[1, 255, 1]).unwrap_err(); + assert!(matches!(err, LinkError::Parse(_)), "got {err:?}"); + } + + #[test] + fn rejects_a_truncated_index() { + // version=1, count=1, name_len=1 'S', idx_count=1, + let err = decode(&[1, 1, 1, b'S', 1]).unwrap_err(); + assert!(matches!(err, LinkError::Parse(_)), "got {err:?}"); + } + + #[test] + fn rejects_trailing_bytes_after_the_declared_entries() { + // A well-formed payload followed by extra bytes the count does not cover. + // Silently dropping the trailing bytes (the prior behavior) would mask a + // corrupt or version-skewed section; the decoder must fail closed, matching + // the other truncation checks in this codec. + let mut bytes = encode(&[("S".to_string(), vec![0])]); + bytes.extend_from_slice(&[0xff, 0xff]); + let err = decode(&bytes).unwrap_err(); + assert!( + matches!(&err, LinkError::Parse(msg) if msg.contains("trailing")), + "expected a Parse error naming the trailing bytes, got {err:?}" + ); + } +} diff --git a/core/wasm-linker/src/tier.rs b/core/wasm-linker/src/tier.rs new file mode 100644 index 00000000..c577c34e --- /dev/null +++ b/core/wasm-linker/src/tier.rs @@ -0,0 +1,97 @@ +//! Memory-merge feasibility tiers. +//! +//! Whether an external function can be merged into the single shared linear +//! memory depends on what its transitive closure touches: +//! +//! - **Tier A** — pure: no memory, no globals, no data/element segments, no +//! tables. Merge is a copy + re-index. +//! - **Tier B** — memory via caller-passed pointers only: the closure +//! loads/stores through addresses the caller supplies, but defines no static +//! data of its own, no mutable globals, and no table/element entries. The one +//! shared memory is enough; no address relocation is needed. Admission to +//! Tier B requires *proof* (via [`crate::provenance`]) that every memory +//! address derives from a function parameter — a closure that fabricates an +//! address from a constant or its own state would alias the host program's +//! memory and is rejected as Tier C instead. +//! - **Tier C** — own static data, globals, or table/element entries: merging +//! would require relocating data and rewriting absolute addresses, which +//! needs relocation metadata the static merge does not consume. Rejected with +//! a clear error. + +use crate::closure::{Closure, ClosureEffects}; +use crate::parse::ParsedModule; +use crate::provenance; +use crate::LinkError; + +/// The feasibility tier of a merge candidate. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum Tier { + /// Pure function: no memory, globals, data, or tables. + A, + /// Memory through caller-passed pointers only. + B, +} + +/// Classifies a closure against its source module, returning the tier or a +/// [`LinkError::RequiresRelocatableBuild`] for Tier-C inputs. +/// +/// A module is Tier C when it carries any of the relocation-sensitive +/// constructs: its own data or element segments, defined globals (a baked-in +/// constant or mutable state), or table definitions / indirect-call use. These +/// imply absolute addresses or per-module state that a position-naive static +/// merge cannot reconcile across two modules sharing one memory. +/// +/// A memory-touching closure is admitted to Tier B **only** when the +/// address-provenance analysis ([`provenance::verify_param_addressing`]) proves +/// every memory access — in the closure `root` and in every function it +/// transitively calls — addresses memory through a value derived from the +/// **root export's** parameters, on every reachable control-flow path. The +/// analysis is interprocedural: the root's parameters are the trusted caller +/// pointers, and an inner function's parameter is trusted only when *every* +/// reachable call site passes it a param-derived argument (a sound greatest +/// fixpoint over the call graph that handles self- and mutual recursion). A +/// closure that fabricates a memory address from a constant, a module-internal +/// source, a parameter-cancelling computation (`param - param`, `param & 0`, …), +/// a value laundered across a `call` boundary that the call site does not +/// justify, or an indirect/table-dispatched call result would silently alias the +/// host program's own linear memory, so it is rejected as Tier C rather than +/// merged. +pub(crate) fn classify( + module: &ParsedModule, + closure: &Closure, + root: u32, + field: &str, +) -> Result { + let reasons = tier_c_reasons(module, &closure.effects); + if !reasons.is_empty() { + return Err(LinkError::RequiresRelocatableBuild { + field: field.to_string(), + reasons, + }); + } + + if closure.effects.uses_memory { + provenance::verify_param_addressing(module, &closure.local_func_indices, root, field)?; + Ok(Tier::B) + } else { + Ok(Tier::A) + } +} + +/// Collects every reason the module fails Tier-A/B feasibility. Empty means the +/// module is mergeable. +fn tier_c_reasons(module: &ParsedModule, effects: &ClosureEffects) -> Vec { + let mut reasons = Vec::new(); + + if module.data_count > 0 || effects.uses_data_segments { + reasons.push("defines or initializes its own static data segments".to_string()); + } + if !module.globals.is_empty() || effects.uses_globals { + reasons.push("defines or accesses module globals".to_string()); + } + if !module.tables.is_empty() || module.element_count > 0 || effects.uses_tables { + reasons.push("uses a table / element segment (indirect calls)".to_string()); + } + + reasons +} diff --git a/core/wasm-linker/tests/fuzz_seeds.rs b/core/wasm-linker/tests/fuzz_seeds.rs new file mode 100644 index 00000000..37dfa0de --- /dev/null +++ b/core/wasm-linker/tests/fuzz_seeds.rs @@ -0,0 +1,420 @@ +//! Seed corpus for the `cargo-fuzz` `link` target, plus the deterministic guard +//! that keeps it honest under stable `cargo test`. +//! +//! The libFuzzer harness in `core/wasm-linker/fuzz/fuzz_targets/link.rs` cannot +//! run on the default toolchain (`cargo-fuzz` + nightly are detached from the +//! workspace). Its seed corpus, however, *is* committed — at +//! `core/wasm-linker/fuzz/seeds/link/` — so a developer running +//! `cargo +nightly fuzz run link core/wasm-linker/fuzz/seeds/link` starts from +//! the round-2 audit reproductions rather than from zero coverage. +//! +//! Two tests keep the corpus trustworthy on every `cargo test`: +//! +//! - [`committed_fuzz_seeds_reach_link_cleanly`] replays each committed seed +//! through the fuzz target's exact wire-format `split` and module-name +//! rotation, asserting it neither panics nor produces a silently-invalid `Ok` +//! — the same invariant the fuzzer enforces. It also checks each round-2 seed +//! reaches its intended rejection (so a seed that silently stops exercising the +//! guard it was built for is caught). +//! - [`regenerate_fuzz_seeds`] (`#[ignore]`d) rebuilds the corpus from source, so +//! the binary blobs are reproducible rather than opaque. Run it with: +//! `cargo test -p inference-wasm-linker --test fuzz_seeds regenerate -- --ignored`. + +use inference_wasm_linker::link as raw_link; +use std::fs; +use std::path::{Path, PathBuf}; + +/// The committed seed-corpus directory for the `link` fuzz target. +fn seed_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("fuzz") + .join("seeds") + .join("link") +} + +/// The logical module names the `link` fuzz target rotates externals through, +/// kept byte-identical to `fuzz_targets/link.rs` so the replay binds imports the +/// same way the fuzzer does. Seed mains therefore import from the empty module +/// `""` — the name the *first* external is always assigned — so the import binds +/// and the seed reaches the real closure / provenance / merge logic rather than +/// stalling on an unsatisfied import. +const MODULE_NAMES: [&str; 4] = ["", "mathlib", "crypto::sha256", "a"]; + +/// The fuzz target's wire-format split, mirrored here so the replay is faithful: +/// `[count:u8][ (len:u16le, bytes) * (count % 5) ][ main bytes ]`. +fn split(data: &[u8]) -> (Vec, Vec>) { + let Some((&count_byte, rest)) = data.split_first() else { + return (Vec::new(), Vec::new()); + }; + let count = (count_byte % 5) as usize; + let mut externals = Vec::with_capacity(count); + let mut cursor = rest; + for _ in 0..count { + if cursor.len() < 2 { + break; + } + let len = u16::from_le_bytes([cursor[0], cursor[1]]) as usize; + cursor = &cursor[2..]; + let take = len.min(cursor.len()); + externals.push(cursor[..take].to_vec()); + cursor = &cursor[take..]; + } + (cursor.to_vec(), externals) +} + +/// Encodes one `(main, externals)` case in the fuzz target's wire format. +fn encode(main: &[u8], externals: &[Vec]) -> Vec { + let mut out = Vec::new(); + out.push(externals.len() as u8); + for ext in externals { + out.extend_from_slice(&(ext.len() as u16).to_le_bytes()); + out.extend_from_slice(ext); + } + out.extend_from_slice(main); + out +} + +/// Assembles a `.wasm` from WAT, panicking with the source on error. +fn wasm(src: &str) -> Vec { + wat::parse_str(src).unwrap_or_else(|e| panic!("invalid seed WAT: {e}\n{src}")) +} + +/// Links a decoded seed exactly as the fuzz target would: external `i` is tagged +/// with `MODULE_NAMES[i % 4]`. +fn link_like_fuzzer(main: &[u8], externals: &[Vec]) -> Result, inference_wasm_linker::LinkError> { + let pairs: Vec<(&str, &[u8])> = externals + .iter() + .enumerate() + .map(|(i, b)| (MODULE_NAMES[i % MODULE_NAMES.len()], b.as_slice())) + .collect(); + raw_link(main, &pairs) +} + +/// A named seed: its file name, the bytes, and the substring its rejection must +/// contain (or `None` for the positive control that must merge into a valid +/// module). The substring pins each round-2 seed to the *specific* guard it was +/// built to exercise, so a refactor that lets a laundering seed slip to a +/// different (or absent) rejection is caught here, not only in the fuzzer. +struct Seed { + name: &'static str, + bytes: Vec, + /// `Some(needle)` ⇒ must be rejected with a message containing `needle`; + /// `None` ⇒ must merge into a valid module. + rejection_needle: Option<&'static str>, +} + +/// The full seed corpus, built from source. Each round-2 reproduction mirrors +/// the dedicated regression test's fixture, and imports from the empty module so +/// the fuzzer's first-external binding satisfies it. +fn seeds() -> Vec { + let mem_main = |ity: &str, field: &str, body: &str| { + wasm(&format!( + "(module {ity} (import \"\" \"{field}\" (func (;0;) (type 0))) \ + (memory (;0;) 1 1) {body} \ + (export \"memory\" (memory 0)) (export \"run\" (func 1)))" + )) + }; + let mem_lib = |ty: &str, field: &str, body: &str| { + wasm(&format!( + "(module {ty} (memory (;0;) 1) \ + (func (;0;) (type 0) {body}) (export \"{field}\" (func 0)))" + )) + }; + + let main_sum = wasm( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (import \"\" \"sum\" (func (;0;) (type 0))) \ + (func (;1;) (type 0) (param i32 i32) (result i32) local.get 0 local.get 1 call 0) \ + (export \"compute\" (func 1)))", + ); + let pure_lib = wasm( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (func (;0;) (type 0) (param i32 i32) (result i32) local.get 0 local.get 1 i32.add) \ + (export \"sum\" (func 0)))", + ); + + let mut deep_body = String::new(); + for _ in 0..5_000 { + deep_body.push_str("block "); + } + for _ in 0..5_000 { + deep_body.push_str("end "); + } + let deep_lib = wasm(&format!( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (func (;0;) (type 0) (param i32 i32) (result i32) {deep_body} \ + local.get 0 local.get 1 i32.add) \ + (export \"sum\" (func 0)))" + )); + + let m2_main = wasm( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (import \"\" \"sum\" (func (;0;) (type 0))) \ + (memory (;0;) 1 1) (data (;0;) (i32.const 0) \"\\2a\\00\\00\\00\") \ + (func (;1;) (type 0) (param i32 i32) (result i32) local.get 0 local.get 1 call 0) \ + (export \"compute\" (func 1)))", + ); + + let mk = |name, main: &[u8], ext: Vec, needle| Seed { + name, + bytes: encode(main, &[ext]), + rejection_needle: needle, + }; + + vec![ + // C-1: a constant address laundered through a control-flow join into an + // address-feeding local. + mk( + "c1_control_flow_join", + &mem_main( + "(type (;0;) (func (param i32 i32) (result i32)))", + "peek", + "(func (;1;) (type 0) (param i32 i32) (result i32) local.get 0 local.get 1 call 0)", + ), + mem_lib( + "(type (;0;) (func (param i32 i32) (result i32)))", + "peek", + "(param i32 i32) (result i32) (local i32) \ + i32.const 1024 local.set 2 \ + (block local.get 1 (if (then local.get 0 local.set 2))) \ + local.get 2 i32.load", + ), + Some("relocatable build"), + ), + // C-2: param-nulling arithmetic — `(addr - addr) + base` is a fixed host + // address. + mk( + "c2_param_nulling_arith", + &mem_main( + "(type (;0;) (func (param i32 i32)))", + "poke", + "(func (;1;) (type 0) (param i32 i32) local.get 0 local.get 1 call 0)", + ), + mem_lib( + "(type (;0;) (func (param i32 i32)))", + "poke", + "(param i32 i32) \ + local.get 0 local.get 0 i32.sub i32.const 65536 i32.add \ + local.get 1 i32.store", + ), + Some("relocatable build"), + ), + // C-2b: add-side algebraic cancellation — `(C - p) + p == C` re-derives + // the fixed host address the `sub` rule demoted. The `add` rule must not + // re-promote `Param + NotParam` to Param. + mk( + "c2b_add_side_cancellation", + &mem_main( + "(type (;0;) (func (param i32 i32)))", + "poke", + "(func (;1;) (type 0) (param i32 i32) local.get 0 local.get 1 call 0)", + ), + mem_lib( + "(type (;0;) (func (param i32 i32)))", + "poke", + "(param i32 i32) \ + i32.const 65536 local.get 0 i32.sub local.get 0 i32.add \ + local.get 1 i32.store", + ), + Some("relocatable build"), + ), + // C-3: a constant address laundered across a `call` boundary. + mk( + "c3_call_laundered", + &mem_main( + "(type (;0;) (func (param i32 i32) (result i32)))", + "peek", + "(func (;1;) (type 0) (param i32 i32) (result i32) local.get 0 local.get 1 call 0)", + ), + wasm( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (type (;1;) (func (param i32) (result i32))) \ + (memory (;0;) 1) \ + (func (;0;) (type 0) (param i32 i32) (result i32) i32.const 1024 call 1) \ + (func (;1;) (type 1) (param i32) (result i32) local.get 0 i32.load) \ + (export \"peek\" (func 0)))", + ), + Some("relocatable build"), + ), + // C-4: a memory64 external folded onto a memoryless main. + mk( + "c4_memory64", + &wasm( + "(module (type (;0;) (func (param i64) (result i64))) \ + (import \"\" \"load_at\" (func (;0;) (type 0))) \ + (func (;1;) (type 0) (param i64) (result i64) local.get 0 call 0) \ + (export \"run\" (func 1)))", + ), + wasm( + "(module (type (;0;) (func (param i64) (result i64))) \ + (memory (;0;) i64 1) \ + (func (;0;) (type 0) (param i64) (result i64) local.get 0 i64.load) \ + (export \"load_at\" (func 0)))", + ), + Some("memory64"), + ), + // H-3: a deeply-nested external body the merge must reject before it can + // abort the wasm-to-v translator. + mk("h3_deep_nesting", &main_sum, deep_lib, Some("nests structured control flow")), + // M-1: an over-declared locals count, rejected by the pre-validation gate + // before any per-local allocation. + mk("m1_over_declared_locals", &main_sum, over_declared_locals_external(u32::MAX), Some("parse")), + // M-2: a main module carrying an active data segment. + mk("m2_main_data_segment", &m2_main, pure_lib.clone(), Some("data segment")), + // Positive control: a genuinely-pure external that must merge into a + // valid module, so the corpus is never vacuously all-rejection. + mk("pure_control_merges", &main_sum, pure_lib, None), + ] +} + +#[test] +fn committed_fuzz_seeds_reach_link_cleanly() { + let dir = seed_dir(); + assert!( + dir.is_dir(), + "the committed fuzz seed corpus is missing at {}; regenerate it with \ + `cargo test -p inference-wasm-linker --test fuzz_seeds regenerate -- --ignored`", + dir.display() + ); + + for seed in seeds() { + let path = dir.join(seed.name); + let committed = fs::read(&path).unwrap_or_else(|e| { + panic!( + "missing seed `{}` ({e}); regenerate with \ + `cargo test -p inference-wasm-linker --test fuzz_seeds regenerate -- --ignored`", + seed.name + ) + }); + + // The committed bytes must match what the generator produces, so the + // corpus stays reproducible (a reviewer can rebuild it from source). + assert_eq!( + committed, seed.bytes, + "committed seed `{}` is stale; regenerate it with \ + `cargo test -p inference-wasm-linker --test fuzz_seeds regenerate -- --ignored`", + seed.name + ); + + // Replay through the fuzzer's exact decode + module rotation, wrapped in + // catch_unwind so a reintroduced panic names the offending seed. + let (main, externals) = split(&committed); + let outcome = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + link_like_fuzzer(&main, &externals) + })); + let result = outcome.unwrap_or_else(|_| { + panic!("seed `{}`: link panicked — it must return an Err", seed.name) + }); + + match (result, seed.rejection_needle) { + (Ok(merged), None) => { + inf_wasmparser::validate(&merged).unwrap_or_else(|e| { + panic!("seed `{}`: merged module fails validation: {e}", seed.name) + }); + } + (Ok(_), Some(needle)) => panic!( + "seed `{}`: a soundness reproduction merged instead of being rejected \ + (expected a `{needle}` rejection) — a silent miscompile", + seed.name + ), + (Err(e), Some(needle)) => { + let msg = e.to_string(); + assert!( + msg.contains(needle), + "seed `{}`: rejected, but not for its intended reason; \ + expected a message containing `{needle}`, got `{msg}`", + seed.name + ); + } + (Err(e), None) => panic!( + "seed `{}`: the positive control must merge, got a rejection: {e}", + seed.name + ), + } + } +} + +#[test] +#[ignore = "writes the committed fuzz seed corpus; run with --ignored to regenerate"] +fn regenerate_fuzz_seeds() { + let dir = seed_dir(); + fs::create_dir_all(&dir).expect("create seed dir"); + for seed in seeds() { + write_seed(&dir, seed.name, &seed.bytes); + } +} + +/// Writes one seed file, creating it byte-for-byte from the generator output. +fn write_seed(dir: &Path, name: &str, bytes: &[u8]) { + fs::write(dir.join(name), bytes) + .unwrap_or_else(|e| panic!("failed to write seed `{name}`: {e}")); +} + +// -- M-1 fixture (hand-assembled invalid module) ----------------------------- +// +// A real assembler cannot emit an over-declared locals count — `wat` computes +// the locals header from the declared types — so the M-1 reproduction is written +// byte-by-byte, mirroring `over_declared_locals_external` in `link.rs`. + +fn push_uleb(out: &mut Vec, mut value: u32) { + loop { + let mut byte = (value & 0x7f) as u8; + value >>= 7; + if value != 0 { + byte |= 0x80; + } + out.push(byte); + if value == 0 { + break; + } + } +} + +fn framed_section(id: u8, section_bytes: &[u8]) -> Vec { + let mut out = vec![id]; + push_uleb(&mut out, section_bytes.len() as u32); + out.extend_from_slice(section_bytes); + out +} + +/// A memory-using external exporting `sum:(i32,i32)->i32` whose single function +/// over-declares its locals count as `locals_count`. With `u32::MAX` this is the +/// M-1 reproduction: the value a 6-byte locals group can set, which the universal +/// pre-validation gate must reject before provenance sizes a per-local `vec!`. +fn over_declared_locals_external(locals_count: u32) -> Vec { + let type_section = framed_section(0x01, &[0x01, 0x60, 0x02, 0x7f, 0x7f, 0x01, 0x7f]); + let function_section = framed_section(0x03, &[0x01, 0x00]); + let memory_section = framed_section(0x05, &[0x01, 0x00, 0x01]); + + let mut export_payload = vec![0x01]; + push_uleb(&mut export_payload, 3); + export_payload.extend_from_slice(b"sum"); + export_payload.push(0x00); + export_payload.push(0x00); + let export_section = framed_section(0x07, &export_payload); + + let mut body = Vec::new(); + body.push(0x01); + push_uleb(&mut body, locals_count); + body.push(0x7f); + body.extend_from_slice(&[0x41, 0x00]); + body.extend_from_slice(&[0x28, 0x02, 0x00]); + body.push(0x1a); + body.extend_from_slice(&[0x41, 0x00]); + body.push(0x0b); + + let mut code_payload = vec![0x01]; + push_uleb(&mut code_payload, body.len() as u32); + code_payload.extend_from_slice(&body); + let code_section = framed_section(0x0a, &code_payload); + + let mut module = Vec::new(); + module.extend_from_slice(b"\0asm"); + module.extend_from_slice(&[0x01, 0x00, 0x00, 0x00]); + module.extend_from_slice(&type_section); + module.extend_from_slice(&function_section); + module.extend_from_slice(&memory_section); + module.extend_from_slice(&export_section); + module.extend_from_slice(&code_section); + module +} diff --git a/core/wasm-linker/tests/link.rs b/core/wasm-linker/tests/link.rs new file mode 100644 index 00000000..deff5593 --- /dev/null +++ b/core/wasm-linker/tests/link.rs @@ -0,0 +1,4848 @@ +//! Integration tests for the static-merge linker. +//! +//! Each test builds its `.wasm` fixtures from inline WAT (via the `wat` crate), +//! links them, and asserts on the unified module: structural validity (through +//! `inf-wasmparser`'s validator), absence of cross-module imports, the merged +//! function bodies, and the precise rejection for Tier-C inputs. + +use inf_wasmparser::{ExternalKind, Operator, Parser, Payload, TypeRef}; +use inference_wasm_linker::{link as raw_link, LinkError}; + +/// Assembles a `.wasm` binary from WAT source, panicking with the WAT on error. +fn wasm(wat: &str) -> Vec { + wat::parse_str(wat).unwrap_or_else(|e| panic!("invalid WAT fixture: {e}\n{wat}")) +} + +/// Links `main` against the given externals, tagging each external with the +/// single logical module `main` imports from. +/// +/// The public `link` API takes `(logical_module, bytes)` pairs so it can match +/// an import's recorded `(module, field)` against the right external. Every +/// single-module fixture in this file imports all of its externs from one +/// logical module, so this helper derives that module from `main`'s import +/// section and pairs all externals with it — keeping each test's call site as +/// `link(&main, &[&lib])`. Tests that need *distinct* logical modules per +/// external (multi-module satisfaction, same-field disambiguation) call +/// [`raw_link`] directly with explicit pairs. +fn link(main: &[u8], libs: &[&[u8]]) -> Result, LinkError> { + let modules: std::collections::BTreeSet = function_imports(main) + .into_iter() + .map(|(module, _)| module) + .collect(); + // A no-import main links any externals away to nothing; the module label is + // irrelevant there. With imports, every fixture here uses a single module. + let module = modules.into_iter().next().unwrap_or_default(); + let pairs: Vec<(&str, &[u8])> = libs.iter().map(|b| (module.as_str(), *b)).collect(); + raw_link(main, &pairs) +} + +/// Validates `bytes` as a complete WASM module. +fn assert_valid(bytes: &[u8]) { + inf_wasmparser::validate(bytes) + .unwrap_or_else(|e| panic!("linked module failed validation: {e}")); +} + +/// The `(module, field)` pairs of every function import in `bytes`. +fn function_imports(bytes: &[u8]) -> Vec<(String, String)> { + let mut imports = Vec::new(); + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::ImportSection(reader) = payload.unwrap() { + for import in reader { + let import = import.unwrap(); + if matches!(import.ty, TypeRef::Func(_)) { + imports.push((import.module.to_string(), import.name.to_string())); + } + } + } + } + imports +} + +/// Number of function bodies in the code section. +fn code_body_count(bytes: &[u8]) -> usize { + let mut count = 0; + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::CodeSectionEntry(_) = payload.unwrap() { + count += 1; + } + } + count +} + +/// The exported-function names of `bytes`. +fn exported_functions(bytes: &[u8]) -> Vec { + let mut names = Vec::new(); + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::ExportSection(reader) = payload.unwrap() { + for export in reader { + let export = export.unwrap(); + if export.kind == ExternalKind::Func { + names.push(export.name.to_string()); + } + } + } + } + names +} + +/// The `call` target indices in the body of the function at `func_idx`. +fn body_call_targets(bytes: &[u8], func_idx: usize) -> Vec { + let mut calls_per_body: Vec> = Vec::new(); + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::CodeSectionEntry(body) = payload.unwrap() { + let mut calls = Vec::new(); + for op in body.get_operators_reader().unwrap() { + if let Operator::Call { function_index } = op.unwrap() { + calls.push(function_index); + } + } + calls_per_body.push(calls); + } + } + calls_per_body[func_idx].clone() +} + +/// The `(function index, name)` pairs recorded in the module's `name` custom +/// section, or an empty vector if no name section is present. +fn function_names(bytes: &[u8]) -> Vec<(u32, String)> { + let mut names = Vec::new(); + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::CustomSection(custom) = payload.unwrap() + && let inf_wasmparser::KnownCustom::Name(reader) = custom.as_known() + { + for sub in reader { + if let inf_wasmparser::Name::Function(map) = sub.unwrap() { + for naming in map { + let naming = naming.unwrap(); + names.push((naming.index, naming.name.to_string())); + } + } + } + } + } + names +} + +/// The raw payload of the custom section named `name`, if present. +fn custom_section_data(bytes: &[u8], name: &str) -> Option> { + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::CustomSection(custom) = payload.unwrap() + && custom.name() == name + { + return Some(custom.data().to_vec()); + } + } + None +} + +/// Decodes the `inference.spec_funcs` payload into `(spec_name, [idx])` pairs, +/// mirroring the encoder's format, for asserting post-link index rewriting. +fn decode_spec_funcs(data: &[u8]) -> Vec<(String, Vec)> { + let mut reader = inf_wasmparser::BinaryReader::new(data, 0); + let version = reader.read_var_u32().unwrap(); + assert_eq!(version, 1, "spec_funcs version"); + let count = reader.read_var_u32().unwrap(); + let mut out = Vec::new(); + for _ in 0..count { + let name = reader.read_string().unwrap().to_string(); + let idx_count = reader.read_var_u32().unwrap(); + let mut indices = Vec::new(); + for _ in 0..idx_count { + indices.push(reader.read_var_u32().unwrap()); + } + out.push((name, indices)); + } + out +} + +/// Whether the body of the function at `func_idx` contains an `i32.add`. +fn body_has_i32_add(bytes: &[u8], func_idx: usize) -> bool { + let mut idx = 0; + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::CodeSectionEntry(body) = payload.unwrap() { + if idx == func_idx { + return body + .get_operators_reader() + .unwrap() + .into_iter() + .any(|op| matches!(op.unwrap(), Operator::I32Add)); + } + idx += 1; + } + } + false +} + +/// Whether the body of the function at `func_idx` contains an `i32.store`. +fn body_has_i32_store(bytes: &[u8], func_idx: usize) -> bool { + let mut idx = 0; + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::CodeSectionEntry(body) = payload.unwrap() { + if idx == func_idx { + return body + .get_operators_reader() + .unwrap() + .into_iter() + .any(|op| matches!(op.unwrap(), Operator::I32Store { .. })); + } + idx += 1; + } + } + false +} + +/// The `(initial, maximum)` page limits of the module's single linear memory, or +/// `None` if it declares no memory. +fn memory_limits(bytes: &[u8]) -> Option<(u64, Option)> { + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::MemorySection(reader) = payload.unwrap() { + let mem = reader.into_iter().next()?.unwrap(); + return Some((mem.initial, mem.maximum)); + } + } + None +} + +// -- Tier A: pure functions -------------------------------------------------- + +/// A main module that imports two pure functions, `sum` and `sub`, and calls +/// each from a local `compute` function it exports. +fn main_with_sum_and_sub() -> Vec { + wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (import "mathlib" "sub" (func (;1;) (type 0))) + (func (;2;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0 + local.get 0 + local.get 1 + call 1 + i32.sub) + (export "compute" (func 2))) + "#, + ) +} + +/// An external module exporting pure `sum` and `sub`. +fn mathlib_pure() -> Vec { + wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.sub) + (export "sum" (func 0)) + (export "sub" (func 1))) + "#, + ) +} + +#[test] +fn tier_a_merges_pure_functions() { + let main = main_with_sum_and_sub(); + let lib = mathlib_pure(); + + let linked = link(&main, &[&lib]).expect("link should succeed"); + assert_valid(&linked); + + // No cross-module imports remain. + assert!( + function_imports(&linked).is_empty(), + "expected no function imports, found {:?}", + function_imports(&linked) + ); + + // Three bodies: the main `compute`, plus the two merged `sum`/`sub`. + assert_eq!(code_body_count(&linked), 3); + + // The main module's export survives. + assert_eq!(exported_functions(&linked), vec!["compute".to_string()]); +} + +#[test] +fn tier_a_main_calls_point_at_merged_bodies() { + let main = main_with_sum_and_sub(); + let lib = mathlib_pure(); + let linked = link(&main, &[&lib]).expect("link should succeed"); + + // After the merge, `compute` is local function 0; the merged `sum` and + // `sub` are functions 1 and 2. The two `call` operators in `compute` must + // now target 1 and 2 (originally imports 0 and 1). + assert_eq!(body_call_targets(&linked, 0), vec![1, 2]); +} + +// -- Name section ------------------------------------------------------------ + +#[test] +fn merged_functions_are_named_after_satisfied_import_fields() { + // Neither fixture carries a `name` section. The two merged closure roots + // must still be named — after the import fields they satisfy, prefixed with + // their logical module (`mathlib`) — so the Rocq translator emits + // `Definition mathlib_sum` / `Definition mathlib_sub` rather than opaque + // `func_` placeholders. + let main = main_with_sum_and_sub(); + let lib = mathlib_pure(); + let linked = link(&main, &[&lib]).expect("link should succeed"); + assert_valid(&linked); + + // Output indices: compute=0, merged sum=1, merged sub=2. + let names = function_names(&linked); + assert!( + names.contains(&(1, "mathlib.sum".to_string())), + "merged sum must be named after its module-prefixed import field, got {names:?}" + ); + assert!( + names.contains(&(2, "mathlib.sub".to_string())), + "merged sub must be named after its module-prefixed import field, got {names:?}" + ); +} + +#[test] +fn main_function_names_survive_the_merge() { + // The main module names its local `compute` in a `name` section; that name + // must follow the function onto its import-free output index (0). + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (func $compute (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + ); + let lib = mathlib_pure(); + let linked = link(&main, &[&lib]).expect("link should succeed"); + assert_valid(&linked); + + let names = function_names(&linked); + assert!( + names.contains(&(0, "compute".to_string())), + "main `compute` name must survive at output index 0, got {names:?}" + ); + assert!( + names.contains(&(1, "mathlib.sum".to_string())), + "merged `sum` must be named with its module prefix, got {names:?}" + ); +} + +// -- Type dedup -------------------------------------------------------------- + +#[test] +fn shared_signatures_dedup_into_one_type() { + // `sum` and `sub` share `(i32,i32)->i32`, which also matches `compute`'s + // type and the import type. The output type section must collapse them. + let main = main_with_sum_and_sub(); + let lib = mathlib_pure(); + let linked = link(&main, &[&lib]).expect("link should succeed"); + assert_valid(&linked); + + let mut type_count = 0; + for payload in Parser::new(0).parse_all(&linked) { + if let Payload::TypeSection(reader) = payload.unwrap() { + type_count = reader.count(); + } + } + assert_eq!(type_count, 1, "all functions share one (i32,i32)->i32 type"); +} + +// -- Transitive closure ------------------------------------------------------ + +#[test] +fn transitive_closure_pulls_in_called_internals() { + // `sum` is exported but delegates to a non-exported internal `add_impl`. + // The closure must drag `add_impl` into the merge and re-index `sum`'s call + // to it. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 1) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("link should succeed"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + + // compute(0) + merged sum(1) + merged add_impl(2) == 3 bodies. + assert_eq!(code_body_count(&linked), 3); + + // compute's call (originally import 0) now targets merged `sum` at 1. + assert_eq!(body_call_targets(&linked, 0), vec![1]); + + // merged `sum` (body 1) must now call merged `add_impl` at 2, not its + // original index 1. + assert_eq!(body_call_targets(&linked, 1), vec![2]); +} + +#[test] +fn closure_does_not_pull_unreferenced_functions() { + // The library exports `sum` and also defines an unrelated `unused` function + // that nothing in `sum`'s closure calls. `unused` must not be merged. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.mul) + (export "sum" (func 0)) + (export "unused" (func 1))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("link should succeed"); + assert_valid(&linked); + // compute + merged sum only; `unused` (i32.mul) is dropped. + assert_eq!(code_body_count(&linked), 2); +} + +// -- Tier B: caller-passed pointers ------------------------------------------ + +#[test] +fn tier_b_merges_function_over_caller_memory() { + // `store_at` writes a value to a caller-supplied address. It touches memory + // but defines no data of its own — Tier B. The main module owns the shared + // memory. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32))) + (import "memlib" "store_at" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 0) (param i32 i32) + local.get 0 + local.get 1 + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32))) + (memory (;0;) 1) + (func (;0;) (type 0) (param i32 i32) + local.get 0 + local.get 1 + i32.store) + (export "store_at" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("Tier B should merge"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + assert_eq!(code_body_count(&linked), 2); + + // The shared memory export survives. + let mut has_memory_export = false; + for payload in Parser::new(0).parse_all(&linked) { + if let Payload::ExportSection(reader) = payload.unwrap() { + for export in reader { + if export.unwrap().kind == ExternalKind::Memory { + has_memory_export = true; + } + } + } + } + assert!(has_memory_export, "shared memory export must survive"); + + // The merged `store_at` body keeps its `i32.store`. + assert!( + body_has_i32_store(&linked, 1), + "merged Tier-B body must retain its memory store" + ); +} + +// -- Tier C: rejected -------------------------------------------------------- + +#[test] +fn tier_c_data_segment_requires_relocatable_build() { + // `lookup` reads from a baked-in data segment via `memory.init`. That is + // own static data — Tier C — and must be rejected. + let main = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (import "tablelib" "lookup" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 0) (result i32) + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (memory (;0;) 1) + (data (;0;) (i32.const 0) "\2a\00\00\00") + (func (;0;) (type 0) (result i32) + i32.const 0 + i32.const 0 + i32.const 4 + memory.init 0 + i32.const 0 + i32.load) + (export "lookup" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("Tier C must be rejected"); + match err { + LinkError::RequiresRelocatableBuild { field, reasons } => { + assert_eq!(field, "lookup"); + assert!( + reasons.iter().any(|r| r.contains("data")), + "reason should mention static data: {reasons:?}" + ); + } + other => panic!("expected RequiresRelocatableBuild, got {other:?}"), + } +} + +#[test] +fn tier_c_global_requires_relocatable_build() { + // `counter` reads a module-defined global — per-module mutable state that + // cannot be merged into a shared module without relocation. + let main = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (import "statelib" "counter" (func (;0;) (type 0))) + (func (;1;) (type 0) (result i32) + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (global (;0;) (mut i32) (i32.const 7)) + (func (;0;) (type 0) (result i32) + global.get 0) + (export "counter" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("Tier C global must be rejected"); + match err { + LinkError::RequiresRelocatableBuild { field, reasons } => { + assert_eq!(field, "counter"); + assert!( + reasons.iter().any(|r| r.contains("global")), + "reason should mention globals: {reasons:?}" + ); + } + other => panic!("expected RequiresRelocatableBuild, got {other:?}"), + } +} + +#[test] +fn tier_c_indirect_call_requires_relocatable_build() { + // An external function that performs an indirect call needs the table / + // element space, which the static merge does not relocate. + let main = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (import "dispatch" "run" (func (;0;) (type 0))) + (func (;1;) (type 0) (result i32) + call 0) + (export "go" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (table (;0;) 1 funcref) + (func (;0;) (type 0) (result i32) + i32.const 0 + call_indirect (type 0)) + (export "run" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("Tier C indirect call must be rejected"); + assert!( + matches!(err, LinkError::RequiresRelocatableBuild { .. }), + "expected RequiresRelocatableBuild, got {err:?}" + ); +} + +#[test] +fn tier_c_subtraction_fabricated_absolute_address_requires_relocatable_build() { + // An external that computes `p - (p - C)` fabricates the fixed absolute + // address `C` from its caller pointer `p`: `(p * 1)` is the caller pointer + // by value but classified not-provably-param, so the subtraction cancels to + // a caller-independent constant. Storing through it would write host memory + // the caller never authorised. The provenance analysis must classify the + // closure Tier C — `Param - NotParam` may not preserve param-derivation — + // and the whole link must reject rather than admit the write as Tier B. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32))) + (import "memlib" "store_at" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 0) (param i32 i32) + local.get 0 + local.get 1 + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32))) + (memory (;0;) 1) + (func (;0;) (type 0) (param i32 i32) + local.get 0 + local.get 0 i32.const 1 i32.mul i32.const 4096 i32.sub + i32.sub + local.get 1 + i32.store) + (export "store_at" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]) + .expect_err("a fabricated absolute store address must be rejected"); + assert!( + matches!(err, LinkError::RequiresRelocatableBuild { .. }), + "expected RequiresRelocatableBuild, got {err:?}" + ); +} + +// -- Multiple externals / unsatisfied --------------------------------------- + +#[test] +fn imports_satisfied_across_multiple_externals() { + // `sum` comes from one library, `sub` from another. Both imports must be + // satisfied and removed. + let main = main_with_sum_and_sub(); + let sum_lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + let sub_lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.sub) + (export "sub" (func 0))) + "#, + ); + + let linked = link(&main, &[&sum_lib, &sub_lib]).expect("both imports satisfiable"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + assert_eq!(code_body_count(&linked), 3); +} + +#[test] +fn unsatisfied_import_is_an_error() { + let main = main_with_sum_and_sub(); + // Only `sum` is provided; `sub` has no body to merge. + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("missing `sub` must fail"); + match err { + LinkError::UnsatisfiedImport { field } => assert_eq!(field, "sub"), + other => panic!("expected UnsatisfiedImport, got {other:?}"), + } +} + +#[test] +fn no_imports_passes_through_unchanged() { + // A self-contained main module with no imports must link to a still-valid + // module with the same single body. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (export "add" (func 0))) + "#, + ); + + let linked = link(&main, &[]).expect("no-import link should succeed"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + assert_eq!(code_body_count(&linked), 1); + assert_eq!(exported_functions(&linked), vec!["add".to_string()]); +} + +#[test] +fn transitive_host_import_is_rejected() { + // The library's `sum` calls one of its own host imports. There is no body + // to merge for that import, so the link must fail clearly. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "host" "log" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "sum" (func 1))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("transitive host import must fail"); + assert!( + matches!(err, LinkError::TransitiveHostImport { .. }), + "expected TransitiveHostImport, got {err:?}" + ); +} + +// -- Body re-encoding: locals, mixed value types, value-typed blocks --------- + +/// The declared `(count, ValType)` locals of the body at `func_idx`, rendered as +/// a printable string so a mismatch shows the actual locals. +fn body_locals(bytes: &[u8], func_idx: usize) -> Vec<(u32, String)> { + let mut idx = 0; + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::CodeSectionEntry(body) = payload.unwrap() { + if idx == func_idx { + let mut out = Vec::new(); + for entry in body.get_locals_reader().unwrap() { + let (count, ty) = entry.unwrap(); + out.push((count, format!("{ty:?}"))); + } + return out; + } + idx += 1; + } + } + Vec::new() +} + +/// The `(params, results)` value-type lists of every type-section entry, +/// rendered as printable strings. +fn type_signatures(bytes: &[u8]) -> Vec<(Vec, Vec)> { + let mut sigs = Vec::new(); + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::TypeSection(reader) = payload.unwrap() { + for group in reader { + let group = group.unwrap(); + for sub in group.types() { + if let inf_wasmparser::CompositeInnerType::Func(ft) = + &sub.composite_type.inner + { + let params = ft.params().iter().map(|t| format!("{t:?}")).collect(); + let results = ft.results().iter().map(|t| format!("{t:?}")).collect(); + sigs.push((params, results)); + } + } + } + } + } + sigs +} + +/// Builds an external module exporting `sum:(i32,i32)->i32` whose body opens a +/// function-typed `block` referencing type index 9, which the module's single +/// type entry does not define. `wat` cannot assemble an out-of-range numeric +/// type index, so this is emitted directly with `wasm-encoder`. The merge's +/// total-remap scan must reject it as a clean error. +fn lib_with_out_of_range_block_type() -> Vec { + use wasm_encoder::{ + BlockType, CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, + Module, TypeSection, ValType, + }; + + let mut module = Module::new(); + + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::I32, ValType::I32], [ValType::I32]); + module.section(&types); + + let mut funcs = FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + + let mut exports = ExportSection::new(); + exports.export("sum", ExportKind::Func, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + let mut f = Function::new([]); + // A function-typed block over a type index the module never defines. + f.instruction(&Instruction::Block(BlockType::FunctionType(9))); + f.instruction(&Instruction::End); + f.instruction(&Instruction::LocalGet(0)); + f.instruction(&Instruction::LocalGet(1)); + f.instruction(&Instruction::I32Add); + f.instruction(&Instruction::End); + code.function(&f); + module.section(&code); + + module.finish() +} + +/// Builds an external module exporting `entry:(funcref)->()` whose body is a +/// trivial `local.get 0; drop` — no reference-producing operator. The ref type +/// lives only in the signature, so this exercises the merge's signature-intern +/// rejection of reference types in isolation (the operator allow-list never +/// sees a reference op here). `wat` cannot easily express a body that drops a +/// funcref parameter without a reference operator the gate would catch. +fn lib_exporting_funcref_param_entry() -> Vec { + use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, Module, + RefType, TypeSection, ValType, + }; + + let mut module = Module::new(); + + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::Ref(RefType::FUNCREF)], []); + module.section(&types); + + let mut funcs = FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + + let mut exports = ExportSection::new(); + exports.export("entry", ExportKind::Func, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + let mut f = Function::new([]); + f.instruction(&Instruction::LocalGet(0)); + f.instruction(&Instruction::Drop); + f.instruction(&Instruction::End); + code.function(&f); + module.section(&code); + + module.finish() +} + +#[test] +fn merged_body_with_locals_and_value_block_survives_reencode() { + // The merged external `classify` declares locals (i64 and i32) and uses an + // `if (result i32)` value-typed block. Re-encoding the body must preserve the + // locals vector and re-emit the value block type — exercising the locals and + // block-type paths the pure-arithmetic fixtures never touch. The locals are + // integer types only: the Inference language has no `f32`/`f64` types, so a + // float local is rejected by the value-type chokepoint rather than merged. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "logic" "classify" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + (local i64 i32) + local.get 0 + i32.const 0 + i32.gt_s + (if (result i32) + (then i32.const 1) + (else i32.const 0))) + (export "classify" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("merge with locals + value block"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + + // The merged body is output function 1; its locals must survive. + let locals = body_locals(&linked, 1); + assert_eq!( + locals, + vec![(1, "I64".to_string()), (1, "I32".to_string())], + "merged body locals must be preserved through re-encoding, got {locals:?}" + ); + + // The value-typed `if` block re-encodes to an i32-result block; the body + // must still validate and produce its i32 result (asserted by assert_valid). + let calls = body_call_targets(&linked, 0); + assert_eq!(calls, vec![1], "run's call now targets the merged body at 1"); +} + +#[test] +fn merged_closure_with_mixed_value_types_dedups_and_reencodes() { + // The library's `mix` takes (i64, i32) and returns i64, and delegates to an + // internal `helper` of the same signature. Merging exercises the non-i32 + // arms of every value-type mapping (type-section emission, sig dedup key, + // and external-body type remap) plus a transitive closure re-index. The + // signature mixes i64 and i32 — distinct integer value types — since the + // Inference language has no `f32`/`f64` types and a float signature would be + // rejected by the value-type chokepoint rather than deduped and merged. + let main = wasm( + r#" + (module + (type (;0;) (func (param i64 i32) (result i64))) + (import "ints" "mix" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i64 i32) (result i64) + local.get 0 + local.get 1 + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i64 i32) (result i64))) + (func (;0;) (type 0) (param i64 i32) (result i64) + local.get 0 + local.get 1 + call 1) + (func (;1;) (type 0) (param i64 i32) (result i64) + local.get 0 + i64.const 1 + i64.add) + (export "mix" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("merge mixed value types"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + // run + merged mix + merged helper. + assert_eq!(code_body_count(&linked), 3); + + // All four functions share one (i64,i32)->i64 type; it must dedup to one. + let sigs = type_signatures(&linked); + assert_eq!( + sigs, + vec![( + vec!["I64".to_string(), "I32".to_string()], + vec!["I64".to_string()] + )], + "the single (i64,i32)->i64 signature must dedup to one type, got {sigs:?}" + ); + + // mix (output 1) re-indexes its internal call to helper (output 2). + assert_eq!(body_call_targets(&linked, 1), vec![2]); +} + +#[test] +fn tail_call_external_is_rejected_at_the_feature_gate() { + // An external whose body uses `return_call` (the tail-call proposal). The + // tail-call proposal is outside the supported WASM 1.0 subset + // (`SUPPORTED_WASM_FEATURES`), and Inference codegen never emits `return_call` + // (the sret-forwarding path lowers to a plain `call`), so the only such body + // is a third-party external. The link gate's feature pass must reject it up + // front with a `UnsupportedWasmFeature` whose message names tail calls — + // rather than admitting it through the closure scanner's tail-call-as-call + // handling and re-indexing the target. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "tail" "entry" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + return_call 1) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + i32.const 1 + i32.add) + (export "entry" (func 0))) + "#, + ); + + let err = assert_clean_rejection(&main, &lib, "tail call"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("tail call")), + "expected an UnsupportedWasmFeature naming tail calls, got {err:?}" + ); +} + +// -- Diamond closure: one inner callee shared by two roots ------------------- + +#[test] +fn diamond_closure_merges_shared_internal_once() { + // Two exported roots `a` and `b` both call the same internal `shared`. The + // merge must copy `shared` exactly once (the `merged_index` dedup), giving + // four bodies total: main `run`, merged `a`, merged `b`, merged `shared`. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "lib" "a" (func (;0;) (type 0))) + (import "lib" "b" (func (;1;) (type 0))) + (func (;2;) (type 0) (param i32) (result i32) + local.get 0 + call 0 + call 1) + (export "run" (func 2))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + call 2) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 2) + (func (;2;) (type 0) (param i32) (result i32) + local.get 0 + i32.const 2 + i32.mul) + (export "a" (func 0)) + (export "b" (func 1))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("diamond closure merges"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + // run + a + b + shared (exactly one shared copy). + assert_eq!( + code_body_count(&linked), + 4, + "the shared internal must be merged exactly once" + ); + + // Merge order: import `a` closes over `{a, shared}` first (a -> output 1, + // shared -> output 2), then import `b` adds only itself (b -> output 3) + // because `shared` is already merged. Both merged roots therefore call the + // single merged `shared` at output 2. + assert_eq!( + body_call_targets(&linked, 1), + vec![2], + "merged `a` must call the single shared body at output 2" + ); + assert_eq!( + body_call_targets(&linked, 3), + vec![2], + "merged `b` must call the same shared body at output 2, proving one copy" + ); +} + +// -- Main module globals survive the merge ----------------------------------- + +/// The `(mutable, init)` of every global, where `init` is the rendered first +/// operator of its constant initializer. +fn module_globals(bytes: &[u8]) -> Vec<(bool, String)> { + let mut globals = Vec::new(); + for payload in Parser::new(0).parse_all(bytes) { + if let Payload::GlobalSection(reader) = payload.unwrap() { + for g in reader { + let g = g.unwrap(); + let first = g + .init_expr + .get_operators_reader() + .into_iter() + .next() + .map(|op| format!("{:?}", op.unwrap())) + .unwrap_or_default(); + globals.push((g.ty.mutable, first)); + } + } + } + globals +} + +#[test] +fn main_globals_and_global_export_survive_the_merge() { + // The main module owns its own globals (an i32 and an i64) — Tier-C state on + // an *external* module, but perfectly fine on the main module, which keeps + // its memory and globals. The merge must re-emit the global section, both + // constant initializers, and a `Global`-kind export unchanged. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "lib" "inc" (func (;0;) (type 0))) + (global (;0;) (mut i32) (i32.const 11)) + (global (;1;) i64 (i64.const 64)) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "run" (func 1)) + (export "state" (global 0)) + (export "limit" (global 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + i32.const 1 + i32.add) + (export "inc" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("main globals must survive"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + + // Both globals re-emitted with their mutability and constant initializers. + let globals = module_globals(&linked); + assert_eq!( + globals, + vec![ + (true, "I32Const { value: 11 }".to_string()), + (false, "I64Const { value: 64 }".to_string()), + ], + "both main globals (i32 + i64) must survive with their initializers, got {globals:?}" + ); + + // The `state`/`limit` global exports survive at their original indices. + let mut global_exports = Vec::new(); + for payload in Parser::new(0).parse_all(&linked) { + if let Payload::ExportSection(reader) = payload.unwrap() { + for export in reader { + let export = export.unwrap(); + if export.kind == ExternalKind::Global { + global_exports.push((export.name.to_string(), export.index)); + } + } + } + } + assert_eq!( + global_exports, + vec![("state".to_string(), 0), ("limit".to_string(), 1)], + "global exports must survive the merge, got {global_exports:?}" + ); +} + +// -- External module that imports its environment ---------------------------- + +#[test] +fn external_importing_its_environment_is_unsupported() { + // The library that would satisfy `sum` itself imports a *memory* from its + // host environment. A static merge cannot reconstruct that environment, so + // the link must reject it as an unsupported construct (distinct from a + // transitive host *function* import). + let main = main_with_sum_and_sub(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "env" "memory" (memory (;0;) 1)) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.sub) + (export "sum" (func 0)) + (export "sub" (func 1))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("environment import must be rejected"); + match err { + LinkError::UnsupportedConstruct(msg) => assert!( + msg.contains("imports its environment"), + "message should explain the environment import: {msg}" + ), + other => panic!("expected UnsupportedConstruct, got {other:?}"), + } +} + +// -- Invalid input ----------------------------------------------------------- + +#[test] +fn invalid_main_bytes_are_a_parse_error() { + // `raw_link` directly: the `link` test helper parses `main` to derive the + // import module, so garbage main bytes must go straight to the linker. + let err = raw_link(b"not a wasm module", &[]).expect_err("garbage must not parse"); + assert!(matches!(err, LinkError::Parse(_)), "expected Parse, got {err:?}"); +} + +#[test] +fn invalid_external_bytes_are_a_parse_error() { + let main = main_with_sum_and_sub(); + let err = raw_link(&main, &[("mathlib", b"\0asm broken")]) + .expect_err("garbage external must not parse"); + assert!(matches!(err, LinkError::Parse(_)), "expected Parse, got {err:?}"); +} + +// -- Adversarial / malformed external bodies (robustness audit issues) ------- +// +// These externals signature-match the import (so the signature-only +// `validate_extern` upstream would accept them) but carry bodies the merge +// cannot soundly emit. Each must yield a clean `LinkError`, never a panic. + +/// A main module importing a pure `sum:(i32,i32)->i32` and calling it. +fn main_importing_sum() -> Vec { + wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + ) +} + +#[test] +fn out_of_range_call_index_is_a_clean_error() { + // H1: an external whose `sum` body calls function index 99, far past the one + // function the module declares. The closure walk must surface a clean + // `LinkError`, not index `local_funcs` out of bounds and panic. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add + call 99 + drop + local.get 0) + (export "sum" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("out-of-range call must fail, not panic"); + assert!( + matches!(err, LinkError::Parse(_)), + "expected a Parse error for the out-of-range call, got {err:?}" + ); +} + +#[test] +fn function_typed_block_external_is_rejected_at_the_feature_gate() { + // H2 / feature gate: a Tier-A pure external whose body contains a + // function-typed block `(block (type 1) (param i32) (result i32))` referencing + // a *defined* signature. A block that references a type index (so it can take + // params or yield multiple results) is a multi-value construct, outside the + // supported WASM 1.0 subset, so the gate's feature pass rejects the module up + // front, naming multi-value. + // + // The merge's total-type-remap mechanism (interning a block's referenced + // signature via `scan_body_type_indices`, the H2 fix that avoided an unmapped- + // index panic) remains in `merge.rs` as defense-in-depth behind this gate: it + // is no longer reachable through the public `link` API because the only Tier-A/B + // bodies that reference a foreign type index are these multi-value blocks, which + // the gate now fronts. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (type (;1;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + (block (type 1) (param i32) (result i32)) + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + + let err = assert_clean_rejection(&main, &lib, "function-typed block"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("multi-value")), + "expected an UnsupportedWasmFeature naming multi-value, got {err:?}" + ); +} + +#[test] +fn function_typed_block_over_an_out_of_range_type_is_a_clean_error() { + // H2 (out-of-range variant): a function-typed block whose type index names + // no type in the source module. The total-remap scan must surface this as a + // clean parse error, not a silent map or a panic. `wat` cannot assemble an + // out-of-range numeric type index, so the body is hand-encoded: a `block` + // (0x02) with a function block type index 9 (LEB 0x09) that the 1-entry type + // section does not define, then `end`/`end`. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + let _ = lib; // the WAT form cannot express the out-of-range index; build it directly. + let lib = lib_with_out_of_range_block_type(); + + let err = + link(&main, &[&lib]).expect_err("out-of-range block type index must be a clean error"); + assert!( + matches!(err, LinkError::Parse(_) | LinkError::UnsupportedConstruct(_)), + "expected a clean Parse/UnsupportedConstruct for the out-of-range type, got {err:?}" + ); +} + +#[test] +fn reference_typed_local_in_merged_body_is_rejected_at_the_feature_gate() { + // H3 / feature gate: an external whose exported `sum` body declares a + // `funcref` local. A `funcref` local is a reference-types construct, outside + // the supported WASM 1.0 subset, so the gate's feature pass rejects the module + // up front, naming reference types. + // + // The emit-time backstop — `read_locals`/`map_val_type` in `rewrite.rs` + // rejecting a ref-typed local rather than escalating to a panic — remains the + // defense-in-depth layer behind this gate and is covered by a direct unit test + // there (`reference_typed_local_is_unsupported`). + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + (local funcref) + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("ref-typed local must fail, not panic"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("reference types")), + "expected an UnsupportedWasmFeature naming reference types, got {err:?}" + ); +} + +#[test] +fn wide_external_links_quickly_and_correctly() { + // H21: a wide external module of many trivial functions, only function 0 + // exported, must parse in linear time. The old `iter_mut().find(is_empty)` + // body assignment was O(N^2) and stalled the build for tens of seconds on a + // few-MiB module. This builds a moderately wide module and asserts the link + // both succeeds and produces a valid module well within a generous bound. + const N: usize = 20_000; + let main = main_importing_sum(); + + let mut wat = String::from( + "(module (type (;0;) (func (param i32 i32) (result i32)))\n\ + (type (;1;) (func))\n", + ); + // Function 0 is the exported `sum`; the rest are trivial padding bodies that + // widen the module without entering the closure. + wat.push_str( + "(func (;0;) (type 0) (param i32 i32) (result i32) local.get 0 local.get 1 i32.add)\n", + ); + for i in 1..N { + wat.push_str(&format!("(func (;{i};) (type 1))\n")); + } + wat.push_str("(export \"sum\" (func 0)))"); + let lib = wasm(&wat); + + let start = std::time::Instant::now(); + let linked = link(&main, &[&lib]).expect("wide external must link"); + let elapsed = start.elapsed(); + + assert_valid(&linked); + // Only `sum`'s closure (itself) is merged; the padding functions are dropped. + assert_eq!(code_body_count(&linked), 2, "compute + merged sum only"); + assert!( + elapsed < std::time::Duration::from_secs(10), + "linking a {N}-function external took {elapsed:?}; O(N^2) parse regressed" + ); +} + +#[test] +fn external_with_start_section_is_rejected() { + // H22: an external declaring a start function. Its initialization closure is + // never folded into the merge, so silently dropping it would lose the + // side-effects (and bypass the transitive-host-import gate). The link must + // reject it cleanly. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (type (;1;) (func)) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (func (;1;) (type 1)) + (start 1) + (export "sum" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("start section must be rejected"); + assert!( + matches!(err, LinkError::UnsupportedConstruct(msg) if msg.contains("start function")), + "expected an UnsupportedConstruct mentioning the start function" + ); +} + +#[test] +fn main_with_start_section_is_rejected() { + // A main module declaring its own start function. `emit` rebuilds the main + // module section-by-section and writes no `StartSection`, so the start + // function (and its initializer side-effects) would silently vanish from the + // output — a valid-but-wrong `.wasm`/`.v`. The merge must reject it up front, + // mirroring the main-side data/element-segment guards. + let main = wasm( + r#" + (module + (type (;0;) (func)) + (type (;1;) (func (result i32))) + (global $g (mut i32) (i32.const 0)) + (func $init (;0;) (type 0) + i32.const 42 + global.set 0) + (func $main (;1;) (type 1) (result i32) + global.get 0) + (start 0) + (export "main" (func 1))) + "#, + ); + + let err = link(&main, &[]).expect_err("main start section must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("start")), + "expected an UnsupportedConstruct mentioning the start section, got {err:?}" + ); +} + +#[test] +fn main_importing_a_non_function_is_rejected() { + // A main module importing a global from its environment. `emit` writes no + // import section, so the imported global silently vanishes and a body's + // `global.get 0` rebinds to the first *defined* global — a wrong value in a + // valid-but-wrong output, with no diagnostic. The merge models function + // imports only; reject the non-function import up front. + let main = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (import "env" "g" (global (;0;) i32)) + (global (;1;) i32 (i32.const 42)) + (func (;0;) (type 0) (result i32) + global.get 0) + (export "main" (func 0))) + "#, + ); + + let err = link(&main, &[]).expect_err("main non-function import must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("non-function")), + "expected an UnsupportedConstruct mentioning a non-function import, got {err:?}" + ); +} + +#[test] +fn main_importing_a_float_global_is_rejected_not_swallowed() { + // The float variant of the non-function-import case: an `f32` global import. + // Dropping the import section would silently swallow it, defeating the + // no-floats contract. The non-function-import guard rejects it before the + // float ever has a chance to reach (or bypass) any value-type check. + let main = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (import "env" "g" (global (;0;) f32)) + (func (;0;) (type 0) (result i32) + i32.const 0) + (export "main" (func 0))) + "#, + ); + + let err = link(&main, &[]).expect_err("main float-global import must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("non-function")), + "expected an UnsupportedConstruct mentioning a non-function import, got {err:?}" + ); +} + +#[test] +fn main_with_table_section_is_rejected() { + // A main module declaring a table and using it via `call_indirect`. `emit` + // writes no `TableSection`, so the table is silently dropped; the surviving + // `call_indirect` then fails *after* the merge as + // `InvalidMergedModule("unknown table 0")`, blaming the linker's own output + // rather than naming the unsupported main-side construct. Reject the table + // section up front with a clear diagnostic. + let main = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (table (;0;) 1 funcref) + (func (;0;) (type 0) (result i32) + i32.const 0 + call_indirect (type 0)) + (export "main" (func 0))) + "#, + ); + + let err = link(&main, &[]).expect_err("main table section must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("table")), + "expected an UnsupportedConstruct mentioning the table section, got {err:?}" + ); +} + +#[test] +fn main_with_two_memories_is_rejected() { + // The static merge models a single shared linear memory. An external is + // already rejected for declaring more than one memory; a main module with two + // memories was asymmetrically tolerated — the parser kept only memory 0 and + // silently discarded the rest, so a body's memarg over memory 1 would rebind + // to memory 0 in a valid-but-wrong output. Reject the second memory up front, + // mirroring the external guard and the main data/element/start/table guards. + let mut module = wasm_encoder::Module::new(); + let mut mems = wasm_encoder::MemorySection::new(); + mems.memory(wasm_encoder::MemoryType { + minimum: 1, + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + mems.memory(wasm_encoder::MemoryType { + minimum: 1, + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&mems); + let main = module.finish(); + + let err = raw_link(&main, &[]).expect_err("a two-memory main must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("memor")), + "expected an UnsupportedConstruct mentioning the multiple memories, got {err:?}" + ); +} + +#[test] +fn main_with_v128_local_is_rejected() { + // A main module whose body declares a `v128` local. The Inference language has + // no SIMD types, and every SIMD operator is rejected, so the value-type axis + // must be consistent: a `v128` local would otherwise pass through the + // main-module re-encode path (which bypasses the feature gate) into the + // output. Reject it on the value-type axis. + let main = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (func (;0;) (type 0) (result i32) + (local v128) + i32.const 0) + (export "main" (func 0))) + "#, + ); + + let err = link(&main, &[]).expect_err("main v128 local must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("v128")), + "expected an UnsupportedConstruct mentioning v128, got {err:?}" + ); +} + +#[test] +fn main_with_unused_v128_type_entry_is_rejected() { + // A `v128` reaching the output through a type-section entry rather than a + // local: the merged type table copies the main module's function signatures, + // so a signature naming `v128` would carry the SIMD type through even with no + // SIMD operator present. Reject it on the type/signature axis. + let main = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (type (;1;) (func (param v128))) + (func (;0;) (type 0) (result i32) + i32.const 0) + (export "main" (func 0))) + "#, + ); + + let err = link(&main, &[]).expect_err("main v128 type entry must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("v128")), + "expected an UnsupportedConstruct mentioning v128, got {err:?}" + ); +} + +#[test] +fn atomic_op_into_memoryless_main_is_rejected_not_silently() { + // H26 / feature gate: a shared-memory atomic external linked into a + // memoryless main. The atomic op and its shared memory belong to the threads + // proposal, outside the supported WASM 1.0 subset, so the link gate's feature + // pass rejects the external up front with a clean `UnsupportedWasmFeature`. + // + // The deeper backstops this case once exercised — the closure scanner's + // allow-list rejecting the atomic op, and the post-merge `InvalidMergedModule` + // gate catching a body copied into a memoryless module — remain present and + // are covered directly (the allow-list in `safety.rs`, the post-merge gate by + // the corpus sweep). Any of those clean rejections is acceptable here; a + // silent `Ok` (invalid artifact) or a panic is not. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (memory (;0;) 1 1 shared) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.atomic.rmw.add + local.get 0 + i32.add) + (export "sum" (func 0))) + "#, + ); + + let result = link(&main, &[&lib]); + match result { + Err(LinkError::UnsupportedWasmFeature { .. }) + | Err(LinkError::InvalidMergedModule(_)) + | Err(LinkError::RequiresRelocatableBuild { .. }) + | Err(LinkError::UnsupportedConstruct(_)) => {} + Err(other) => panic!("expected a clean rejection, got {other:?}"), + Ok(bytes) => panic!( + "merge silently produced a {}-byte module; it must be rejected", + bytes.len() + ), + } +} + +#[test] +fn ambiguous_import_across_two_externals_is_rejected() { + // Defensive: two externals both export a signature-matching `sum`. The + // field-keyed binding cannot soundly choose between them, so the merge must + // reject rather than silently pick the first (sort-order-dependent) match. + let main = main_importing_sum(); + let lib_a = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + let lib_b = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.sub) + (export "sum" (func 0))) + "#, + ); + + let err = link(&main, &[&lib_a, &lib_b]).expect_err("ambiguous import must be rejected"); + match err { + LinkError::AmbiguousImport { module, field } => { + assert_eq!(field, "sum"); + assert_eq!(module, "mathlib"); + } + other => panic!("expected AmbiguousImport, got {other:?}"), + } +} + +/// C4: two externals export the same field `sum`, but the main module binds it +/// from `bbb`. The merge must fold *bbb's* body (`i32.add`) regardless of the +/// decoy's logical name or slice position — the field-keyed merge previously let +/// the earlier-sorting `aaa` (`i32.sub`) win. +#[test] +fn same_field_two_modules_binds_the_named_module_not_the_first() { + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "bbb" "sum" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + ); + let sub_lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.sub) + (export "sum" (func 0))) + "#, + ); + let add_lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + + // The decoy `aaa` sorts before `bbb`; the field-keyed merge would have + // picked it. Pass it first to make the slice order also favor the decoy. + let linked = raw_link(&main, &[("aaa", &sub_lib), ("bbb", &add_lib)]) + .expect("the bbb-bound `sum` must satisfy the import"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + // Output func 1 is the merged `sum`; it must carry bbb's `i32.add` body. + assert!( + body_has_i32_add(&linked, 1), + "the merged `sum` must be bbb's `i32.add`, not aaa's `i32.sub`" + ); + + // Reversing the slice order must not change which body is merged: the + // binding is on the logical module, not the position. + let reversed = raw_link(&main, &[("bbb", &add_lib), ("aaa", &sub_lib)]) + .expect("order must not matter"); + assert!( + body_has_i32_add(&reversed, 1), + "filename/slice order must not decide the merged body" + ); +} + +/// Two externals bound under *different* logical modules both export `sum`, and +/// the main module imports `sum` from each. The module-prefixed naming makes the +/// two merged roots' name-section entries distinct by construction (`alib.sum`, +/// `blib.sum`), so neither collides nor forces wasm-to-v's index-suffix +/// disambiguation — even though their import field is identical. +#[test] +fn same_field_two_modules_get_distinct_prefixed_names() { + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "alib" "sum" (func (;0;) (type 0))) + (import "blib" "sum" (func (;1;) (type 0))) + (func (;2;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0 + local.get 0 + local.get 1 + call 1 + i32.sub) + (export "compute" (func 2))) + "#, + ); + let alib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + let blib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.mul) + (export "sum" (func 0))) + "#, + ); + + let linked = raw_link(&main, &[("alib", &alib), ("blib", &blib)]) + .expect("both same-field modules must satisfy their respective imports"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + + // Output indices: compute=0, then the two merged `sum` roots. Both carry the + // import field `sum`, so only the module prefix keeps their name-section + // entries distinct. + let names = function_names(&linked); + let merged: std::collections::BTreeSet<&str> = names + .iter() + .filter(|(idx, _)| *idx != 0) + .map(|(_, n)| n.as_str()) + .collect(); + assert!( + merged.contains("alib.sum"), + "the alib-bound root must be named `alib.sum`, got {names:?}" + ); + assert!( + merged.contains("blib.sum"), + "the blib-bound root must be named `blib.sum`, got {names:?}" + ); + assert_eq!( + merged.len(), + 2, + "the two same-field roots must have distinct names by construction, got {names:?}" + ); +} + +/// A logical module name carrying Inference's `::` path separator +/// (`crypto::sha256`) must flow through the prefix unchanged and deterministically +/// (`crypto::sha256.hash`), with no panic. The downstream Rocq translator +/// sanitizes every non-alphanumeric to `_`, so the residual `::` is the +/// translator's concern, not the linker's — the linker keeps the logical name +/// verbatim so the prefix stays traceable to its source module. +#[test] +fn a_path_separated_logical_module_name_prefixes_deterministically() { + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "crypto::sha256" "hash" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (export "hash" (func 0))) + "#, + ); + + let linked = raw_link(&main, &[("crypto::sha256", &lib)]) + .expect("a `::`-separated logical module must link without panicking"); + assert_valid(&linked); + + let names = function_names(&linked); + assert!( + names.contains(&(1, "crypto::sha256.hash".to_string())), + "the merged root must keep its logical module verbatim in the prefix, got {names:?}" + ); +} + +/// H25 + C1: a main module carrying an `inference.spec_funcs` section that binds +/// an extern must (a) keep the section after linking (H25) and (b) rewrite each +/// recorded index from the pre-link space into the post-link space (C1). Here +/// the main imports `sum` and records spec index 1 (its own local function, in +/// the pre-link space that counts the import as index 0); after the import is +/// removed that function shifts down to index 0. +#[test] +fn spec_funcs_section_survives_and_is_reindexed() { + // version=1, count=1, name_len=1 'S', idx_count=1, index=1 + let spec_payload = [1u8, 1, 1, b'S', 1, 1]; + let main_wat = r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#; + let mut main = wasm(main_wat); + // Append the spec_funcs custom section (the `wat` crate does not emit it). + use wasm_encoder::Section as _; + wasm_encoder::CustomSection { + name: "inference.spec_funcs".into(), + data: (&spec_payload[..]).into(), + } + .append_to(&mut main); + + let lib = mathlib_pure(); + let linked = link(&main, &[&lib]).expect("link must preserve the spec section"); + assert_valid(&linked); + + let data = custom_section_data(&linked, "inference.spec_funcs") + .expect("the linked module must still carry the spec_funcs section (H25)"); + let decoded = decode_spec_funcs(&data); + assert_eq!( + decoded, + vec![("S".to_string(), vec![0])], + "pre-link index 1 (import + 1 local) must rewrite to post-link index 0 (C1)" + ); +} + +#[test] +fn out_of_range_spec_funcs_index_is_a_clean_parse_error() { + // S2: a spec index past the main module's function count must reject as a + // clean `LinkError::Parse`, not silently remap onto a wrong/nonexistent + // function. The post-merge validator treats the `inference.spec_funcs` custom + // section as opaque, so without the explicit bound in `map_main_func` this + // would emit a garbage Rocq proof obligation that still passes validation. + // + // The main here has 1 import (index 0) + 1 local (index 1), so a pre-link + // index of 5 is out of range. + // version=1, count=1, name_len=1 'S', idx_count=1, index=5 + let spec_payload = [1u8, 1, 1, b'S', 1, 5]; + let main_wat = r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#; + let mut main = wasm(main_wat); + use wasm_encoder::Section as _; + wasm_encoder::CustomSection { + name: "inference.spec_funcs".into(), + data: (&spec_payload[..]).into(), + } + .append_to(&mut main); + + let lib = mathlib_pure(); + let err = link(&main, &[&lib]) + .expect_err("an out-of-range spec index must be a clean rejection, never a wrong remap"); + assert!( + matches!(&err, LinkError::Parse(msg) if msg.contains("out of range")), + "expected a Parse error naming the out-of-range index, got {err:?}" + ); +} + +#[test] +fn two_spec_funcs_sections_in_main_are_a_clean_error_not_a_silent_overwrite() { + // The `inference.spec_funcs` section is a verification deliverable: its proof + // obligations must never be silently dropped. A main carrying two such + // sections previously kept only the last (last-wins overwrite), discarding the + // first section's obligations. The parser must instead reject the duplicate + // with a clean error so the lost obligations are surfaced, never vanished. + // version=1, count=1, name_len=1, idx_count=1, index=0 in each section, but + // recording different spec names so a silent overwrite would be observable. + let first = [1u8, 1, 1, b'A', 1, 0]; + let second = [1u8, 1, 1, b'B', 1, 0]; + let main_wat = r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#; + let mut main = wasm(main_wat); + use wasm_encoder::Section as _; + wasm_encoder::CustomSection { + name: "inference.spec_funcs".into(), + data: (&first[..]).into(), + } + .append_to(&mut main); + wasm_encoder::CustomSection { + name: "inference.spec_funcs".into(), + data: (&second[..]).into(), + } + .append_to(&mut main); + + let lib = mathlib_pure(); + let err = link(&main, &[&lib]) + .expect_err("a duplicate spec_funcs section must be rejected, never silently overwritten"); + assert!( + matches!(&err, LinkError::Parse(msg) | LinkError::UnsupportedConstruct(msg) if msg.contains("spec_funcs")), + "expected a clean error naming the duplicate spec_funcs section, got {err:?}" + ); +} + +#[test] +fn malformed_main_with_out_of_range_type_index_is_a_clean_error_not_a_panic() { + // S3: the public `link` API accepts arbitrary main bytes. A main whose + // FunctionSection names a function type index past the type section must be + // rejected with a clean `LinkError` (the entry-side structural validation), + // never panic on a raw slice index in `emit` before the post-merge gate runs. + // `wat` validates and so cannot build this; assemble it with `wasm-encoder`. + use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, Module, + TypeSection, + }; + + let mut module = Module::new(); + let mut types = TypeSection::new(); + types.ty().function([], []); // only type index 0 exists + module.section(&types); + + let mut funcs = FunctionSection::new(); + funcs.function(5); // out-of-range: no type index 5 + module.section(&funcs); + + let mut exports = ExportSection::new(); + exports.export("run", ExportKind::Func, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + let mut f = Function::new([]); + f.instruction(&Instruction::End); + code.function(&f); + module.section(&code); + + let main = module.finish(); + + // No externals: the merge still parses, plans, and emits the main module, so + // the out-of-range type index is reached on the emit path. + let err = raw_link(&main, &[]) + .expect_err("a main with an out-of-range type index must be a clean rejection"); + assert!( + matches!(&err, LinkError::Parse(_)), + "expected a clean Parse rejection, got {err:?}" + ); +} + +// -- WU2: fail-closed rejection of unmergeable operator families ------------- +// +// Each test below feeds the merge an external `.wasm` containing a construct the +// static merge cannot model — an atomic, a SIMD op, exception handling, a typed +// reference, a multi-memory access, or a reference-typed signature. The merge +// must reject every one with a CLEAN `LinkError` (never panic, and never a +// silent `Ok` of a structurally-invalid module). Most fixtures assemble from +// inline WAT (the `wat` crate does not validate, so it happily emits these into +// an otherwise-MVP module); the few WAT cannot express are built with +// `wasm-encoder`. + +/// Asserts that linking `main` against `lib` is a clean rejection: a returned +/// `LinkError` of one of the fail-closed kinds, never a panic, and never an +/// `Ok` (which for these fixtures would be a silently-invalid artifact). +fn assert_clean_rejection(main: &[u8], lib: &[u8], what: &str) -> LinkError { + match link(main, &[lib]) { + Ok(bytes) => panic!( + "{what}: merge silently produced a {}-byte module; it must be rejected", + bytes.len() + ), + Err( + e @ (LinkError::UnsupportedConstruct(_) + | LinkError::RequiresRelocatableBuild { .. } + | LinkError::InvalidMergedModule(_) + | LinkError::IncompatibleMemory { .. } + | LinkError::UnsupportedWasmFeature { .. } + | LinkError::Parse(_)), + ) => e, + Err(other) => panic!("{what}: expected a fail-closed rejection, got {other:?}"), + } +} + +#[test] +fn atomic_memory_op_is_rejected_at_the_feature_gate() { + // H17 / feature gate: an external with a shared memory and an + // `i32.atomic.rmw.add` body. Atomics belong to the threads proposal, outside + // the supported WASM 1.0 subset, so the link gate's feature pass rejects the + // module up front — before the closure scanner's allow-list (the + // defense-in-depth backstop, still tested directly in `safety.rs`) would + // reach the operator. The shared memory the atomic op requires is itself a + // threads-proposal construct, so the validator names `threads`. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (memory (;0;) 1 1 shared) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.atomic.rmw.add + local.get 0 + i32.add) + (export "sum" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "atomic rmw"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("threads")), + "expected an UnsupportedWasmFeature naming the threads proposal, got {err:?}" + ); +} + +#[test] +fn simd_v128_memory_load_is_rejected_at_the_feature_gate() { + // H18 / feature gate: an external with a `v128.load` body. SIMD is outside + // the supported WASM 1.0 subset, so the link gate's feature pass rejects the + // module up front, naming SIMD — before the closure scanner's allow-list (the + // backstop, still tested directly in `safety.rs`) would reach the opcode. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (memory (;0;) 1) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + v128.load + drop + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "v128.load"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("SIMD")), + "expected an UnsupportedWasmFeature naming SIMD, got {err:?}" + ); +} + +#[test] +fn exception_handling_throw_is_rejected_at_the_feature_gate() { + // H11 / feature gate: an external with a tag section and a `throw 0` body. + // Exception handling is outside the supported WASM 1.0 subset, so the link + // gate's feature pass rejects the module up front, naming the exceptions + // proposal — before the allow-list (the backstop, tested directly in + // `safety.rs`) would reach the EH operator. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (type (;1;) (func)) + (tag (;0;) (type 1)) + (func (;0;) (type 0) (param i32 i32) (result i32) + throw 0) + (export "sum" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "throw"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("exceptions")), + "expected an UnsupportedWasmFeature naming exceptions, got {err:?}" + ); +} + +#[test] +fn exception_handling_try_table_is_rejected_at_the_feature_gate() { + // H11 (try_table variant) / feature gate: the structured `try_table` block is + // likewise part of the exceptions proposal, outside the supported subset, so + // the gate's feature pass rejects it naming exceptions. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + block + try_table + end + end + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "try_table"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("exceptions")), + "expected an UnsupportedWasmFeature naming exceptions, got {err:?}" + ); +} + +#[test] +fn call_ref_is_rejected_at_the_feature_gate() { + // H12 / feature gate: an external whose body uses `call_ref`. Typed function + // references are outside the supported WASM 1.0 subset, so the gate's feature + // pass rejects the module up front, naming reference types — before the + // allow-list (the backstop, tested directly in `safety.rs`) would reach the + // operator. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + ref.null 0 + call_ref 0) + (export "sum" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "call_ref"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("reference types")), + "expected an UnsupportedWasmFeature naming reference types, got {err:?}" + ); +} + +#[test] +fn ref_null_is_rejected_at_the_feature_gate() { + // H12/H13 / feature gate: an external whose body uses `ref.null func`. The + // reference-types proposal is outside the supported subset, so the gate's + // feature pass rejects the module naming reference types — before the + // allow-list would reach the operator. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + ref.null func + drop + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "ref.null"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("reference types")), + "expected an UnsupportedWasmFeature naming reference types, got {err:?}" + ); +} + +#[test] +fn multi_memory_external_is_rejected_at_the_feature_gate() { + // H14 / feature gate: an external declaring two memories whose body loads from + // memory 1. Multiple memories belong to the multi-memory proposal, outside the + // supported WASM 1.0 subset, so the gate's feature pass rejects the module up + // front — before the per-external `memory_count > 1` guard and the non-zero + // memarg allow-list (both still tested directly) would reach it. The validator + // names multiple memories. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (memory (;0;) 1) + (memory (;1;) 1) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + i32.load 1 + drop + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "multi-memory"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("memor")), + "expected an UnsupportedWasmFeature mentioning memories, got {err:?}" + ); +} + +#[test] +fn nonzero_memarg_memory_index_is_rejected_cleanly() { + // H14 (single-memory, non-zero memarg): even a one-memory module whose body + // names memory 1 in a memarg must be rejected — the load-bearing fix drives + // off memarg presence so the index can never silently dangle. + let main = main_importing_sum(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (memory (;0;) 1) + (memory (;1;) 1) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.store 1 + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + assert_clean_rejection(&main, &lib, "non-zero memarg"); +} + +#[test] +fn reference_typed_parameter_signature_is_rejected_at_the_feature_gate() { + // H23 / feature gate: a crafted external whose exported `entry` has a + // `funcref` parameter, with a body that uses no reference-producing operator + // (just `local.get`/`drop`). A `funcref` in a function signature is itself a + // reference-types construct, so the link gate's feature pass rejects the + // module up front, naming reference types — before the merge's signature + // interning is reached. + // + // The intern-time backstop (`val_type_tag`/`sig_key` rejecting a ref type so + // it can never be collapsed to `i32` and silently emitted) remains the + // defense-in-depth layer behind this gate and is covered by a direct unit + // test in `merge.rs` (`ref_typed_signature_is_rejected_at_intern_time`). + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "reflib" "entry" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "run" (func 1))) + "#, + ); + let lib = lib_exporting_funcref_param_entry(); + let err = assert_clean_rejection(&main, &lib, "ref-typed parameter"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("reference types")), + "expected an UnsupportedWasmFeature naming reference types, got {err:?}" + ); +} + +#[test] +fn memory64_external_against_memory32_main_is_rejected_at_the_feature_gate() { + // H16 (partial) / feature gate: a memory64 external folded onto a memory32 + // main. The memory64 proposal is outside the supported WASM 1.0 subset, so + // the gate's feature pass rejects the external up front, naming memory64 — + // before the memory reconciler's shape guard (the backstop, still tested + // directly in `merge.rs`) would reach it. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (memory (;0;) 1) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (memory (;0;) i64 1) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "memory64 vs memory32"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("memory64")), + "expected an UnsupportedWasmFeature naming memory64, got {err:?}" + ); +} + +#[test] +fn memory64_external_onto_a_memoryless_main_is_rejected_at_the_feature_gate() { + // C-4 / feature gate: a `memory64` external forwarded by a *memoryless* main. + // The memory64 proposal is outside the supported subset, so the gate's feature + // pass rejects the external up front, naming memory64 — before the + // reconciler's `None => ext` adopt-path shape guard (the backstop, still + // tested directly in `merge.rs`) would reach it. + let main = wasm( + r#" + (module + (type (;0;) (func (param i64) (result i64))) + (import "memlib" "load_at" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i64) (result i64) + local.get 0 + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i64) (result i64))) + (memory (;0;) i64 1) + (func (;0;) (type 0) (param i64) (result i64) + local.get 0 + i64.load) + (export "load_at" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "memory64 onto a memoryless main"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("memory64")), + "expected an UnsupportedWasmFeature naming memory64, got {err:?}" + ); +} + +#[test] +fn bare_shared_external_onto_a_memoryless_main_is_rejected_at_the_feature_gate() { + // L-1 / feature gate: a bare `shared` external memory whose body uses no + // atomic op (so the operator allow-list does not catch it) folded onto a + // memoryless main. A `shared` memory is a threads-proposal construct, outside + // the supported WASM 1.0 subset, so the gate's feature pass rejects the + // external up front, naming the threads requirement — before the reconciler's + // adopt-path shape guard (the backstop, still tested directly in `merge.rs`) + // would reach it. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "memlib" "load_at" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (memory (;0;) 1 1 shared) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + i32.load) + (export "load_at" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "bare shared onto a memoryless main"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } + if details.contains("threads") || details.contains("shared")), + "expected an UnsupportedWasmFeature naming the threads/shared requirement, got {err:?}" + ); +} + +// -- Address-provenance (C2) ------------------------------------------------- + +#[test] +fn tier_b_param_addressed_load_merges_into_a_valid_module() { + // C2 (safe case): a Tier-B external that loads through its *parameter* keeps + // the Tier-B contract — every address is caller-supplied. It must merge into + // a valid module with the load body intact. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "memlib" "load_at" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (memory (;0;) 1) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + i32.const 4 + i32.add + i32.load) + (export "load_at" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("param-addressed Tier B should merge"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + assert_eq!(code_body_count(&linked), 2); +} + +#[test] +fn tier_b_multi_function_param_addressed_helper_merges() { + // The headline interprocedural case: a `sort(ptr, len)` export that calls an + // internal `swap(p, a, b)` helper, passing `swap` a param-derived pointer; the + // helper dereferences its pointer parameter. Because every call site supplies + // `swap`'s pointer from the root's own parameters, the sound interprocedural + // analysis proves the whole *two-function* closure is caller-addressed and + // merges it — the conservative `>1 function => reject` stopgap is gone. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32))) + (import "sortlib" "sort" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 0) (param i32 i32) + local.get 0 + local.get 1 + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32))) + (type (;1;) (func (param i32 i32 i32))) + (memory (;0;) 1) + (func (;0;) (type 0) (param i32 i32) + local.get 0 + local.get 1 + i32.add + local.get 0 + local.get 1 + call 1) + (func (;1;) (type 1) (param i32 i32 i32) + local.get 0 + local.get 1 + i32.load + i32.store + local.get 1 + local.get 2 + i32.load + i32.store) + (export "sort" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("param-addressed helper closure should merge"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + // run + merged sort + merged swap. + assert_eq!(code_body_count(&linked), 3); +} + +#[test] +fn tier_b_helper_called_with_constant_address_is_rejected() { + // The interprocedural reject case: the root discards a constant into a helper + // that dereferences its parameter. The constant argument makes the helper's + // parameter untrusted at its only call site, so the helper's load aliases a + // fixed host address — rejected as Tier C, not silently merged. + let main = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (import "memlib" "peek" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 0) (result i32) + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (type (;1;) (func (param i32) (result i32))) + (memory (;0;) 1) + (func (;0;) (type 0) (result i32) + i32.const 1024 + call 1) + (func (;1;) (type 1) (param i32) (result i32) + local.get 0 + i32.load) + (export "peek" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("a const-fed helper deref must be rejected"); + match err { + LinkError::RequiresRelocatableBuild { field, reasons } => { + assert_eq!(field, "peek"); + assert!( + reasons.iter().any(|r| r.contains("parameter")), + "reason should mention parameter provenance: {reasons:?}" + ); + } + other => panic!("expected RequiresRelocatableBuild, got {other:?}"), + } +} + +#[test] +fn tier_b_self_recursive_param_addressed_helper_merges() { + // A self-recursive export that dereferences its parameter and recurses with a + // *param-derived* argument (`ptr + 4`). The greatest fixpoint keeps the + // parameter trusted across the back-edge, so the recursive closure merges. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32))) + (import "memlib" "walk" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 0) (param i32) + local.get 0 + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32))) + (memory (;0;) 1) + (func (;0;) (type 0) (param i32) + local.get 0 + i32.load + if + local.get 0 + i32.const 4 + i32.add + call 0 + end) + (export "walk" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("self-recursive param-addressed closure should merge"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + assert_eq!(code_body_count(&linked), 2); +} + +#[test] +fn tier_b_absolute_address_load_is_rejected() { + // C2 (the defect): an external that loads from a *fixed absolute address* not + // derived from any parameter would silently alias the host program's own + // memory. The address-provenance analysis must reject it as Tier C rather + // than merge it. + let main = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (import "memlib" "peek" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 0) (result i32) + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (result i32))) + (memory (;0;) 1) + (func (;0;) (type 0) (result i32) + i32.const 1024 + i32.load) + (export "peek" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("absolute-address load must be rejected"); + match err { + LinkError::RequiresRelocatableBuild { field, reasons } => { + assert_eq!(field, "peek"); + assert!( + reasons.iter().any(|r| r.contains("parameter")), + "reason should mention parameter provenance: {reasons:?}" + ); + } + other => panic!("expected RequiresRelocatableBuild, got {other:?}"), + } +} + +#[test] +fn tier_b_store_at_absolute_address_is_rejected() { + // C2 (store variant): a store to a fixed absolute address corrupts the host + // program's memory at a baked-in offset. It must be rejected. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32))) + (import "memlib" "poke" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 0) (param i32) + local.get 0 + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32))) + (memory (;0;) 1) + (func (;0;) (type 0) (param i32) + i32.const 2048 + local.get 0 + i32.store) + (export "poke" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("absolute-address store must be rejected"); + assert!( + matches!(err, LinkError::RequiresRelocatableBuild { .. }), + "expected RequiresRelocatableBuild, got {err:?}" + ); +} + +// -- Memory reconciliation (H15, H16, H24) ----------------------------------- + +#[test] +fn memoryless_main_with_param_addressed_external_synthesizes_memory() { + // H24: the main module declares no memory of its own, but the external it + // links uses memory (through a parameter) and declares its own. The merge + // must synthesize an output memory from the external's declaration so the + // merged body's `memory 0` reference is satisfied — a valid module. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32))) + (import "memlib" "store_at" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) + local.get 0 + local.get 1 + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32))) + (memory (;0;) 3) + (func (;0;) (type 0) (param i32 i32) + local.get 0 + local.get 1 + i32.store) + (export "store_at" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("memoryless main + memory external must reconcile"); + assert_valid(&linked); + let (initial, _max) = memory_limits(&linked).expect("output must declare the synthesized memory"); + assert_eq!(initial, 3, "the external's minimum must be carried into the output"); +} + +#[test] +fn external_minimum_is_reconciled_so_no_out_of_bounds() { + // H15: the external declares `(memory 10)`, a far larger minimum than the + // main module's 1-page memory. The reconciled output minimum must be the max + // of the two (10), so an access in the external's static range is in-bounds + // rather than trapping. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "memlib" "load_at" (func (;0;) (type 0))) + (memory (;0;) 1 20) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (memory (;0;) 10 20) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + i32.load) + (export "load_at" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("memory minimums must reconcile"); + assert_valid(&linked); + let (initial, maximum) = memory_limits(&linked).expect("output declares a memory"); + assert_eq!(initial, 10, "reconciled minimum is the max of both module minimums"); + assert_eq!(maximum, Some(20), "reconciled maximum widens to admit both ranges"); +} + +#[test] +fn memory_grow_against_a_growable_memory_is_reconciled() { + // H15 (grow, accepted): the external grows memory, and the reconciled + // memory's maximum exceeds its minimum, so growth can succeed. The merge + // must accept it and produce a valid module. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "memlib" "grow_by" (func (;0;) (type 0))) + (memory (;0;) 1 10) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (memory (;0;) 1 10) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + memory.grow) + (export "grow_by" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("growable memory must reconcile"); + assert_valid(&linked); + let (_initial, maximum) = memory_limits(&linked).expect("output declares a memory"); + assert!( + maximum.is_none_or(|m| m > 1), + "a growable memory must keep room above the minimum, got {maximum:?}" + ); +} + +#[test] +fn memory_grow_against_a_fixed_memory_is_rejected() { + // H15 (grow, rejected): the external grows memory, but every module's memory + // is pinned (min == max), so growth always fails at runtime. The merge must + // reject it with a clean diagnostic rather than emit a module whose + // `memory.grow` silently returns -1. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "memlib" "grow_by" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (memory (;0;) 1 1) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + memory.grow) + (export "grow_by" (func 0))) + "#, + ); + + let err = link(&main, &[&lib]).expect_err("growth against a fixed memory must reject"); + assert!( + matches!(err, LinkError::IncompatibleMemory { .. }), + "expected IncompatibleMemory, got {err:?}" + ); +} + +#[test] +fn custom_page_size_mismatch_is_rejected_cleanly() { + // H16 (page-size flag): an external whose memory uses a custom page size + // changes the address-to-page mapping and cannot be folded onto the main + // module's default-page memory. The merge must reject it cleanly. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "memlib" "load_at" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (memory (;0;) 1 1 (pagesize 1)) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + i32.load) + (export "load_at" (func 0))) + "#, + ); + + // The custom-page-sizes proposal is disabled in the linker's structural + // validator, so the universal external pre-validation gate (mirroring the + // driver) now rejects this external as invalid WASM before the memory-shape + // reconciler is reached. Either rejection is fail-closed and clean: a `Parse` + // naming the custom page size from the entry gate, or the reconciler's + // `IncompatibleMemory` if validation is ever relaxed for this proposal. Both + // are accepted here; what matters is that the merge never folds a + // custom-page-size memory into the default-page output. + let err = assert_clean_rejection(&main, &lib, "custom page size"); + let mentions_page_size = match &err { + LinkError::Parse(message) => message.contains("page size"), + LinkError::IncompatibleMemory { reason, .. } => reason.contains("page size"), + _ => false, + }; + assert!( + mentions_page_size, + "expected a clean rejection mentioning the custom page size, got {err:?}" + ); +} + +// -- Deterministic adversarial property sweep -------------------------------- +// +// This is the stable-`cargo test` analogue of the `cargo-fuzz` target in +// `core/wasm-linker/fuzz/`. `cargo-fuzz` needs a nightly toolchain that is not +// part of the default build, so the generative guard cannot run everywhere; this +// property test runs the same *invariant* over a fixed, hand-seeded corpus on +// every `cargo test`: +// +// for every (main, externals) in the corpus, `link` must either +// (a) return `Err`, or +// (b) return `Ok` with bytes that pass `inf_wasmparser::validate`. +// +// It must NEVER panic, hang, OOM, or return a silently-invalid module. The +// corpus is the union of the audit reproductions (one adversarial external per +// confirmed body-level issue) and a deterministic byte-mutation sweep of a valid +// external, which broadly exercises the parse/closure/merge/emit paths without +// any nondeterminism. Each `link` call is wrapped in `catch_unwind` so a +// regression that reintroduces a panic fails this test with the offending +// fixture named, rather than aborting the run with an opaque backtrace. + +/// The outcome a corpus probe demands of `link`. +/// +/// Every probe forbids a panic / hang / OOM and a silently-invalid `Ok`. The +/// variants additionally pin *which* clean outcome is correct, so a regression +/// that turns a soundness rejection into a (validating but wrong) merge is +/// caught — not just one that reintroduces a crash. +#[derive(Clone, Copy, PartialEq)] +enum Expect { + /// Any clean outcome is acceptable: a returned `Err`, or an `Ok` whose bytes + /// validate. Used by the malformed/mutation inputs whose resolution is shape- + /// dependent (some parse-reject, some are caught later). + CleanOutcome, + /// The probe MUST merge into a valid module. A returned `Err` is a + /// regression. Used by the legitimate positive controls. + Merges, + /// The probe MUST be rejected with a clean `Err`. An `Ok` — even one whose + /// bytes validate — is a silent miscompile. Used by every soundness + /// reproduction (the laundering / shape / resource cases) whose merge would + /// model the wrong machine. + Rejected, +} + +/// One labelled corpus entry: a main module, its externals, the demanded +/// outcome, and a human-readable description used in failure messages. +struct Probe { + label: &'static str, + main: Vec, + externals: Vec>, + expect: Expect, +} + +/// Assembles a `.wasm` from WAT, returning `None` for inputs `wat` itself +/// rejects. The mutation sweep deliberately produces some non-assemblable +/// fixtures via byte flips on already-assembled bytes, never via WAT, so this +/// only guards the hand-written seeds. +fn try_wasm(src: &str) -> Option> { + wat::parse_str(src).ok() +} + +/// The hand-seeded adversarial corpus: each external signature-matches the +/// import `main_importing_sum` declares (so a signature-only upstream check +/// would admit it) but carries a body or section the merge must handle without +/// panicking — either by merging into a valid module or by a clean `LinkError`. +/// These mirror the seeds the `cargo-fuzz` target is meant to start from. +fn seed_probes() -> Vec { + let main = main_importing_sum(); + let sum_lib = |body: &str| -> Option> { + try_wasm(&format!( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (func (;0;) (type 0) (param i32 i32) (result i32) {body}) \ + (export \"sum\" (func 0)))" + )) + }; + + let mut probes = Vec::new(); + let mut push = |label: &'static str, lib: Option>| { + if let Some(lib) = lib { + probes.push(Probe { + label, + main: main.clone(), + externals: vec![lib], + // The first-audit body seeds resolve to a clean outcome whose + // exact shape (parse-reject vs. tier-reject vs. validate-reject) + // is an internal detail; the invariant is only "no panic, no + // silently-invalid Ok". + expect: Expect::CleanOutcome, + }); + } + }; + + // H1: out-of-range call index. + push( + "H1 out-of-range call", + sum_lib("local.get 0 local.get 1 i32.add call 99 drop local.get 0"), + ); + // H2: function-typed block over a defined, non-own type index. + push( + "H2 function-typed block", + sum_lib("local.get 0 (block (param) (result)) local.get 1 i32.add"), + ); + // H3: reference-typed local in the body. + push( + "H3 funcref local", + sum_lib("(local funcref) local.get 0 local.get 1 i32.add"), + ); + // H11: exception-handling body with a tag section. + push( + "H11 throw", + try_wasm( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (tag (;0;)) \ + (func (;0;) (type 0) (param i32 i32) (result i32) \ + local.get 0 local.get 1 i32.add throw 0) \ + (export \"sum\" (func 0)))", + ), + ); + // H12/H13: typed-reference operators. + push("H12 ref.null", sum_lib("ref.null func drop local.get 0 local.get 1 i32.add")); + // H17: shared-memory atomic op into a memoryless main. + push( + "H17 atomic rmw", + try_wasm( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (memory (;0;) 1 1 shared) \ + (func (;0;) (type 0) (param i32 i32) (result i32) \ + i32.const 0 local.get 0 i32.atomic.rmw.add drop local.get 1) \ + (export \"sum\" (func 0)))", + ), + ); + // H18: SIMD V128 memory load into a memoryless main. + push( + "H18 v128.load", + try_wasm( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (memory (;0;) 1) \ + (func (;0;) (type 0) (param i32 i32) (result i32) \ + i32.const 0 v128.load drop local.get 0 local.get 1 i32.add) \ + (export \"sum\" (func 0)))", + ), + ); + // H24: Tier-B memory op merged into a memoryless main. + push( + "H24 memory.size", + try_wasm( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (memory (;0;) 1) \ + (func (;0;) (type 0) (param i32 i32) (result i32) \ + memory.size drop local.get 0 local.get 1 i32.add) \ + (export \"sum\" (func 0)))", + ), + ); + // H14: multi-memory external with a non-zero memarg memory index. + push( + "H14 multi-memory", + try_wasm( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (memory (;0;) 1) (memory (;1;) 1) \ + (func (;0;) (type 0) (param i32 i32) (result i32) \ + local.get 0 i32.load 1 drop local.get 0 local.get 1 i32.add) \ + (export \"sum\" (func 0)))", + ), + ); + // C2: a load from a fixed absolute address, no parameter provenance. + push( + "C2 absolute-address load", + try_wasm( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (memory (;0;) 1) \ + (func (;0;) (type 0) (param i32 i32) (result i32) \ + i32.const 1024 i32.load drop local.get 0 local.get 1 i32.add) \ + (export \"sum\" (func 0)))", + ), + ); + // Hand-encoded: an out-of-range function-typed block index `wat` cannot express. + probes.push(Probe { + label: "H2 out-of-range block type", + main: main.clone(), + externals: vec![lib_with_out_of_range_block_type()], + expect: Expect::Rejected, + }); + // A genuinely-pure external that must merge into a valid module (the positive + // control: the sweep must not become vacuously all-`Err`). + probes.push(Probe { + label: "pure control (must merge)", + main: main.clone(), + externals: vec![mathlib_pure()], + expect: Expect::Merges, + }); + // Empty / truncated externals — the parser must reject, not index past the end. + probes.push(Probe { + label: "empty external", + main: main.clone(), + externals: vec![Vec::new()], + expect: Expect::Rejected, + }); + probes.push(Probe { + label: "magic-only external", + main: main.clone(), + externals: vec![b"\0asm\x01\0\0\0".to_vec()], + expect: Expect::Rejected, + }); + + probes.extend(round2_probes()); + probes +} + +/// The round-2 audit reproductions, folded into the same panic-free / +/// `Ok ⇒ valid` invariant sweep as the first-audit seeds. Each is the exact +/// laundering / shape / resource shape the dedicated regression tests assert a +/// clean outcome for; including them here additionally guarantees that *however* +/// each is resolved (clean `Err`, or a valid merge for the legitimate cases), it +/// is never a panic, hang, OOM, or silently-invalid module. +/// +/// The provenance-laundering probes (C-1/C-2/C-3) address the host's *own* +/// linear memory, so they pair against a memory-owning main that exports a +/// shared memory — the practically-reachable Tier-B shape — rather than the +/// memoryless `main_importing_sum`. +fn round2_probes() -> Vec { + let mut probes = Vec::new(); + + // A memory-owning main exporting a shared memory and importing `peek`/`poke` + // from `mathlib` (the module label the property sweep tags every external + // with), mirroring the `tier_b_*` reproductions. + let mem_main = |import_ty: &str, import_field: &str, body: &str| -> Option> { + try_wasm(&format!( + "(module {import_ty} \ + (import \"mathlib\" \"{import_field}\" (func (;0;) (type 0))) \ + (memory (;0;) 1 1) \ + {body} \ + (export \"memory\" (memory 0)) (export \"run\" (func 1)))" + )) + }; + let mem_lib = |ty: &str, field: &str, body: &str| -> Option> { + try_wasm(&format!( + "(module {ty} (memory (;0;) 1) \ + (func (;0;) (type 0) {body}) (export \"{field}\" (func 0)))" + )) + }; + + // Every round-2 reproduction is a soundness case: the merge would model the + // wrong machine, so the only correct outcome is a clean rejection. + let mut push = |label: &'static str, main: Option>, lib: Option>| { + if let (Some(main), Some(lib)) = (main, lib) { + probes.push(Probe { label, main, externals: vec![lib], expect: Expect::Rejected }); + } + }; + + // C-1: a constant address laundered through a control-flow join into an + // address-feeding local. The skip path leaves the const, so the address is + // not parameter-derived on every path — a host-memory alias the provenance + // analysis must reject. + push( + "C-1 control-flow-join laundered load", + mem_main( + "(type (;0;) (func (param i32 i32) (result i32)))", + "peek", + "(func (;1;) (type 0) (param i32 i32) (result i32) \ + local.get 0 local.get 1 call 0)", + ), + mem_lib( + "(type (;0;) (func (param i32 i32) (result i32)))", + "peek", + "(param i32 i32) (result i32) (local i32) \ + i32.const 1024 local.set 2 \ + (block local.get 1 (if (then local.get 0 local.set 2))) \ + local.get 2 i32.load", + ), + ); + + // C-2: param-nulling arithmetic. `(addr - addr) == 0`, so `+ 65536` is a + // fixed host address regardless of the caller's pointer — the two-point + // lattice cannot prove the operands unequal, so `sub` must not propagate. + push( + "C-2 param-nulling arithmetic store", + mem_main( + "(type (;0;) (func (param i32 i32)))", + "poke", + "(func (;1;) (type 0) (param i32 i32) \ + local.get 0 local.get 1 call 0)", + ), + mem_lib( + "(type (;0;) (func (param i32 i32)))", + "poke", + "(param i32 i32) \ + local.get 0 local.get 0 i32.sub i32.const 65536 i32.add \ + local.get 1 i32.store", + ), + ); + + // C-2b: add-side algebraic cancellation `(C - p) + p == C`. The round-2 + // `sub` rule demotes `const - param` to NotParam, but that value is a + // *negated* parameter, not a constant; the `add` rule must not re-promote a + // `Param + NotParam` to Param, or `(C - p) + p` recovers the fixed host + // address C and aliases the host's own memory at offset 65536 for every + // caller pointer. End-to-end mirror of the headline reproduction. + push( + "C-2b add-side cancellation store", + mem_main( + "(type (;0;) (func (param i32 i32)))", + "poke", + "(func (;1;) (type 0) (param i32 i32) \ + local.get 0 local.get 1 call 0)", + ), + mem_lib( + "(type (;0;) (func (param i32 i32)))", + "poke", + "(param i32 i32) \ + i32.const 65536 local.get 0 i32.sub local.get 0 i32.add \ + local.get 1 i32.store", + ), + ); + + // C-2b (commuted): p + (C - p) == C, the operand-order mirror, where the + // param is the first `add` operand and the negated-param NotParam is on top. + push( + "C-2b add-side cancellation store (commuted)", + mem_main( + "(type (;0;) (func (param i32 i32)))", + "poke", + "(func (;1;) (type 0) (param i32 i32) \ + local.get 0 local.get 1 call 0)", + ), + mem_lib( + "(type (;0;) (func (param i32 i32)))", + "poke", + "(param i32 i32) \ + local.get 0 i32.const 65536 local.get 0 i32.sub i32.add \ + local.get 1 i32.store", + ), + ); + + // C-3: a constant address laundered across a `call` boundary. `$peek` + // discards a const and calls a helper that loads through its own (untrusted) + // param; the multi-function memory closure must be rejected. + push( + "C-3 call-laundered load", + mem_main( + "(type (;0;) (func (param i32 i32) (result i32)))", + "peek", + "(func (;1;) (type 0) (param i32 i32) (result i32) \ + local.get 0 local.get 1 call 0)", + ), + try_wasm( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (type (;1;) (func (param i32) (result i32))) \ + (memory (;0;) 1) \ + (func (;0;) (type 0) (param i32 i32) (result i32) \ + i32.const 1024 call 1) \ + (func (;1;) (type 1) (param i32) (result i32) \ + local.get 0 i32.load) \ + (export \"peek\" (func 0)))", + ), + ); + + // C-4: a `memory64` external folded onto a memoryless main, addressing its + // i64 memory directly through its i64 parameter so provenance accepts it and + // the shape guard — not provenance — is what must fire. The `.wasm` would be + // a 64-bit machine but the `.v` a 32-bit one, so the merge must reject the + // shape outright rather than adopt it. The main imports `load_at` from + // `mathlib` (the module label the sweep tags externals with). + push( + "C-4 memory64 external onto memoryless main", + try_wasm( + "(module (type (;0;) (func (param i64) (result i64))) \ + (import \"mathlib\" \"load_at\" (func (;0;) (type 0))) \ + (func (;1;) (type 0) (param i64) (result i64) local.get 0 call 0) \ + (export \"run\" (func 1)))", + ), + try_wasm( + "(module (type (;0;) (func (param i64) (result i64))) \ + (memory (;0;) i64 1) \ + (func (;0;) (type 0) (param i64) (result i64) local.get 0 i64.load) \ + (export \"load_at\" (func 0)))", + ), + ); + + // H-3: a deeply-nested external body the merge must reject before it can + // later abort the wasm-to-v translator's unbounded recursion. + if let Some(lib) = { + let mut body = String::new(); + for _ in 0..5_000 { + body.push_str("block "); + } + for _ in 0..5_000 { + body.push_str("end "); + } + try_wasm(&format!( + "(module (type (;0;) (func (param i32 i32) (result i32))) \ + (func (;0;) (type 0) (param i32 i32) (result i32) {body} \ + local.get 0 local.get 1 i32.add) \ + (export \"sum\" (func 0)))" + )) + } { + probes.push(Probe { + label: "H-3 deeply-nested external body", + main: main_importing_sum(), + externals: vec![lib], + expect: Expect::Rejected, + }); + } + + // M-1: an over-declared locals count (the value a 6-byte locals group can + // set). The universal pre-validation gate must reject it before provenance + // sizes a per-local `vec!` — no multi-GB allocation. + probes.push(Probe { + label: "M-1 over-declared locals", + main: main_importing_sum(), + externals: vec![over_declared_locals_external(u32::MAX)], + expect: Expect::Rejected, + }); + + // M-2: a main module carrying an active data segment. `emit` rebuilds the + // main without a data section, so a surviving merge would silently drop the + // initializer; the guard must reject it. + probes.push(Probe { + label: "M-2 main-side data segment", + main: wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (data (;0;) (i32.const 0) "\2a\00\00\00") + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 local.get 1 call 0) + (export "compute" (func 1))) + "#, + ), + externals: vec![mathlib_pure()], + expect: Expect::Rejected, + }); + + probes +} + +/// A deterministic single-byte and truncation sweep over an assembled valid +/// external. Flipping bytes in a real module produces a large family of +/// structurally-broken inputs (bad section lengths, dangling indices, illegal +/// opcodes) that the merge must reject cleanly. The stride keeps the test fast +/// while still covering every section boundary region. +fn mutation_probes() -> Vec { + let main = main_importing_sum(); + let base = mathlib_pure(); + let mut probes = Vec::new(); + + // A single-byte flip at every offset of the valid module. The base module is + // small (tens of bytes), so the full sweep stays fast while covering every + // section header, length prefix, type, index, and opcode byte. + for offset in 0..base.len() { + let mut bytes = base.clone(); + bytes[offset] ^= 0xFF; + probes.push(Probe { + label: "byte-flip mutation", + main: main.clone(), + externals: vec![bytes], + // A flipped byte may break a length prefix, an index, or an opcode — + // or, rarely, leave a still-valid (differently-shaped) module. Either + // a clean `Err` or a validating `Ok` is acceptable. + expect: Expect::CleanOutcome, + }); + } + + // Progressive truncations: a representative set of prefixes of the valid + // module, so a length prefix can name more bytes than remain. + for cut in (1..base.len()).step_by(3) { + probes.push(Probe { + label: "truncation mutation", + main: main.clone(), + externals: vec![base[..cut].to_vec()], + expect: Expect::CleanOutcome, + }); + } + + probes +} + +#[test] +fn adversarial_corpus_never_panics_and_only_emits_valid_modules() { + let probes = seed_probes() + .into_iter() + .chain(mutation_probes()) + .collect::>(); + + // The sweep must be substantial — guard against a refactor silently emptying + // the corpus (e.g. a builder helper that starts returning nothing). + assert!( + probes.len() > 30, + "the adversarial corpus is unexpectedly small ({} probes)", + probes.len() + ); + + let mut merged_ok = 0usize; + for probe in &probes { + let pairs: Vec<(&str, &[u8])> = probe + .externals + .iter() + .map(|bytes| ("mathlib", bytes.as_slice())) + .collect(); + + // `link` is panic-free by contract; wrap it so a reintroduced panic fails + // here with the offending fixture named rather than aborting the run. + let outcome = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + raw_link(&probe.main, &pairs) + })); + + let result = outcome.unwrap_or_else(|_| { + panic!("`{}`: link panicked on adversarial input — it must return an Err", probe.label) + }); + + match (&result, probe.expect) { + // A merge must always be structurally valid, whatever the expectation. + (Ok(merged), _) => { + inf_wasmparser::validate(merged).unwrap_or_else(|e| { + panic!( + "`{}`: link returned Ok but the merged module fails validation: {e}", + probe.label + ) + }); + // A soundness reproduction that *merges* is a silent miscompile — + // the worst outcome — even though the bytes validate. + assert!( + probe.expect != Expect::Rejected, + "`{}`: link merged a soundness reproduction into a valid-but-wrong \ + module; it must reject it cleanly", + probe.label + ); + merged_ok += 1; + } + // A returned error is correct for `Rejected` and acceptable for + // `CleanOutcome`, but a regression for a positive control. + (Err(e), Expect::Merges) => panic!( + "`{}`: a legitimate probe must merge, got a rejection: {e}", + probe.label + ), + (Err(_), _) => {} + } + } + + // At least the pure control must have merged successfully, proving the sweep + // is not vacuously rejecting everything (which would make the `Ok ⇒ valid` + // arm untested). + assert!( + merged_ok >= 1, + "no probe merged successfully; the `Ok ⇒ valid` invariant went untested" + ); +} + +// -- H-3: deeply-nested external rejected at the merge ---------------------- + +/// Builds a main importing `deep` and an external `deep` whose body nests +/// `depth` empty `block`s. Mirrors the adversarial external that, merged +/// unchecked, would later overflow the wasm-to-v translator's recursion on the +/// `-v` proof path. +fn deep_nesting_main_and_lib(depth: usize) -> (Vec, Vec) { + let main = wasm( + r#" + (module + (type (;0;) (func)) + (import "deeplib" "deep" (func (;0;) (type 0))) + (func (;1;) (type 0) call 0) + (export "run" (func 1))) + "#, + ); + + let mut body = String::new(); + for _ in 0..depth { + body.push_str("block "); + } + for _ in 0..depth { + body.push_str("end "); + } + let lib = wasm(&format!( + r#"(module (func (;0;) (export "deep") {body}))"# + )); + (main, lib) +} + +/// H-3: an external whose body nests structured control flow far past the +/// merge's cap must be rejected with a clean [`LinkError`] on the link/`-o` +/// path — never merged into the output where it would later abort the +/// wasm-to-v translator (an unrecoverable SIGABRT) on the `-v` path. +#[test] +fn deeply_nested_external_body_is_rejected_at_link() { + let (main, lib) = deep_nesting_main_and_lib(5_000); + let err = link(&main, &[&lib]).expect_err("a deeply-nested external must be rejected"); + match err { + LinkError::UnsupportedConstruct(msg) => { + assert!( + msg.contains("nests structured control flow"), + "the diagnostic should name the nesting-depth limit: {msg}" + ); + } + other => panic!("expected UnsupportedConstruct for deep nesting, got {other:?}"), + } +} + +/// H-3: an external nested within the cap still merges cleanly, so the guard +/// rejects only pathological depth, never a legitimately nested function. +#[test] +fn external_nested_within_the_cap_merges() { + let (main, lib) = deep_nesting_main_and_lib(16); + let linked = link(&main, &[&lib]).expect("a modestly-nested external should merge"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); +} + +// -- H-4: deterministic name for a nameless merged inner callee ------------- + +/// H-4: a merged external's nameless inner callee must receive a deterministic +/// name derived from its output function index, so the merged module always +/// carries a complete name section and the downstream `.v` is reproducible. +/// +/// The external (built from plain WAT) has no name section, so its inner callee +/// `func 1` starts nameless. The closure root that satisfies the import is +/// renamed to the import field, prefixed with its logical module +/// (`lib.compute`); the inner callee must be filled with `.func_` +/// (`lib.func_2`) rather than left nameless (which previously forced wasm-to-v +/// down a per-process random-UUID path). The `lib.` prefix sanitizes to `lib_` +/// in the downstream Rocq names. +#[test] +fn nameless_merged_inner_callee_gets_deterministic_name() { + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "lib" "compute" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "run" (func 1))) + "#, + ); + // No name section: the root `compute` (func 0) calls a nameless inner + // helper (func 1). + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + call 1) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + i32.const 1 + i32.add) + (export "compute" (func 0))) + "#, + ); + + let linked = link(&main, &[&lib]).expect("closure with a nameless inner callee should merge"); + assert_valid(&linked); + + let names = function_names(&linked); + // The closure root is named after the import field it satisfies, prefixed + // with its logical module. + assert!( + names.iter().any(|(_, n)| n == "lib.compute"), + "the closure root should be named `lib.compute`: {names:?}" + ); + // Every merged function carries a name (a complete name section), and the + // nameless inner callee is named deterministically from its output index, + // under the same `.` namespace as the root — never left out of the + // section. + assert!( + names.iter().any(|(_, n)| n == "lib.func_2"), + "the nameless inner callee should get a deterministic `lib.func_` name: {names:?}" + ); + // No UUID-style name leaks in: a deterministic fallback name is a plain + // `.func_` whose suffix after the last `.` parses as an integer. + for (_, n) in &names { + if let Some(suffix) = n.rsplit('.').next().and_then(|s| s.strip_prefix("func_")) { + assert!( + suffix.parse::().is_ok(), + "a `func_`-prefixed fallback name must be index-derived, not a UUID: {n}" + ); + } + } +} + +// -- H-2 (corrected): verification-only constructs in externals ------------- +// +// Inference's non-deterministic blocks (`forall`/`exists`/`assume`/`unique`) and +// uzumaki rvalues (`i32`/`i64.uzumaki`) are verification-only: they have meaning +// only in the Rocq lowering and no executable runtime semantics. When building +// an executable binary, an *external* whose merged-closure body carries one of +// these opcodes would yield a non-executable output (a miscompile), so the +// linker must reject it with a clean `LinkError`. (The main module in proof mode +// legitimately carries these opcodes as proof scaffolding and must pass through +// unaffected — covered by the wasm-to-v proof-path tests.) Separately, spec +// functions and the `inference.spec_funcs` custom section in an external are +// stripped: they are never in the executable closure and are not merged. + +/// The single-byte `0xfc` sub-opcode for each Inference non-det block, matching +/// the codegen and `inf-wasmparser` decoder. +const NONDET_SUBOPCODES: &[(u8, &str)] = &[ + (0x3a, "forall"), + (0x3b, "exists"), + (0x3c, "assume"), + (0x3d, "unique"), +]; + +/// The `0xfc` sub-opcode for each uzumaki rvalue. +const UZUMAKI_SUBOPCODES: &[(u8, &str)] = &[(0x31, "i32.uzumaki"), (0x32, "i64.uzumaki")]; + +/// Builds an external exporting `sum:(i32,i32)->i32` whose body opens an +/// Inference non-det block (`sub_opcode`) with an empty block type, then +/// computes the sum. `wat` cannot assemble the custom `0xfc`-prefixed opcode, so +/// the body is hand-encoded: +/// ` (empty) end; local.get 0; local.get 1; i32.add; end`. +/// +/// The non-det block is verification-only, so an executable merge of this +/// external must reject it rather than copy it into a non-executable output. +fn lib_with_nondet_block(sub_opcode: u8) -> Vec { + use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, Module, + TypeSection, ValType, + }; + + let mut module = Module::new(); + + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::I32, ValType::I32], [ValType::I32]); + module.section(&types); + + let mut funcs = FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + + let mut exports = ExportSection::new(); + exports.export("sum", ExportKind::Func, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + let mut f = Function::new([]); + // ` (empty)` = 0xfc 0x40. The empty block has no stack + // effect, so the surrounding stack stays valid. + f.raw([0xfc, sub_opcode, 0x40]); + f.instruction(&Instruction::End); // close the non-det block + f.instruction(&Instruction::LocalGet(0)); + f.instruction(&Instruction::LocalGet(1)); + f.instruction(&Instruction::I32Add); + f.instruction(&Instruction::End); + code.function(&f); + module.section(&code); + + module.finish() +} + +/// Builds an external exporting `sum:(i32,i32)->i32` whose body contains an +/// uzumaki rvalue (`sub_opcode`), immediately dropped to keep the stack +/// balanced. The uzumaki rvalue is verification-only, so an executable merge of +/// this external must reject it. +fn lib_with_uzumaki(sub_opcode: u8) -> Vec { + use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, Module, + TypeSection, ValType, + }; + + let mut module = Module::new(); + + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::I32, ValType::I32], [ValType::I32]); + module.section(&types); + + let mut funcs = FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + + let mut exports = ExportSection::new(); + exports.export("sum", ExportKind::Func, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + let mut f = Function::new([]); + // `` = 0xfc ; it pushes a value, dropped immediately. + f.raw([0xfc, sub_opcode]); + f.instruction(&Instruction::Drop); + f.instruction(&Instruction::LocalGet(0)); + f.instruction(&Instruction::LocalGet(1)); + f.instruction(&Instruction::I32Add); + f.instruction(&Instruction::End); + code.function(&f); + module.section(&code); + + module.finish() +} + +#[test] +fn external_nondet_block_is_rejected_as_non_executable() { + // H-2 (corrected): each non-det block in an external's merged-closure body is + // verification-only, so an executable merge must reject it with a clean + // `LinkError` rather than copy it into a non-executable output. Covers + // forall/exists/assume/unique. + let main = main_importing_sum(); + for &(sub_opcode, name) in NONDET_SUBOPCODES { + let lib = lib_with_nondet_block(sub_opcode); + let err = assert_clean_rejection(&main, &lib, name); + if let LinkError::UnsupportedConstruct(msg) = &err { + assert!( + msg.contains("verification-only"), + "{name}: expected a verification-only rejection, got {msg}" + ); + } + } +} + +#[test] +fn external_uzumaki_is_rejected_as_non_executable() { + // H-2 (corrected): each uzumaki rvalue in an external's merged-closure body + // is verification-only and has no executable semantics, so an executable + // merge must reject it. Covers i32.uzumaki and i64.uzumaki. + let main = main_importing_sum(); + for &(sub_opcode, name) in UZUMAKI_SUBOPCODES { + let lib = lib_with_uzumaki(sub_opcode); + let err = assert_clean_rejection(&main, &lib, name); + if let LinkError::UnsupportedConstruct(msg) = &err { + assert!( + msg.contains("verification-only"), + "{name}: expected a verification-only rejection, got {msg}" + ); + } + } +} + +#[test] +fn external_nondet_functype_block_is_rejected_as_non_executable() { + // H-2 (corrected): the function-typed non-det form is rejected identically to + // the empty form — the construct is verification-only regardless of its block + // type, so the merge never even reaches the block-type remap. The fixture's + // `forall (type 1)` would, under the old (now-wrong) semantics, have been + // remapped and copied; it must now reject cleanly. + use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, Module, + TypeSection, ValType, + }; + + let lib = { + let mut module = Module::new(); + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::I32, ValType::I32], [ValType::I32]); + types.ty().function([], []); + module.section(&types); + let mut funcs = FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + let mut exports = ExportSection::new(); + exports.export("sum", ExportKind::Func, 0); + module.section(&exports); + let mut code = CodeSection::new(); + let mut f = Function::new([]); + // `forall (type 1)` = 0xfc 0x3a . + f.raw([0xfc, 0x3a, 0x01]); + f.instruction(&Instruction::End); + f.instruction(&Instruction::LocalGet(0)); + f.instruction(&Instruction::LocalGet(1)); + f.instruction(&Instruction::I32Add); + f.instruction(&Instruction::End); + code.function(&f); + module.section(&code); + module.finish() + }; + + let main = main_importing_sum(); + assert_clean_rejection(&main, &lib, "forall function-typed block"); +} + +/// Builds an external exporting an executable `sum:(i32,i32)->i32` that ALSO +/// carries (1) a separate spec function with a non-det body, and (2) an +/// `inference.spec_funcs` custom section naming it. `sum` does not call the spec +/// function, so the spec function is outside the executable closure: merging +/// `sum` must strip both the spec function and the spec section, with no error. +fn lib_with_spec_function_and_section() -> Vec { + use wasm_encoder::{ + CodeSection, CustomSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, + Module, TypeSection, ValType, + }; + + let mut module = Module::new(); + + let mut types = TypeSection::new(); + // type 0: the `sum` signature; type 1: the spec function signature `()->()`. + types + .ty() + .function([ValType::I32, ValType::I32], [ValType::I32]); + types.ty().function([], []); + module.section(&types); + + let mut funcs = FunctionSection::new(); + funcs.function(0); // func 0: sum + funcs.function(1); // func 1: spec (verification-only body) + module.section(&funcs); + + // Only `sum` is exported, so only it (and its closure) can be merged. + let mut exports = ExportSection::new(); + exports.export("sum", ExportKind::Func, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + // func 0: executable sum, no verification-only opcodes. + let mut sum = Function::new([]); + sum.instruction(&Instruction::LocalGet(0)); + sum.instruction(&Instruction::LocalGet(1)); + sum.instruction(&Instruction::I32Add); + sum.instruction(&Instruction::End); + code.function(&sum); + // func 1: spec body carrying a `forall` block — legal in a spec function, + // never executed, and never pulled into `sum`'s closure. + let mut spec = Function::new([]); + spec.raw([0xfc, 0x3a, 0x40]); // forall (empty) + spec.instruction(&Instruction::End); // close forall + spec.instruction(&Instruction::End); // close function + code.function(&spec); + module.section(&code); + + // An `inference.spec_funcs` section naming the spec function (index 1). + // version=1, count=1, name_len=1 'S', idx_count=1, idx=1. + let spec_section_payload = [1u8, 1, 1, b'S', 1, 1]; + module.section(&CustomSection { + name: "inference.spec_funcs".into(), + data: (&spec_section_payload[..]).into(), + }); + + module.finish() +} + +#[test] +fn external_spec_function_and_section_are_stripped_when_building_an_executable() { + // (1) An external that ALSO contains a spec function (verification-only body) + // and an `inference.spec_funcs` section must link successfully when building + // an executable: the spec function is outside the executable closure of the + // satisfied export, so it is not merged, and the spec section is stripped — + // no error on its presence. + let main = main_importing_sum(); + let lib = lib_with_spec_function_and_section(); + + let linked = link(&main, &[&lib]).expect("an external with specs must link, with specs stripped"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + + // The merged output must carry NO `inference.spec_funcs` section from the + // external (the main module here has none either, so the section is absent). + assert!( + custom_section_data(&linked, "inference.spec_funcs").is_none(), + "an external's spec section must not be merged into the executable output" + ); + + // No merged function body may carry a verification-only opcode: the spec + // function (with its `forall`) must have been stripped, not merged. + for payload in Parser::new(0).parse_all(&linked) { + if let Payload::CodeSectionEntry(body) = payload.unwrap() { + for op in body.get_operators_reader().unwrap() { + assert!( + !matches!( + op.unwrap(), + Operator::Forall { .. } + | Operator::Exists { .. } + | Operator::Assume { .. } + | Operator::Unique { .. } + | Operator::I32Uzumaki { .. } + | Operator::I64Uzumaki { .. } + ), + "no merged executable body may carry a verification-only opcode" + ); + } + } + } +} + +#[test] +fn external_malformed_spec_section_does_not_fail_the_link() { + // An external's spec section is stripped, so even a *malformed* one must not + // fail the link: it is irrelevant to the executable merge. Build a valid + // executable `sum` external, then append a garbage `inference.spec_funcs` + // section (a bogus version byte the main-module decoder would reject). + use wasm_encoder::{ + CodeSection, CustomSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, + Module, TypeSection, ValType, + }; + + let lib = { + let mut module = Module::new(); + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::I32, ValType::I32], [ValType::I32]); + module.section(&types); + let mut funcs = FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + let mut exports = ExportSection::new(); + exports.export("sum", ExportKind::Func, 0); + module.section(&exports); + let mut code = CodeSection::new(); + let mut f = Function::new([]); + f.instruction(&Instruction::LocalGet(0)); + f.instruction(&Instruction::LocalGet(1)); + f.instruction(&Instruction::I32Add); + f.instruction(&Instruction::End); + code.function(&f); + module.section(&code); + // A garbage spec section: version byte 0xff, which the main decoder would + // reject as an unsupported version. For an external it is simply skipped. + module.section(&CustomSection { + name: "inference.spec_funcs".into(), + data: (&[0xffu8, 0xff, 0xff][..]).into(), + }); + module.finish() + }; + + let main = main_importing_sum(); + let linked = + link(&main, &[&lib]).expect("a malformed external spec section must not fail the link"); + assert_valid(&linked); +} + +#[test] +fn malformed_main_spec_section_fails_the_link() { + // COV-4: the *main* module's `inference.spec_funcs` section IS decoded (it + // drives proof-mode translation and is re-emitted re-indexed), so a malformed + // one — here a bogus version byte 0xff — reaching `link()` must be a hard + // `LinkError::Parse`, not silently dropped. This mirrors + // `external_malformed_spec_section_does_not_fail_the_link` for the side that + // is actually decoded. + let main_wat = r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#; + let mut main = wasm(main_wat); + use wasm_encoder::Section as _; + wasm_encoder::CustomSection { + name: "inference.spec_funcs".into(), + data: (&[0xffu8, 0xff, 0xff][..]).into(), + } + .append_to(&mut main); + + let lib = mathlib_pure(); + let err = link(&main, &[&lib]) + .expect_err("a malformed main spec section must be a hard link error"); + assert!( + matches!(&err, LinkError::Parse(_)), + "expected a Parse error for the malformed main spec section, got {err:?}" + ); +} + +/// Builds a proof-mode MAIN module that imports `sum` and whose own exported +/// body carries verification-only opcodes (a `forall` block and an +/// `i32.uzumaki`) as Rocq proof scaffolding, alongside an executable `call` to +/// the import. `wat` cannot assemble the custom opcodes, so the whole module is +/// hand-encoded. +fn proof_mode_main_with_nondet_and_uzumaki() -> Vec { + use wasm_encoder::{ + CodeSection, EntityType, ExportKind, ExportSection, Function, FunctionSection, ImportSection, + Instruction, Module, TypeSection, ValType, + }; + + let mut module = Module::new(); + + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::I32, ValType::I32], [ValType::I32]); + module.section(&types); + + let mut imports = ImportSection::new(); + imports.import("mathlib", "sum", EntityType::Function(0)); + module.section(&imports); + + let mut funcs = FunctionSection::new(); + funcs.function(0); // the main local function (output index 0 after the import is removed) + module.section(&funcs); + + let mut exports = ExportSection::new(); + exports.export("compute", ExportKind::Func, 1); // import is 0, local is 1 + module.section(&exports); + + let mut code = CodeSection::new(); + let mut f = Function::new([]); + // Proof scaffolding: a `forall (empty)` block and an `i32.uzumaki` (dropped), + // both verification-only and legal in the main module. + f.raw([0xfc, 0x3a, 0x40]); // forall (empty) + f.instruction(&Instruction::End); // close forall + f.raw([0xfc, 0x31]); // i32.uzumaki + f.instruction(&Instruction::Drop); + // Executable tail: sum(arg0, arg1) via the (to-be-merged) import. + f.instruction(&Instruction::LocalGet(0)); + f.instruction(&Instruction::LocalGet(1)); + f.instruction(&Instruction::Call(0)); // call the imported `sum` + f.instruction(&Instruction::End); + code.function(&f); + module.section(&code); + + module.finish() +} + +#[test] +fn proof_mode_main_nondet_and_uzumaki_survive_the_merge() { + // (c): a proof-mode MAIN module carrying non-det/uzumaki opcodes that links a + // plain executable external must still compile, and its verification-only + // opcodes must survive into the linked output unaltered (they are Rocq proof + // scaffolding the merge must not strip, reject, or alter — only the MAIN + // module is exempt; externals are rejected). + let main = proof_mode_main_with_nondet_and_uzumaki(); + let lib = mathlib_pure(); + + let linked = link(&main, &[&lib]).expect("proof-mode main with non-det/uzumaki must link"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + + let mut saw_forall = false; + let mut saw_uzumaki = false; + for payload in Parser::new(0).parse_all(&linked) { + if let Payload::CodeSectionEntry(body) = payload.unwrap() { + for op in body.get_operators_reader().unwrap() { + match op.unwrap() { + Operator::Forall { .. } => saw_forall = true, + Operator::I32Uzumaki { .. } => saw_uzumaki = true, + _ => {} + } + } + } + } + assert!( + saw_forall, + "the main module's `forall` proof scaffolding must survive the merge" + ); + assert!( + saw_uzumaki, + "the main module's `i32.uzumaki` proof scaffolding must survive the merge" + ); +} + +// -- M-1 / M-2: the public `link` API is self-defending ----------------------- +// +// `link` is an entry point in its own right; its contract previously only +// *assumed* pre-validated externals (the CLI driver validates, the library API +// did not). These tests pin the two universal backstops: a structural +// pre-validation gate over every external (M-1) and a main-side data/element +// guard (M-2). + +/// Appends `value` as a little-endian base-128 (unsigned LEB128) varint, the +/// encoding WASM uses for counts and indices. +fn push_uleb(out: &mut Vec, mut value: u32) { + loop { + let mut byte = (value & 0x7f) as u8; + value >>= 7; + if value != 0 { + byte |= 0x80; + } + out.push(byte); + if value == 0 { + break; + } + } +} + +/// Wraps `section_bytes` in a section with the given `id`, prefixing the byte +/// length WASM section framing requires. +fn framed_section(id: u8, section_bytes: &[u8]) -> Vec { + let mut out = vec![id]; + push_uleb(&mut out, section_bytes.len() as u32); + out.extend_from_slice(section_bytes); + out +} + +/// Hand-assembles a memory-using external exporting `sum : (i32, i32) -> i32` +/// whose single function over-declares its locals count as `locals_count`. A +/// real assembler cannot emit this — `wat`/`wasm_encoder` compute the locals +/// header from the declared types — so the code section is written byte-by-byte. +/// +/// With `locals_count = u32::MAX` this is the M-1 reproduction: the value a +/// 6-byte locals group can set, which the provenance interpreter would once have +/// turned into a ~4.3 GB `vec!`. The module is deliberately *invalid* WASM (the +/// declared locals do not exist), so the universal pre-validation gate must +/// reject it before any byte reaches provenance. +fn over_declared_locals_external(locals_count: u32) -> Vec { + // Type section: one `(func (param i32 i32) (result i32))`. + let type_section = framed_section( + 0x01, + &[ + 0x01, // one type + 0x60, // func + 0x02, 0x7f, 0x7f, // two i32 params + 0x01, 0x7f, // one i32 result + ], + ); + + // Function section: one function of type 0. + let function_section = framed_section(0x03, &[0x01, 0x00]); + + // Memory section: one memory, min 1 page (so the body's `i32.load` is + // structurally placed against a real memory). + let memory_section = framed_section(0x05, &[0x01, 0x00, 0x01]); + + // Export section: `sum` -> func 0. + let mut export_payload = vec![0x01]; // one export + push_uleb(&mut export_payload, 3); // name length ("sum") + export_payload.extend_from_slice(b"sum"); + export_payload.push(0x00); // kind: func + export_payload.push(0x00); // func index 0 + let export_section = framed_section(0x07, &export_payload); + + // Code section: one body whose single locals group claims `locals_count` + // i32 locals, then loads from address 0 and returns a constant. The + // over-declaration is the payload. + let mut body = Vec::new(); + body.push(0x01); // one locals group + push_uleb(&mut body, locals_count); // (count, i32) — the over-declaration + body.push(0x7f); // i32 + body.extend_from_slice(&[0x41, 0x00]); // i32.const 0 + body.extend_from_slice(&[0x28, 0x02, 0x00]); // i32.load (align 2, offset 0) + body.push(0x1a); // drop + body.extend_from_slice(&[0x41, 0x00]); // i32.const 0 (the i32 result) + body.push(0x0b); // end + + let mut code_payload = vec![0x01]; // one code entry + push_uleb(&mut code_payload, body.len() as u32); // body size + code_payload.extend_from_slice(&body); + let code_section = framed_section(0x0a, &code_payload); + + let mut module = Vec::new(); + module.extend_from_slice(b"\0asm"); // magic + module.extend_from_slice(&[0x01, 0x00, 0x00, 0x00]); // version 1 + module.extend_from_slice(&type_section); + module.extend_from_slice(&function_section); + module.extend_from_slice(&memory_section); + module.extend_from_slice(&export_section); + module.extend_from_slice(&code_section); + module +} + +#[test] +fn over_declared_locals_external_via_link_is_rejected_without_huge_alloc() { + // M-1 on the public library path: a tiny external whose single locals group + // claims `u32::MAX` locals, linked through the public `link` API. The + // universal pre-validation gate must reject it as a clean `LinkError` before + // the provenance interpreter would size a per-local `vec!` — no multi-GB + // allocation, no panic, no hang. + let main = main_importing_sum(); + let lib = over_declared_locals_external(u32::MAX); + + let err = link(&main, &[&lib]).expect_err("an over-declared-locals external must be rejected"); + assert!( + matches!(err, LinkError::Parse(_)), + "expected a clean Parse rejection from the pre-validation gate, got {err:?}" + ); +} + +#[test] +fn main_module_with_a_data_segment_is_rejected_cleanly() { + // M-2: a main module carrying an active data segment must be rejected, not + // silently merged. `emit` rebuilds the main module without a `DataSection`, + // so a surviving merge would drop the initializer — a valid-but-wrong + // `.wasm`/`.v`. The guard rejects up front with a clean diagnostic. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (data (;0;) (i32.const 0) "\2a\00\00\00") + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + ); + let lib = mathlib_pure(); + + let err = link(&main, &[&lib]).expect_err("a main-side data segment must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("data segment")), + "expected an UnsupportedConstruct naming the main-side data segment, got {err:?}" + ); +} + +#[test] +fn main_module_with_an_element_segment_is_rejected_cleanly() { + // M-2 (element half): a main module carrying an element segment must be + // rejected too. `emit` omits both the main `TableSection` and any + // `ElementSection`, so a surviving merge would orphan the element's table + // reference. The guard rejects it as a clean diagnostic rather than relying + // on the post-merge validate gate to catch the orphan. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (table (;0;) 1 1 funcref) + (elem (;0;) (i32.const 0) func 1) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + ); + let lib = mathlib_pure(); + + let err = link(&main, &[&lib]).expect_err("a main-side element segment must be rejected"); + assert!( + matches!(&err, LinkError::UnsupportedConstruct(msg) if msg.contains("element segment")), + "expected an UnsupportedConstruct naming the main-side element segment, got {err:?}" + ); +} + +// -- WASM 1.0 feature gate: supported post-MVP additions link ------------------ +// +// `SUPPORTED_WASM_FEATURES` is the integer WASM 1.0 core plus exactly one scalar +// post-MVP addition the merge models: bulk memory. An external using only this +// must pass the link gate and merge normally — the gate rejects *every* other +// post-1.0 proposal, including sign-extension and saturating float-to-int (the +// Rocq translator models neither), and all floating point (the Inference language +// has no `f32`/`f64` types). + +/// A main module importing a pure `f:(i32)->i32` from `lib` and calling it. The +/// shared shape for the feature-gate fixtures, each of which supplies a `lib` +/// exporting `f` whose body exercises one post-MVP op. +fn main_importing_f() -> Vec { + wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "lib" "f" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32) (result i32) + local.get 0 + call 0) + (export "run" (func 1))) + "#, + ) +} + +#[test] +fn sign_extension_external_is_rejected_at_the_feature_gate() { + // The sign-extension proposal (`i32.extend8_s`) is outside the supported + // subset: the Rocq translator has no lowering for it, and Inference codegen + // narrows sub-i32 values with shifts/masks instead of emitting it. The gate's + // feature pass rejects such an external up front with the validator's + // sign-extension diagnostic — before the closure scanner's allow-list (the + // defense-in-depth backstop, tested directly in `safety.rs`) is reached. + let main = main_importing_f(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + i32.extend8_s) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "sign extension"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("sign extension")), + "expected an UnsupportedWasmFeature naming sign extension, got {err:?}" + ); +} + +#[test] +fn saturating_float_to_int_external_is_rejected_at_the_feature_gate() { + // The saturating float-to-int proposal (`i32.trunc_sat_f32_s`) is outside the + // supported subset: the Rocq translator has no lowering for it, and its + // operand is a float — and the Inference language has no `f32`/`f64` types. + // The body takes an f32 and returns an i32, so the gate rejects it on the + // float type first; the validator names floating point. + let main = wasm( + r#" + (module + (type (;0;) (func (param f32) (result i32))) + (import "lib" "f" (func (;0;) (type 0))) + (func (;1;) (type 0) (param f32) (result i32) + local.get 0 + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param f32) (result i32))) + (func (;0;) (type 0) (param f32) (result i32) + local.get 0 + i32.trunc_sat_f32_s) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "saturating float-to-int"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("floating-point")), + "expected an UnsupportedWasmFeature naming floating point, got {err:?}" + ); +} + +#[test] +fn bulk_memory_copy_external_passes_the_gate_and_merges() { + // The bulk-memory proposal (`memory.copy` over the single shared memory) is in + // the supported subset. The external declares the memory and copies a region; + // the param-addressed copy is Tier B and folds onto the shared memory. + // + // The copy's dest, src, AND length are all caller-passed (`copy(dst, src, + // len)`): under the S1 extent rule a constant copy length would reject at + // Tier B (an unbounded clobber above the caller's pointer), so the realistic + // caller-owns-`(ptr, len)` shape is what keeps the bulk-memory opcode + // mergeable. The assertion remains that the bulk-memory opcode passes the + // WASM-1.0 feature gate and the body links. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32 i32))) + (import "lib" "f" (func (;0;) (type 0))) + (memory (;0;) 1) + (func (;1;) (type 0) (param i32 i32 i32) + local.get 0 + local.get 1 + local.get 2 + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32 i32))) + (memory (;0;) 1) + (func (;0;) (type 0) (param i32 i32 i32) + local.get 0 + local.get 1 + local.get 2 + memory.copy) + (export "f" (func 0))) + "#, + ); + let linked = link(&main, &[&lib]).expect("a bulk-memory external must link"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + assert_eq!(code_body_count(&linked), 2); +} + +#[test] +fn plain_mvp_external_passes_the_gate_and_merges() { + // A baseline: a pure MVP external (no post-MVP op at all) passes the gate. + let main = main_importing_f(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + i32.const 1 + i32.add) + (export "f" (func 0))) + "#, + ); + let linked = link(&main, &[&lib]).expect("a plain MVP external must link"); + assert_valid(&linked); + assert!(function_imports(&linked).is_empty()); + assert_eq!(code_body_count(&linked), 2); +} + +// -- WASM 1.0 feature gate: post-1.0 proposals are rejected naming the feature -- +// +// Each negative case below proves a distinct post-1.0 proposal is rejected *at +// the gate* (before the merge's per-operator/per-section backstops) with a +// feature-named `UnsupportedWasmFeature`. The atomics/SIMD/reference-types/ +// exceptions/memory64/multi-memory/tail-call cases live with the rejection-helper +// tests above (each updated to the gate outcome); this section adds the +// multi-value proposal, which has no per-operator backstop of its own (a +// multi-result block is structurally valid MVP-shaped bytes the allow-list copies +// verbatim) and so relies on the gate as its sole filter. + +#[test] +fn multi_value_block_external_is_rejected_at_the_feature_gate() { + // The multi-value proposal lets a block reference a *type index* (so it can + // take params, not just produce a single inline result). The function-typed + // `block (type 1) (param i32) (result i32)` below is well-formed under the + // parser's default features but outside the supported WASM 1.0 subset, so the + // gate's feature pass rejects it naming multi-value. Unlike SIMD/atomics/etc., + // multi-value carries no distinguishing opcode the allow-list could catch — the + // gate is its only filter, which is precisely why the explicit feature contract + // matters here. + let main = main_importing_f(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (type (;1;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0 + (block (type 1) (param i32) (result i32))) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "multi-value block"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("multi-value")), + "expected an UnsupportedWasmFeature naming multi-value, got {err:?}" + ); +} + +#[test] +fn multi_result_function_external_is_rejected_at_the_feature_gate() { + // The multi-value proposal also lets a *function* return more than one value. + // A `(result i32 i32)` function signature is well-formed under default + // features but outside the supported subset, so the gate rejects it naming + // multi-value at the type-section level — before any body is scanned. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32 i32))) + (import "lib" "f" (func (;0;) (type 0))) + (type (;1;) (func (param i32) (result i32))) + (func (;1;) (type 1) (param i32) (result i32) + local.get 0 + call 0 + drop) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32 i32))) + (func (;0;) (type 0) (param i32) (result i32 i32) + local.get 0 + local.get 0) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "multi-result function"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("multi-value")), + "expected an UnsupportedWasmFeature naming multi-value, got {err:?}" + ); +} + +// -- Floating point: no f32/f64 anywhere, rejected at the gate ---------------- +// +// The Inference language has no `f32`/`f64` types: codegen never emits a float +// operator, value type, or constant, and the Rocq translator models none. The +// feature gate (`SUPPORTED_WASM_FEATURES`) drops the fork's baseline `FLOATS` +// flag, so the validator rejects, at the feature pass, any float instruction +// ("floating-point instruction disallowed") and any float value type in a +// signature, local, or global ("floating-point support is disabled"). Each case +// below proves a distinct float surface — operator, signature, local, global, +// constant — is rejected *at the gate* with a feature-named `UnsupportedWasmFeature` +// naming floating point, before the per-opcode / value-type backstops in the +// merge are reached. + +#[test] +fn float_op_external_is_rejected_at_the_feature_gate() { + // A float *operator* in the external body. The signature stays integer so the + // rejection is attributable to the operator, not the type. + let main = main_importing_f(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + f32.const 1 + f32.const 1 + f32.add + drop + local.get 0) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "float operator"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("floating-point")), + "expected an UnsupportedWasmFeature naming floating point, got {err:?}" + ); +} + +#[test] +fn float_in_signature_only_external_is_rejected_at_the_feature_gate() { + // A float appears *only* in a reachable function's signature — no float + // operator anywhere. The reachable `(param f64) (result i32)` helper sits + // behind an i32 root, so a signature-blind upstream check could admit it; the + // gate must still reject on the float value type in the signature. + let main = main_importing_f(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (type (;1;) (func (param f64) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0) + (func (;1;) (type 1) (param f64) (result i32) + i32.const 0) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "float in signature"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("floating-point")), + "expected an UnsupportedWasmFeature naming floating point, got {err:?}" + ); +} + +#[test] +fn float_local_only_external_is_rejected_at_the_feature_gate() { + // A float appears only as a *local* — no float operator, no float in any + // signature. The gate rejects on the float value type of the local. + let main = main_importing_f(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + (local f32) + local.get 0) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "float local"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("floating-point")), + "expected an UnsupportedWasmFeature naming floating point, got {err:?}" + ); +} + +#[test] +fn float_global_external_is_rejected_at_the_feature_gate() { + // An `f32` global declared in the external. The gate rejects on the float + // value type of the global, before the global-collection chokepoint in + // `parse::collect_global` is reached. + let main = main_importing_f(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (global (;0;) f32 (f32.const 1)) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "float global"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("floating-point")), + "expected an UnsupportedWasmFeature naming floating point, got {err:?}" + ); +} + +#[test] +fn float_const_only_external_is_rejected_at_the_feature_gate() { + // A lone `f64.const` (immediately dropped) with an otherwise-integer + // signature. The float constant is itself a float instruction, so the gate's + // feature pass rejects it. + let main = main_importing_f(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + f64.const 1 + drop + local.get 0) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "float const"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("floating-point")), + "expected an UnsupportedWasmFeature naming floating point, got {err:?}" + ); +} + +#[test] +fn main_module_carrying_a_float_is_rejected_cleanly() { + // The MAIN module is not passed through the feature gate (it is the linker's + // own codegen output on the live pipeline), but the public `link()` API + // accepts arbitrary main bytes. A main carrying a float operator must still be + // rejected with a clean `LinkError` — never a panic, never a silent merge. + // The allow-list backstop on the main re-encode path catches the float op. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (import "lib" "f" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32) (result i32) + f32.const 1 + f32.const 1 + f32.add + drop + local.get 0 + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + local.get 0) + (export "f" (func 0))) + "#, + ); + match link(&main, &[&lib]) { + Ok(bytes) => panic!( + "a float-carrying main silently produced a {}-byte module; it must be rejected", + bytes.len() + ), + Err(LinkError::UnsupportedConstruct(msg)) => { + assert!( + msg.contains("floating-point"), + "expected a floating-point UnsupportedConstruct, got {msg:?}" + ); + } + Err(other) => panic!("expected a floating-point UnsupportedConstruct, got {other:?}"), + } +} + +// -- COV-2 (D3): GC and stack-switching externals are rejected --------------- +// +// `SUPPORTED_WASM_FEATURES` names the fork's baseline `GC_TYPES` value-type flag +// directly, but admits NO GC *proposal* construct: a GC type still needs the `GC` +// feature, which is off. These tests pin the ACTUAL +// rejection layer determined empirically — the GC type is caught by the gate's +// feature pass naming `gc`, the stack-switching construct by the gate's +// structural pass (continuation types are off even under default features). + +#[test] +fn gc_typed_external_is_rejected_naming_the_gc_feature() { + // A GC `struct` type is well-formed under the parser's default features but + // needs the `GC` proposal to validate, which is outside the supported subset. + // The gate's feature pass rejects it with an `UnsupportedWasmFeature` naming + // `gc`, before any body or opcode is examined. + let main = main_importing_f(); + let lib = wasm( + r#" + (module + (type (;0;) (struct (field i32))) + (type (;1;) (func (param i32) (result i32))) + (func (;0;) (type 1) (param i32) (result i32) + local.get 0) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "gc struct type"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("gc")), + "expected an UnsupportedWasmFeature naming the gc feature, got {err:?}" + ); +} + +#[test] +fn externref_typed_external_is_rejected_naming_reference_types() { + // A GC reference value (`externref`, produced by `ref.null extern`) needs the + // reference-types proposal, which is off. The gate's feature pass rejects it + // naming reference types — a GC/reference *value type*, not an instruction the + // allow-list would see. + let main = main_importing_f(); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32) (result i32))) + (func (;0;) (type 0) (param i32) (result i32) + ref.null extern + drop + local.get 0) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "externref value"); + assert!( + matches!(&err, LinkError::UnsupportedWasmFeature { details, .. } if details.contains("reference types")), + "expected an UnsupportedWasmFeature naming reference types, got {err:?}" + ); +} + +#[test] +fn stack_switching_external_is_rejected_naming_stack_switching() { + // The stack-switching proposal's continuation types are off even under the + // parser's *default* features in this fork, so a module declaring `(cont 0)` + // fails the gate's STRUCTURAL pass — surfaced as `Parse` naming stack + // switching, not as a feature-pass `UnsupportedWasmFeature`. The external + // exports a `(func)`-typed `f`, matching a dedicated no-arg main. + let main = wasm( + r#" + (module + (type (;0;) (func)) + (import "lib" "f" (func (;0;) (type 0))) + (func (;1;) (type 0) + call 0) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func)) + (type (;1;) (cont 0)) + (func (;0;) (type 0)) + (export "f" (func 0))) + "#, + ); + let err = assert_clean_rejection(&main, &lib, "stack-switching continuation type"); + assert!( + matches!(&err, LinkError::Parse(msg) if msg.contains("stack switching")), + "expected a Parse error naming stack switching, got {err:?}" + ); +} + +// -- COV-7 (D4): structural validation runs before the feature pass ---------- + +#[test] +fn malformed_post_1_0_external_is_parse_not_unsupported_feature() { + // COV-7: lock the two-pass ordering (structural-before-feature). The external + // uses a post-1.0 feature (SIMD `v128`) AND is structurally broken (its body + // returns an i32 where a `v128` result is declared). The structural pass runs + // first under the parser's default features (which include SIMD), so it + // catches the type mismatch and reports `Parse` — NOT `UnsupportedWasmFeature`. + // Were the order reversed, the restricted feature pass (no SIMD) would reject + // naming SIMD first, masking the real structural defect. + use inference_wasm_linker::validate_external; + + let lib = wasm( + r#" + (module + (type (;0;) (func (result v128))) + (func (;0;) (type 0) (result v128) + i32.const 0) + (export "f" (func 0))) + "#, + ); + + // At the validate_external entry: a Parse, naming the structural mismatch. + let direct = validate_external("lib", &lib) + .expect_err("a structurally-broken external must be rejected"); + assert!( + matches!(&direct, LinkError::Parse(_)), + "structural validation must run first, reporting Parse, got {direct:?}" + ); + + // And through the full `link` path: the same structural-first ordering holds. + let main = main_importing_f(); + let via_link = link(&main, &[&lib]) + .expect_err("a structurally-broken external must fail the link"); + assert!( + matches!(&via_link, LinkError::Parse(_)), + "the link entry must report the structural defect as Parse, not a feature name, got {via_link:?}" + ); +} diff --git a/core/wasm-linker/tests/v_alignment.rs b/core/wasm-linker/tests/v_alignment.rs new file mode 100644 index 00000000..61553ec7 --- /dev/null +++ b/core/wasm-linker/tests/v_alignment.rs @@ -0,0 +1,617 @@ +//! Invariant: anything `inference_wasm_linker::link()` accepts can be lowered to +//! Rocq by `wasm-to-v` without panicking. +//! +//! The linker copies the main module's body verbatim and folds external function +//! bodies in after gating every operator through the fail-closed allow-list +//! (`crate::safety::check_operator`). The paired downstream phase, the `wasm-to-v` +//! translator, lowers that linked output to a Rocq `.v` proof artifact. The two +//! must agree on the instruction set: every operator the linker is willing to +//! emit into its output must have a translator lowering. An operator the linker +//! admits but the translator hits `todo!()` on is a latent SIGABRT on the `-v` +//! proof path — a clean link followed by an unrecoverable crash in the next phase. +//! +//! This test pins that agreement. For each allow-listed opcode family it links a +//! fixture that drives an operator of that family into the linked output, then +//! translates the output under `std::panic::catch_unwind`. A `todo!()` in the +//! translator surfaces here as a labeled test failure naming the family, rather +//! than as an opaque abort deep in a later compilation. +//! +//! ## Audit that motivated this test +//! +//! The allow-list was audited against the translator's operator match. Several +//! families were allow-listed (or admitted at the feature gate) yet reached a +//! `todo!()` in the translator; they have since been removed from the +//! allow-list / feature gate so the two phases agree: +//! +//! - **saturating float-to-int truncations** (8 opcodes: +//! `i32`/`i64`.`trunc_sat`_`f32`/`f64`_`s`/`u`), +//! - **sign-extension** (5 opcodes: `i32.extend8_s`, `i32.extend16_s`, +//! `i64.extend8_s`, `i64.extend16_s`, `i64.extend32_s`), +//! - **tail calls** (`return_call`, `return_call_indirect`), +//! - **segment-indexed table initialization** (`table.init`, `elem.drop`, +//! `table.copy`), +//! - **all floating-point** operators and value types (`f32`/`f64`). +//! +//! Each of those is now rejected before reaching the merge, so it can never enter +//! a linked output. The corpus below covers only what the linker still admits. +//! +//! ## How the corpus drives operators into the output +//! +//! Most families are exercised in the **main module**, which the linker re-encodes +//! verbatim into the output — the surest way to guarantee a specific operator +//! reaches the translator, since an external body must additionally survive tier +//! classification (a memory access through a non-parameter address is rejected as +//! Tier C). Direct `call` is inherent in every fixture (the main module calls the +//! satisfied import, whose body is merged in). The non-det/uzumaki proof-path +//! opcodes are legal only in the main module, which carries them as Rocq proof +//! scaffolding the merge preserves verbatim. + +use inference_wasm_linker::link as raw_link; +use inference_wasm_to_v_translator::wasm_parser::translate_bytes; +use rustc_hash::FxHashMap; +use std::panic::{catch_unwind, AssertUnwindSafe}; + +/// Assembles a `.wasm` binary from WAT source, panicking with the WAT on error. +fn wasm(wat: &str) -> Vec { + wat::parse_str(wat).unwrap_or_else(|e| panic!("invalid WAT fixture: {e}\n{wat}")) +} + +/// The pure `mathlib` external every fixture links against: it exports +/// `sum:(i32,i32)->i32`, the import each main module satisfies. Reused so the +/// link always has a body to merge (exercising the direct-call path) without each +/// fixture restating the library. +fn mathlib_sum() -> Vec { + wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (export "sum" (func 0))) + "#, + ) +} + +/// Links `main` against `mathlib_sum`, satisfying its `mathlib::sum` import, and +/// asserts the link succeeds. The logical module label must match the import's +/// recorded module (`mathlib`) so the merge resolves the import against the +/// external. +fn link_against_mathlib(main: &[u8]) -> Vec { + let lib = mathlib_sum(); + raw_link(main, &[("mathlib", &lib)]) + .unwrap_or_else(|e| panic!("link must accept the fixture, got {e:?}")) +} + +/// The invariant check for one corpus entry: the linked output of `main` must +/// translate to Rocq without panicking. `translate_bytes` is run under +/// `catch_unwind`, so a `todo!()` for an unlowered operator surfaces as a labeled +/// failure naming the opcode family rather than an opaque process abort. +/// +/// `Ok(Ok(_))` is the only acceptance: the closure must not panic (a `todo!()` +/// would make `catch_unwind` return `Err`) *and* the translation must succeed (a +/// recoverable `Err` would mean the operator is rejected rather than lowered, +/// which for an allow-listed family is itself a phase-disagreement worth flagging). +fn assert_output_translates(label: &str, main: &[u8]) { + let linked = link_against_mathlib(main); + + let result = catch_unwind(AssertUnwindSafe(|| { + let empty: FxHashMap> = FxHashMap::default(); + translate_bytes("Prog", &linked, &empty) + })); + + match result { + Ok(Ok(_)) => {} + Ok(Err(e)) => panic!( + "{label}: the linker accepted this output but wasm-to-v rejected it with a \ + recoverable error ({e:?}); an allow-listed family must have a translator \ + lowering, so the allow-list and the translator have diverged" + ), + Err(_) => panic!( + "{label}: the linker accepted this output but wasm-to-v PANICKED translating it \ + (an unlowered operator hit `todo!()`); this family is allow-listed in \ + core/wasm-linker/src/safety.rs without a translator lowering — either add the \ + lowering in core/wasm-to-v/src/translator.rs or remove the family from the \ + allow-list" + ), + } +} + +/// A main module that imports `mathlib::sum`, runs `body` (a WAT instruction +/// sequence the fixture under test exercises), then calls the import so the link +/// always has a body to merge. `body` runs on a `(param i32 i32) (result i32)` +/// function with no memory; the call's result is the function's result. +/// +/// The body's stack effect must be net-zero (every value it pushes it must also +/// consume), so the trailing `call 0` leaves exactly the one `i32` result on the +/// stack. +fn main_with_body(body: &str) -> Vec { + wasm(&format!( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + {body} + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + )) +} + +/// Like [`main_with_body`] but the function additionally declares a `(memory 1)` +/// and `extra_locals` (e.g. `(local i64)`), so memory-touching and 64-bit +/// fixtures have an address space and scratch slots. The reconciled output keeps +/// this memory (the pure `mathlib` external declares none). +fn main_with_memory_body(extra_locals: &str, body: &str) -> Vec { + wasm(&format!( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (memory (;0;) 1) + (func (;1;) (type 0) (param i32 i32) (result i32) + {extra_locals} + {body} + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + )) +} + +#[test] +fn structured_control_flow_translates() { + // block / loop / if / else / br / br_if / br_table / return / unreachable / + // nop. Each opens or targets a structured region the translator reconstructs + // into a nested Rocq expression. + let main = main_with_body( + r#" + nop + block + i32.const 1 + br_if 0 + br 0 + end + block + loop + br 1 + end + end + block + block + block + local.get 0 + br_table 0 1 2 + end + end + end + local.get 0 + i32.eqz + if (result i32) + local.get 0 + local.get 1 + call 0 + return + else + i32.const 0 + end + drop + local.get 0 + i32.eqz + if + unreachable + end + "#, + ); + assert_output_translates("structured control flow", &main); +} + +#[test] +fn parametric_ops_translate() { + // drop / select. + let main = main_with_body( + r#" + i32.const 7 + drop + i32.const 1 + i32.const 2 + i32.const 0 + select + drop + "#, + ); + assert_output_translates("parametric (drop/select)", &main); +} + +#[test] +fn locals_translate() { + // local.get / local.set / local.tee. The two params are the only locals on + // the base signature; `tee` leaves its value for the trailing `drop`. + let main = main_with_body( + r#" + local.get 0 + local.set 1 + local.get 1 + local.tee 0 + drop + "#, + ); + assert_output_translates("locals (get/set/tee)", &main); +} + +#[test] +fn direct_call_translates() { + // `call` is inherent in every fixture (the main body calls the satisfied + // import, whose body is merged in), so a bare fixture exercises it. The + // assertion is that the merged call site and the merged `sum` body both lower. + let main = main_with_body("nop"); + assert_output_translates("direct call", &main); +} + +#[test] +fn integer_loads_and_stores_translate() { + // Every integer load and store width: i32/i64 full-width, and the sub-width + // signed/unsigned loads and narrowing stores. Each reads or writes the single + // shared memory the merge folds onto. + let main = main_with_memory_body( + "(local i64)", + r#" + local.get 0 i32.load drop + local.get 0 i64.load drop + local.get 0 i32.load8_s drop + local.get 0 i32.load8_u drop + local.get 0 i32.load16_s drop + local.get 0 i32.load16_u drop + local.get 0 i64.load8_s drop + local.get 0 i64.load8_u drop + local.get 0 i64.load16_s drop + local.get 0 i64.load16_u drop + local.get 0 i64.load32_s drop + local.get 0 i64.load32_u drop + local.get 0 local.get 1 i32.store + local.get 0 local.get 1 i32.store8 + local.get 0 local.get 1 i32.store16 + local.get 0 local.get 2 i64.store + local.get 0 local.get 2 i64.store8 + local.get 0 local.get 2 i64.store16 + local.get 0 local.get 2 i64.store32 + "#, + ); + assert_output_translates("integer loads/stores", &main); +} + +#[test] +fn memory_ops_translate() { + // memory.size / memory.grow / memory.copy / memory.fill over the single + // shared memory. + let main = main_with_memory_body( + "", + r#" + memory.size drop + local.get 0 memory.grow drop + local.get 0 local.get 1 i32.const 4 memory.copy + local.get 0 i32.const 0 i32.const 4 memory.fill + "#, + ); + assert_output_translates("memory size/grow/copy/fill", &main); +} + +#[test] +fn integer_constants_translate() { + // i32.const / i64.const. + let main = main_with_body( + r#" + i32.const -1 + drop + i64.const 9223372036854775807 + drop + "#, + ); + assert_output_translates("integer constants", &main); +} + +#[test] +fn i32_comparisons_translate() { + // i32: eqz / eq / ne / lt_s / lt_u / gt_s / gt_u / le_s / le_u / ge_s / ge_u. + let main = main_with_body( + r#" + local.get 0 i32.eqz drop + local.get 0 local.get 1 i32.eq drop + local.get 0 local.get 1 i32.ne drop + local.get 0 local.get 1 i32.lt_s drop + local.get 0 local.get 1 i32.lt_u drop + local.get 0 local.get 1 i32.gt_s drop + local.get 0 local.get 1 i32.gt_u drop + local.get 0 local.get 1 i32.le_s drop + local.get 0 local.get 1 i32.le_u drop + local.get 0 local.get 1 i32.ge_s drop + local.get 0 local.get 1 i32.ge_u drop + "#, + ); + assert_output_translates("i32 comparisons", &main); +} + +#[test] +fn i64_comparisons_translate() { + // i64: eqz / eq / ne / lt_s / lt_u / gt_s / gt_u / le_s / le_u / ge_s / ge_u. + let main = main_with_memory_body( + "(local i64) (local i64)", + r#" + local.get 2 i64.eqz drop + local.get 2 local.get 3 i64.eq drop + local.get 2 local.get 3 i64.ne drop + local.get 2 local.get 3 i64.lt_s drop + local.get 2 local.get 3 i64.lt_u drop + local.get 2 local.get 3 i64.gt_s drop + local.get 2 local.get 3 i64.gt_u drop + local.get 2 local.get 3 i64.le_s drop + local.get 2 local.get 3 i64.le_u drop + local.get 2 local.get 3 i64.ge_s drop + local.get 2 local.get 3 i64.ge_u drop + "#, + ); + assert_output_translates("i64 comparisons", &main); +} + +#[test] +fn i32_arithmetic_and_bitwise_translate() { + // i32: clz / ctz / popcnt / add / sub / mul / div_s / div_u / rem_s / rem_u / + // and / or / xor / shl / shr_s / shr_u / rotl / rotr. + let main = main_with_body( + r#" + local.get 0 i32.clz drop + local.get 0 i32.ctz drop + local.get 0 i32.popcnt drop + local.get 0 local.get 1 i32.add drop + local.get 0 local.get 1 i32.sub drop + local.get 0 local.get 1 i32.mul drop + local.get 0 local.get 1 i32.div_s drop + local.get 0 local.get 1 i32.div_u drop + local.get 0 local.get 1 i32.rem_s drop + local.get 0 local.get 1 i32.rem_u drop + local.get 0 local.get 1 i32.and drop + local.get 0 local.get 1 i32.or drop + local.get 0 local.get 1 i32.xor drop + local.get 0 local.get 1 i32.shl drop + local.get 0 local.get 1 i32.shr_s drop + local.get 0 local.get 1 i32.shr_u drop + local.get 0 local.get 1 i32.rotl drop + local.get 0 local.get 1 i32.rotr drop + "#, + ); + assert_output_translates("i32 arithmetic/bitwise", &main); +} + +#[test] +fn i64_arithmetic_and_bitwise_translate() { + // i64: clz / ctz / popcnt / add / sub / mul / div_s / div_u / rem_s / rem_u / + // and / or / xor / shl / shr_s / shr_u / rotl / rotr. + let main = main_with_memory_body( + "(local i64) (local i64)", + r#" + local.get 2 i64.clz drop + local.get 2 i64.ctz drop + local.get 2 i64.popcnt drop + local.get 2 local.get 3 i64.add drop + local.get 2 local.get 3 i64.sub drop + local.get 2 local.get 3 i64.mul drop + local.get 2 local.get 3 i64.div_s drop + local.get 2 local.get 3 i64.div_u drop + local.get 2 local.get 3 i64.rem_s drop + local.get 2 local.get 3 i64.rem_u drop + local.get 2 local.get 3 i64.and drop + local.get 2 local.get 3 i64.or drop + local.get 2 local.get 3 i64.xor drop + local.get 2 local.get 3 i64.shl drop + local.get 2 local.get 3 i64.shr_s drop + local.get 2 local.get 3 i64.shr_u drop + local.get 2 local.get 3 i64.rotl drop + local.get 2 local.get 3 i64.rotr drop + "#, + ); + assert_output_translates("i64 arithmetic/bitwise", &main); +} + +#[test] +fn integer_width_conversions_translate() { + // The three kept conversions: i32.wrap_i64 / i64.extend_i32_s / + // i64.extend_i32_u. (The saturating truncations and sign-extensions were + // removed from the allow-list because the translator has no lowering for + // them — see the header audit.) + let main = main_with_memory_body( + "(local i64)", + r#" + local.get 2 i32.wrap_i64 drop + local.get 0 i64.extend_i32_s drop + local.get 0 i64.extend_i32_u drop + "#, + ); + assert_output_translates("integer width conversions", &main); +} + +#[test] +fn main_globals_translate() { + // global.get / global.set on a main-side mutable global. Globals live on the + // main module (a Tier-C external carrying its own globals is rejected), so the + // fixture declares the global itself. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (global (;0;) (mut i32) (i32.const 0)) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + global.set 0 + global.get 0 + drop + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + ); + assert_output_translates("main globals (get/set)", &main); +} + +#[test] +fn ref_func_translates() { + // `ref.func ` pushes a function reference, which the translator lowers to + // `BI_ref_func`. It needs no table: the reference names an exported function + // (the export declares it referenceable), so the operator survives the link + // into the output even though the merge preserves no table section. The pushed + // reference is immediately dropped to keep the body's stack net-zero. Pinned + // so a future translator regression on this opcode surfaces here. + // + // Function 1 is this fixture's own exported local (`compute`); function 0 is + // the satisfied import, merged in. Referencing 1 keeps the reference valid in + // the linked output's index space. + let main = main_with_body( + r#" + ref.func 1 + drop + "#, + ); + assert_output_translates("ref.func", &main); +} + +// `call_indirect` is allow-listed (and translatable: `wasm-to-v` lowers it), but +// it cannot appear in a linkable *output* today: the merge preserves no +// `TableSection`, and a main-side table is now rejected outright (alongside the +// already-rejected main-side element segment — see `merge::Plan::build`). So any +// output `call_indirect` would reference a non-existent table; there is no linked +// output in which to exercise the opcode, and a corpus entry would only assert +// the merge's table rejection, not v-alignment. When the merge gains table +// preservation, add a `call_indirect` entry here. + +#[test] +fn proof_path_nondet_and_uzumaki_translate() { + // The verification-only proof-path opcodes (forall/exists/assume/unique and + // i32.uzumaki/i64.uzumaki) are legal only in the main module, which carries + // them as Rocq proof scaffolding the merge preserves verbatim. `wat` cannot + // assemble these custom `0xfc`-prefixed opcodes, so the body is hand-encoded. + let main = proof_mode_main_with_nondet_and_uzumaki(); + assert_output_translates("non-det blocks + uzumaki (proof path)", &main); +} + +/// Builds a proof-mode MAIN module that imports `mathlib::sum` and whose own +/// exported body carries every verification-only opcode the proof path uses — the +/// four non-det blocks (`forall`/`exists`/`assume`/`unique`) and both uzumaki +/// rvalues (`i32.uzumaki`/`i64.uzumaki`) — alongside an executable `call` to the +/// import. `wat` cannot assemble the custom opcodes, so the module is hand-encoded +/// byte-by-byte, mirroring the encoding in `link.rs`. +fn proof_mode_main_with_nondet_and_uzumaki() -> Vec { + use wasm_encoder::{ + CodeSection, EntityType, ExportKind, ExportSection, Function, FunctionSection, ImportSection, + Instruction, Module, TypeSection, ValType, + }; + + let mut module = Module::new(); + + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::I32, ValType::I32], [ValType::I32]); + module.section(&types); + + let mut imports = ImportSection::new(); + imports.import("mathlib", "sum", EntityType::Function(0)); + module.section(&imports); + + let mut funcs = FunctionSection::new(); + funcs.function(0); + module.section(&funcs); + + let mut exports = ExportSection::new(); + // The import is output index 0; the local function is index 1. + exports.export("compute", ExportKind::Func, 1); + module.section(&exports); + + let mut code = CodeSection::new(); + let mut f = Function::new([]); + // Each non-det block (`0xfc 0x40` = empty block type) opens a + // region closed by `End`; the empty block has no stack effect. + for sub_opcode in [0x3a_u8, 0x3b, 0x3c, 0x3d] { + f.raw([0xfc, sub_opcode, 0x40]); + f.instruction(&Instruction::End); + } + // Each uzumaki rvalue (`0xfc `) pushes a value, dropped to keep + // the stack balanced. + f.raw([0xfc, 0x31]); // i32.uzumaki + f.instruction(&Instruction::Drop); + f.raw([0xfc, 0x32]); // i64.uzumaki + f.instruction(&Instruction::Drop); + // Executable tail: sum(arg0, arg1) via the (to-be-merged) import. + f.instruction(&Instruction::LocalGet(0)); + f.instruction(&Instruction::LocalGet(1)); + f.instruction(&Instruction::Call(0)); + f.instruction(&Instruction::End); + code.function(&f); + module.section(&code); + + module.finish() +} + +/// A main module whose exported body nests `depth` empty `block` regions, then +/// calls `mathlib::sum` so the link has an executable tail. Used to pin the +/// structured-control-flow depth cap on the main re-encode path: the linker must +/// reject a body the downstream wasm-to-v translator (which recurses one frame +/// per level) cannot render, so the v-alignment invariant — anything linkable is +/// translatable — holds at the cap boundary as well as below it. +fn main_with_nested_blocks(depth: usize) -> Vec { + let mut body = String::new(); + for _ in 0..depth { + body.push_str("block "); + } + for _ in 0..depth { + body.push_str("end "); + } + wasm(&format!( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "mathlib" "sum" (func (;0;) (type 0))) + (func (;1;) (type 0) (param i32 i32) (result i32) + {body} + local.get 0 + local.get 1 + call 0) + (export "compute" (func 1))) + "#, + )) +} + +#[test] +fn main_body_at_the_control_depth_cap_links_and_translates() { + // A main body nested one level below the cap must both link and translate. + // The closure scan and the wasm-to-v translator both admit nesting strictly + // below 256 levels; the main re-encode path must agree, so a legitimately + // deep (but in-bounds) body is never spuriously rejected. + let main = main_with_nested_blocks(255); + assert_output_translates("main body at the control-depth cap", &main); +} + +#[test] +fn main_body_past_the_control_depth_cap_is_rejected_before_translation() { + // A main body nested at the cap must be rejected by the linker, not linked + // and then rejected by wasm-to-v. The main re-encode path previously left + // the depth cap unenforced, so such a body linked cleanly and only failed + // downstream — violating the invariant that anything linkable is + // translatable. The link must now reject it up front. + let main = main_with_nested_blocks(256); + let lib = mathlib_sum(); + let err = raw_link(&main, &[("mathlib", &lib)]) + .expect_err("a main body past the control-depth cap must be rejected by the linker"); + match err { + inference_wasm_linker::LinkError::UnsupportedConstruct(msg) => assert!( + msg.contains("256") && msg.contains("control"), + "expected an UnsupportedConstruct naming the control-depth limit, got {msg:?}" + ), + other => panic!("expected UnsupportedConstruct, got {other:?}"), + } +} diff --git a/core/wasm-to-v/Cargo.toml b/core/wasm-to-v/Cargo.toml index 203a781d..112b86d5 100644 --- a/core/wasm-to-v/Cargo.toml +++ b/core/wasm-to-v/Cargo.toml @@ -13,6 +13,5 @@ inference-wasm-codegen.workspace = true rustc-hash.workspace = true thiserror.workspace = true -[dependencies.uuid] -version = "1.10.0" -features = ["v4", "fast-rng", "macro-diagnostics"] +[dev-dependencies] +wat = "1.225.0" diff --git a/core/wasm-to-v/README.md b/core/wasm-to-v/README.md index 3182358d..6a532182 100644 --- a/core/wasm-to-v/README.md +++ b/core/wasm-to-v/README.md @@ -413,7 +413,6 @@ This crate has minimal dependencies to keep the build fast and secure: - **inference-wasm-codegen** (`workspace`): Source of the `SPEC_FUNCS_SECTION_NAME` and `SPEC_FUNCS_SECTION_VERSION` wire-format constants; this crate consumes them at the decode boundary so encoder and decoder share one source of truth - **rustc-hash** (`workspace`): `FxHashMap` for the `spec_funcs_by_spec` API type - **thiserror** (`workspace`): Derive macro for the `WasmToVError` enum in `errors.rs` -- **uuid** (`1.10.0`): Unique identifier generation for Rocq definitions (features: `v4`, `fast-rng`, `macro-diagnostics`) The `inf-wasmparser` fork is critical for parsing Inference's custom WASM instruction extensions. See [`tools/inf-wasmparser/`](../../tools/inf-wasmparser/README.md) for details. diff --git a/core/wasm-to-v/src/lib.rs b/core/wasm-to-v/src/lib.rs index b323e306..4fcb873a 100644 --- a/core/wasm-to-v/src/lib.rs +++ b/core/wasm-to-v/src/lib.rs @@ -318,3 +318,399 @@ mod tests { ); } } + +/// Robustness tests for the external `.wasm` static-linking path through +/// `wasm-to-v` (Issue #9 robustness audit, work unit 7). +/// +/// These assemble the kind of module a static merge produces — a merged +/// external inner function sharing a name with a main-module function, and +/// bodies bearing typed-reference / exception-handling operators copied +/// verbatim from an adversarial external — and assert the CLEAN outcome: +/// globally-unique Rocq `Definition`s, and a recoverable +/// [`WasmToVError::UnsupportedFeature`] instead of a panic. +#[cfg(test)] +mod link_robustness { + use super::errors::WasmToVError; + use super::wasm_parser::translate_bytes; + use rustc_hash::FxHashMap; + + fn translate(wat: &str) -> anyhow::Result { + let bytes = wat::parse_str(wat).expect("fixture WAT assembles"); + translate_bytes("Prog", &bytes, &FxHashMap::default()) + } + + /// H20: a merged module whose external inner function shares a name with a + /// main-module function must yield distinct Rocq `Definition`s (Coq cannot + /// overload), and the `mod_funcs` list must reference each unique name. + #[test] + #[cfg_attr(miri, ignore)] + fn duplicate_function_names_are_disambiguated() { + // A module whose `name` section maps both function indices to the + // identical string `add_three`, modelling a main-module `add_three` + // (index 0) next to a merged external `add_three` (index 1). + let bytes = duplicate_named_module(); + let output = translate_bytes("Prog", &bytes, &FxHashMap::default()) + .expect("translation succeeds"); + + let definitions = output.matches("Definition add_three :").count(); + assert_eq!( + definitions, 1, + "exactly one `Definition add_three` may be emitted; got {definitions}:\n{output}", + ); + // The colliding second function must be emitted under a disambiguated + // name derived from its WASM function index. + assert!( + output.contains("Definition add_three_1 :"), + "second `add_three` should be disambiguated to `add_three_1`:\n{output}", + ); + // Both unique names must appear in the `mod_funcs` list so the proof + // deliverable references both bodies. + assert!( + output.contains("add_three ::") && output.contains("add_three_1 ::"), + "mod_funcs must list both disambiguated names:\n{output}", + ); + } + + /// Hand-encodes a 2-function module whose `name` section maps both function + /// indices to the identical string `add_three`. `wat` cannot express a + /// name-section collision from symbolic identifiers, so we emit the bytes + /// directly. + fn duplicate_named_module() -> Vec { + // Assemble a valid skeleton with `wat`, then append a `name` section + // naming both functions `add_three`. + let skeleton = wat::parse_str( + r#" + (module + (func (param i32) (result i32) local.get 0 i32.const 100 i32.add) + (func (param i32) (result i32) local.get 0 i32.const 3 i32.add)) + "#, + ) + .expect("skeleton assembles"); + + // name section: id=0 (custom), name "name"; subsection id=1 (function + // names) with 2 entries, both "add_three". + let func_name = b"add_three"; + let mut func_subsec = Vec::new(); + func_subsec.push(2u8); // count + for idx in 0u8..2 { + func_subsec.push(idx); // func index (LEB128, single byte for <128) + func_subsec.push(func_name.len() as u8); + func_subsec.extend_from_slice(func_name); + } + let mut name_payload = Vec::new(); + name_payload.push(0x04); // length of "name" + name_payload.extend_from_slice(b"name"); + name_payload.push(0x01); // subsection id: function names + name_payload.push(func_subsec.len() as u8); + name_payload.extend_from_slice(&func_subsec); + + let mut bytes = skeleton; + bytes.push(0x00); // custom section id + bytes.push(name_payload.len() as u8); + bytes.extend_from_slice(&name_payload); + bytes + } + + /// H13: a `ref.null` copied verbatim from an adversarial external must + /// surface as a recoverable [`WasmToVError::UnsupportedFeature`], never a + /// `todo!()` panic. + #[test] + #[cfg_attr(miri, ignore)] + fn ref_null_is_unsupported_feature_not_panic() { + let err = translate( + r#" + (module + (func (export "f") (result i32) + ref.null func + drop + i32.const 0)) + "#, + ) + .expect_err("ref.null must be rejected"); + + let downcast = err.downcast_ref::(); + assert!( + matches!(downcast, Some(WasmToVError::UnsupportedFeature { .. })), + "ref.null should surface as UnsupportedFeature; got: {err:?}", + ); + } + + /// H13: `call_ref` likewise must be a recoverable error rather than a + /// panic on the `-v` path. + #[test] + #[cfg_attr(miri, ignore)] + fn call_ref_is_unsupported_feature_not_panic() { + let err = translate( + r#" + (module + (type $sig (func (result i32))) + (func (export "f") (result i32) + ref.null $sig + call_ref $sig)) + "#, + ) + .expect_err("call_ref must be rejected"); + + let downcast = err.downcast_ref::(); + assert!( + matches!(downcast, Some(WasmToVError::UnsupportedFeature { .. })), + "call_ref should surface as UnsupportedFeature; got: {err:?}", + ); + } + + /// Assembles a one-function module whose body nests `depth` empty `block`s, + /// mirroring the adversarially deep external the linker would otherwise + /// merge before handing it to the translator. + fn nested_blocks_module(depth: usize) -> Vec { + let mut body = String::new(); + for _ in 0..depth { + body.push_str("block "); + } + for _ in 0..depth { + body.push_str("end "); + } + let wat = format!(r#"(module (func (export "f") {body}))"#); + wat::parse_str(&wat).expect("nested-blocks WAT assembles") + } + + /// H-3: a deeply-nested external body must surface as a recoverable + /// [`WasmToVError::UnsupportedFeature`] rather than overflowing the + /// translator's stack (an unrecoverable SIGABRT) on the `-v` proof path. + /// + /// The translator recurses once per nesting level both when building the + /// expression tree (`translate_expression`) and when rendering it + /// (`print_with_offset`); without a depth bound a body of a few thousand + /// nested blocks aborts the process. A depth well past the cap must fail + /// cleanly. + #[test] + #[cfg_attr(miri, ignore)] + fn deeply_nested_body_is_unsupported_feature_not_stack_overflow() { + let bytes = nested_blocks_module(5_000); + let err = translate_bytes("Prog", &bytes, &FxHashMap::default()) + .expect_err("a deeply-nested body must be rejected, not abort"); + + let downcast = err.downcast_ref::(); + assert!( + matches!(downcast, Some(WasmToVError::UnsupportedFeature { .. })), + "deep nesting should surface as UnsupportedFeature; got: {err:?}", + ); + } + + /// H-3: a body nested *up to* the cap still translates cleanly, so the + /// guard rejects only pathological depth, never a legitimately nested + /// function. + #[test] + #[cfg_attr(miri, ignore)] + fn body_nested_within_the_cap_translates() { + let bytes = nested_blocks_module(16); + translate_bytes("Prog", &bytes, &FxHashMap::default()) + .expect("a modestly-nested body translates"); + } + + /// Assembles a 2-function module with *no* name section: an exported `sum` + /// (index 0) that calls an anonymous inner `func 1`. Models the supply path + /// issue #9 serves — a third-party / `wasm-tools`-stripped external whose + /// inner callees carry no debug name. + fn nameless_two_function_module() -> Vec { + wat::parse_str( + r#" + (module + (func (export "sum") (param i32) (result i32) + local.get 0 call 1) + (func (param i32) (result i32) + local.get 0 i32.const 1 i32.add)) + "#, + ) + .expect("nameless module assembles") + } + + /// H-4: a nameless function must receive a deterministic name derived from + /// its output function index (`func_`), not a per-process random UUID, + /// so the `.v` is byte-identical across runs for byte-identical input. + #[test] + #[cfg_attr(miri, ignore)] + fn nameless_functions_get_deterministic_names_and_reproducible_v() { + let bytes = nameless_two_function_module(); + + let first = translate_bytes("Prog", &bytes, &FxHashMap::default()) + .expect("first translation succeeds"); + let second = translate_bytes("Prog", &bytes, &FxHashMap::default()) + .expect("second translation succeeds"); + + assert_eq!( + first, second, + "byte-identical input must produce a byte-identical `.v` across runs", + ); + // Every nameless function is named from its output index; no random UUID + // name leaks into the proof artifact. + assert!( + first.contains("Definition func_0 :") && first.contains("Definition func_1 :"), + "nameless functions should be named `func_0`/`func_1` from their index:\n{first}", + ); + } + + /// Assembles a 2-function module whose name section names only the exported + /// root (`func 0` = `sum`), leaving the inner callee (`func 1`) nameless. + /// Mirrors a static-merge output with a named closure root next to a + /// nameless inner callee, exercising the translator's index-derived + /// fallback in isolation. + fn root_named_inner_nameless_module() -> Vec { + let skeleton = wat::parse_str( + r#" + (module + (func (param i32) (result i32) local.get 0 call 1) + (func (param i32) (result i32) local.get 0 i32.const 1 i32.add)) + "#, + ) + .expect("skeleton assembles"); + + // name section: id=0 (custom), name "name"; subsection id=1 (function + // names) with a single entry naming function 0 `sum`. + let func_name = b"sum"; + let mut func_subsec = Vec::new(); + func_subsec.push(1u8); // count + func_subsec.push(0u8); // func index 0 + func_subsec.push(func_name.len() as u8); + func_subsec.extend_from_slice(func_name); + + let mut name_payload = Vec::new(); + name_payload.push(0x04); // length of "name" + name_payload.extend_from_slice(b"name"); + name_payload.push(0x01); // subsection id: function names + name_payload.push(func_subsec.len() as u8); + name_payload.extend_from_slice(&func_subsec); + + let mut bytes = skeleton; + bytes.push(0x00); // custom section id + bytes.push(name_payload.len() as u8); + bytes.extend_from_slice(&name_payload); + bytes + } + + /// H-4: when only the closure root carries a name, the nameless inner + /// callee still gets a deterministic index-derived name and the artifact is + /// reproducible — the named root keeps `sum`, the inner callee is `func_1`, + /// and no UUID appears. + #[test] + #[cfg_attr(miri, ignore)] + fn nameless_inner_callee_with_named_root_is_deterministic() { + let bytes = root_named_inner_nameless_module(); + + let first = translate_bytes("Prog", &bytes, &FxHashMap::default()) + .expect("first translation succeeds"); + let second = translate_bytes("Prog", &bytes, &FxHashMap::default()) + .expect("second translation succeeds"); + + assert_eq!( + first, second, + "byte-identical input must produce a byte-identical `.v` across runs", + ); + // The root keeps its source name (sanitized for Rocq — `sum` collides + // with a stdlib name and is suffixed to `sum_`), distinct from the + // index-derived fallback the inner callee receives. + assert!( + first.contains("Definition sum_ :"), + "the named root keeps its `sum`-derived name:\n{first}", + ); + assert!( + first.contains("Definition func_1 :"), + "the nameless inner callee should be `func_1` from its index:\n{first}", + ); + } + + /// D6: `function_bodies` is 0-based over the code section, but the name + /// section keys on the *absolute* WASM function index, which numbers + /// imported functions first. `translate_functions` offsets the body + /// position by the function-import count to recover the absolute index. + /// + /// This module imports `host` (absolute index 0) and defines `local` + /// (absolute index 1). The single code-section body is `local`; its + /// name-section entry lives under absolute index 1. Without the offset the + /// translator would look up index 0 and emit the body under the *import's* + /// name (`host`) — a silently mis-named proof obligation. The offset must + /// give it the correct name `local`. + #[test] + #[cfg_attr(miri, ignore)] + fn function_import_offsets_the_name_lookup() { + let bytes = wat::parse_str( + r#" + (module + (import "env" "host" (func $host (param i32) (result i32))) + (func $local (param i32) (result i32) local.get 0 i32.const 1 i32.add)) + "#, + ) + .expect("import fixture WAT assembles"); + + let output = translate_bytes("Prog", &bytes, &FxHashMap::default()) + .expect("an import-bearing module translates"); + + assert!( + output.contains("Definition local :"), + "the sole defined function must be named from its absolute index (1 -> `local`), \ + not the import's index (0 -> `host`):\n{output}", + ); + assert!( + !output.contains("Definition host :"), + "the import's name must never be emitted as a defined `module_func`:\n{output}", + ); + } + + /// D6 companion: with no name section, the fallback name is derived from the + /// *absolute* index too, so the offset is exercised even without debug + /// names. The import occupies absolute index 0, so the single defined body + /// is `func_1`, never `func_0`. + #[test] + #[cfg_attr(miri, ignore)] + fn function_import_offsets_the_nameless_fallback() { + // Assemble a named skeleton, then strip the name section so the + // translator falls back to index-derived names. + let with_names = wat::parse_str( + r#" + (module + (import "env" "host" (func (param i32) (result i32))) + (func (param i32) (result i32) local.get 0 i32.const 1 i32.add)) + "#, + ) + .expect("import fixture WAT assembles"); + + let output = translate_bytes("Prog", &with_names, &FxHashMap::default()) + .expect("an import-bearing nameless module translates"); + + assert!( + output.contains("Definition func_1 :"), + "the nameless defined body sits at absolute index 1, so it must be `func_1`:\n{output}", + ); + assert!( + !output.contains("Definition func_0 :"), + "absolute index 0 belongs to the import, so `func_0` must not be a defined \ + function:\n{output}", + ); + } + + /// D6 companion: a non-function import (a memory) does not occupy a function + /// index, so the function-import offset stays 0 and the sole defined body + /// keeps absolute index 0. Guards against over-counting non-function + /// imports in the offset. + #[test] + #[cfg_attr(miri, ignore)] + fn non_function_import_does_not_offset_function_indices() { + let bytes = wat::parse_str( + r#" + (module + (import "env" "mem" (memory 1)) + (func $only (param i32) (result i32) local.get 0 i32.const 1 i32.add)) + "#, + ) + .expect("memory-import fixture WAT assembles"); + + let output = translate_bytes("Prog", &bytes, &FxHashMap::default()) + .expect("a module whose only import is a memory translates"); + + // The defined function sits at absolute index 0 (no function imports), + // so it keeps its source name with no index perturbation. + assert!( + output.contains("Definition only :"), + "a non-function import must not shift the defined function's index:\n{output}", + ); + } +} diff --git a/core/wasm-to-v/src/rocq_names.rs b/core/wasm-to-v/src/rocq_names.rs index a71081ab..586074fd 100644 --- a/core/wasm-to-v/src/rocq_names.rs +++ b/core/wasm-to-v/src/rocq_names.rs @@ -164,3 +164,158 @@ pub fn validate_rocq_identifier(name: &str) -> Result<(), WasmToVError> { Ok(()) } + +/// Rewrites an arbitrary WASM name-section symbol into a syntactically legal +/// Rocq identifier, returning a name that always satisfies +/// [`validate_rocq_identifier`]. +/// +/// This is the decode-boundary defense for function names copied verbatim +/// from a WASM `name` section. Such names are not constrained to Rocq's +/// identifier grammar: Inference's own codegen emits struct-method names like +/// `Point.sum_coords` (illegal `.`), and an adversarial external `.wasm` can +/// name an inner function with a Coq keyword (`fun`, `match`) or otherwise +/// illegal characters. Emitting any of these verbatim as `Definition ` +/// produces invalid Gallina with exit 0 — a silent miscompile of the proof +/// artifact. Sanitizing here guarantees every emitted `Definition` name is +/// well-formed; the emitter additionally de-duplicates the sanitized names so +/// distinct functions never collide on one Rocq `Definition`. +/// +/// Rewrite rules (each chosen to map the legal grammar to itself, so already +/// valid names are returned unchanged): +/// - Characters outside `[A-Za-z0-9_]` become `_`. +/// - A leading non-letter is prefixed with `f_` (Rocq reserves `_`-leading and +/// digit-leading identifiers). +/// - A `__` run is collapsed to `_` (the module/spec separator is reserved). +/// - A name colliding with a reserved keyword or stdlib name is suffixed `_`. +/// - An over-length name is truncated to the 255-character cap. +/// +/// The result is never guaranteed globally unique on its own — that is the +/// caller's responsibility — but it is always individually well-formed. +#[must_use] +pub fn sanitize_rocq_identifier(name: &str) -> String { + let mut out = String::with_capacity(name.len().min(255)); + for c in name.chars() { + if c.is_ascii_alphanumeric() { + out.push(c); + } else { + out.push('_'); + } + } + + // Enforce a letter-leading identifier first; an empty or non-alpha start + // is prefixed rather than dropped so distinct inputs stay distinguishable. + // Done before the `__` collapse so the `f_` prefix joined to a leading `_` + // (`f_` + `_priv`) does not leave a `__` run behind. + let needs_prefix = out + .chars() + .next() + .is_none_or(|c| !c.is_ascii_alphabetic()); + if needs_prefix { + out.insert_str(0, "f_"); + } + + // Collapse `__` runs so the sanitized name cannot collide with the + // `__` separator grammar. + while out.contains("__") { + out = out.replace("__", "_"); + } + + if out.len() > 255 { + out.truncate(255); + // Truncation may leave a trailing `_` adjacent to the cap; that is + // still a legal identifier, so no further fix-up is needed. + } + + while REJECTED_ROCQ_KEYWORDS.contains(&out.as_str()) + || REJECTED_ROCQ_STDLIB_NAMES.contains(&out.as_str()) + { + out.push('_'); + } + + debug_assert!( + validate_rocq_identifier(&out).is_ok(), + "sanitized identifier `{out}` (from `{name}`) is still invalid", + ); + out +} + +#[cfg(test)] +mod tests { + use super::{sanitize_rocq_identifier, validate_rocq_identifier}; + + /// Every sanitized name must satisfy the validator — the sanitizer's core + /// contract. + fn assert_sanitized_is_valid(input: &str) -> String { + let out = sanitize_rocq_identifier(input); + assert!( + validate_rocq_identifier(&out).is_ok(), + "sanitized `{out}` (from `{input}`) failed validation", + ); + out + } + + #[test] + fn already_valid_names_are_unchanged() { + for name in ["add_three", "main", "Geometry", "f0", "x_y_z"] { + assert_eq!(sanitize_rocq_identifier(name), name); + } + } + + #[test] + fn dotted_method_name_becomes_valid_identifier() { + // Inference emits struct-method names like `Point.sum_coords`. + let out = assert_sanitized_is_valid("Point.sum_coords"); + assert_eq!(out, "Point_sum_coords"); + } + + #[test] + fn illegal_characters_become_underscores() { + let out = assert_sanitized_is_valid("a-b/c:d"); + assert_eq!(out, "a_b_c_d"); + } + + #[test] + fn leading_non_letter_is_prefixed() { + assert_eq!(assert_sanitized_is_valid("0abc"), "f_0abc"); + assert_eq!(assert_sanitized_is_valid("_priv"), "f_priv"); + // A digit-only name is prefixed, not emptied. + assert_eq!(assert_sanitized_is_valid("123"), "f_123"); + } + + #[test] + fn empty_name_is_prefixed_to_a_legal_identifier() { + assert_eq!(assert_sanitized_is_valid(""), "f_"); + } + + #[test] + fn double_underscore_runs_are_collapsed() { + // `__` is the reserved module/spec separator. + let out = assert_sanitized_is_valid("a__b"); + assert!(!out.contains("__"), "must not retain `__`: {out}"); + assert_eq!(out, "a_b"); + // A run of illegal chars collapsing to many underscores still collapses. + assert_eq!(assert_sanitized_is_valid("a...b"), "a_b"); + } + + #[test] + fn coq_keywords_are_escaped() { + for kw in ["fun", "match", "Definition", "forall"] { + let out = assert_sanitized_is_valid(kw); + assert_ne!(out, kw, "keyword `{kw}` must be escaped"); + } + } + + #[test] + fn stdlib_names_are_escaped() { + for name in ["nat", "Nat", "list", "Some"] { + let out = assert_sanitized_is_valid(name); + assert_ne!(out, name, "stdlib name `{name}` must be escaped"); + } + } + + #[test] + fn over_length_names_are_truncated() { + let out = assert_sanitized_is_valid(&"a".repeat(400)); + assert!(out.len() <= 255, "must respect the 255-char cap: {}", out.len()); + } +} diff --git a/core/wasm-to-v/src/translator.rs b/core/wasm-to-v/src/translator.rs index 9d233a6a..2f1f8dc4 100644 --- a/core/wasm-to-v/src/translator.rs +++ b/core/wasm-to-v/src/translator.rs @@ -117,7 +117,8 @@ //! Generated Rocq identifiers follow these rules: //! //! - **Named functions**: Use names from custom name section if available -//! - **Anonymous functions**: Generate unique names using UUID (`func_`) +//! - **Anonymous functions**: Deterministically named `func_` from the +//! output function index, so the `.v` is reproducible for identical input //! - **Module name**: Use name from custom section, or parameter to `translate_bytes` //! //! ## Output Format @@ -158,16 +159,14 @@ //! |}. //! ``` -use core::fmt; -use std::{collections::HashMap, fmt::Display}; +use std::collections::HashMap; use inf_wasmparser::{ BlockType, CompositeInnerType, Data, DataKind, Element, ElementItems, ElementKind, Export, FunctionBody, Global, Import, MemoryType, Operator, OperatorsIterator, OperatorsReader, RecGroup, RefType, Table, TableType, TypeRef, ValType as wpValType, }; -use rustc_hash::FxHashMap; -use uuid::Uuid; +use rustc_hash::{FxHashMap, FxHashSet}; use crate::errors::WasmToVError; @@ -297,9 +296,12 @@ impl WasmParseData<'_> { /// /// # Error Recovery /// - /// Unlike the parser, this method uses error recovery: it collects translation - /// errors from all sections and returns the first error only if no sections - /// succeeded. This provides better diagnostics for complex failures. + /// This method collects translation errors from every section so a single + /// failure does not mask later ones, but it is fail-closed: if any section + /// failed, the assembled module is discarded and the first error is + /// returned. The emitted `.v` is a mission-critical proof artifact, so a + /// partial translation (e.g. a module missing a function body) must never + /// be returned as success. /// /// # Returns /// @@ -572,21 +574,69 @@ impl WasmParseData<'_> { } res.push('\n'); res.push_str("End Host.\n"); + + // Fail-closed: any section error means the assembled module is + // incomplete (e.g. a function body that hit an unsupported operator). + // Returning it as success would emit a corrupt proof artifact, so + // surface the first collected error instead. + if let Some(first) = errors.into_iter().next() { + return Err(first); + } Ok(res) } + /// Number of imported functions, which occupy the lowest function indices + /// in WASM's index space before any locally-defined (code-section) function. + /// + /// The static-merge linker removes every import before `-v`, so this is `0` + /// for every artifact the pipeline produces (the always-link invariant). It + /// is non-zero only when a pre-link or third-party module is translated + /// directly; the offset below keeps that case correctly indexed rather than + /// relying on the invariant for soundness. + fn func_import_count(&self) -> usize { + self.imports + .iter() + .filter(|import| matches!(import.ty, TypeRef::Func(_))) + .count() + } + //Record module_func fn translate_functions(&mut self) -> anyhow::Result<()> { + // Rocq `Definition`s are not overloadable, so every emitted function + // name must be globally unique. A static merge can fold an external + // library's private function (carrying its own debug name) next to a + // main-module function of the same name. We disambiguate by appending + // the WASM function index on collision, deriving the `Definition` and + // the matching `mod_funcs` entry from the same per-index name. + // + // `function_bodies` is indexed 0-based over the *code section*, but the + // name section, start/export descriptors, and the + // `inference.spec_funcs` map key on the *absolute* WASM function index, + // which numbers imported functions first. Offset the body position by + // the function-import count to recover the absolute index for those + // lookups. `mod_funcs` order itself stays body-relative (it excludes + // imports, which appear via `mod_imports`). With no imports — every + // post-link artifact — the offset is zero and output is unchanged. + let func_import_base = self.func_import_count(); + let mut used_names: FxHashSet = FxHashSet::default(); for (index, function_body) in self.function_bodies.iter().enumerate() { let modfunc_type = *self.function_type_indexes.get(index).unwrap_or(&0); - let func_name = if let Some(func_names_map) = &self.func_names_map { - func_names_map - .get(&(index as u32)) - .unwrap_or(&format!("func_{}", get_id())) - .to_owned() - } else { - format!("func_{}", get_id()) + let abs_index = (func_import_base + index) as u32; + // A function with no name-section entry is named deterministically + // from its absolute index (`func_`) rather than a + // per-process random UUID, so the `.v` is byte-identical across runs + // for byte-identical input (reproducible builds, content-addressed + // proof caches, CI diffs). The linker fills every merged inner + // callee's name, so this fallback fires only for an unnamed function + // reaching the translator directly. + let base_name = match &self.func_names_map { + Some(func_names_map) => func_names_map + .get(&abs_index) + .cloned() + .unwrap_or_else(|| format!("func_{abs_index}")), + None => format!("func_{abs_index}"), }; + let func_name = unique_function_name(base_name, abs_index, &mut used_names); self.translated_function_names.push(func_name.clone()); let mut modfunc_locals = String::new(); @@ -711,6 +761,27 @@ fn translate_table_type_limits(table_type: &TableType) -> anyhow::Result //Record limits fn translate_memory_type_limits(memory_type: &MemoryType) -> anyhow::Result { + // The target model (`Mm {|lim_min; lim_max|}`) has no field for `memory64`, + // `shared`, or a custom page size, so any such memory would be silently + // re-encoded as a 32-bit, non-shared, default-page-size machine — a `.v` + // describing a machine the `.wasm` is not. Reject rather than miscompile the + // proof artifact (defense in depth behind the linker's shape guard; audit + // C-4/L-1). + if memory_type.memory64 { + return Err(anyhow::anyhow!(WasmToVError::UnsupportedFeature { + description: "memory64 (i64-addressed) linear memory".into(), + })); + } + if memory_type.shared { + return Err(anyhow::anyhow!(WasmToVError::UnsupportedFeature { + description: "shared linear memory (threads proposal)".into(), + })); + } + if memory_type.page_size_log2.is_some() { + return Err(anyhow::anyhow!(WasmToVError::UnsupportedFeature { + description: "linear memory with a custom page size".into(), + })); + } let lim_min = format!("{}%N", memory_type.initial); let lim_max = match memory_type.maximum { Some(max) => format!("Some({max}%N)"), @@ -806,7 +877,19 @@ impl Expression<'_> { self.parts.last() } - fn print_with_offset(&self, tabs_count: usize) -> anyhow::Result { + /// Renders this expression tree to its Rocq list form, indenting nested + /// blocks by `tabs_count` levels. + /// + /// `depth` bounds the self-recursion independently of the indentation: a + /// body nested deeper than [`MAX_EXPRESSION_DEPTH`] is rejected with a + /// recoverable [`WasmToVError::UnsupportedFeature`] rather than recursing to + /// stack exhaustion (an unrecoverable `abort()`). The bound mirrors the one + /// in `translate_expression`, so a body that built its tree without + /// overflowing also renders without overflowing. + fn print_with_offset(&self, tabs_count: usize, depth: usize) -> anyhow::Result { + if depth >= MAX_EXPRESSION_DEPTH { + return Err(too_deeply_nested_err()); + } let mut res = String::new(); let offset = " ".repeat(tabs_count); for part in &self.parts { @@ -825,7 +908,7 @@ impl Expression<'_> { translate_basic_operator(&block.label, &self.local_name_map)?.as_str(), ); res.push_str(" (\n"); - res.push_str(block.parts.print_with_offset(tabs_count + 1)?.as_str()); + res.push_str(block.parts.print_with_offset(tabs_count + 1, depth + 1)?.as_str()); res.push_str(") "); res.push_str("::\n"); } @@ -835,9 +918,9 @@ impl Expression<'_> { translate_basic_operator(&cond.label, &self.local_name_map)?.as_str(), ); res.push_str(" (\n"); - res.push_str(cond.then_arm.print_with_offset(tabs_count + 1)?.as_str()); + res.push_str(cond.then_arm.print_with_offset(tabs_count + 1, depth + 1)?.as_str()); res.push_str(") (\n"); - res.push_str(cond.else_arm.print_with_offset(tabs_count + 1)?.as_str()); + res.push_str(cond.else_arm.print_with_offset(tabs_count + 1, depth + 1)?.as_str()); res.push_str(") "); res.push_str("::\n"); } @@ -848,20 +931,35 @@ impl Expression<'_> { } } -impl Display for Expression<'_> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "{}", - self.print_with_offset(2) - .unwrap_or(String::from("Error rendering expression")) - ) - } +/// Maximum structured-control-flow nesting depth the translator recurses +/// through before rejecting a body as too deeply nested. +/// +/// `translate_expression` (tree build) and [`Expression::print_with_offset`] +/// (render) are mutually-bounded self-recursive: a body of N nested blocks +/// recurses N deep. A Rust stack overflow is an `abort()` that bypasses every +/// `?`/`Err` path, so an adversarial external `.wasm` with thousands of nested +/// blocks would crash the proof path (SIGABRT) instead of failing cleanly. +/// Capping the depth turns that DoS into a recoverable +/// [`WasmToVError::UnsupportedFeature`]. The bound is far above any nesting a +/// real Inference function produces and comfortably below the depth at which +/// either pass would exhaust even a small (2 MiB) thread stack. +const MAX_EXPRESSION_DEPTH: usize = 256; + +fn too_deeply_nested_err() -> anyhow::Error { + anyhow::anyhow!(WasmToVError::UnsupportedFeature { + description: format!( + "function body nests structured control flow deeper than {MAX_EXPRESSION_DEPTH} levels" + ), + }) } fn translate_expression<'a>( operators_reader: &mut OperatorsIterator<'a>, + depth: usize, ) -> anyhow::Result> { + if depth >= MAX_EXPRESSION_DEPTH { + return Err(too_deeply_nested_err()); + } let mut result = Expression::default(); while let Some(next_operator) = operators_reader.next() { let next_operator = next_operator.as_ref().unwrap(); @@ -872,8 +970,7 @@ fn translate_expression<'a>( | inf_wasmparser::Operator::Exists { .. } | inf_wasmparser::Operator::Assume { .. } | inf_wasmparser::Operator::Unique { .. } => { - // operators_reader.next(); - let block_operations = translate_expression(operators_reader)?; + let block_operations = translate_expression(operators_reader, depth + 1)?; let block = BlockExpr { label: next_operator.to_owned(), parts: block_operations, @@ -881,15 +978,14 @@ fn translate_expression<'a>( result.parts.push(ExpressionPart::Block(block)); } inf_wasmparser::Operator::If { .. } => { - // operators_reader.next(); - let then_arm = translate_expression(operators_reader)?; + let then_arm = translate_expression(operators_reader, depth + 1)?; let else_arm = if matches!( then_arm.last_part().unwrap(), ExpressionPart::Operator(Operator::End) ) { Expression::default() } else { - translate_expression(operators_reader)? + translate_expression(operators_reader, depth + 1)? }; let condition = ConditionExpr { @@ -918,9 +1014,12 @@ fn translate_expr( local_name_map: Option>, ) -> anyhow::Result { let mut peekable_operators_reader = operators_reader.clone().into_iter(); - let mut expression = translate_expression(&mut peekable_operators_reader)?; + let mut expression = translate_expression(&mut peekable_operators_reader, 0)?; expression.local_name_map = local_name_map; - Ok(expression.to_string()) + // Render through the fallible `print_with_offset` directly rather than the + // `Display` impl, so that an unsupported operator surfaces as a returned + // `WasmToVError` instead of being swallowed into placeholder text. + expression.print_with_offset(2, 0) } fn translate_block_type(block_type: &BlockType) -> anyhow::Result { @@ -1459,7 +1558,11 @@ fn translate_basic_operator( Operator::ElemDrop { .. } => todo!(), Operator::TableCopy { .. } => todo!(), Operator::TypedSelect { .. } => todo!(), - Operator::RefNull { .. } => todo!(), + Operator::RefNull { .. } => { + return Err(anyhow::anyhow!(WasmToVError::UnsupportedFeature { + description: "ref.null (typed reference instruction)".into(), + })); + } Operator::RefIsNull => "BI_ref_is_null".to_string(), Operator::RefFunc { function_index } => format!("BI_ref_func {function_index}%N"), Operator::TableFill { table } => format!("BI_table_fill {table}%N"), @@ -1895,8 +1998,16 @@ fn translate_basic_operator( Operator::I16x8RelaxedQ15mulrS => todo!(), Operator::I16x8RelaxedDotI8x16I7x16S => todo!(), Operator::I32x4RelaxedDotI8x16I7x16AddS => todo!(), - Operator::TryTable { .. } => todo!(), - Operator::Throw { .. } => todo!(), + Operator::TryTable { .. } => { + return Err(anyhow::anyhow!(WasmToVError::UnsupportedFeature { + description: "try_table (exception-handling instruction)".into(), + })); + } + Operator::Throw { .. } => { + return Err(anyhow::anyhow!(WasmToVError::UnsupportedFeature { + description: "throw (exception-handling instruction)".into(), + })); + } Operator::ThrowRef => todo!(), Operator::Try { .. } => todo!(), Operator::Catch { .. } => todo!(), @@ -2059,8 +2170,16 @@ fn translate_basic_operator( })); } Operator::RefI31Shared => todo!(), - Operator::CallRef { .. } => todo!(), - Operator::ReturnCallRef { .. } => todo!(), + Operator::CallRef { .. } => { + return Err(anyhow::anyhow!(WasmToVError::UnsupportedFeature { + description: "call_ref (typed function reference instruction)".into(), + })); + } + Operator::ReturnCallRef { .. } => { + return Err(anyhow::anyhow!(WasmToVError::UnsupportedFeature { + description: "return_call_ref (typed function reference instruction)".into(), + })); + } Operator::RefAsNonNull => todo!(), Operator::BrOnNull { .. } => todo!(), Operator::BrOnNonNull { .. } => todo!(), @@ -2100,8 +2219,72 @@ fn translate_data(data: &Data) -> anyhow::Result { Ok(res) } -fn get_id() -> String { - let uuid = Uuid::new_v4().to_string(); - let mut parts = uuid.split('-'); - parts.next().unwrap().to_string() +/// Returns a Rocq `Definition` name guaranteed not to collide with any name +/// already in `used_names`, recording the chosen name. On collision the WASM +/// function `index` is appended (`_`); should that already be +/// taken, a monotonically increasing suffix is added until the name is free. +fn unique_function_name( + base_name: String, + index: u32, + used_names: &mut FxHashSet, +) -> String { + if used_names.insert(base_name.clone()) { + return base_name; + } + let mut candidate = format!("{base_name}_{index}"); + let mut disambiguator = 0u32; + while !used_names.insert(candidate.clone()) { + candidate = format!("{base_name}_{index}_{disambiguator}"); + disambiguator += 1; + } + candidate +} + +#[cfg(test)] +mod tests { + use super::*; + + fn mem(memory64: bool, shared: bool, page_size_log2: Option) -> MemoryType { + MemoryType { memory64, shared, initial: 1, maximum: Some(1), page_size_log2 } + } + + fn assert_unsupported(result: anyhow::Result, needle: &str) { + let err = result.expect_err("a non-32-bit memory must be rejected"); + let Some(WasmToVError::UnsupportedFeature { description }) = + err.downcast_ref::() + else { + panic!("expected UnsupportedFeature, got {err:?}"); + }; + assert!(description.contains(needle), "description names the feature: {description}"); + } + + #[test] + fn a_32_bit_memory_translates() { + // The default 32-bit, non-shared, default-page-size memory is the only + // shape the model encodes; it must still translate cleanly. + let limits = translate_memory_type_limits(&mem(false, false, None)) + .expect("a standard 32-bit memory translates"); + assert_eq!(limits, "{|lim_min := 1%N; lim_max := Some(1%N)|}"); + } + + #[test] + fn a_memory64_memory_is_rejected() { + // C-4: the translator must never silently encode a 64-bit machine as the + // 32-bit `Mm` record, which has no memory64 field. + assert_unsupported(translate_memory_type_limits(&mem(true, false, None)), "memory64"); + } + + #[test] + fn a_shared_memory_is_rejected() { + // L-1: a shared memory has no representable flag in the target model. + assert_unsupported(translate_memory_type_limits(&mem(false, true, None)), "shared"); + } + + #[test] + fn a_custom_page_size_memory_is_rejected() { + assert_unsupported( + translate_memory_type_limits(&mem(false, false, Some(0))), + "custom page size", + ); + } } diff --git a/core/wasm-to-v/src/wasm_parser.rs b/core/wasm-to-v/src/wasm_parser.rs index f35b0202..48991128 100644 --- a/core/wasm-to-v/src/wasm_parser.rs +++ b/core/wasm-to-v/src/wasm_parser.rs @@ -96,7 +96,7 @@ use rustc_hash::FxHashMap; use std::collections::HashMap; use crate::errors::WasmToVError; -use crate::rocq_names::validate_rocq_identifier; +use crate::rocq_names::{sanitize_rocq_identifier, validate_rocq_identifier}; use crate::translator::WasmParseData; /// Translates WebAssembly bytecode into Rocq (Coq) formal verification code. @@ -348,8 +348,18 @@ fn parse( let mut func_names_map = HashMap::new(); for func_name in func_names { let func_name = func_name?; - func_names_map - .insert(func_name.index, func_name.name.to_string()); + // Function names are emitted verbatim as + // `Definition `, so they must be + // legal Rocq identifiers. WASM names are + // not (Inference emits `Struct.method`; an + // adversarial external may use a Coq + // keyword). Sanitize at the decode boundary + // so no illegal identifier reaches Gallina; + // the translator de-duplicates the result. + func_names_map.insert( + func_name.index, + sanitize_rocq_identifier(func_name.name), + ); } if !func_names_map.is_empty() { wasm_parse_data.func_names_map = Some(func_names_map); diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 4d6a0a95..9384bcdd 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -24,3 +24,4 @@ wasmprinter = "0.245.1" [dev-dependencies] cov-mark = { workspace = true, features = ["enable"] } +wat = "1.225.0" diff --git a/tests/src/analysis/rules_a024.rs b/tests/src/analysis/rules_a024.rs index 0ec16f88..a301f67f 100644 --- a/tests/src/analysis/rules_a024.rs +++ b/tests/src/analysis/rules_a024.rs @@ -167,6 +167,171 @@ mod analysis_rules_tests { assert_eq!(a024_count, 2, "expected 2 ExternFunctionCall errors for nested calls, got: {errors:?}"); } + #[test] + fn a024_call_to_bound_extern_accepted() { + // An extern bound to a source module via `use … from` lowers to a + // linker-satisfied import (issue #9, Phase 4), so calling it must NOT + // trigger A024 — only unbound bare externs remain uncompilable. + let source = r#" + external fn sum(a: i32, b: i32) -> i32; + use { sum } from arith; + fn main() -> i32 { return sum(1, 2); } + "#; + let result = analyze(source); + if let Err(ref e) = result { + let has_a024 = e + .errors() + .iter() + .any(|e| matches!(e, AnalysisDiagnostic::ExternFunctionCall { .. })); + assert!( + !has_a024, + "a bound extern call must not trigger A024, got: {:?}", + e.errors() + ); + } + } + + #[test] + fn a024_unbound_extern_rejected_when_other_extern_is_bound() { + // With one bound extern and one unbound bare extern, only the call to + // the unbound one is rejected. + let source = r#" + external fn sum(a: i32, b: i32) -> i32; + use { sum } from arith; + external fn raw(x: i32) -> i32; + fn main() -> i32 { return sum(1, 2) + raw(3); } + "#; + let errors = expect_errors(source); + let offending: Vec<&str> = errors + .iter() + .filter_map(|e| match e { + AnalysisDiagnostic::ExternFunctionCall { name, .. } => Some(name.as_str()), + _ => None, + }) + .collect(); + assert_eq!( + offending, + vec!["raw"], + "only the unbound extern `raw` should be rejected, got: {errors:?}" + ); + } + + #[test] + fn a024_top_level_use_does_not_bind_spec_inner_extern() { + // H8: a top-level `use { sort } from sorting;` is file-global but binds + // only top-level externs. With no top-level `sort` declared, the `use` + // names an undeclared top-level extern, so the type checker reports + // ExternImportNotDeclared rather than silently binding the spec-inner + // `sort` (which previously suppressed A024 and crashed proof-mode + // codegen). + let source = r#" + use { sort } from sorting; + spec Ms { + external fn sort(a: i32) -> i32; + fn run(x: i32) -> i32 { return sort(x); } + } + "#; + let arena = build_ast(source.to_string()); + let rendered = match inference_type_checker::TypeCheckerBuilder::build_typed_context(arena) + { + Ok(_) => panic!("a top-level use of an undeclared top-level extern must be rejected"), + Err(err) => format!("{err:#}"), + }; + assert!( + rendered.contains("sort") && rendered.contains("no `external fn"), + "expected ExternImportNotDeclared for `sort`, got: {rendered}" + ); + } + + #[test] + fn a024_spec_inner_extern_unbound_despite_same_named_bound_top_level() { + // H9/H10: a bound top-level `external fn sort` and a same-named, distinct + // spec-inner `external fn sort`. The `use` binds only the top-level + // declaration (resolution is by DefId, not name), so the spec-inner + // `sort` stays unbound and its call is A024-rejected — preventing the + // proof-mode miscompile where the spec body would call the merged + // top-level `sort` with a mismatched signature. + let source = r#" + external fn sort(a: i32) -> i32; + use { sort } from sorting; + spec Ms { + external fn sort(a: i64, b: i64) -> i64; + fn run(x: i64, y: i64) -> i64 { return sort(x, y); } + } + "#; + let errors = expect_errors(source); + let has_sort_rejection = errors.iter().any(|e| { + matches!(e, AnalysisDiagnostic::ExternFunctionCall { name, .. } if name == "sort") + }); + assert!( + has_sort_rejection, + "the unbound spec-inner `sort` must be A024-rejected even though a same-named top-level extern is bound, got: {errors:?}" + ); + } + + #[test] + fn a024_bound_top_level_extern_call_not_flagged_when_unbound_spec_inner_shadows_it() { + // H1 (round-2 regression): a bound top-level `external fn sort` (via + // `use … from`) called from a top-level function MUST NOT be flagged + // just because a same-named, distinct, unbound spec-inner + // `external fn sort` exists. Resolution is scope-aware: the top-level + // call binds to the bound top-level declaration, the spec-inner call + // binds to the unbound spec-inner one. Only the latter is A024-rejected. + // A name-keyed check let the unbound spec-inner declaration poison the + // valid top-level call site (the round-2 H-1 false positive). + let source = r#" + external fn sort(a: i32) -> i32; + use { sort } from sorting; + fn main() -> i32 { return sort(7); } + spec Ms { + external fn sort(a: i64, b: i64) -> i64; + fn run(x: i64, y: i64) -> i64 { return sort(x, y); } + } + "#; + let errors = expect_errors(source); + let sort_rejections = errors + .iter() + .filter( + |e| matches!(e, AnalysisDiagnostic::ExternFunctionCall { name, .. } if name == "sort"), + ) + .count(); + assert_eq!( + sort_rejections, 1, + "exactly the unbound spec-inner `sort` call must be A024-rejected; the bound \ + top-level `sort(7)` call must NOT be flagged, got: {errors:?}" + ); + } + + #[test] + fn a024_bound_top_level_extern_call_accepted_despite_unbound_spec_inner_same_name() { + // H1 (round-2 regression), positive form: with ONLY the bound top-level + // `sort` called (the spec-inner `sort` is declared but never called), + // analysis must succeed — the uncalled unbound spec-inner declaration + // must not poison the valid top-level call. + let source = r#" + external fn sort(a: i32) -> i32; + use { sort } from sorting; + fn main() -> i32 { return sort(7); } + spec Ms { + external fn sort(a: i64, b: i64) -> i64; + fn pure_run(x: i64) -> i64 { return x; } + } + "#; + let result = analyze(source); + if let Err(ref e) = result { + let has_a024 = e + .errors() + .iter() + .any(|e| matches!(e, AnalysisDiagnostic::ExternFunctionCall { .. })); + assert!( + !has_a024, + "a bound top-level extern call must compile even when a same-named unbound \ + spec-inner extern is declared but uncalled, got: {:?}", + e.errors() + ); + } + } + #[test] fn a024_extern_function_call_in_const_array_inside_function() { let source = r#" diff --git a/tests/src/codegen/wasm/extern_import.rs b/tests/src/codegen/wasm/extern_import.rs new file mode 100644 index 00000000..c93a27e7 --- /dev/null +++ b/tests/src/codegen/wasm/extern_import.rs @@ -0,0 +1,381 @@ +//! Golden + structural tests for `external fn` import emission (issue #9, Phase 2). +//! +//! Each `external fn` bound to a source module via `use … from ` is +//! emitted as a WASM function import. Imports occupy the lowest function indices +//! (`0..N`), so every locally defined function is shifted by the import count and +//! every extern call lowers to its import index. +//! +//! These tests run codegen WITHOUT analysis to exercise the import-emission +//! shape in isolation, mirroring how the non-det golden tests bypass analysis. +//! (Analysis would also accept these fixtures: rule A024 rejects only *unbound* +//! externs, and every extern here is bound via `use … from`.) + +#[cfg(test)] +mod extern_import_tests { + use crate::utils::{ + assert_wasms_modules_equivalence, assert_wat_equivalence, get_test_file_path, + get_test_wasm_path, wasm_codegen_no_analysis, + }; + use inf_wasmparser::{ExternalKind, Operator, Parser, Payload, TypeRef, ValType}; + + /// A single `(module, field, type_idx)` triple read back from the import + /// section, plus the `Call` operands found in each defined function body. + struct ModuleShape { + imports: Vec<(String, String, u32)>, + /// Type index of every locally defined function, in definition order. + defined_func_types: Vec, + /// `(export_name, function_index)` for every exported function. + func_exports: Vec<(String, u32)>, + /// `Call` operands for each defined function body, in definition order. + calls_per_defined_func: Vec>, + } + + fn read_shape(wasm: &[u8]) -> ModuleShape { + let mut imports = Vec::new(); + let mut defined_func_types = Vec::new(); + let mut func_exports = Vec::new(); + let mut calls_per_defined_func = Vec::new(); + + for payload in Parser::new(0).parse_all(wasm) { + match payload.expect("valid wasm payload") { + Payload::ImportSection(reader) => { + for import in reader { + let import = import.expect("valid import"); + if let TypeRef::Func(type_idx) = import.ty { + imports.push(( + import.module.to_string(), + import.name.to_string(), + type_idx, + )); + } else { + panic!("unexpected non-function import: {import:?}"); + } + } + } + Payload::FunctionSection(reader) => { + for type_idx in reader { + defined_func_types.push(type_idx.expect("valid function type idx")); + } + } + Payload::ExportSection(reader) => { + for export in reader { + let export = export.expect("valid export"); + if export.kind == ExternalKind::Func { + func_exports.push((export.name.to_string(), export.index)); + } + } + } + Payload::CodeSectionEntry(body) => { + let mut calls = Vec::new(); + let reader = body.get_operators_reader().expect("operators reader"); + for op in reader { + // Fail fast on a malformed operator stream: silently + // skipping decode errors could mask broken codegen output + // and let a structural assertion pass on garbage. + if let Operator::Call { function_index } = op.expect("operator decodes") { + calls.push(function_index); + } + } + calls_per_defined_func.push(calls); + } + _ => {} + } + } + + ModuleShape { + imports, + defined_func_types, + func_exports, + calls_per_defined_func, + } + } + + /// Reads every function type in the type section, in type-index order, as + /// `(params, results)`. Lets a test resolve an import's `type_idx` to the + /// concrete WASM value types of the emitted import signature. + fn read_func_types(wasm: &[u8]) -> Vec<(Vec, Vec)> { + let mut func_types = Vec::new(); + for payload in Parser::new(0).parse_all(wasm) { + if let Payload::TypeSection(reader) = payload.expect("valid wasm payload") { + for func_type in reader.into_iter_err_on_gc_types() { + let func_type = func_type.expect("function type"); + func_types.push(( + func_type.params().to_vec(), + func_type.results().to_vec(), + )); + } + } + } + func_types + } + + fn compile(test_name: &str) -> Vec { + let test_file_path = get_test_file_path(module_path!(), test_name); + let source_code = std::fs::read_to_string(&test_file_path) + .unwrap_or_else(|_| panic!("Failed to read test file: {test_file_path:?}")); + let wasm = wasm_codegen_no_analysis(&source_code); + inf_wasmparser::validate(&wasm) + .unwrap_or_else(|e| panic!("Generated Wasm module is invalid for {test_name}: {e}")); + wasm + } + + fn assert_matches_golden(test_name: &str, actual: &[u8]) { + let expected_path = get_test_wasm_path(module_path!(), test_name); + let expected = std::fs::read(&expected_path) + .unwrap_or_else(|_| panic!("Failed to read expected wasm for: {test_name}")); + assert_wasms_modules_equivalence(&expected, actual); + assert_wat_equivalence(actual, module_path!(), test_name); + } + + /// One extern, one local function. The import takes function index 0 and the + /// local `add_three` is shifted to index 1; the call to `sum` lowers to the + /// import index 0. + #[test] + fn single_import_test() { + cov_mark::check!(wasm_codegen_emit_import_section); + cov_mark::check!(wasm_codegen_emit_extern_call); + let test_name = "single_import"; + let actual = compile(test_name); + assert_matches_golden(test_name, &actual); + + let shape = read_shape(&actual); + assert_eq!( + shape.imports, + vec![("arith".to_string(), "sum".to_string(), 0)], + "expected one import (arith.sum) referencing type 0" + ); + // One local function, shifted past the single import. + assert_eq!(shape.defined_func_types.len(), 1, "one local function"); + assert_eq!( + shape.func_exports, + vec![("add_three".to_string(), 1)], + "local add_three is shifted to index 1 (after the import)" + ); + assert_eq!( + shape.calls_per_defined_func, + vec![vec![0]], + "the call to sum lowers to import index 0" + ); + } + + /// Two externs, called in a nested expression. Imports take indices 0 and 1, + /// the local `compute` is shifted to index 2, and `sum(neg(x), 3)` lowers to + /// `call 1` (neg) then `call 0` (sum). + #[test] + fn multi_import_test() { + let test_name = "multi_import"; + let actual = compile(test_name); + assert_matches_golden(test_name, &actual); + + let shape = read_shape(&actual); + assert_eq!( + shape.imports, + vec![ + ("arith".to_string(), "sum".to_string(), 0), + ("arith".to_string(), "neg".to_string(), 1), + ], + "two imports in declaration order at indices 0 and 1" + ); + assert_eq!( + shape.func_exports, + vec![("compute".to_string(), 2)], + "local compute is shifted to index 2 (after both imports)" + ); + assert_eq!( + shape.calls_per_defined_func, + vec![vec![1, 0]], + "nested call order: neg (import 1) evaluated before sum (import 0)" + ); + } + + /// One extern plus two local functions. Both locals shift past the import: + /// `helper` -> index 1, `entry` -> index 2. `entry` calls `helper` (local + /// index 1) and `ext_double` (import index 0). + #[test] + fn import_with_locals_test() { + let test_name = "import_with_locals"; + let actual = compile(test_name); + assert_matches_golden(test_name, &actual); + + let shape = read_shape(&actual); + assert_eq!( + shape.imports, + vec![("helpers".to_string(), "ext_double".to_string(), 0)], + "single import at index 0" + ); + assert_eq!(shape.defined_func_types.len(), 2, "two local functions"); + assert_eq!( + shape.func_exports, + vec![("helper".to_string(), 1), ("entry".to_string(), 2)], + "both locals shift past the import (helper -> 1, entry -> 2)" + ); + // entry is the second defined function body; it calls helper (local + // index 1) then ext_double (import index 0). + assert_eq!( + shape.calls_per_defined_func[1], + vec![1, 0], + "entry calls local helper (idx 1) then extern ext_double (import idx 0)" + ); + } + + /// Two externs with an identical signature share a single type entry: both + /// `inc` and `dec` reference type 0, while the local `run` body interns its + /// own type. Verifies import-against-import type deduplication. + #[test] + fn import_dedup_test() { + let test_name = "import_dedup"; + let actual = compile(test_name); + assert_matches_golden(test_name, &actual); + + let shape = read_shape(&actual); + assert_eq!( + shape.imports, + vec![ + ("arith".to_string(), "inc".to_string(), 0), + ("arith".to_string(), "dec".to_string(), 0), + ], + "both same-signature imports dedup onto type 0" + ); + assert_eq!( + shape.func_exports, + vec![("run".to_string(), 2)], + "local run is shifted to index 2" + ); + assert_eq!( + shape.calls_per_defined_func, + vec![vec![1, 0]], + "inc(dec(x)): dec (import 1) evaluated first, then inc (import 0)" + ); + } + + /// A bound `external fn` that is never called still emits its import and + /// still shifts local functions: import emission is driven by the + /// declaration + binding, not by call sites. + #[test] + fn uncalled_bound_extern_still_emits_import() { + let source = "\ +external fn unused(a: i32) -> i32; +use { unused } from lib; + +pub fn run(x: i32) -> i32 { + return x; +} +"; + let wasm = wasm_codegen_no_analysis(source); + inf_wasmparser::validate(&wasm).expect("invalid wasm"); + let shape = read_shape(&wasm); + assert_eq!( + shape.imports, + vec![("lib".to_string(), "unused".to_string(), 0)], + "uncalled but bound extern is still imported" + ); + assert_eq!( + shape.func_exports, + vec![("run".to_string(), 1)], + "local run is still shifted past the import" + ); + assert_eq!( + shape.calls_per_defined_func, + vec![Vec::::new()], + "run makes no calls" + ); + } + + /// A bare `external fn` with no binding `use` carries no provenance, so it is + /// skipped: no import is emitted and local functions keep index `0` — the + /// output is identical to a program with no externs at all. + #[test] + fn unbound_extern_emits_no_import() { + let source = "\ +external fn bare(a: i32) -> i32; + +pub fn run(x: i32) -> i32 { + return x; +} +"; + let wasm = wasm_codegen_no_analysis(source); + inf_wasmparser::validate(&wasm).expect("invalid wasm"); + let shape = read_shape(&wasm); + assert!( + shape.imports.is_empty(), + "unbound extern must not be emitted as an import: {:?}", + shape.imports + ); + assert_eq!( + shape.func_exports, + vec![("run".to_string(), 0)], + "with no imports the local function keeps index 0" + ); + } + + /// An ignored extern parameter (`_: i32`) still occupies an ABI slot: the + /// call site pushes the argument and the real `.wasm` export declares the + /// parameter, so it must appear in the emitted import signature. This locks + /// codegen's `import_param_types` in lock-step with the validator's + /// `lower_extern_signature`, which already treats `Ignored` as a real param. + #[test] + fn ignored_extern_param_present_in_import_signature() { + let source = "\ +external fn f(_: i32, x: i64) -> i32; +use { f } from m; + +pub fn main() -> i32 { + let a: i32 = 7; + let b: i64 = 9; + return f(a, b); +} +"; + let wasm = wasm_codegen_no_analysis(source); + inf_wasmparser::validate(&wasm).expect("invalid wasm"); + + let shape = read_shape(&wasm); + assert_eq!( + shape.imports, + vec![("m".to_string(), "f".to_string(), 0)], + "the bound extern f is imported from module m at type 0" + ); + + let func_types = read_func_types(&wasm); + let (params, results) = &func_types[shape.imports[0].2 as usize]; + assert_eq!( + params.as_slice(), + &[ValType::I32, ValType::I64], + "the ignored first parameter is present: import params are [i32, i64]" + ); + assert_eq!( + results.as_slice(), + &[ValType::I32], + "import result is [i32]" + ); + } + + /// Regenerates the golden `.wasm` and `.wat` for every extern-import test. + /// Run with `--ignored` after intentional codegen changes. + #[test] + #[ignore] + fn regenerate_extern_import_wasm() { + use crate::utils::{get_test_data_path, regenerate_wat}; + + for test_name in [ + "single_import", + "multi_import", + "import_with_locals", + "import_dedup", + ] { + let dir = get_test_data_path() + .join("codegen") + .join("wasm") + .join("extern_import") + .join(test_name); + let source_code = std::fs::read_to_string(dir.join(format!("{test_name}.inf"))) + .unwrap_or_else(|_| panic!("Failed to read {test_name}.inf")); + let actual = wasm_codegen_no_analysis(&source_code); + inf_wasmparser::validate(&actual) + .unwrap_or_else(|e| panic!("Generated Wasm module is invalid: {e}")); + let wasm_path = dir.join(format!("{test_name}.wasm")); + std::fs::write(&wasm_path, &actual) + .unwrap_or_else(|e| panic!("Failed to write {}: {e}", wasm_path.display())); + regenerate_wat(&actual, &dir, test_name); + } + } +} diff --git a/tests/src/codegen/wasm/extern_link.rs b/tests/src/codegen/wasm/extern_link.rs new file mode 100644 index 00000000..e5ce7826 --- /dev/null +++ b/tests/src/codegen/wasm/extern_link.rs @@ -0,0 +1,323 @@ +//! End-to-end test for external `.wasm` linking (issue #9, Phase 4). +//! +//! Drives the full pipeline an `infc` invocation with `-L` runs: a program that +//! `use`s a function from an external module is compiled, the external module is +//! resolved off a search path and statically merged in, and the result is +//! translated to Rocq. The two end-to-end guarantees the merge makes are +//! asserted directly: +//! +//! - the unified `.wasm` has **no cross-module imports** — the external bytes +//! are folded in, not referenced; and +//! - the unified `.v` carries the merged function as an **ordinary named +//! definition** with **no orphan `Mi` import record** for the merged module. +//! +//! The external fixture is itself produced by the Inference compiler: a tiny +//! library exporting `pub fn sum` lowers to a `.wasm` whose `sum` export backs +//! the main program's `external fn sum`. + +#[cfg(test)] +mod extern_link_tests { + use std::path::{Path, PathBuf}; + + use inference::wasm_link::{resolve_external_modules, SearchPath}; + use inference::{codegen, link, parse, type_check, wasm_to_v, FxHashMap}; + use inf_wasmparser::{Parser, Payload, TypeRef}; + + /// Compiles `source` to a `.wasm` with the default settings, skipping the + /// analysis phase — this codegen path does not need it. (The library sources + /// passed here define no externs; in the main programs that do, A024 accepts + /// the bound externs and rejects only unbound ones.) + fn compile_wasm(source: &str, module_name: &str) -> Vec { + let arena = parse(source).expect("library source parses"); + let typed = type_check(arena).expect("library source type-checks"); + let output = codegen(&typed, module_name).expect("library codegen succeeds"); + output.wasm().to_vec() + } + + /// A throwaway directory under the system temp dir, unique to this test run, + /// removed on drop. + struct TempLibDir { + path: PathBuf, + } + + impl TempLibDir { + fn new(tag: &str) -> Self { + let nanos = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos(); + let path = std::env::temp_dir().join(format!( + "inference-extern-link-{tag}-{}-{nanos}", + std::process::id() + )); + std::fs::create_dir_all(&path).expect("create temp lib dir"); + TempLibDir { path } + } + + /// Writes `bytes` to `/`, creating parent directories. + fn write_module(&self, relative: &Path, bytes: &[u8]) { + let dest = self.path.join(relative); + if let Some(parent) = dest.parent() { + std::fs::create_dir_all(parent).expect("create module parent dir"); + } + std::fs::write(dest, bytes).expect("write external module"); + } + + fn path(&self) -> &Path { + &self.path + } + } + + impl Drop for TempLibDir { + fn drop(&mut self) { + let _ = std::fs::remove_dir_all(&self.path); + } + } + + /// The `(module, field)` of every function import in `wasm`. + fn function_imports(wasm: &[u8]) -> Vec<(String, String)> { + let mut imports = Vec::new(); + for payload in Parser::new(0).parse_all(wasm) { + if let Payload::ImportSection(reader) = payload.expect("valid payload") { + for import in reader { + let import = import.expect("valid import"); + if matches!(import.ty, TypeRef::Func(_)) { + imports.push((import.module.to_string(), import.name.to_string())); + } + } + } + } + imports + } + + /// Runs the pipeline an `infc -L main.inf -v` invocation runs and + /// returns the unified `.wasm` together with its Rocq translation. + fn compile_and_link(main_source: &str, lib_dir: &Path, module_name: &str) -> (Vec, String) { + let arena = parse(main_source).expect("main source parses"); + let typed = type_check(arena).expect("main source type-checks"); + + let mut search_path = SearchPath::new(); + search_path.push_lib_dir(lib_dir.to_path_buf()); + let externals = resolve_external_modules(&typed, &search_path, None) + .expect("external modules resolve and validate"); + let external_bytes: Vec<(&str, &[u8])> = externals + .iter() + .map(|m| (m.logical_module.as_str(), m.bytes.as_slice())) + .collect(); + + let codegen_output = codegen(&typed, module_name).expect("main codegen succeeds"); + + // Sanity guard: the *unlinked* codegen output carries the import, so its + // translation contains an `Mi` record. This makes the post-link absence + // of `Mi` a real difference rather than a vacuous pass. + let empty: FxHashMap> = FxHashMap::default(); + let pre_link_rocq = wasm_to_v(module_name, codegen_output.wasm(), &empty) + .expect("unlinked wasm-to-v succeeds"); + assert!( + pre_link_rocq.contains("Mi "), + "the unlinked module must still carry an import record; .v was:\n{pre_link_rocq}" + ); + + let unified = link(codegen_output.wasm(), &external_bytes).expect("link succeeds"); + let rocq = wasm_to_v(module_name, &unified, &empty).expect("wasm-to-v succeeds"); + (unified, rocq) + } + + #[test] + fn single_extern_links_to_self_contained_wasm_and_v() { + // The external library exports `sum`; the main program binds it via + // `use { sum } from arith;` and calls it. + let lib_wasm = compile_wasm( + "pub fn sum(a: i32, b: i32) -> i32 { return a + b; }", + "arith", + ); + + let lib_dir = TempLibDir::new("single"); + // Logical module `arith` resolves to `/arith.wasm`. + lib_dir.write_module(Path::new("arith.wasm"), &lib_wasm); + + let main_source = "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn add_three(x: i32) -> i32 { return sum(x, 3); }"; + + let (unified, rocq) = compile_and_link(main_source, lib_dir.path(), "extern_link"); + + // The unified module is valid and self-contained: no import references + // the external module any more. + inf_wasmparser::validate(&unified).expect("unified module is valid wasm"); + assert!( + function_imports(&unified).is_empty(), + "unified module must have no cross-module imports, found {:?}", + function_imports(&unified) + ); + + // The merged `sum` reads as an ordinary named Rocq definition, prefixed + // with its logical module so two libraries exporting the same field can + // never collide in the name section. The linker emits `arith.sum`, which + // wasm-to-v sanitizes (every non-alphanumeric to `_`) to `arith_sum`. + assert!( + rocq.contains("Definition arith_sum"), + "merged function must be a module-prefixed named Rocq definition; .v was:\n{rocq}" + ); + + // No orphan import record survives for the merged module: the linker + // removed the import section, so wasm-to-v emits no `Mi` for it. + assert!( + !rocq.contains("Mi \"arith\""), + "merged module must leave no orphan `Mi` import record; .v was:\n{rocq}" + ); + assert!( + !rocq.contains("MID_func"), + "a self-contained module imports nothing, so no `MID_func` should appear; .v was:\n{rocq}" + ); + } + + #[test] + fn nested_module_path_resolves_and_links() { + // A `::`-separated logical module must resolve to a nested file + // (`crypto::adder` -> `/crypto/adder.wasm`) and link identically. + let lib_wasm = compile_wasm( + "pub fn combine(a: i32, b: i32) -> i32 { return a + b; }", + "adder", + ); + + let lib_dir = TempLibDir::new("nested"); + lib_dir.write_module(Path::new("crypto").join("adder.wasm").as_path(), &lib_wasm); + + let main_source = "external fn combine(a: i32, b: i32) -> i32;\n\ + use { combine } from crypto::adder;\n\ + pub fn run(x: i32) -> i32 { return combine(x, x); }"; + + let (unified, rocq) = compile_and_link(main_source, lib_dir.path(), "nested_link"); + + inf_wasmparser::validate(&unified).expect("unified module is valid wasm"); + assert!( + function_imports(&unified).is_empty(), + "no cross-module imports may remain, found {:?}", + function_imports(&unified) + ); + // The merged `combine` is prefixed with its `::`-separated logical + // module: the linker emits `crypto::adder.combine`, which wasm-to-v + // sanitizes (every non-alphanumeric to `_`, then `__` runs collapsed) to + // `crypto_adder_combine`. + assert!( + rocq.contains("Definition crypto_adder_combine"), + "merged function must be a module-prefixed named Rocq definition; .v was:\n{rocq}" + ); + assert!( + !rocq.contains("Mi \"crypto::adder\""), + "no orphan `Mi` import record for the merged module; .v was:\n{rocq}" + ); + } + + #[test] + fn program_without_externs_is_unchanged_by_the_link_step() { + // A program that binds no externs resolves to an empty external set and + // the link step is a byte-identical pass-through of codegen output. + let main_source = "pub fn double(x: i32) -> i32 { return x + x; }"; + let arena = parse(main_source).expect("parses"); + let typed = type_check(arena).expect("type-checks"); + + let externals = + resolve_external_modules(&typed, &SearchPath::new(), None).expect("no externs"); + assert!(externals.is_empty(), "program binds no external modules"); + + let codegen_output = codegen(&typed, "plain").expect("codegen succeeds"); + let unified = link(codegen_output.wasm(), &[]).expect("link is a no-op"); + assert_eq!( + unified, + codegen_output.wasm(), + "the link step must not alter an extern-free module" + ); + } + + #[test] + fn link_with_no_externals_does_not_silently_pass_through_dangling_imports() { + // Fail-closed: the empty-externals fast path is keyed on the module being + // import-free, not merely on the externals slice being empty. A module + // that still carries an import but is given no externals to satisfy it + // must error (unsatisfied import), never pass through with the import + // intact. (In the CLI flow externals are always resolved first; this + // guards the public `inference::link` contract against misuse.) + let import_bearing = wat::parse_str( + r#"(module + (import "arith" "sum" (func (param i32 i32) (result i32))) + (func (export "run") (result i32) + i32.const 1 i32.const 2 call 0))"#, + ) + .expect("fixture assembles"); + assert!( + link(&import_bearing, &[]).is_err(), + "a module with an unsatisfied import must not pass through as Ok" + ); + + // And malformed bytes must surface a parse error, not Ok(garbage). + assert!( + link(&[0x00, 0x61, 0x73, 0x6d, 0xff], &[]).is_err(), + "malformed main bytes must be a link error, not a silent pass-through" + ); + } + + #[test] + fn proof_mode_spec_indices_name_the_spec_function_not_the_merged_extern() { + // C1: a proof-mode program that binds an extern AND declares a spec. + // Codegen records the spec function's index in the *pre-link* space, + // which counts the import (`spec_func_base = import_count + ...`). After + // the link removes the import and shifts indices down, the embedded + // `inference.spec_funcs` section the linker rewrites must name the spec + // function `check` (post-link index 1), not the merged extern `sum` + // (post-link index 2). Translating with an empty explicit map makes the + // translator adopt the embedded post-link section as the source of truth. + let lib_wasm = compile_wasm( + "pub fn sum(a: i32, b: i32) -> i32 { return a + b; }", + "arith", + ); + let lib_dir = TempLibDir::new("c1_spec"); + lib_dir.write_module(Path::new("arith.wasm"), &lib_wasm); + + let main_source = "external fn sum(a: i32, b: i32) -> i32;\n\ + use { sum } from arith;\n\ + pub fn add_three(x: i32) -> i32 { return sum(x, 3); }\n\ + spec MySpec {\n\ + fn check(x: i32) -> i32 { return sum(x, x); }\n\ + }"; + + let arena = parse(main_source).expect("main parses"); + let typed = type_check(arena).expect("main type-checks"); + + let mut search_path = SearchPath::new(); + search_path.push_lib_dir(lib_dir.path().to_path_buf()); + let externals = resolve_external_modules(&typed, &search_path, None) + .expect("external modules resolve"); + let external_bytes: Vec<(&str, &[u8])> = externals + .iter() + .map(|m| (m.logical_module.as_str(), m.bytes.as_slice())) + .collect(); + + let target = inference_wasm_codegen::Target::default(); + let mode = inference_wasm_codegen::CompilationMode::Proof; + let codegen_output = inference_wasm_codegen::codegen( + &typed, + target, + mode, + target.default_opt_level(), + "c1prog", + ) + .expect("proof-mode codegen succeeds"); + + let unified = link(codegen_output.wasm(), &external_bytes).expect("link succeeds"); + inf_wasmparser::validate(&unified).expect("unified module is valid wasm"); + + // Empty explicit map: the post-link embedded section is the source of + // truth (the pre-link codegen indices would be stale here). + let empty: FxHashMap> = FxHashMap::default(); + let rocq = wasm_to_v("c1prog", &unified, &empty).expect("wasm-to-v succeeds"); + + // Post-link indices: add_three=0, check=1, merged sum=2. + assert!( + rocq.contains("Definition c1prog__MySpec_specs : list N := (1 :: nil)%N."), + "MySpec_specs must name `check` at post-link index 1, not the merged \ + extern at 2; .v was:\n{rocq}" + ); + } +} diff --git a/tests/src/codegen/wasm/extern_link_exec.rs b/tests/src/codegen/wasm/extern_link_exec.rs new file mode 100644 index 00000000..b30fd442 --- /dev/null +++ b/tests/src/codegen/wasm/extern_link_exec.rs @@ -0,0 +1,321 @@ +//! Semantic execution tests for the static-merge linker (issue #9, audit S5). +//! +//! Every other linker test asserts only *structure* plus `inf_wasmparser::validate` +//! passing. That leaves a soundness hole the audit named S5: a re-index bug that +//! produced a **validating** module which nonetheless wired an import onto the +//! *wrong* same-signature body would pass every structural assertion. Two external +//! functions with identical signatures (`sum`/`sub`, `store_at`/`load_at`) are the +//! canonical trap — swapping their merged bodies keeps the module valid but changes +//! what it computes. +//! +//! These tests close that hole by **executing** the merged module. Each fixture is +//! assembled from inline WAT (mirroring the linker's own integration tests), driven +//! through the real `inference::link`, instantiated in `wasmtime`, and asserted on +//! the *computed result* — a value chosen to distinguish correct wiring from the +//! plausible swap. The merged module exports its shared memory and its entry +//! function, so a Tier-B round-trip can be observed directly through that memory. + +#[cfg(test)] +mod extern_link_exec_tests { + use inference::link; + use wasmtime::{Engine, Instance, Module, Store, TypedFunc}; + + /// Assembles a `.wasm` binary from WAT, panicking with the WAT on error. + fn wasm(wat: &str) -> Vec { + wat::parse_str(wat).unwrap_or_else(|e| panic!("invalid WAT fixture: {e}\n{wat}")) + } + + /// Instantiates `wasm` with no imports (the merge removes them all) and hands + /// the caller the live `Store`/`Instance` to read exports from. + fn instantiate(wasm: &[u8]) -> (Store<()>, Instance) { + let engine = Engine::default(); + let module = + Module::new(&engine, wasm).unwrap_or_else(|e| panic!("merged module rejected: {e}")); + let mut store = Store::new(&engine, ()); + let instance = Instance::new(&mut store, &module, &[]) + .unwrap_or_else(|e| panic!("merged module failed to instantiate: {e}")); + (store, instance) + } + + #[test] + fn tier_a_merge_wires_each_same_signature_extern_to_its_own_body() { + // Two externals with the *same* `(i32,i32)->i32` signature. `run` calls + // both, so a re-index that swapped the merged bodies would still validate + // but compute a different number. `run(a,b) = sum(a,b) - sub(a,b)`: + // correct = (a+b) - (a-b) = 2b + // swapped = (a-b) - (a+b) = -2b + // run(7,3) is 6 when wired correctly and -6 under a swap — distinct, and + // distinct from the also-plausible "both wired to sum" (0) and "both wired + // to sub" (0) miswirings. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (import "arith" "sum" (func (;0;) (type 0))) + (import "arith" "sub" (func (;1;) (type 0))) + (func (;2;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0 + local.get 0 + local.get 1 + call 1 + i32.sub) + (export "run" (func 2))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32) (result i32))) + (func (;0;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.add) + (func (;1;) (type 0) (param i32 i32) (result i32) + local.get 0 + local.get 1 + i32.sub) + (export "sum" (func 0)) + (export "sub" (func 1))) + "#, + ); + + let linked = link(&main, &[("arith", &lib)]).expect("Tier-A merge succeeds"); + inf_wasmparser::validate(&linked).expect("merged module is valid wasm"); + + let (mut store, instance) = instantiate(&linked); + let run: TypedFunc<(i32, i32), i32> = instance + .get_typed_func(&mut store, "run") + .expect("merged module exports `run`"); + + assert_eq!( + run.call(&mut store, (7, 3)).expect("run(7,3) executes"), + 6, + "run(a,b) must be sum-then-sub = 2b; a swapped wiring would yield -6" + ); + assert_eq!( + run.call(&mut store, (10, 4)).expect("run(10,4) executes"), + 8, + "second point pins 2b again, ruling out a constant-offset coincidence" + ); + assert_eq!( + run.call(&mut store, (3, 0)).expect("run(3,0) executes"), + 0, + "b=0 collapses both directions to 0, guarding the sign of the wiring" + ); + } + + #[test] + fn tier_b_store_load_round_trips_through_shared_memory() { + // Two externals over the main module's shared memory: `store_at(ptr,val)` + // writes, `load_at(ptr)` reads. Both touch memory only through their + // caller-passed pointer — Tier B. `run(ptr,val)` stores then loads back the + // same address, so a correct merge round-trips the value. Storing at one + // address and loading another (`isolate`) confirms the two distinct-but- + // same-family bodies were not collapsed onto one address. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32 i32))) + (type (;1;) (func (param i32) (result i32))) + (type (;2;) (func (param i32 i32) (result i32))) + (import "memlib" "store_at" (func (;0;) (type 0))) + (import "memlib" "load_at" (func (;1;) (type 1))) + (memory (;0;) 1 1) + (func (;2;) (type 2) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0 + local.get 0 + call 1) + (func (;3;) (type 2) (param i32 i32) (result i32) + local.get 0 + local.get 1 + call 0 + local.get 0 + i32.const 4 + i32.add + call 1) + (export "memory" (memory 0)) + (export "run" (func 2)) + (export "isolate" (func 3))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32 i32))) + (type (;1;) (func (param i32) (result i32))) + (memory (;0;) 1) + (func (;0;) (type 0) (param i32 i32) + local.get 0 + local.get 1 + i32.store) + (func (;1;) (type 1) (param i32) (result i32) + local.get 0 + i32.load) + (export "store_at" (func 0)) + (export "load_at" (func 1))) + "#, + ); + + let linked = link(&main, &[("memlib", &lib)]).expect("Tier-B merge succeeds"); + inf_wasmparser::validate(&linked).expect("merged module is valid wasm"); + + let (mut store, instance) = instantiate(&linked); + let run: TypedFunc<(i32, i32), i32> = instance + .get_typed_func(&mut store, "run") + .expect("merged module exports `run`"); + + for &(ptr, val) in &[(0_i32, 42_i32), (16, -7), (256, 1_000_000)] { + assert_eq!( + run.call(&mut store, (ptr, val)).expect("store-then-load executes"), + val, + "store_at then load_at over shared memory must round-trip the value" + ); + } + + // Storing at `ptr` and loading `ptr+4` reads a slot `run` never wrote: the + // store and load are wired to genuinely different addresses, not collapsed. + let isolate: TypedFunc<(i32, i32), i32> = instance + .get_typed_func(&mut store, "isolate") + .expect("merged module exports `isolate`"); + assert_eq!( + isolate.call(&mut store, (64, 99)).expect("isolate executes"), + 0, + "loading an untouched neighbouring slot must read the memory's zero init" + ); + + // The store is observable in the module's *own* exported memory — direct + // confirmation the merge folded both bodies onto the one shared memory. + run.call(&mut store, (128, 0x1234)).expect("seed memory"); + let memory = instance + .get_memory(&mut store, "memory") + .expect("merged module exports its shared memory"); + let mut slot = [0u8; 4]; + memory + .read(&store, 128, &mut slot) + .expect("read the written slot"); + assert_eq!( + i32::from_le_bytes(slot), + 0x1234, + "the merged store must be visible in the shared exported memory" + ); + } + + #[test] + fn interprocedural_tier_b_sort_pair_moves_the_right_bytes() { + // The multi-function param-addressed case: `sort_pair(ptr)` reads the two + // i32s at `[ptr]` and `[ptr+4]` and, when out of order, calls `swap(ptr)` + // to exchange them. Every address is derived from the caller's `ptr`, so + // the interprocedural provenance fixpoint admits it to Tier B. Correctness + // here is byte movement through shared memory: a re-index that mis-wired + // `sort_pair`'s `call` to `swap` (or swapped the load/store offsets) would + // leave the pair unsorted or corrupt it. `run` writes a pair, sorts it, and + // returns both slots packed so a single result pins the whole outcome. + let main = wasm( + r#" + (module + (type (;0;) (func (param i32))) + (type (;1;) (func (param i32 i32 i32) (result i32))) + (import "sortlib" "sort_pair" (func (;0;) (type 0))) + (memory (;0;) 1 1) + (func (;1;) (type 1) (param i32 i32 i32) (result i32) + ;; write lo-candidate and hi-candidate at base and base+4 + local.get 0 + local.get 1 + i32.store + local.get 0 + i32.const 4 + i32.add + local.get 2 + i32.store + ;; sort the pair in place + local.get 0 + call 0 + ;; pack: low slot in the high half, high slot in the low half so the + ;; ordering is visible in one i32 result (lo*1000 + hi). + local.get 0 + i32.load + i32.const 1000 + i32.mul + local.get 0 + i32.const 4 + i32.add + i32.load + i32.add) + (export "memory" (memory 0)) + (export "run" (func 1))) + "#, + ); + let lib = wasm( + r#" + (module + (type (;0;) (func (param i32))) + (memory (;0;) 1) + ;; swap(ptr): exchange [ptr] and [ptr+4] + (func (;0;) (type 0) (param i32) + (local i32 i32) + local.get 0 + i32.load + local.set 1 + local.get 0 + i32.const 4 + i32.add + i32.load + local.set 2 + local.get 0 + local.get 2 + i32.store + local.get 0 + i32.const 4 + i32.add + local.get 1 + i32.store) + ;; sort_pair(ptr): if [ptr] > [ptr+4], swap + (func (;1;) (type 0) (param i32) + local.get 0 + i32.load + local.get 0 + i32.const 4 + i32.add + i32.load + i32.gt_s + if + local.get 0 + call 0 + end) + (export "sort_pair" (func 1))) + "#, + ); + + let linked = link(&main, &[("sortlib", &lib)]).expect("interprocedural Tier-B merge succeeds"); + inf_wasmparser::validate(&linked).expect("merged module is valid wasm"); + + let (mut store, instance) = instantiate(&linked); + let run: TypedFunc<(i32, i32, i32), i32> = instance + .get_typed_func(&mut store, "run") + .expect("merged module exports `run`"); + + // Already sorted: stays (3, 9) -> 3*1000 + 9. + assert_eq!( + run.call(&mut store, (0, 3, 9)).expect("run executes"), + 3009, + "an already-ordered pair must be left untouched" + ); + // Out of order: (9, 3) must become (3, 9) -> 3009. A mis-wired call or a + // bad offset would leave 9003 or some corrupt mix. + assert_eq!( + run.call(&mut store, (16, 9, 3)).expect("run executes"), + 3009, + "an out-of-order pair must be sorted by the merged swap" + ); + // Negative values exercise the signed comparison through the merge. + assert_eq!( + run.call(&mut store, (32, 5, -5)).expect("run executes"), + -4995, + "sorting must place -5 low and 5 high: -5*1000 + 5" + ); + } +} diff --git a/tests/src/codegen/wasm/mod.rs b/tests/src/codegen/wasm/mod.rs index 07f9af98..6a250293 100644 --- a/tests/src/codegen/wasm/mod.rs +++ b/tests/src/codegen/wasm/mod.rs @@ -17,6 +17,9 @@ mod binops_unary_combos; mod binops_u32; mod bounds_check; mod expr_deep_nesting; +mod extern_import; +mod extern_link; +mod extern_link_exec; mod inference_wrapper; mod loops; mod negative; diff --git a/tests/src/spec_propagation.rs b/tests/src/spec_propagation.rs index e3e78354..a5963420 100644 --- a/tests/src/spec_propagation.rs +++ b/tests/src/spec_propagation.rs @@ -1493,3 +1493,79 @@ mod scenario_10_wasm_to_v_compile_mode { ); } } + +// ============================================================================ +// Scenario 11: Over-long spec name rejected at codegen (D2) +// ============================================================================ +#[cfg(test)] +mod scenario_11_overlong_spec_name { + use crate::utils::build_ast; + use inference_type_checker::TypeCheckerBuilder; + use inference_wasm_codegen::{CompilationMode, OptLevel, Target}; + + /// Both `inference.spec_funcs` decoders (the linker and the Rocq + /// translator) reject a spec name longer than 255 bytes. Codegen must + /// refuse to emit such a name up front rather than produce a `.wasm` + /// artifact that fails its own downstream link/translate step. + #[test] + fn spec_name_over_255_bytes_is_rejected_at_codegen() { + let long_name = "S".repeat(256); + let source = format!( + "fn foo(x: i32) -> i32 {{ return x; }}\n\ + spec {long_name} {{\n \ + fn prop() forall {{\n \ + let i: i32 = @;\n \ + assert(foo(i) == i);\n \ + }}\n\ + }}\n" + ); + + let arena = build_ast(source); + let typed_context = TypeCheckerBuilder::build_typed_context(arena) + .expect("type check should succeed") + .typed_context(); + let err = inference_wasm_codegen::codegen( + &typed_context, + Target::Wasm32, + CompilationMode::Proof, + OptLevel::O3, + "output", + ) + .expect_err("codegen must reject a spec name exceeding 255 bytes"); + + let msg = err.to_string(); + assert!( + msg.contains("256") && msg.contains("255"), + "expected a spec-name-length diagnostic citing 256 and the 255 cap; got: {msg}" + ); + } + + /// A spec name at exactly the 255-byte cap is emitted normally: the limit + /// is inclusive, mirroring both decoders' `len() > MAX` rejection. + #[test] + fn spec_name_at_255_bytes_is_accepted() { + let name = "S".repeat(255); + let source = format!( + "fn foo(x: i32) -> i32 {{ return x; }}\n\ + spec {name} {{\n \ + fn prop() forall {{\n \ + let i: i32 = @;\n \ + assert(foo(i) == i);\n \ + }}\n\ + }}\n" + ); + + let arena = build_ast(source); + let typed_context = TypeCheckerBuilder::build_typed_context(arena) + .expect("type check should succeed") + .typed_context(); + inference_wasm_codegen::codegen( + &typed_context, + Target::Wasm32, + CompilationMode::Proof, + OptLevel::O3, + "output", + ) + .expect("a 255-byte spec name is at the cap and must be accepted"); + } +} diff --git a/tests/src/type_checker/coverage.rs b/tests/src/type_checker/coverage.rs index 4be9a212..35d255a5 100644 --- a/tests/src/type_checker/coverage.rs +++ b/tests/src/type_checker/coverage.rs @@ -1409,6 +1409,92 @@ mod type_validation_coverage { } } +/// Validation of `external fn` signatures (issue #9 robustness audit, H6/H7). +/// +/// An `external fn` is signature-only, but its declared types must still be +/// real and it must not declare a `self` receiver. Before these checks an +/// undeclared `Custom` type lowered to `i32` and `todo!()`-panicked codegen +/// (H6), and a `self` receiver compiled to a silently-invalid `.wasm` (H7). +#[cfg(test)] +mod extern_signature_validation { + use super::*; + + #[test] + fn undeclared_param_type_is_rejected() { + let source = r#"external fn f(a: Undeclared) -> i32;"#; + let Err(err) = try_type_check(source) else { + panic!("extern with an undeclared parameter type must be rejected"); + }; + let msg = err.to_string(); + assert!( + msg.contains("Undeclared") || msg.contains("unknown type"), + "error should name the unknown type, got: {msg}" + ); + } + + #[test] + fn undeclared_return_type_is_rejected() { + let source = r#"external fn f(a: i32) -> Undeclared;"#; + let Err(err) = try_type_check(source) else { + panic!("extern with an undeclared return type must be rejected"); + }; + let msg = err.to_string(); + assert!( + msg.contains("Undeclared") || msg.contains("unknown type"), + "error should name the unknown type, got: {msg}" + ); + } + + #[test] + fn declared_struct_type_in_signature_is_accepted() { + // A `Custom` type that resolves (here a struct) stays valid: the new + // validation only rejects *unknown* types, it does not reject all + // `Custom` forms. + let source = r#"struct S { x: i32; } external fn f(a: S) -> S;"#; + let result = try_type_check(source); + assert!( + result.is_ok(), + "extern over a declared struct type should type-check, got: {:?}", + result.err() + ); + } + + #[test] + fn self_receiver_is_rejected() { + let source = r#"external fn f(self, a: i32) -> i32;"#; + let Err(err) = try_type_check(source) else { + panic!("extern declaring a `self` receiver must be rejected"); + }; + let msg = err.to_string(); + assert!( + msg.contains("self reference"), + "error should be a self-reference diagnostic, got: {msg}" + ); + } + + #[test] + fn spec_inner_undeclared_param_type_is_rejected() { + // Spec-inner externs recurse back through the same collection arm, so + // the validation must hold inside a `spec` body too. + let source = r#"spec S { external fn f(a: Undeclared) -> i32; }"#; + let result = try_type_check(source); + assert!( + result.is_err(), + "spec-inner extern with an undeclared parameter type must be rejected" + ); + } + + #[test] + fn spec_inner_self_receiver_is_rejected() { + let source = r#"spec S { external fn f(self, a: i32) -> i32; }"#; + let result = try_type_check(source); + assert!( + result.is_err(), + "spec-inner extern declaring a `self` receiver must be rejected" + ); + } +} + #[cfg(test)] mod function_registration_coverage { use super::*; diff --git a/tests/src/type_checker/type_checker.rs b/tests/src/type_checker/type_checker.rs index 2e888569..6899c0ed 100644 --- a/tests/src/type_checker/type_checker.rs +++ b/tests/src/type_checker/type_checker.rs @@ -3317,6 +3317,261 @@ mod external_function_tests { } } +/// Phase 1 of issue #9: extern provenance binding. +/// +/// An `external fn` is bound to the source module named by a `use … from` +/// clause. The binding is exposed on [`TypedContext`] via `extern_origin` and +/// `is_extern_function`. A name imported from two distinct modules is an +/// ambiguity error; a `use … from` naming an undeclared extern is a dangling +/// import error; a bare extern (no binding `use`) stays valid but unbound. +#[cfg(test)] +mod extern_provenance_tests { + use super::*; + + fn err_string(source: &str) -> String { + match try_type_check(source) { + Ok(_) => panic!("type checking should fail"), + Err(e) => e.to_string(), + } + } + + // --- Binding succeeds --- + + #[test] + fn binds_extern_to_single_module() { + let source = r#" + use { sort } from collections; + external fn sort(a: i32, b: i32) -> i32; + fn main() -> i32 { return 0; } + "#; + let ctx = try_type_check(source).expect("binding a single module should type-check"); + let origin = ctx + .extern_origin("sort") + .expect("sort should carry a bound origin"); + assert_eq!(origin.logical_module, "collections"); + assert_eq!(origin.export_field, "sort"); + assert!( + origin.resolved_path.is_none(), + "Phase 1 leaves resolved_path unset; the driver fills it" + ); + assert!(ctx.is_extern_function("sort")); + } + + #[test] + fn binds_extern_to_nested_module_path() { + let source = r#" + use { hash } from crypto::sha256; + external fn hash(b: i32) -> i32; + fn main() -> i32 { return 0; } + "#; + let ctx = try_type_check(source).expect("nested module path should type-check"); + let origin = ctx.extern_origin("hash").expect("hash should be bound"); + assert_eq!( + origin.logical_module, "crypto::sha256", + "nested path joins with `::`, never an OS separator" + ); + } + + #[test] + fn binds_multiple_fields_from_one_use() { + let source = r#" + use { sort, search } from collections; + external fn sort(a: i32) -> i32; + external fn search(a: i32) -> i32; + fn main() -> i32 { return 0; } + "#; + let ctx = try_type_check(source).expect("multi-field use should type-check"); + assert_eq!( + ctx.extern_origin("sort").expect("sort bound").logical_module, + "collections" + ); + assert_eq!( + ctx.extern_origin("search") + .expect("search bound") + .logical_module, + "collections" + ); + } + + #[test] + fn binds_same_field_from_repeated_identical_module_without_ambiguity() { + // Two `use` clauses naming the same field from the *same* module are + // redundant, not ambiguous: there is still exactly one source module. + let source = r#" + use { sort } from collections; + use { sort } from collections; + external fn sort(a: i32) -> i32; + fn main() -> i32 { return 0; } + "#; + let ctx = try_type_check(source).expect("repeated identical import should bind"); + assert_eq!( + ctx.extern_origin("sort").expect("sort bound").logical_module, + "collections" + ); + } + + // --- Unbound extern stays valid --- + + #[test] + fn bare_extern_without_use_is_unbound_but_valid() { + let source = r#" + external fn add(a: i32, b: i32) -> i32; + fn main() -> i32 { return 0; } + "#; + let ctx = try_type_check(source).expect("a bare extern declaration is valid"); + assert!( + ctx.extern_origin("add").is_none(), + "an extern with no binding `use` has no provenance" + ); + assert!( + ctx.is_extern_function("add"), + "an unbound extern is still discriminated as extern, not local" + ); + } + + #[test] + fn local_function_is_not_extern() { + let source = r#"fn helper() -> i32 { return 1; } fn main() -> i32 { return helper(); }"#; + let ctx = try_type_check(source).expect("local functions type-check"); + assert!(!ctx.is_extern_function("helper")); + assert!(ctx.extern_origin("helper").is_none()); + } + + // --- Ambiguity errors --- + + #[test] + fn ambiguous_extern_from_two_modules_errors() { + let source = r#" + use { sort } from collections; + use { sort } from algorithms; + external fn sort(a: i32) -> i32; + fn main() -> i32 { return 0; } + "#; + let err = err_string(source); + assert!( + err.contains("external function `sort` is bound to multiple modules"), + "expected ambiguity diagnostic, got: {err}" + ); + assert!( + err.contains("collections") && err.contains("algorithms"), + "ambiguity diagnostic should list both modules, got: {err}" + ); + } + + #[test] + fn ambiguous_extern_leaves_binding_unset() { + // Even though the program is rejected, the symbol table must not pick + // an arbitrary module for an ambiguous extern. + let source = r#" + use { sort } from collections; + use { sort } from algorithms; + external fn sort(a: i32) -> i32; + fn main() -> i32 { return 0; } + "#; + let result = try_type_check(source); + assert!(result.is_err(), "ambiguous extern must be rejected"); + } + + // --- Missing / dangling import errors --- + + #[test] + fn use_from_naming_undeclared_extern_errors() { + let source = r#" + use { missing } from collections; + fn main() -> i32 { return 0; } + "#; + let err = err_string(source); + assert!( + err.contains("imports `missing` from module `collections`") + && err.contains("no `external fn missing` is declared"), + "expected dangling-import diagnostic, got: {err}" + ); + } + + #[test] + fn use_from_with_some_undeclared_fields_errors_only_on_missing() { + let source = r#" + use { sort, missing } from collections; + external fn sort(a: i32) -> i32; + fn main() -> i32 { return 0; } + "#; + let err = err_string(source); + assert!( + err.contains("`missing`"), + "the undeclared field should be reported, got: {err}" + ); + assert!( + !err.contains("no `external fn sort` is declared"), + "the declared field must not be reported as dangling, got: {err}" + ); + } + + // --- Provenance inside spec and module bodies --- + + #[test] + fn top_level_use_does_not_bind_a_spec_inner_extern() { + // H8: a `use … from` clause is file-global but binds only TOP-LEVEL + // externs. A spec-inner `external fn mix` is a different scope; naming it + // from a top-level `use` with no matching top-level extern is a dangling + // import (`ExternImportNotDeclared`), not a silent bind. The prior + // behavior bound it, suppressing A024 and crashing proof-mode codegen. + let source = r#" + use { mix } from crypto; + spec s { + external fn mix(a: i32, b: i32) -> i32; + } + fn main() -> i32 { return 0; } + "#; + let err = err_string(source); + assert!( + err.contains("imports `mix` from module `crypto`") + && err.contains("no `external fn mix` is declared"), + "a top-level use of a spec-inner extern must be a dangling import, got: {err}" + ); + } + + #[test] + fn top_level_use_binds_only_the_top_level_extern_when_a_spec_shadows_it() { + // H9: a bound top-level `mix` and a same-named spec-inner `mix` are + // distinct declarations. The `use` binds the top-level one; the bound + // origin recovered by name resolves to the top-level (root-scope) + // declaration that the use clause actually attaches to. + let source = r#" + external fn mix(a: i32, b: i32) -> i32; + use { mix } from crypto; + spec s { + external fn mix(a: i32) -> i32; + } + fn main() -> i32 { return 0; } + "#; + let ctx = try_type_check(source).expect("top-level mix binds; spec mix stays unbound"); + assert_eq!( + ctx.extern_origin("mix").expect("top-level mix is bound").logical_module, + "crypto" + ); + } + + #[test] + fn spec_nested_use_from_naming_undeclared_extern_errors() { + // A spec-only extern does NOT satisfy a top-level `use`: the binding + // scan is top-level-only, so a `use` naming a spec-only extern is a + // dangling import. + let source = r#" + use { present } from crypto; + spec s { + external fn present(a: i32) -> i32; + } + fn main() -> i32 { return 0; } + "#; + let err = err_string(source); + assert!( + err.contains("`present`") + && err.contains("no `external fn present` is declared"), + "a spec-only extern must not satisfy a top-level use, got: {err}" + ); + } +} + /// Tests for generic type parameters in variable definitions #[cfg(test)] mod generic_type_param_in_vardef { diff --git a/tests/test_data/codegen/wasm/extern_import/import_dedup/import_dedup.inf b/tests/test_data/codegen/wasm/extern_import/import_dedup/import_dedup.inf new file mode 100644 index 00000000..b5d19d84 --- /dev/null +++ b/tests/test_data/codegen/wasm/extern_import/import_dedup/import_dedup.inf @@ -0,0 +1,7 @@ +external fn inc(a: i32) -> i32; +external fn dec(a: i32) -> i32; +use { inc, dec } from arith; + +pub fn run(x: i32) -> i32 { + return inc(dec(x)); +} diff --git a/tests/test_data/codegen/wasm/extern_import/import_dedup/import_dedup.wasm b/tests/test_data/codegen/wasm/extern_import/import_dedup/import_dedup.wasm new file mode 100644 index 00000000..891d1307 Binary files /dev/null and b/tests/test_data/codegen/wasm/extern_import/import_dedup/import_dedup.wasm differ diff --git a/tests/test_data/codegen/wasm/extern_import/import_dedup/import_dedup.wat b/tests/test_data/codegen/wasm/extern_import/import_dedup/import_dedup.wat new file mode 100644 index 00000000..6c36228c --- /dev/null +++ b/tests/test_data/codegen/wasm/extern_import/import_dedup/import_dedup.wat @@ -0,0 +1,14 @@ +(module $output + (type (;0;) (func (param i32) (result i32))) + (type (;1;) (func (param i32) (result i32))) + (import "arith" "inc" (func (;0;) (type 0))) + (import "arith" "dec" (func (;1;) (type 0))) + (export "run" (func $run)) + (func $run (;2;) (type 1) (param $x i32) (result i32) + local.get $x + call 1 + call 0 + return + unreachable + ) +) diff --git a/tests/test_data/codegen/wasm/extern_import/import_with_locals/import_with_locals.inf b/tests/test_data/codegen/wasm/extern_import/import_with_locals/import_with_locals.inf new file mode 100644 index 00000000..ffadd546 --- /dev/null +++ b/tests/test_data/codegen/wasm/extern_import/import_with_locals/import_with_locals.inf @@ -0,0 +1,10 @@ +external fn ext_double(a: i32) -> i32; +use { ext_double } from helpers; + +pub fn helper(x: i32) -> i32 { + return x + 1; +} + +pub fn entry(x: i32) -> i32 { + return ext_double(helper(x)); +} diff --git a/tests/test_data/codegen/wasm/extern_import/import_with_locals/import_with_locals.wasm b/tests/test_data/codegen/wasm/extern_import/import_with_locals/import_with_locals.wasm new file mode 100644 index 00000000..591bfa3f Binary files /dev/null and b/tests/test_data/codegen/wasm/extern_import/import_with_locals/import_with_locals.wasm differ diff --git a/tests/test_data/codegen/wasm/extern_import/import_with_locals/import_with_locals.wat b/tests/test_data/codegen/wasm/extern_import/import_with_locals/import_with_locals.wat new file mode 100644 index 00000000..8cd48333 --- /dev/null +++ b/tests/test_data/codegen/wasm/extern_import/import_with_locals/import_with_locals.wat @@ -0,0 +1,22 @@ +(module $output + (type (;0;) (func (param i32) (result i32))) + (type (;1;) (func (param i32) (result i32))) + (type (;2;) (func (param i32) (result i32))) + (import "helpers" "ext_double" (func (;0;) (type 0))) + (export "helper" (func $helper)) + (export "entry" (func $entry)) + (func $helper (;1;) (type 1) (param $x i32) (result i32) + local.get $x + i32.const 1 + i32.add + return + unreachable + ) + (func $entry (;2;) (type 2) (param $x i32) (result i32) + local.get $x + call $helper + call 0 + return + unreachable + ) +) diff --git a/tests/test_data/codegen/wasm/extern_import/multi_import/multi_import.inf b/tests/test_data/codegen/wasm/extern_import/multi_import/multi_import.inf new file mode 100644 index 00000000..ef308808 --- /dev/null +++ b/tests/test_data/codegen/wasm/extern_import/multi_import/multi_import.inf @@ -0,0 +1,7 @@ +external fn sum(a: i32, b: i32) -> i32; +external fn neg(a: i32) -> i32; +use { sum, neg } from arith; + +pub fn compute(x: i32) -> i32 { + return sum(neg(x), 3); +} diff --git a/tests/test_data/codegen/wasm/extern_import/multi_import/multi_import.wasm b/tests/test_data/codegen/wasm/extern_import/multi_import/multi_import.wasm new file mode 100644 index 00000000..894a89d8 Binary files /dev/null and b/tests/test_data/codegen/wasm/extern_import/multi_import/multi_import.wasm differ diff --git a/tests/test_data/codegen/wasm/extern_import/multi_import/multi_import.wat b/tests/test_data/codegen/wasm/extern_import/multi_import/multi_import.wat new file mode 100644 index 00000000..93ba9ddc --- /dev/null +++ b/tests/test_data/codegen/wasm/extern_import/multi_import/multi_import.wat @@ -0,0 +1,16 @@ +(module $output + (type (;0;) (func (param i32 i32) (result i32))) + (type (;1;) (func (param i32) (result i32))) + (type (;2;) (func (param i32) (result i32))) + (import "arith" "sum" (func (;0;) (type 0))) + (import "arith" "neg" (func (;1;) (type 1))) + (export "compute" (func $compute)) + (func $compute (;2;) (type 2) (param $x i32) (result i32) + local.get $x + call 1 + i32.const 3 + call 0 + return + unreachable + ) +) diff --git a/tests/test_data/codegen/wasm/extern_import/single_import/single_import.inf b/tests/test_data/codegen/wasm/extern_import/single_import/single_import.inf new file mode 100644 index 00000000..85a3c8bb --- /dev/null +++ b/tests/test_data/codegen/wasm/extern_import/single_import/single_import.inf @@ -0,0 +1,6 @@ +external fn sum(a: i32, b: i32) -> i32; +use { sum } from arith; + +pub fn add_three(x: i32) -> i32 { + return sum(x, 3); +} diff --git a/tests/test_data/codegen/wasm/extern_import/single_import/single_import.wasm b/tests/test_data/codegen/wasm/extern_import/single_import/single_import.wasm new file mode 100644 index 00000000..7ec320d2 Binary files /dev/null and b/tests/test_data/codegen/wasm/extern_import/single_import/single_import.wasm differ diff --git a/tests/test_data/codegen/wasm/extern_import/single_import/single_import.wat b/tests/test_data/codegen/wasm/extern_import/single_import/single_import.wat new file mode 100644 index 00000000..c167d543 --- /dev/null +++ b/tests/test_data/codegen/wasm/extern_import/single_import/single_import.wat @@ -0,0 +1,13 @@ +(module $output + (type (;0;) (func (param i32 i32) (result i32))) + (type (;1;) (func (param i32) (result i32))) + (import "arith" "sum" (func (;0;) (type 0))) + (export "add_three" (func $add_three)) + (func $add_three (;1;) (type 1) (param $x i32) (result i32) + local.get $x + i32.const 3 + call 0 + return + unreachable + ) +) diff --git a/tests/test_data/inf/example.inf b/tests/test_data/inf/example.inf index 4fd75285..a080ba70 100644 --- a/tests/test_data/inf/example.inf +++ b/tests/test_data/inf/example.inf @@ -90,8 +90,8 @@ struct identity { //Use use inference::std; use inference::std::algorithms::sort; -use { sort } from "./sort.rs"; -use { sort, hash } from "./sort.rs"; +use { sort } from sorting; +use { sort, hash } from sorting; use inference::std::algorithms::{sort,hash}; //Binary Expression fn spec_assign() -> () { @@ -281,7 +281,7 @@ struct Account { } } use inference::std::algorithms::sort; -use { sort, hash } from "./sort.0.wasm"; +use { sort, hash } from sorting; use inference::std::algorithms::{sort, hash}; fn example() -> u32 { let a: u32 = 42; @@ -374,7 +374,7 @@ fn bubble_sort(arr: [i32;10], compare_function: fn(left: i32, right: i32) -> i32 } } } -use { hash } from "./cryptography.0.wasm"; +use { hash } from cryptography; spec HashContext { type HashFunction = fn([u8; 100]) -> [u8; 32]; fn verify_hash_transitivity(hash_f: HashFunction) -> () { diff --git a/tests/test_data/inf/test_parse_source_file_1.inf b/tests/test_data/inf/test_parse_source_file_1.inf index d884accd..c0918647 100644 --- a/tests/test_data/inf/test_parse_source_file_1.inf +++ b/tests/test_data/inf/test_parse_source_file_1.inf @@ -1,7 +1,7 @@ use inference::std; use inference::std::collections::{ Array, Set }; use inference::std::types::Address; -use { sorting_function } from "./sort.rs"; +use { sorting_function } from sorting; ///this is the sort function Inference spec context sort_function_context {