diff --git a/crates/synth-backend/src/arm_backend.rs b/crates/synth-backend/src/arm_backend.rs index 710d1f5..f2f653d 100644 --- a/crates/synth-backend/src/arm_backend.rs +++ b/crates/synth-backend/src/arm_backend.rs @@ -559,6 +559,24 @@ fn compile_wasm_to_arm( arm_instrs }; + // VCR-RA uxth/uxtb fold (#428, #242): `movw rM,#0xffff; and rD,rN,rM` → + // `uxth rD,rN` (and the 0xff/uxtb form), removing the dead `movw` — −1 + // instruction, −1 live register per 16/8-bit mask. 0xffff/0xff are not Thumb-2 + // modified immediates so the selector materializes them into a register; the + // dedicated zero-extend expresses the same masking inline. Removal-only + + // rewrite-in-place (offset-neutral). FLAG-OFF by default (opt-in + // `SYNTH_UXTH_FOLD=1`) ⇒ bit-identical (frozen gate green); the byte-changing + // default-on flip is the separate on-target-gated step, like the prior levers. + let arm_instrs = if std::env::var("SYNTH_UXTH_FOLD").is_ok() { + let (out, folds) = synth_synthesis::liveness::fold_uxth(&arm_instrs); + if std::env::var("SYNTH_FUSE_STATS").is_ok() { + eprintln!("[uxth-fold] {folds} mask-and folded to uxth/uxtb, movw dropped"); + } + out + } else { + arm_instrs + }; + // ISA feature gate: validate that all generated instructions are supported // by the target. This catches FPU instructions on no-FPU targets, double-precision // instructions on single-precision targets, etc. diff --git a/crates/synth-backend/src/arm_encoder.rs b/crates/synth-backend/src/arm_encoder.rs index f2dbcbf..3fa19af 100644 --- a/crates/synth-backend/src/arm_encoder.rs +++ b/crates/synth-backend/src/arm_encoder.rs @@ -391,6 +391,20 @@ impl ArmEncoder { 0xE6BF0070 | (rd_bits << 12) | rm_bits } + ArmOp::Uxtb { rd, rm } => { + let rd_bits = reg_to_bits(rd); + let rm_bits = reg_to_bits(rm); + // UXTB encoding: cond | 01101110 1111 Rd rotate 00 0111 Rm (rotate=00) + 0xE6EF0070 | (rd_bits << 12) | rm_bits + } + + ArmOp::Uxth { rd, rm } => { + let rd_bits = reg_to_bits(rd); + let rm_bits = reg_to_bits(rm); + // UXTH encoding: cond | 01101111 1111 Rd rotate 00 0111 Rm (rotate=00) + 0xE6FF0070 | (rd_bits << 12) | rm_bits + } + // Move instructions ArmOp::Mov { rd, op2 } => { let rd_bits = reg_to_bits(rd); @@ -2178,6 +2192,42 @@ impl ArmEncoder { } } + // UXTB Rd,Rm — zero-extend byte (rd = rm & 0xff) + ArmOp::Uxtb { rd, rm } => { + let rd_bits = reg_to_bits(rd) as u16; + let rm_bits = reg_to_bits(rm) as u16; + if rd_bits < 8 && rm_bits < 8 { + // UXTB Rd, Rm (16-bit): 1011 0010 11 Rm Rd + let instr: u16 = 0xB2C0 | (rm_bits << 3) | rd_bits; + Ok(instr.to_le_bytes().to_vec()) + } else { + // Thumb-2 UXTB.W: FA5F F(rd)80 (rm) + let hw1: u16 = 0xFA5F; + let hw2: u16 = (0xF080 | ((rd_bits as u32) << 8) | rm_bits as u32) as u16; + let mut bytes = hw1.to_le_bytes().to_vec(); + bytes.extend_from_slice(&hw2.to_le_bytes()); + Ok(bytes) + } + } + + // UXTH Rd,Rm — zero-extend halfword (rd = rm & 0xffff) + ArmOp::Uxth { rd, rm } => { + let rd_bits = reg_to_bits(rd) as u16; + let rm_bits = reg_to_bits(rm) as u16; + if rd_bits < 8 && rm_bits < 8 { + // UXTH Rd, Rm (16-bit): 1011 0010 10 Rm Rd + let instr: u16 = 0xB280 | (rm_bits << 3) | rd_bits; + Ok(instr.to_le_bytes().to_vec()) + } else { + // Thumb-2 UXTH.W: FA1F F(rd)80 (rm) + let hw1: u16 = 0xFA1F; + let hw2: u16 = (0xF080 | ((rd_bits as u32) << 8) | rm_bits as u32) as u16; + let mut bytes = hw1.to_le_bytes().to_vec(); + bytes.extend_from_slice(&hw2.to_le_bytes()); + Ok(bytes) + } + } + // CMP (can be 16-bit for low registers) ArmOp::Cmp { rn, op2 } => { let rn_bits = reg_to_bits(rn) as u16; diff --git a/crates/synth-synthesis/src/liveness.rs b/crates/synth-synthesis/src/liveness.rs index 82de9bf..16b3cae 100644 --- a/crates/synth-synthesis/src/liveness.rs +++ b/crates/synth-synthesis/src/liveness.rs @@ -111,7 +111,9 @@ pub fn reg_effect(op: &ArmOp) -> Option { | Rbit { rd, rm } | Popcnt { rd, rm } | Sxtb { rd, rm } - | Sxth { rd, rm } => def_use(vec![*rd], vec![*rm]), + | Sxth { rd, rm } + | Uxtb { rd, rm } + | Uxth { rd, rm } => def_use(vec![*rd], vec![*rm]), // flag-setting compares: read operands, write no GP register Cmp { rn, op2 } | Cmn { rn, op2 } => { @@ -587,6 +589,93 @@ pub fn fold_immediate_shifts(instrs: &[ArmInstruction]) -> (Vec, (folded, folds) } +/// VCR-RA peephole (#428, #242): fold a 16/8-bit mask materialized into a scratch +/// register and consumed by `AND` into the dedicated zero-extend instruction. +/// +/// ```text +/// movw rM, #0xffff ; and rD, rN, rM -> uxth rD, rN (rD = rN & 0xffff) +/// movw rM, #0x00ff ; and rD, rN, rM -> uxtb rD, rN (rD = rN & 0x00ff) +/// ``` +/// +/// `0xffff`/`0xff` are not Thumb-2 modified immediates, so the selector must +/// materialize the mask into a register (`movw`) and use the register form of AND; +/// `uxth`/`uxtb` express the same masking in one instruction, freeing the scratch +/// register. Sound unconditionally: `r & 0xffff` and `r & 0xff` are exactly what +/// UXTH/UXTB compute (rotation 0). AND is commutative, so the masked source is +/// whichever operand is not the materialized mask register. +/// +/// Same soundness scaffolding as [`fold_immediate_shifts`]: the mask register `M` +/// must be untouched between the `movw` and the `and`, and **dead after** the `and` +/// (`reg_dead_by_redef`) for the `movw` to be a removable dead store. Removal-only +/// and rewrite-in-place, so offset-neutral (no labels/branches move). Pure +/// function; callers opt in (flag-gated wiring is a separate, oracle-gated step). +pub fn fold_uxth(instrs: &[ArmInstruction]) -> (Vec, usize) { + use crate::rules::Operand2; + let n = instrs.len(); + let mut out = instrs.to_vec(); + let mut drop_movw: Vec = vec![false; n]; + let mut folds = 0usize; + + for i in 0..n { + // [i] must be `movw rM, #0xffff` (→ uxth) or `#0xff` (→ uxtb). + let (m, wide) = match &instrs[i].op { + ArmOp::Movw { rd, imm16: 0xffff } => (*rd, true), + ArmOp::Movw { rd, imm16: 0x00ff } => (*rd, false), + _ => continue, + }; + for j in (i + 1)..n { + // The AND consuming M as one operand; the source is the OTHER operand. + let extended = match &out[j].op { + ArmOp::And { + rd, + rn, + op2: Operand2::Reg(rm), + } if *rm == m && *rn != m => Some((*rd, *rn)), + ArmOp::And { + rd, + rn, + op2: Operand2::Reg(rm), + } if *rn == m && *rm != m => Some((*rd, *rm)), + _ => None, + }; + if let Some((rd, src)) = extended { + // The movw must be a dead store once the AND (its only consumer of + // M in this window) becomes a `uxth/uxtb` that doesn't read M. That + // holds if either the fold's destination IS M — `and M,src,M` + // ⇒ `uxth M,src` redefines M, killing the movw — or M is otherwise + // dead after the AND. + if rd == m || reg_dead_by_redef(m, &instrs[j + 1..]) { + out[j].op = if wide { + ArmOp::Uxth { rd, rm: src } + } else { + ArmOp::Uxtb { rd, rm: src } + }; + drop_movw[i] = true; + folds += 1; + } + break; // M consumed (folded or declined) — done with this movw. + } + // Any other read/redef of M, or an unmodeled op, ends this movw's window. + match reg_effect(&instrs[j].op) { + Some(eff) if eff.uses.contains(&m) || eff.defs.contains(&m) => break, + Some(_) => {} + None => break, + } + } + } + + if folds == 0 { + return (out, 0); + } + let folded: Vec = out + .into_iter() + .enumerate() + .filter(|(i, _)| !drop_movw[*i]) + .map(|(_, ins)| ins) + .collect(); + (folded, folds) +} + /// True if `op` returns from the function: `Bx LR`, or a `Pop`/`Ldmia` that loads /// `PC` (the `pop {…, pc}` epilogue). At such a point the only registers live out /// of the function are the ABI result registers — see [`RETURN_VALUE_REGS`]. @@ -1898,6 +1987,14 @@ fn rewrite_op( rd: d(rd), rm: u(rm), }, + Uxtb { rd, rm } => Uxtb { + rd: d(rd), + rm: u(rm), + }, + Uxth { rd, rm } => Uxth { + rd: d(rd), + rm: u(rm), + }, Cmp { rn, op2 } => Cmp { rn: u(rn), op2: op2_map(op2, use_map), @@ -2814,6 +2911,14 @@ fn rename_use(op: &ArmOp, from: Reg, to: Reg) -> Option { rd: *rd, rm: sub(*rm), }, + Uxtb { rd, rm } => Uxtb { + rd: *rd, + rm: sub(*rm), + }, + Uxth { rd, rm } => Uxth { + rd: *rd, + rm: sub(*rm), + }, Cmp { rn, op2 } => Cmp { rn: sub(*rn), op2: op2_rename(op2, from, to), @@ -3573,6 +3678,189 @@ mod tests { assert_eq!(n, 0); } + // ---- fold_uxth (#428) ---- + + #[test] + fn fold_uxth_basic_folds_mask_and_into_uxth() { + // movw r7,#0xffff ; and r8,r6,r7 ; movw r7,#0 (redef ⇒ r7 dead) + // ⇒ uxth r8,r6 and the movw is removed. + let seq = vec![ + ins(ArmOp::Movw { + rd: Reg::R7, + imm16: 0xffff, + }), + ins(ArmOp::And { + rd: Reg::R8, + rn: Reg::R6, + op2: Operand2::Reg(Reg::R7), + }), + ins(ArmOp::Movw { + rd: Reg::R7, + imm16: 0, + }), + ]; + let (out, n) = fold_uxth(&seq); + assert_eq!(n, 1); + assert_eq!(out.len(), seq.len() - 1, "the folded movw is removed"); + assert!( + out.iter().any(|i| matches!( + i.op, + ArmOp::Uxth { + rd: Reg::R8, + rm: Reg::R6 + } + )), + "mask-and folded to uxth r8,r6" + ); + assert!( + !out.iter().any(|i| matches!(i.op, ArmOp::And { .. })), + "no AND remains" + ); + } + + #[test] + fn fold_uxth_uxtb_for_0xff_mask() { + let seq = vec![ + ins(ArmOp::Movw { + rd: Reg::R4, + imm16: 0x00ff, + }), + ins(ArmOp::And { + rd: Reg::R2, + rn: Reg::R3, + op2: Operand2::Reg(Reg::R4), + }), + ins(ArmOp::Movw { + rd: Reg::R4, + imm16: 0, + }), + ]; + let (out, n) = fold_uxth(&seq); + assert_eq!(n, 1); + assert!( + out.iter().any(|i| matches!( + i.op, + ArmOp::Uxtb { + rd: Reg::R2, + rm: Reg::R3 + } + )), + "0xff mask folds to uxtb" + ); + } + + #[test] + fn fold_uxth_handles_commutative_mask_in_rn() { + // and r8,r7,r6 with the mask in rn (r7) ⇒ source is r6. + let seq = vec![ + ins(ArmOp::Movw { + rd: Reg::R7, + imm16: 0xffff, + }), + ins(ArmOp::And { + rd: Reg::R8, + rn: Reg::R7, + op2: Operand2::Reg(Reg::R6), + }), + ins(ArmOp::Movw { + rd: Reg::R7, + imm16: 0, + }), + ]; + let (out, n) = fold_uxth(&seq); + assert_eq!(n, 1); + assert!( + out.iter().any(|i| matches!( + i.op, + ArmOp::Uxth { + rd: Reg::R8, + rm: Reg::R6 + } + )), + "commutative: mask in rn ⇒ uxth of the op2 source" + ); + } + + #[test] + fn fold_uxth_folds_when_and_writes_back_to_mask_reg() { + // The common codegen shape: `and r3,r0,r3` reuses the mask reg as the dest. + // The fold `uxth r3,r0` redefines r3, so the movw is dead even though r3 is + // read afterward (it then holds the AND result, not the mask). + let seq = vec![ + ins(ArmOp::Movw { + rd: Reg::R3, + imm16: 0xffff, + }), + ins(ArmOp::And { + rd: Reg::R3, + rn: Reg::R0, + op2: Operand2::Reg(Reg::R3), + }), + ins(ArmOp::LslReg { + rd: Reg::R3, + rn: Reg::R3, + rm: Reg::R5, + }), // reads r3 (the AND result) — must NOT block the fold + ]; + let (out, n) = fold_uxth(&seq); + assert_eq!(n, 1, "and-writes-back-to-mask still folds"); + assert!( + out.iter().any(|i| matches!( + i.op, + ArmOp::Uxth { + rd: Reg::R3, + rm: Reg::R0 + } + )), + "folded to uxth r3,r0" + ); + } + + #[test] + fn fold_uxth_declines_non_mask_constant() { + // 0x1234 is neither 0xffff nor 0xff ⇒ not a zero-extend ⇒ no fold. + let seq = vec![ + ins(ArmOp::Movw { + rd: Reg::R7, + imm16: 0x1234, + }), + ins(ArmOp::And { + rd: Reg::R8, + rn: Reg::R6, + op2: Operand2::Reg(Reg::R7), + }), + ins(ArmOp::Movw { + rd: Reg::R7, + imm16: 0, + }), + ]; + let (_, n) = fold_uxth(&seq); + assert_eq!(n, 0); + } + + #[test] + fn fold_uxth_declines_when_mask_live_after_and() { + // r7 is read again after the AND ⇒ the movw is not a dead store ⇒ no fold. + let seq = vec![ + ins(ArmOp::Movw { + rd: Reg::R7, + imm16: 0xffff, + }), + ins(ArmOp::And { + rd: Reg::R8, + rn: Reg::R6, + op2: Operand2::Reg(Reg::R7), + }), + ins(ArmOp::Add { + rd: Reg::R0, + rn: Reg::R7, + op2: Operand2::Imm(1), + }), // reads r7 ⇒ live + ]; + let (_, n) = fold_uxth(&seq); + assert_eq!(n, 0); + } + #[test] fn forward_reload_basic_becomes_mov() { // str r0,[sp,4] ; ; ldr r1,[sp,4] ⇒ the ldr becomes mov r1,r0. diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index 150e609..863ae39 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -5035,8 +5035,11 @@ impl OptimizerBridge { ArmOp::Adc { .. } | ArmOp::Sbc { .. } => 4, // CLZ, RBIT are always 32-bit Thumb-2 ArmOp::Clz { .. } | ArmOp::Rbit { .. } => 4, - // SXTB, SXTH can be 16-bit for low registers - ArmOp::Sxtb { rd, rm } | ArmOp::Sxth { rd, rm } => { + // SXTB, SXTH, UXTB, UXTH can be 16-bit for low registers + ArmOp::Sxtb { rd, rm } + | ArmOp::Sxth { rd, rm } + | ArmOp::Uxtb { rd, rm } + | ArmOp::Uxth { rd, rm } => { let rd_bits = reg_num(rd); let rm_bits = reg_num(rm); if rd_bits < 8 && rm_bits < 8 { 2 } else { 4 } diff --git a/crates/synth-synthesis/src/rules.rs b/crates/synth-synthesis/src/rules.rs index de1b6c6..962bdbd 100644 --- a/crates/synth-synthesis/src/rules.rs +++ b/crates/synth-synthesis/src/rules.rs @@ -220,6 +220,14 @@ pub enum ArmOp { rd: Reg, rm: Reg, }, // Sign-extend halfword (16-bit to 32-bit) + Uxtb { + rd: Reg, + rm: Reg, + }, // Zero-extend byte (rd = rm & 0xff) — the fold target for `movw #0xff; and` + Uxth { + rd: Reg, + rm: Reg, + }, // Zero-extend halfword (rd = rm & 0xffff) — fold target for `movw #0xffff; and` // Move Mov { diff --git a/scripts/repro/uxth_fold.wat b/scripts/repro/uxth_fold.wat new file mode 100644 index 0000000..f805146 --- /dev/null +++ b/scripts/repro/uxth_fold.wat @@ -0,0 +1,13 @@ +;; VCR-RA uxth/uxtb fold validation (#428, #242). `movw #0xffff; and` (and #0xff) +;; for a 16/8-bit mask folds to `uxth`/`uxtb`, dropping the dead movw. Enough live +;; masks that they land in scratch r2-r8 (not the r0/r1 result regs the conservative +;; dead-analysis must keep), so the fold actually fires. Result identical flag-off +;; vs flag-on vs wasmtime. pack(a,b,c) = ((a&0xffff)<<8) ^ (b&0xffff) ^ (c&0xff) +(module + (memory 1) + (func (export "pack") (param $a i32) (param $b i32) (param $c i32) (result i32) + (i32.xor + (i32.xor + (i32.shl (i32.and (local.get $a) (i32.const 65535)) (i32.const 8)) + (i32.and (local.get $b) (i32.const 65535))) + (i32.and (local.get $c) (i32.const 255))))) diff --git a/scripts/repro/uxth_fold_differential.py b/scripts/repro/uxth_fold_differential.py new file mode 100644 index 0000000..d66214e --- /dev/null +++ b/scripts/repro/uxth_fold_differential.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +"""VCR-RA uxth/uxtb fold validation oracle (#428, epic #242). + +The fold rewrites `movw rM,#0xffff; and rD,rN,rM` -> `uxth rD,rN` (and the 0xff / +uxtb form), dropping the dead movw. `uxth` zero-extends the low halfword, i.e. +`rN & 0xffff` — exactly what the AND computes — so the result must be identical. + +wasmtime is ground truth; unicorn runs synth's ARM (`--relocatable` path). This +harness compiles `uxth_fold.wat` BOTH ways — flag-off (SYNTH_NO... clean, the +shipped lowering) and flag-on (SYNTH_UXTH_FOLD=1, with the fold) — and asserts +flag-off == flag-on == wasmtime over a vector sweep. flag-on must also actually +fold (the fold is non-vacuous on this fixture: 2 uxth folds). + +Run (venv with wasmtime+unicorn+capstone+pyelftools, e.g. /tmp/armv): + /tmp/armv/bin/python scripts/repro/uxth_fold_differential.py +""" + +import re +import subprocess +import sys + +import wasmtime +from elftools.elf.elffile import ELFFile +from unicorn import UC_ARCH_ARM, UC_MODE_THUMB, Uc, UcError +from unicorn.arm_const import ( + UC_ARM_REG_LR, + UC_ARM_REG_R0, + UC_ARM_REG_R1, + UC_ARM_REG_R2, + UC_ARM_REG_R11, + UC_ARM_REG_SP, +) + +WAT = "scripts/repro/uxth_fold.wat" +SYNTH = "./target/debug/synth" + +CODE, LIN, STK, RET = 0x200000, 0x40000, 0x180000, 0x300000 + +VECTORS = [ + (0, 0, 0), + (1, 2, 3), + (0xFFFF, 0xFFFF, 0xFF), + (0x12345678, 0xABCDEF01, 0x99), + (0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), + (0x0001FFFF, 0x7FFF8000, 0x100), + (0xDEADBEEF, 0xCAFEBABE, 0x42), +] + + +def compile_variant(fold_on): + elf = f"/tmp/uxth_{'on' if fold_on else 'off'}.elf" + env = {"PATH": "/usr/bin:/bin"} + if fold_on: + env["SYNTH_UXTH_FOLD"] = "1" + env["SYNTH_FUSE_STATS"] = "1" + r = subprocess.run( + [SYNTH, "compile", WAT, "-o", elf, "--target", "cortex-m4", + "--relocatable", "--all-exports"], + capture_output=True, text=True, env=env, + ) + assert r.returncode == 0, f"compile (fold={fold_on}) failed: {r.stderr}" + folds = 0 + m = re.search(r"\[uxth-fold\] (\d+) ", r.stderr) + if m: + folds = int(m.group(1)) + return elf, folds + + +def fn_addr_and_code(elf): + dis = subprocess.run([SYNTH, "disasm", elf], capture_output=True, text=True).stdout + syms = {m.group(2): int(m.group(1), 16) + for m in re.finditer(r"^([0-9a-f]{8}) <(\w+)>:", dis, re.M)} + fa = syms.get("func_0") or syms.get("pack") + text = ELFFile(open(elf, "rb")).get_section_by_name(".text") + return fa, text.data(), text["sh_addr"] + + +def arm(elf, a, b, c): + fa, code, base = fn_addr_and_code(elf) + mu = Uc(UC_ARCH_ARM, UC_MODE_THUMB) + mu.mem_map(CODE, 0x10000) + mu.mem_map(LIN, 0x10000) + mu.mem_map(STK - 0x8000, 0x10000) + mu.mem_map(RET, 0x1000) + mu.mem_write(CODE, code) + mu.reg_write(UC_ARM_REG_SP, STK) + mu.reg_write(UC_ARM_REG_R11, LIN) + mu.reg_write(UC_ARM_REG_R0, a & 0xFFFFFFFF) + mu.reg_write(UC_ARM_REG_R1, b & 0xFFFFFFFF) + mu.reg_write(UC_ARM_REG_R2, c & 0xFFFFFFFF) + mu.reg_write(UC_ARM_REG_LR, RET | 1) + try: + mu.emu_start((CODE + fa - base) | 1, RET, count=10000) + except UcError as e: + return f"ERR:{e}" + return mu.reg_read(UC_ARM_REG_R0) & 0xFFFFFFFF + + +def main(): + off_elf, off_folds = compile_variant(False) + on_elf, on_folds = compile_variant(True) + assert off_folds == 0, f"flag-off must not fold (got {off_folds})" + assert on_folds >= 1, f"flag-on must fold (non-vacuous); got {on_folds}" + print(f"folds: flag-off={off_folds} flag-on={on_folds}") + + engine = wasmtime.Engine() + module = wasmtime.Module(engine, wasmtime.wat2wasm(open(WAT).read())) + + def wt(a, b, c): + store = wasmtime.Store(engine) + inst = wasmtime.Instance(store, module, []) + return inst.exports(store)["pack"](store, a, b, c) & 0xFFFFFFFF + + fails = 0 + for (a, b, c) in VECTORS: + gt = wt(a, b, c) + off = arm(off_elf, a, b, c) + on = arm(on_elf, a, b, c) + ok = isinstance(off, int) and isinstance(on, int) and off == gt and on == gt + fails += 0 if ok else 1 + print(f"pack(0x{a:08X},0x{b:08X},0x{c:08X}) off={off if isinstance(off,str) else f'0x{off:08X}'}" + f" on={on if isinstance(on,str) else f'0x{on:08X}'} (wasmtime 0x{gt:08X})" + f"{'' if ok else ' <-- MISMATCH'}") + print(f"\n{len(VECTORS) - fails}/{len(VECTORS)} match (flag-off == flag-on == wasmtime)") + print("ORACLE: PASS" if fails == 0 else f"ORACLE: FAIL ({fails})") + sys.exit(1 if fails else 0) + + +if __name__ == "__main__": + main()