diff --git a/Cargo.lock b/Cargo.lock
index 6e85543..79ac2d0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "adler2"
@@ -9,25 +9,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
 
 [[package]]
-name = "aom-decode"
-version = "0.2.13"
+name = "archmage"
+version = "0.8.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "045795fa770d3b7ec2a98d96d8a4a3a81402d1e7bb71a70968f0d799729deda0"
+checksum = "c0fb818f2e02ca08cc6416725044fadc84bcf5a96e2eb7ab103c966123667d86"
 dependencies = [
- "avif-parse",
- "imgref",
- "libaom-sys",
- "log",
- "quick-error",
- "rgb",
- "yuv",
+ "archmage-macros",
+ "safe_unaligned_simd",
 ]
 
 [[package]]
-name = "arrayvec"
-version = "0.7.6"
+name = "archmage-macros"
+version = "0.8.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+checksum = "1886761e37995842cfbaf8c24d483e62d9b8c174254d29146f596d4d87efff2d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
 
 [[package]]
 name = "autocfg"
@@ -36,66 +36,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 
 [[package]]
-name = "avif-parse"
-version = "1.4.0"
+name = "bitflags"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f85ce2a7cd14ac0a30dc29a115de22466aeb8a029410f9f1e4f283443c959d1"
-dependencies = [
- "arrayvec",
- "bitreader",
- "byteorder",
- "fallible_collections",
- "leb128",
- "log",
-]
-
-[[package]]
-name = "bitreader"
-version = "0.3.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "886559b1e163d56c765bc3a985febb4eee8009f625244511d8ee3c432e08c066"
-dependencies = [
- "cfg-if",
-]
+checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
 
 [[package]]
 name = "bytemuck"
-version = "1.23.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3995eaeebcdf32f91f980d360f78732ddc061097ab4e39991ae7a6ace9194677"
-
-[[package]]
-name = "byteorder"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
-
-[[package]]
-name = "cc"
-version = "1.2.38"
+version = "1.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80f41ae168f955c12fb8960b057d70d0ca153fb83182b57d86380443527be7e9"
-dependencies = [
- "find-msvc-tools",
- "jobserver",
- "libc",
- "shlex",
-]
+checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec"
 
 [[package]]
 name = "cfg-if"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
-
-[[package]]
-name = "cmake"
-version = "0.1.54"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0"
-dependencies = [
- "cc",
-]
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 
 [[package]]
 name = "crc32fast"
@@ -148,30 +104,28 @@ dependencies = [
  "dssim-core",
  "getopts",
  "imgref",
- "load_image",
- "lodepng",
+ "moxcms",
  "ordered-channel",
+ "png",
  "rayon",
  "rgb",
+ "zenbitmaps",
+ "zune-jpeg",
 ]
 
 [[package]]
 name = "dssim-core"
 version = "3.4.0"
 dependencies = [
+ "archmage",
  "imgref",
  "itertools",
  "lodepng",
+ "magetypes",
  "rayon",
  "rgb",
 ]
 
-[[package]]
-name = "dunce"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
-
 [[package]]
 name = "either"
 version = "1.15.0"
@@ -179,55 +133,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
 
 [[package]]
-name = "fallible_collections"
-version = "0.5.1"
+name = "enough"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b3e85d14d419ba3e1db925519461c0d17a49bdd2d67ea6316fa965ca7acdf74"
+checksum = "177a59811828ea5dbc8927662d444ffee7b55a44696486f1b1a245ae9e652533"
 
 [[package]]
-name = "find-msvc-tools"
-version = "0.1.2"
+name = "fdeflate"
+version = "0.3.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ced73b1dacfc750a6db6c0a0c3a3853c8b41997e2e2c563dc90804ae6867959"
+checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c"
+dependencies = [
+ "simd-adler32",
+]
 
 [[package]]
 name = "flate2"
-version = "1.1.2"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d"
+checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
 dependencies = [
  "crc32fast",
- "libz-rs-sys",
  "miniz_oxide",
+ "zlib-rs",
 ]
 
-[[package]]
-name = "foreign-types"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965"
-dependencies = [
- "foreign-types-macros",
- "foreign-types-shared",
-]
-
-[[package]]
-name = "foreign-types-macros"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "foreign-types-shared"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b"
-
 [[package]]
 name = "getopts"
 version = "0.2.24"
@@ -237,23 +167,11 @@ dependencies = [
  "unicode-width",
 ]
 
-[[package]]
-name = "getrandom"
-version = "0.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
-dependencies = [
- "cfg-if",
- "libc",
- "r-efi",
- "wasi",
-]
-
 [[package]]
 name = "imgref"
-version = "1.11.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0263a3d970d5c054ed9312c0057b4f3bde9c0b33836d3637361d4a9e6e7a408"
+checksum = "e7c5cedc30da3a610cac6b4ba17597bdf7152cf974e8aab3afb3d54455e371c8"
 
 [[package]]
 name = "itertools"
@@ -264,124 +182,17 @@ dependencies = [
  "either",
 ]
 
-[[package]]
-name = "jobserver"
-version = "0.1.34"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
-dependencies = [
- "getrandom",
- "libc",
-]
-
-[[package]]
-name = "jpeg-decoder"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00810f1d8b74be64b13dbf3db89ac67740615d6c891f0e7b6179326533011a07"
-dependencies = [
- "rayon",
-]
-
-[[package]]
-name = "lcms2"
-version = "6.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b75877b724685dd49310bdbadbf973fc69b1d01992a6d4a861b928fc3943f87b"
-dependencies = [
- "bytemuck",
- "foreign-types",
- "lcms2-sys",
-]
-
-[[package]]
-name = "lcms2-sys"
-version = "4.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c2604b23848ca80b2add60f0fb2270fd980e622c25029b6597fa01cfd5f8d5f"
-dependencies = [
- "cc",
- "dunce",
- "libc",
- "pkg-config",
-]
-
-[[package]]
-name = "leb128"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67"
-
-[[package]]
-name = "libaom-sys"
-version = "0.17.2+libaom.3.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c4fce8aaf0c2d8534529b0698ed98ec1da8d35319588d20b0d79ce6446e4194"
-dependencies = [
- "cmake",
-]
-
 [[package]]
 name = "libc"
-version = "0.2.175"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543"
-
-[[package]]
-name = "libwebp"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7792a82f95b5b2528d9fe1642231f972d2cdd73b91ccbcb6ab214c2cf1a74f4"
-dependencies = [
- "libwebp-sys2",
-]
-
-[[package]]
-name = "libwebp-sys2"
-version = "0.1.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4790186411a6843ecc0a141c8948c8e623a0bb5e886834b1b6c90f3dfa85bb99"
-dependencies = [
- "cc",
- "cfg-if",
- "libc",
- "pkg-config",
- "vcpkg",
-]
-
-[[package]]
-name = "libz-rs-sys"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "840db8cf39d9ec4dd794376f38acc40d0fc65eec2a8f484f7fd375b84602becd"
-dependencies = [
- "zlib-rs",
-]
-
-[[package]]
-name = "load_image"
-version = "3.3.1"
+version = "0.2.182"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13af80aa49d27d6334430fcc8aa55c01acdf23209da94f670e217650320644c0"
-dependencies = [
- "aom-decode",
- "bytemuck",
- "imgref",
- "jpeg-decoder",
- "lcms2",
- "libwebp",
- "lodepng",
- "mozjpeg",
- "quick-error",
- "rexif",
- "rgb",
-]
+checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112"
 
 [[package]]
 name = "lodepng"
-version = "3.12.1"
+version = "3.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a32335d22e44238e2bb0b4d726964d18952ce1f1279ec3305305d2c61539eb"
+checksum = "fe7982db11054edc023a1b424dddcc65be18f71fa46ec6bde2efcfc1fb6b22da"
 dependencies = [
  "crc32fast",
  "flate2",
@@ -390,10 +201,13 @@ dependencies = [
 ]
 
 [[package]]
-name = "log"
-version = "0.4.28"
+name = "magetypes"
+version = "0.8.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
+checksum = "9233ec8c05f4844b1b4d92b5fd55ae179e8f7d1c09415ec45ceeda294237c42c"
+dependencies = [
+ "archmage",
+]
 
 [[package]]
 name = "miniz_oxide"
@@ -402,41 +216,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
 dependencies = [
  "adler2",
+ "simd-adler32",
 ]
 
 [[package]]
-name = "mozjpeg"
-version = "0.10.13"
+name = "moxcms"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7891b80aaa86097d38d276eb98b3805d6280708c4e0a1e6f6aed9380c51fec9"
+checksum = "04e5f643aebb3c117fa77e268557e3bf19e250769062820fcaf3ff33ea560adf"
 dependencies = [
- "arrayvec",
- "bytemuck",
- "libc",
- "mozjpeg-sys",
- "rgb",
-]
-
-[[package]]
-name = "mozjpeg-sys"
-version = "2.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f0dc668bf9bf888c88e2fb1ab16a406d2c380f1d082b20d51dd540ab2aa70c1"
-dependencies = [
- "cc",
- "dunce",
- "libc",
- "nasm-rs",
-]
-
-[[package]]
-name = "nasm-rs"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34f676553b60ccbb76f41f9ae8f2428dac3f259ff8f1c2468a174778d06a1af9"
-dependencies = [
- "jobserver",
- "log",
+ "num-traits",
+ "pxfm",
 ]
 
 [[package]]
@@ -455,41 +245,42 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95be4d57809897b5a7539fc15a7dfe0e84141bc3dfaa2e9b1b27caa90acf61ab"
 
 [[package]]
-name = "pkg-config"
-version = "0.3.32"
+name = "png"
+version = "0.18.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
+checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61"
+dependencies = [
+ "bitflags",
+ "crc32fast",
+ "fdeflate",
+ "flate2",
+ "miniz_oxide",
+]
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.101"
+version = "1.0.106"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
-name = "quick-error"
-version = "2.0.1"
+name = "pxfm"
+version = "0.1.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
+checksum = "b5a041e753da8b807c9255f28de81879c78c876392ff2469cde94799b2896b9d"
 
 [[package]]
 name = "quote"
-version = "1.0.40"
+version = "1.0.45"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
 dependencies = [
  "proc-macro2",
 ]
 
-[[package]]
-name = "r-efi"
-version = "5.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
-
 [[package]]
 name = "rayon"
 version = "1.11.0"
@@ -510,32 +301,32 @@ dependencies = [
  "crossbeam-utils",
 ]
 
-[[package]]
-name = "rexif"
-version = "0.7.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be932047c168919c8d5af065b16fa7d4bd24835ffa256bf0cf1ff463f91c15df"
-
 [[package]]
 name = "rgb"
-version = "0.8.52"
+version = "0.8.53"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce"
+checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4"
 dependencies = [
  "bytemuck",
 ]
 
 [[package]]
-name = "shlex"
-version = "1.3.0"
+name = "safe_unaligned_simd"
+version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+checksum = "f9c874fb993656d484725d58fbfddb39dfcf98ebcadc1a1d09568aa6b39bafba"
+
+[[package]]
+name = "simd-adler32"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2"
 
 [[package]]
 name = "syn"
-version = "2.0.106"
+version = "2.0.117"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -543,59 +334,64 @@ dependencies = [
 ]
 
 [[package]]
-name = "unicode-ident"
-version = "1.0.19"
+name = "thiserror"
+version = "2.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d"
+checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
+dependencies = [
+ "thiserror-impl",
+]
 
 [[package]]
-name = "unicode-width"
-version = "0.2.1"
+name = "thiserror-impl"
+version = "2.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c"
+checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
 
 [[package]]
-name = "vcpkg"
-version = "0.2.15"
+name = "unicode-ident"
+version = "1.0.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
 
 [[package]]
-name = "wasi"
-version = "0.14.7+wasi-0.2.4"
+name = "unicode-width"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c"
-dependencies = [
- "wasip2",
-]
+checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
 
 [[package]]
-name = "wasip2"
-version = "1.0.1+wasi-0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
+name = "zenbitmaps"
+version = "0.1.0"
+source = "git+https://github.com/imazen/zenbitmaps.git?rev=818992c#818992c0e56b5b5d544880964d796037cbc49eba"
 dependencies = [
- "wit-bindgen",
+ "enough",
+ "rgb",
+ "thiserror",
 ]
 
 [[package]]
-name = "wit-bindgen"
-version = "0.46.0"
+name = "zlib-rs"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
+checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
 
 [[package]]
-name = "yuv"
-version = "0.1.9"
+name = "zune-core"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8cbe2d856acbe6d86c0fa6f458b73e962834061ca2f7f94c6e4633afc9efd4d4"
-dependencies = [
- "num-traits",
- "rgb",
-]
+checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9"
 
 [[package]]
-name = "zlib-rs"
-version = "0.5.2"
+name = "zune-jpeg"
+version = "0.5.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f06ae92f42f5e5c42443fd094f245eb656abf56dd7cce9b8b263236565e00f2"
+checksum = "410e9ecef634c709e3831c2cfdb8d9c32164fae1c67496d5b68fff728eec37fe"
+dependencies = [
+ "zune-core",
+]
diff --git a/Cargo.toml b/Cargo.toml
index ffed2a4..3e51cb8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,8 +11,8 @@ name = "dssim"
 readme = "README.md"
 repository = "https://github.com/kornelski/dssim.git"
 version = "3.4.1"
-edition = "2021"
-rust-version = "1.72"
+edition = "2024"
+rust-version = "1.85"
 
 [[bin]]
 doctest = false
@@ -25,20 +25,17 @@ imgref = "1.11.0"
 getopts = "0.2.24"
 rayon = { version = "1.10.0", optional = true }
 rgb = "0.8.52"
-lodepng = { version = "3.12", default-features = false, features = ["rust_backend"] }
-load_image = { version = "3.3", features = ["lcms2-static"] }
+png = "0.18"
+zune-jpeg = "0.5"
+moxcms = "0.8"
 crossbeam-channel = "0.5.15"
 ordered-channel = { version = "1.2.0" }
+zenbitmaps = { git = "https://github.com/imazen/zenbitmaps.git", rev = "818992c", features = ["rgb"] }
 
 [features]
-default = ["threads", "dssim-core/default", "no-macos-vimage"]
+default = ["threads", "dssim-core/default", "no-macos-vimage", "fma"]
 threads = ["dep:rayon", "dssim-core/threads"]
-avif = ["load_image/avif"]
-# Support comparing WebP files directly
-webp = ["load_image/webp"]
-webp-static = ["load_image/webp-static"]
-# Decode JPEGs using libjpeg-turbo derivative (closer to the reference decoder)
-mozjpeg = ["load_image/mozjpeg"]
+fma = ["dssim-core/fma"]
 no-macos-vimage = ["dssim-core/no-macos-vimage"]
 
 [package.metadata.release]
diff --git a/benches/compare.rs b/benches/compare.rs
index b973e58..afdb7c8 100644
--- a/benches/compare.rs
+++ b/benches/compare.rs
@@ -1,25 +1,29 @@
 #![feature(test)]
 
 extern crate test;
-use dssim::{ToRGBAPLU, RGBAPLU};
+use dssim::{RGBAPLU, ToRGBAPLU};
 use imgref::{Img, ImgVec};
 use test::Bencher;
 
 fn load(path: &str) -> Result<ImgVec<RGBAPLU>, lodepng::Error> {
     let image = lodepng::decode32_file(path)?;
-    Ok(Img::new(image.buffer.to_rgbaplu(), image.width, image.height))
+    Ok(Img::new(
+        image.buffer.to_rgbaplu(),
+        image.width,
+        image.height,
+    ))
 }
 
 #[bench]
 fn compare(bench: &mut Bencher) {
     let attr = dssim::Dssim::new();
     let other = &load("tests/test1-sm.png").unwrap();
-    let orig = attr.create_image(&load("tests/test2-sm.png").unwrap()).unwrap();
+    let orig = attr
+        .create_image(&load("tests/test2-sm.png").unwrap())
+        .unwrap();
     let modif = attr.create_image(other).unwrap();
 
-    bench.iter(|| {
-        attr.compare(&orig, modif.clone())
-    });
+    bench.iter(|| attr.compare(&orig, modif.clone()));
 }
 
 #[bench]
@@ -27,7 +31,5 @@ fn create_image(bench: &mut Bencher) {
     let attr = dssim::Dssim::new();
     let img = &load("tests/test1-sm.png").unwrap();
 
-    bench.iter(|| {
-        attr.create_image(img)
-    });
+    bench.iter(|| attr.create_image(img));
 }
diff --git a/dssim-core/Cargo.toml b/dssim-core/Cargo.toml
index 6edc493..874f5c0 100644
--- a/dssim-core/Cargo.toml
+++ b/dssim-core/Cargo.toml
@@ -11,7 +11,7 @@ name = "dssim-core"
 readme = "README.md"
 repository = "https://github.com/kornelski/dssim.git"
 version = "3.4.0"
-edition = "2021"
+edition = "2024"
 
 [lib]
 crate-type = ["lib", "staticlib"]
@@ -19,6 +19,8 @@ crate-type = ["lib", "staticlib"]
 [dependencies]
 imgref = "1.11.0"
 itertools = "0.14"
+archmage = { version = "0.8.6", optional = true }
+magetypes = { version = "0.8.6", optional = true }
 rayon = { version = "1.11.0", optional = true }
 rgb = "0.8.52"
 
@@ -28,6 +30,7 @@ lodepng = "3.12.1"
 [features]
 default = ["threads"]
 threads = ["dep:rayon"]
+fma = ["dep:archmage", "dep:magetypes"]
 no-macos-vimage = []
 # internal for cargo-c only
 capi = []
diff --git a/dssim-core/src/blur.rs b/dssim-core/src/blur.rs
index 3f63618..69fa1d5 100644
--- a/dssim-core/src/blur.rs
+++ b/dssim-core/src/blur.rs
@@ -1,21 +1,23 @@
-
+#[cfg(all(target_os = "macos", not(feature = "no-macos-vimage")))]
 const KERNEL: [f32; 9] = [
-    0.095332, 0.118095, 0.095332,
-    0.118095, 0.146293, 0.118095,
-    0.095332, 0.118095, 0.095332,
+    0.095332, 0.118095, 0.095332, 0.118095, 0.146293, 0.118095, 0.095332, 0.118095, 0.095332,
 ];
 
+#[inline]
+pub(crate) fn zeroed_f32_vec(len: usize) -> Vec<f32> {
+    vec![0.0f32; len]
+}
+
 #[cfg(all(target_os = "macos", not(feature = "no-macos-vimage")))]
 mod mac {
     use super::KERNEL;
-    use crate::ffi::vImageConvolve_PlanarF;
-    use crate::ffi::vImagePixelCount;
     use crate::ffi::vImage_Buffer;
     use crate::ffi::vImage_Flags::kvImageEdgeExtend;
+    use crate::ffi::vImageConvolve_PlanarF;
+    use crate::ffi::vImagePixelCount;
     use imgref::*;
-    use std::mem::MaybeUninit;
 
-    pub fn blur(src: ImgRef<'_, f32>, tmp: &mut [MaybeUninit<f32>]) -> ImgVec<f32> {
+    pub fn blur(src: ImgRef<'_, f32>, tmp: &mut [f32]) -> ImgVec<f32> {
         let width = src.width();
         let height = src.height();
 
@@ -25,22 +27,19 @@ mod mac {
             rowBytes: src.stride() * std::mem::size_of::<f32>(),
             data: src.buf().as_ptr(),
         };
-        let mut dst_vec = Vec::with_capacity(width * height);
+        let mut dst_vec = vec![0f32; width * height];
         let mut dstbuf = vImage_Buffer {
             width: width as vImagePixelCount,
             height: height as vImagePixelCount,
             rowBytes: width * std::mem::size_of::<f32>(),
-            data: dst_vec.spare_capacity_mut().as_mut_ptr().cast(),
+            data: dst_vec.as_mut_ptr(),
         };
 
         do_blur(&srcbuf, tmp, &mut dstbuf, width, height);
-        unsafe {
-            dst_vec.set_len(dst_vec.capacity());
-        }
         ImgVec::new(dst_vec, width, height)
     }
 
-    pub fn blur_in_place(mut srcdst: ImgRefMut<'_, f32>, tmp: &mut [MaybeUninit<f32>]) {
+    pub fn blur_in_place(mut srcdst: ImgRefMut<'_, f32>, tmp: &mut [f32]) {
         let srcbuf = vImage_Buffer {
             width: srcdst.width() as vImagePixelCount,
             height: srcdst.height() as vImagePixelCount,
@@ -57,7 +56,27 @@ mod mac {
         do_blur(&srcbuf, tmp, &mut dstbuf, srcdst.width(), srcdst.height());
     }
 
-    fn do_blur(srcbuf: &vImage_Buffer<*const f32>, tmp: &mut [MaybeUninit<f32>], dstbuf: &mut vImage_Buffer<*mut f32>, width: usize, height: usize) {
+    /// Blur the element-wise product of two images. On macOS, falls back to
+    /// multiply then blur since vImage has no fused variant.
+    pub fn blur_mul(src1: ImgRef<'_, f32>, src2: ImgRef<'_, f32>, tmp: &mut [f32]) -> Vec<f32> {
+        let width = src1.width();
+        let height = src1.height();
+        let mut product: Vec<f32> = src1
+            .pixels()
+            .zip(src2.pixels())
+            .map(|(a, b)| a * b)
+            .collect();
+        blur_in_place(ImgRefMut::new(&mut product, width, height), tmp);
+        product
+    }
+
+    fn do_blur(
+        srcbuf: &vImage_Buffer<*const f32>,
+        tmp: &mut [f32],
+        dstbuf: &mut vImage_Buffer<*mut f32>,
+        width: usize,
+        height: usize,
+    ) {
         assert_eq!(tmp.len(), width * height);
 
         unsafe {
@@ -65,18 +84,40 @@ mod mac {
                 width: width as vImagePixelCount,
                 height: height as vImagePixelCount,
                 rowBytes: width * std::mem::size_of::<f32>(),
-                data: tmp.as_mut_ptr().cast::<f32>(),
+                data: tmp.as_mut_ptr(),
             };
-            let res = vImageConvolve_PlanarF(srcbuf, &mut tmpwrbuf, std::ptr::null_mut(), 0, 0, KERNEL.as_ptr(), 3, 3, 0., kvImageEdgeExtend);
+            let res = vImageConvolve_PlanarF(
+                srcbuf,
+                &mut tmpwrbuf,
+                std::ptr::null_mut(),
+                0,
+                0,
+                KERNEL.as_ptr(),
+                3,
+                3,
+                0.,
+                kvImageEdgeExtend,
+            );
             assert_eq!(0, res);
 
             let tmprbuf = vImage_Buffer {
                 width: width as vImagePixelCount,
                 height: height as vImagePixelCount,
                 rowBytes: width * std::mem::size_of::<f32>(),
-                data: tmp.as_ptr().cast::<f32>(),
+                data: tmp.as_ptr(),
             };
-            let res = vImageConvolve_PlanarF(&tmprbuf, dstbuf, std::ptr::null_mut(), 0, 0, KERNEL.as_ptr(), 3, 3, 0., kvImageEdgeExtend);
+            let res = vImageConvolve_PlanarF(
+                &tmprbuf,
+                dstbuf,
+                std::ptr::null_mut(),
+                0,
+                0,
+                KERNEL.as_ptr(),
+                3,
+                3,
+                0.,
+                kvImageEdgeExtend,
+            );
             assert_eq!(0, res);
         }
     }
@@ -84,101 +125,459 @@ mod mac {
 
 #[cfg(not(all(target_os = "macos", not(feature = "no-macos-vimage"))))]
 mod portable {
-    use super::KERNEL;
+    use super::zeroed_f32_vec;
     use imgref::*;
-    use std::cmp::min;
-    use std::mem::MaybeUninit;
 
-    #[inline]
-    unsafe fn do3f(prev: &[f32], curr: &[f32], next: &[f32], i: usize) -> f32 {
-        debug_assert!(i > 0);
+    // 1D kernel from separable decomposition of the 3×3 kernel.
+    // Symmetric: K1D = [K_SIDE, K_CENTER, K_SIDE].
+    const K_SIDE: f32 = 0.308_758_86;
+    const K_CENTER: f32 = 0.382_482_8;
+
+    // Fused double-blur 5-tap kernel: convolving K1D with itself.
+    // K5 = [K5_OUTER, K5_INNER, K5_MID, K5_INNER, K5_OUTER]
+    // This makes H→V→H→V equivalent to a single H5→V5 pass, halving memory traffic.
+    const K5_OUTER: f32 = K_SIDE * K_SIDE;
+    const K5_INNER: f32 = 2.0 * K_SIDE * K_CENTER;
+    const K5_MID: f32 = 2.0 * K_SIDE * K_SIDE + K_CENTER * K_CENTER;
+
+    /// Horizontal 5-tap blur. Equivalent to two sequential 3-tap horizontal blurs.
+    #[inline(never)]
+    fn blur_h5(src: &[f32], dst: &mut [f32], width: usize, height: usize, src_stride: usize) {
+        let last = width - 1;
+        for y in 0..height {
+            let row = &src[y * src_stride..][..width];
+            let out = &mut dst[y * width..][..width];
+
+            // Left edge pixels (0, 1): clamp negative indices to 0
+            for i in 0..2.min(width) {
+                let m2 = row[0]; // i-2 and i-1 clamp to 0
+                let m1 = if i >= 1 { row[i - 1] } else { row[0] };
+                let p1 = if i < last { row[i + 1] } else { row[last] };
+                let p2 = if i + 2 <= last { row[i + 2] } else { row[last] };
+                out[i] = (m2 + p2) * K5_OUTER + (m1 + p1) * K5_INNER + row[i] * K5_MID;
+            }
 
-        let c0 = i - 1;
-        let c1 = i;
-        let c2 = i + 1;
+            // Inner pixels: 2 <= i <= width-3
+            for i in 2..width.saturating_sub(2) {
+                out[i] = (row[i - 2] + row[i + 2]) * K5_OUTER
+                    + (row[i - 1] + row[i + 1]) * K5_INNER
+                    + row[i] * K5_MID;
+            }
 
-        (prev.get_unchecked(c0)*KERNEL[0] + prev.get_unchecked(c1)*KERNEL[1] + prev.get_unchecked(c2)*KERNEL[2]) +
-        (curr.get_unchecked(c0)*KERNEL[3] + curr.get_unchecked(c1)*KERNEL[4] + curr.get_unchecked(c2)*KERNEL[5]) +
-        (next.get_unchecked(c0)*KERNEL[6] + next.get_unchecked(c1)*KERNEL[7] + next.get_unchecked(c2)*KERNEL[8])
+            // Right edge pixels: clamp beyond-end indices to last
+            for i in width.saturating_sub(2).max(2)..width {
+                let p1 = if i < last { row[i + 1] } else { row[last] };
+                let p2 = if i + 2 <= last { row[i + 2] } else { row[last] };
+                out[i] =
+                    (row[i - 2] + p2) * K5_OUTER + (row[i - 1] + p1) * K5_INNER + row[i] * K5_MID;
+            }
+        }
     }
 
-    fn do3(prev: &[f32], curr: &[f32], next: &[f32], i: usize, width: usize) -> f32 {
-        let c0 = if i > 0 { i - 1 } else { 0 };
-        let c1 = i;
-        let c2 = min(i + 1, width - 1);
+    /// Vertical 5-tap blur. Equivalent to two sequential 3-tap vertical blurs.
+    #[inline(never)]
+    fn blur_v5(src: &[f32], dst: &mut [f32], width: usize, height: usize, dst_stride: usize) {
+        let last_y = height - 1;
 
-        prev[c2].mul_add(KERNEL[2], prev[c0].mul_add(KERNEL[0], prev[c1] * KERNEL[1])) +
-        curr[c2].mul_add(KERNEL[5], curr[c0].mul_add(KERNEL[3], curr[c1] * KERNEL[4])) +
-        next[c2].mul_add(KERNEL[8], next[c0].mul_add(KERNEL[6], next[c1] * KERNEL[7]))
+        for y in 0..height {
+            let ym2 = y.saturating_sub(2);
+            let ym1 = y.saturating_sub(1);
+            let yp1 = (y + 1).min(last_y);
+            let yp2 = (y + 2).min(last_y);
+
+            let rm2 = &src[ym2 * width..][..width];
+            let rm1 = &src[ym1 * width..][..width];
+            let rc = &src[y * width..][..width];
+            let rp1 = &src[yp1 * width..][..width];
+            let rp2 = &src[yp2 * width..][..width];
+
+            let out = &mut dst[y * dst_stride..][..width];
+
+            for x in 0..width {
+                out[x] =
+                    (rm2[x] + rp2[x]) * K5_OUTER + (rm1[x] + rp1[x]) * K5_INNER + rc[x] * K5_MID;
+            }
+        }
     }
 
-    pub fn blur(src: ImgRef<'_, f32>, tmp: &mut [MaybeUninit<f32>]) -> ImgVec<f32> {
-        let width = src.width();
-        let height = src.height();
-        let tmp_dst = ImgRefMut::new(tmp, width, height);
-        let tmp_src = do_blur(src, tmp_dst);
-        let mut dst_vec = Vec::with_capacity(width * height);
-        do_blur(tmp_src.as_ref(), ImgRefMut::new(dst_vec.spare_capacity_mut(), width, height));
-        unsafe {
-            dst_vec.set_len(dst_vec.capacity());
+    /// Horizontal 5-tap blur with fused element-wise multiply.
+    /// Computes blur(src1 * src2) in a single H5 pass.
+    #[allow(clippy::too_many_arguments)]
+    #[inline(never)]
+    fn blur_h5_mul(
+        src1: &[f32],
+        src2: &[f32],
+        dst: &mut [f32],
+        width: usize,
+        height: usize,
+        stride1: usize,
+        stride2: usize,
+    ) {
+        let last = width - 1;
+        for y in 0..height {
+            let r1 = &src1[y * stride1..][..width];
+            let r2 = &src2[y * stride2..][..width];
+            let out = &mut dst[y * width..][..width];
+
+            // General clamped access for edge pixels
+            let clamp = |i: isize| i.max(0).min(last as isize) as usize;
+            let prod = |i: isize| r1[clamp(i)] * r2[clamp(i)];
+
+            // Left edge pixels
+            for i in 0..2.min(width) {
+                let ii = i as isize;
+                out[i] = (prod(ii - 2) + prod(ii + 2)) * K5_OUTER
+                    + (prod(ii - 1) + prod(ii + 1)) * K5_INNER
+                    + (r1[i] * r2[i]) * K5_MID;
+            }
+
+            // Inner pixels
+            for i in 2..width.saturating_sub(2) {
+                let pm2 = r1[i - 2] * r2[i - 2];
+                let pm1 = r1[i - 1] * r2[i - 1];
+                let pc = r1[i] * r2[i];
+                let pp1 = r1[i + 1] * r2[i + 1];
+                let pp2 = r1[i + 2] * r2[i + 2];
+                out[i] = (pm2 + pp2) * K5_OUTER + (pm1 + pp1) * K5_INNER + pc * K5_MID;
+            }
+
+            // Right edge pixels
+            for i in width.saturating_sub(2).max(2)..width {
+                let ii = i as isize;
+                out[i] = (prod(ii - 2) + prod(ii + 2)) * K5_OUTER
+                    + (prod(ii - 1) + prod(ii + 1)) * K5_INNER
+                    + (r1[i] * r2[i]) * K5_MID;
+            }
         }
-        ImgVec::new(dst_vec, width, height)
     }
 
-    fn do_blur<'d>(src: ImgRef<'_, f32>, mut dst: ImgRefMut<'d, MaybeUninit<f32>>) -> ImgRefMut<'d, f32> {
-        assert_eq!(src.width(), dst.width());
-        assert_eq!(src.height(), dst.height());
-        assert!(src.width() > 0);
-        assert!(src.width() < 1 << 24);
-        assert!(src.height() > 0);
-        assert!(src.height() < 1 << 24);
-        debug_assert!(src.pixels().all(|p| p.is_finite()));
+    // ── AVX2+FMA SIMD path ──────────────────────────────────────────────
+    #[cfg(all(feature = "fma", target_arch = "x86_64"))]
+    mod simd {
+        use super::{K5_INNER, K5_MID, K5_OUTER};
+        use archmage::prelude::*;
+        use magetypes::simd::f32x8;
+
+        #[arcane]
+        pub fn blur_avx2(
+            t: X64V3Token,
+            buf: &[f32],
+            tmp: &mut [f32],
+            dst: &mut [f32],
+            w: usize,
+            h: usize,
+            stride: usize,
+        ) {
+            blur_h5_simd(t, buf, tmp, w, h, stride);
+            blur_v5_simd(t, tmp, dst, w, h, w);
+        }
 
-        let width = src.width();
-        let height = src.height();
-        let src_stride = src.stride();
-        let dst_stride = dst.stride();
-        let src = src.buf();
-        let dst = dst.buf_mut();
-
-        let mut prev = &src[0..width];
-        let mut curr = prev;
-        let mut next = prev;
-        for y in 0..height {
-            prev = curr;
-            curr = next;
-            let next_start = (y+1)*src_stride;
-            next = if y+1 < height {&src[next_start..next_start+width]} else {curr};
+        #[arcane]
+        pub fn blur_in_place_avx2(
+            t: X64V3Token,
+            buf: &mut [f32],
+            tmp: &mut [f32],
+            w: usize,
+            h: usize,
+            stride: usize,
+        ) {
+            blur_h5_simd(t, buf, tmp, w, h, stride);
+            blur_v5_simd(t, tmp, buf, w, h, stride);
+        }
 
-            let dstrow = &mut dst[y*dst_stride..y*dst_stride+width];
+        #[allow(clippy::too_many_arguments)]
+        #[arcane]
+        pub fn blur_mul_avx2(
+            t: X64V3Token,
+            src1: &[f32],
+            src2: &[f32],
+            tmp: &mut [f32],
+            dst: &mut [f32],
+            w: usize,
+            h: usize,
+            stride1: usize,
+            stride2: usize,
+        ) {
+            blur_h5_mul_simd(t, src1, src2, tmp, w, h, stride1, stride2);
+            blur_v5_simd(t, tmp, dst, w, h, w);
+        }
 
-            dstrow[0].write(do3(prev, curr, next, 0, width));
-            for i in 1..width-1 {
-                unsafe {
-                    dstrow[i].write(do3f(prev, curr, next, i));
+        #[rite]
+        fn blur_h5_simd(
+            t: X64V3Token,
+            src: &[f32],
+            dst: &mut [f32],
+            width: usize,
+            height: usize,
+            src_stride: usize,
+        ) {
+            let vk_outer = f32x8::splat(t, K5_OUTER);
+            let vk_inner = f32x8::splat(t, K5_INNER);
+            let vk_mid = f32x8::splat(t, K5_MID);
+            let last = width - 1;
+
+            for y in 0..height {
+                let row = &src[y * src_stride..][..width];
+                let out = &mut dst[y * width..][..width];
+
+                // Left edge: scalar (2 pixels)
+                for i in 0..2.min(width) {
+                    let ii = i as isize;
+                    let get = |j: isize| row[j.max(0).min(last as isize) as usize];
+                    out[i] = (get(ii - 2) + get(ii + 2)) * K5_OUTER
+                        + (get(ii - 1) + get(ii + 1)) * K5_INNER
+                        + row[i] * K5_MID;
+                }
+
+                // SIMD loop: needs row[i-2..i+10] valid
+                let mut i = 2;
+                while i + 10 <= width {
+                    let far_left = f32x8::load(t, (&row[i - 2..i + 6]).try_into().unwrap());
+                    let left = f32x8::load(t, (&row[i - 1..i + 7]).try_into().unwrap());
+                    let center = f32x8::load(t, (&row[i..i + 8]).try_into().unwrap());
+                    let right = f32x8::load(t, (&row[i + 1..i + 9]).try_into().unwrap());
+                    let far_right = f32x8::load(t, (&row[i + 2..i + 10]).try_into().unwrap());
+                    let outer_sum = far_left + far_right;
+                    let inner_sum = left + right;
+                    let result =
+                        outer_sum.mul_add(vk_outer, inner_sum.mul_add(vk_inner, center * vk_mid));
+                    result.store((&mut out[i..i + 8]).try_into().unwrap());
+                    i += 8;
+                }
+
+                // Scalar tail + right edges
+                while i < width {
+                    let ii = i as isize;
+                    let get = |j: isize| row[j.max(0).min(last as isize) as usize];
+                    out[i] = (get(ii - 2) + get(ii + 2)) * K5_OUTER
+                        + (get(ii - 1) + get(ii + 1)) * K5_INNER
+                        + row[i] * K5_MID;
+                    i += 1;
                 }
             }
-            if width > 1 {
-                dstrow[width-1].write(do3(prev, curr, next, width-1, width));
+        }
+
+        #[rite]
+        fn blur_v5_simd(
+            t: X64V3Token,
+            src: &[f32],
+            dst: &mut [f32],
+            width: usize,
+            height: usize,
+            dst_stride: usize,
+        ) {
+            let vk_outer = f32x8::splat(t, K5_OUTER);
+            let vk_inner = f32x8::splat(t, K5_INNER);
+            let vk_mid = f32x8::splat(t, K5_MID);
+            let last_y = height - 1;
+
+            for y in 0..height {
+                let ym2 = y.saturating_sub(2);
+                let ym1 = y.saturating_sub(1);
+                let yp1 = (y + 1).min(last_y);
+                let yp2 = (y + 2).min(last_y);
+
+                let rm2 = &src[ym2 * width..][..width];
+                let rm1 = &src[ym1 * width..][..width];
+                let rc = &src[y * width..][..width];
+                let rp1 = &src[yp1 * width..][..width];
+                let rp2 = &src[yp2 * width..][..width];
+
+                let out = &mut dst[y * dst_stride..][..width];
+
+                let mut x = 0;
+                while x + 8 <= width {
+                    let vm2 = f32x8::load(t, (&rm2[x..x + 8]).try_into().unwrap());
+                    let vm1 = f32x8::load(t, (&rm1[x..x + 8]).try_into().unwrap());
+                    let vc = f32x8::load(t, (&rc[x..x + 8]).try_into().unwrap());
+                    let vp1 = f32x8::load(t, (&rp1[x..x + 8]).try_into().unwrap());
+                    let vp2 = f32x8::load(t, (&rp2[x..x + 8]).try_into().unwrap());
+                    let outer = vm2 + vp2;
+                    let inner = vm1 + vp1;
+                    let result = outer.mul_add(vk_outer, inner.mul_add(vk_inner, vc * vk_mid));
+                    result.store((&mut out[x..x + 8]).try_into().unwrap());
+                    x += 8;
+                }
+
+                // Scalar tail
+                while x < width {
+                    out[x] = (rm2[x] + rp2[x]) * K5_OUTER
+                        + (rm1[x] + rp1[x]) * K5_INNER
+                        + rc[x] * K5_MID;
+                    x += 1;
+                }
             }
         }
 
-        // assumes init after writing all the data
-        unsafe {
-            ImgRefMut::new_stride(std::slice::from_raw_parts_mut(dst.as_mut_ptr().cast(), dst.len()), width, height, dst_stride)
+        #[allow(clippy::too_many_arguments)]
+        #[rite]
+        fn blur_h5_mul_simd(
+            t: X64V3Token,
+            src1: &[f32],
+            src2: &[f32],
+            dst: &mut [f32],
+            width: usize,
+            height: usize,
+            stride1: usize,
+            stride2: usize,
+        ) {
+            let vk_outer = f32x8::splat(t, K5_OUTER);
+            let vk_inner = f32x8::splat(t, K5_INNER);
+            let vk_mid = f32x8::splat(t, K5_MID);
+            let last = width - 1;
+
+            for y in 0..height {
+                let r1 = &src1[y * stride1..][..width];
+                let r2 = &src2[y * stride2..][..width];
+                let out = &mut dst[y * width..][..width];
+
+                // Left edge: scalar (2 pixels)
+                for i in 0..2.min(width) {
+                    let ii = i as isize;
+                    let cl = |j: isize| j.max(0).min(last as isize) as usize;
+                    let p = |j: isize| r1[cl(j)] * r2[cl(j)];
+                    out[i] = (p(ii - 2) + p(ii + 2)) * K5_OUTER
+                        + (p(ii - 1) + p(ii + 1)) * K5_INNER
+                        + (r1[i] * r2[i]) * K5_MID;
+                }
+
+                // SIMD loop: needs [i-2..i+10] valid from both sources
+                let mut i = 2;
+                while i + 10 <= width {
+                    let l1m2 = f32x8::load(t, (&r1[i - 2..i + 6]).try_into().unwrap());
+                    let l2m2 = f32x8::load(t, (&r2[i - 2..i + 6]).try_into().unwrap());
+                    let l1m1 = f32x8::load(t, (&r1[i - 1..i + 7]).try_into().unwrap());
+                    let l2m1 = f32x8::load(t, (&r2[i - 1..i + 7]).try_into().unwrap());
+                    let l1c = f32x8::load(t, (&r1[i..i + 8]).try_into().unwrap());
+                    let l2c = f32x8::load(t, (&r2[i..i + 8]).try_into().unwrap());
+                    let l1p1 = f32x8::load(t, (&r1[i + 1..i + 9]).try_into().unwrap());
+                    let l2p1 = f32x8::load(t, (&r2[i + 1..i + 9]).try_into().unwrap());
+                    let l1p2 = f32x8::load(t, (&r1[i + 2..i + 10]).try_into().unwrap());
+                    let l2p2 = f32x8::load(t, (&r2[i + 2..i + 10]).try_into().unwrap());
+
+                    let p_m2 = l1m2 * l2m2;
+                    let p_m1 = l1m1 * l2m1;
+                    let p_c = l1c * l2c;
+                    let p_p1 = l1p1 * l2p1;
+                    let p_p2 = l1p2 * l2p2;
+
+                    let outer = p_m2 + p_p2;
+                    let inner = p_m1 + p_p1;
+                    let result = outer.mul_add(vk_outer, inner.mul_add(vk_inner, p_c * vk_mid));
+                    result.store((&mut out[i..i + 8]).try_into().unwrap());
+                    i += 8;
+                }
+
+                // Scalar tail + right edges
+                while i < width {
+                    let ii = i as isize;
+                    let cl = |j: isize| j.max(0).min(last as isize) as usize;
+                    let p = |j: isize| r1[cl(j)] * r2[cl(j)];
+                    out[i] = (p(ii - 2) + p(ii + 2)) * K5_OUTER
+                        + (p(ii - 1) + p(ii + 1)) * K5_INNER
+                        + (r1[i] * r2[i]) * K5_MID;
+                    i += 1;
+                }
+            }
         }
     }
 
-    pub fn blur_in_place(srcdst: ImgRefMut<'_, f32>, tmp: &mut [MaybeUninit<f32>]) {
-        let tmp_dst = ImgRefMut::new(tmp, srcdst.width(), srcdst.height());
-        let tmp_src = do_blur(srcdst.as_ref(), tmp_dst);
-        do_blur(tmp_src.as_ref(), as_maybe_uninit(srcdst));
+    pub fn blur(src: ImgRef<'_, f32>, tmp: &mut [f32]) -> ImgVec<f32> {
+        let width = src.width();
+        let height = src.height();
+        assert!(width > 0 && width < 1 << 24);
+        assert!(height > 0 && height < 1 << 24);
+        debug_assert!(src.pixels().all(|p| p.is_finite()));
+
+        let pixels = width * height;
+        assert!(tmp.len() >= pixels);
+        let tmp = &mut tmp[..pixels];
+        let mut dst = zeroed_f32_vec(pixels);
+
+        #[cfg(all(feature = "fma", target_arch = "x86_64"))]
+        {
+            use archmage::SimdToken as _;
+            if let Some(token) = archmage::X64V3Token::summon() {
+                simd::blur_avx2(token, src.buf(), tmp, &mut dst, width, height, src.stride());
+                return ImgVec::new(dst, width, height);
+            }
+        }
+
+        blur_h5(src.buf(), tmp, width, height, src.stride());
+        blur_v5(tmp, &mut dst, width, height, width);
+
+        ImgVec::new(dst, width, height)
+    }
+
+    pub fn blur_in_place(mut srcdst: ImgRefMut<'_, f32>, tmp: &mut [f32]) {
+        let width = srcdst.width();
+        let height = srcdst.height();
+        let stride = srcdst.stride();
+        let pixels = width * height;
+
+        assert!(tmp.len() >= pixels);
+        let tmp = &mut tmp[..pixels];
+        let buf = srcdst.buf_mut();
+
+        #[cfg(all(feature = "fma", target_arch = "x86_64"))]
+        {
+            use archmage::SimdToken as _;
+            if let Some(token) = archmage::X64V3Token::summon() {
+                simd::blur_in_place_avx2(token, buf, tmp, width, height, stride);
+                return;
+            }
+        }
+
+        blur_h5(buf, tmp, width, height, stride);
+        blur_v5(tmp, buf, width, height, stride);
     }
 
-    fn as_maybe_uninit(img: ImgRefMut<'_, f32>) -> ImgRefMut<'_, MaybeUninit<f32>> {
-        img.map_buf(|dst| unsafe {
-            std::slice::from_raw_parts_mut(dst.as_mut_ptr().cast::<MaybeUninit<f32>>(), dst.len())
-        })
+    /// Blur the element-wise product of two images: blur(src1 * src2).
+    /// Fuses the multiply into the horizontal pass, then does a single vertical pass.
+    pub fn blur_mul(src1: ImgRef<'_, f32>, src2: ImgRef<'_, f32>, tmp: &mut [f32]) -> Vec<f32> {
+        let width = src1.width();
+        let height = src1.height();
+        debug_assert_eq!(width, src2.width());
+        debug_assert_eq!(height, src2.height());
+        assert!(width > 0 && width < 1 << 24);
+        assert!(height > 0 && height < 1 << 24);
+
+        let pixels = width * height;
+        assert!(tmp.len() >= pixels);
+        let tmp = &mut tmp[..pixels];
+        let mut dst = zeroed_f32_vec(pixels);
+
+        #[cfg(all(feature = "fma", target_arch = "x86_64"))]
+        {
+            use archmage::SimdToken as _;
+            if let Some(token) = archmage::X64V3Token::summon() {
+                simd::blur_mul_avx2(
+                    token,
+                    src1.buf(),
+                    src2.buf(),
+                    tmp,
+                    &mut dst,
+                    width,
+                    height,
+                    src1.stride(),
+                    src2.stride(),
+                );
+                return dst;
+            }
+        }
+
+        blur_h5_mul(
+            src1.buf(),
+            src2.buf(),
+            tmp,
+            width,
+            height,
+            src1.stride(),
+            src2.stride(),
+        );
+        blur_v5(tmp, &mut dst, width, height, width);
+
+        dst
     }
 }
 
@@ -196,9 +595,9 @@ fn blur_zero() {
     let src = vec![0.25];
     let mut src2 = src.clone();
 
-    let mut tmp = vec![-55.; 1]; tmp.clear();
-    let dst = blur(ImgRef::new(&src[..], 1,1), tmp.spare_capacity_mut());
-    blur_in_place(ImgRefMut::new(&mut src2[..], 1, 1), tmp.spare_capacity_mut());
+    let mut tmp = vec![0.; 1];
+    let dst = blur(ImgRef::new(&src[..], 1, 1), &mut tmp);
+    blur_in_place(ImgRefMut::new(&mut src2[..], 1, 1), &mut tmp);
 
     assert_eq!(&src2, dst.buf());
     assert!((0.25 - dst.buf()[0]).abs() < 0.00001);
@@ -206,40 +605,43 @@ fn blur_zero() {
 
 #[test]
 fn blur_one() {
-    blur_one_compare(Img::new(vec![
-        0.,0.,0.,0.,0.,
-        0.,0.,0.,0.,0.,
-        0.,0.,1.,0.,0.,
-        0.,0.,0.,0.,0.,
-        0.,0.,0.,0.,0.,
-    ], 5, 5));
+    blur_one_compare(Img::new(
+        vec![
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+            0., 0., 0.,
+        ],
+        5,
+        5,
+    ));
 }
 
 #[test]
 fn blur_one_stride() {
-    let nan = 1./0.;
-    blur_one_compare(Img::new_stride(vec![
-        0.,0.,0.,0.,0., nan, -11.,
-        0.,0.,0.,0.,0., 333., nan,
-        0.,0.,1.,0.,0., nan, -11.,
-        0.,0.,0.,0.,0., 333., nan,
-        0.,0.,0.,0.,0., nan,
-    ], 5, 5, 7));
+    let nan = 1. / 0.;
+    blur_one_compare(Img::new_stride(
+        vec![
+            0., 0., 0., 0., 0., nan, -11., 0., 0., 0., 0., 0., 333., nan, 0., 0., 1., 0., 0., nan,
+            -11., 0., 0., 0., 0., 0., 333., nan, 0., 0., 0., 0., 0., nan,
+        ],
+        5,
+        5,
+        7,
+    ));
 }
 
 #[cfg(test)]
 fn blur_one_compare(src: ImgVec<f32>) {
     let mut src2 = src.clone();
 
-    let mut tmp = vec![-55.; 5*5]; tmp.clear();
-    let dst = blur(src.as_ref(), tmp.spare_capacity_mut());
-    blur_in_place(src2.as_mut(), tmp.spare_capacity_mut());
+    let mut tmp = vec![0.; 5 * 5];
+    let dst = blur(src.as_ref(), &mut tmp);
+    blur_in_place(src2.as_mut(), &mut tmp);
 
     assert_eq!(&src2.pixels().collect::<Vec<_>>(), dst.buf());
 
-    assert!((1./110. - dst.buf()[0]).abs() < 0.0001, "{dst:?}");
-    assert!((1./110. - dst.buf()[5*5-1]).abs() < 0.0001, "{dst:?}");
-    assert!((0.11354011 - dst.buf()[2*5+2]).abs() < 0.0001);
+    assert!((1. / 110. - dst.buf()[0]).abs() < 0.0001, "{dst:?}");
+    assert!((1. / 110. - dst.buf()[5 * 5 - 1]).abs() < 0.0001, "{dst:?}");
+    assert!((0.11354011 - dst.buf()[2 * 5 + 2]).abs() < 0.0001);
 }
 
 #[test]
@@ -247,9 +649,9 @@ fn blur_1x1() {
     let src = vec![1.];
     let mut src2 = src.clone();
 
-    let mut tmp = vec![-999.; 1]; tmp.clear();
-    let dst = blur(ImgRef::new(&src[..], 1,1), tmp.spare_capacity_mut());
-    blur_in_place(ImgRefMut::new(&mut src2[..], 1,1), tmp.spare_capacity_mut());
+    let mut tmp = vec![0.; 1];
+    let dst = blur(ImgRef::new(&src[..], 1, 1), &mut tmp);
+    blur_in_place(ImgRefMut::new(&mut src2[..], 1, 1), &mut tmp);
 
     assert!((dst.buf()[0] - 1.).abs() < 0.00001);
     assert!((src2[0] - 1.).abs() < 0.00001);
@@ -258,38 +660,61 @@ fn blur_1x1() {
 #[test]
 fn blur_two() {
     let src = vec![
-    0.,1.,1.,1.,
-    1.,1.,1.,1.,
-    1.,1.,1.,1.,
-    1.,1.,1.,1.,
+        0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
     ];
     let mut src2 = src.clone();
 
-    let mut tmp = vec![-55.; 4*4]; tmp.clear();
-    let dst = blur(ImgRef::new(&src[..], 4,4), tmp.spare_capacity_mut());
-    blur_in_place(ImgRefMut::new(&mut src2[..], 4,4), tmp.spare_capacity_mut());
+    let mut tmp = vec![0.; 4 * 4];
+    let dst = blur(ImgRef::new(&src[..], 4, 4), &mut tmp);
+    blur_in_place(ImgRefMut::new(&mut src2[..], 4, 4), &mut tmp);
 
     assert_eq!(&src2, dst.buf());
 
-    let z00 = 0.*KERNEL[0] + 0.*KERNEL[1] + 1.*KERNEL[2] +
-              0.*KERNEL[3] + 0.*KERNEL[4] + 1.*KERNEL[5] +
-              1.*KERNEL[6] + 1.*KERNEL[7] + 1.*KERNEL[8];
-    let z01 =                                   0.*KERNEL[0] + 1.*KERNEL[1] + 1.*KERNEL[2] +
-                                                0.*KERNEL[3] + 1.*KERNEL[4] + 1.*KERNEL[5] +
-                                                1.*KERNEL[6] + 1.*KERNEL[7] + 1.*KERNEL[8];
-
-    let z10 = 0.*KERNEL[0] + 0.*KERNEL[1] + 1.*KERNEL[2] +
-              1.*KERNEL[3] + 1.*KERNEL[4] + 1.*KERNEL[5] +
-              1.*KERNEL[6] + 1.*KERNEL[7] + 1.*KERNEL[8];
-    let z11 =                                   0.*KERNEL[0] + 1.*KERNEL[1] + 1.*KERNEL[2] +
-                                                1.*KERNEL[3] + 1.*KERNEL[4] + 1.*KERNEL[5] +
-                                                1.*KERNEL[6] + 1.*KERNEL[7] + 1.*KERNEL[8];
-    let exp = z00*KERNEL[0] + z00*KERNEL[1] + z01*KERNEL[2] +
-              z00*KERNEL[3] + z00*KERNEL[4] + z01*KERNEL[5] +
-              z10*KERNEL[6] + z10*KERNEL[7] + z11*KERNEL[8];
-
+    // All-1 corners should remain 1.0 (kernel is normalized)
     assert!((1. - dst.buf()[3]).abs() < 0.0001, "{}", dst.buf()[3]);
-    assert!((1. - dst.buf()[3 * 4]).abs() < 0.0001, "{}", dst.buf()[3 * 4]);
-    assert!((1. - dst.buf()[4 * 4 - 1]).abs() < 0.0001, "{}", dst.buf()[4 * 4 - 1]);
-    assert!((f64::from(exp) - f64::from(dst.buf()[0])).abs() < 0.0000001);
+    assert!(
+        (1. - dst.buf()[3 * 4]).abs() < 0.0001,
+        "{}",
+        dst.buf()[3 * 4]
+    );
+    assert!(
+        (1. - dst.buf()[4 * 4 - 1]).abs() < 0.0001,
+        "{}",
+        dst.buf()[4 * 4 - 1]
+    );
+
+    // Reference 5-tap computation for corner [0][0]
+    let k_side: f32 = 0.308_758_86;
+    let k_center: f32 = 0.382_482_8;
+    let k5o = k_side * k_side;
+    let k5i = 2.0 * k_side * k_center;
+    let k5m = 2.0 * k_side * k_side + k_center * k_center;
+    let cl = |i: isize, max: usize| i.max(0).min(max as isize) as usize;
+
+    // H5 pass on 4×4
+    let mut h = [0.0f32; 16];
+    for y in 0..4 {
+        for x in 0..4usize {
+            let xi = x as isize;
+            h[y * 4 + x] = (src[y * 4 + cl(xi - 2, 3)] + src[y * 4 + cl(xi + 2, 3)]) * k5o
+                + (src[y * 4 + cl(xi - 1, 3)] + src[y * 4 + cl(xi + 1, 3)]) * k5i
+                + src[y * 4 + x] * k5m;
+        }
+    }
+    // V5 pass
+    let mut exp_all = [0.0f32; 16];
+    for y in 0..4usize {
+        for x in 0..4 {
+            let yi = y as isize;
+            exp_all[y * 4 + x] = (h[cl(yi - 2, 3) * 4 + x] + h[cl(yi + 2, 3) * 4 + x]) * k5o
+                + (h[cl(yi - 1, 3) * 4 + x] + h[cl(yi + 1, 3) * 4 + x]) * k5i
+                + h[y * 4 + x] * k5m;
+        }
+    }
+    let exp = exp_all[0];
+    assert!(
+        (f64::from(exp) - f64::from(dst.buf()[0])).abs() < 0.0001,
+        "expected {exp}, got {}",
+        dst.buf()[0]
+    );
 }
diff --git a/dssim-core/src/c_api.rs b/dssim-core/src/c_api.rs
index 64c0904..38c31de 100644
--- a/dssim-core/src/c_api.rs
+++ b/dssim-core/src/c_api.rs
@@ -4,19 +4,21 @@ use rgb::{RGB8, RGBA8};
 pub type DssimImage = crate::DssimImage<f32>;
 
 /// Create new context for comparisons
-#[no_mangle]
+#[unsafe(no_mangle)]
 pub extern "C" fn dssim_new() -> *mut Dssim {
     let d = Box::new(crate::new());
     Box::into_raw(d)
 }
 
 /// Free the context
-#[no_mangle]
+#[unsafe(no_mangle)]
 pub unsafe extern "C" fn dssim_free(d: *mut Dssim) {
     if d.is_null() {
         return;
     }
-    let _ = Box::from_raw(d);
+    unsafe {
+        let _ = Box::from_raw(d);
+    }
 }
 
 /// Take sRGB RGBA pixels (non-premultiplied, alpha last) and preprocess them into image format that can be compared.
@@ -24,11 +26,16 @@ pub unsafe extern "C" fn dssim_free(d: *mut Dssim) {
 /// Pixels are copied. Returns NULL on error.
 ///
 /// Call `dssim_free_image` to free memory when the image is no longer needed.
-#[no_mangle]
-pub unsafe extern "C" fn dssim_create_image_rgba(dssim: &mut Dssim, pixels: *const u8, width: u32, height: u32) -> *mut DssimImage {
+#[unsafe(no_mangle)]
+pub unsafe extern "C" fn dssim_create_image_rgba(
+    dssim: &mut Dssim,
+    pixels: *const u8,
+    width: u32,
+    height: u32,
+) -> *mut DssimImage {
     let width = width as usize;
     let height = height as usize;
-    let pixels = std::slice::from_raw_parts(pixels.cast::<RGBA8>(), width * height);
+    let pixels = unsafe { std::slice::from_raw_parts(pixels.cast::<RGBA8>(), width * height) };
     match dssim.create_image_rgba(pixels, width, height) {
         Some(img) => Box::into_raw(Box::new(img)),
         None => std::ptr::null_mut(),
@@ -40,11 +47,16 @@ pub unsafe extern "C" fn dssim_create_image_rgba(dssim: &mut Dssim, pixels: *con
 /// Pixels are copied. Returns NULL on error.
 ///
 /// Call `dssim_free_image` to free memory when the image is no longer needed.
-#[no_mangle]
-pub unsafe extern "C" fn dssim_create_image_rgb(dssim: &mut Dssim, pixels: *const u8, width: u32, height: u32) -> *mut DssimImage {
+#[unsafe(no_mangle)]
+pub unsafe extern "C" fn dssim_create_image_rgb(
+    dssim: &mut Dssim,
+    pixels: *const u8,
+    width: u32,
+    height: u32,
+) -> *mut DssimImage {
     let width = width as usize;
     let height = height as usize;
-    let pixels = std::slice::from_raw_parts(pixels.cast::<RGB8>(), width * height);
+    let pixels = unsafe { std::slice::from_raw_parts(pixels.cast::<RGB8>(), width * height) };
     match dssim.create_image_rgb(pixels, width, height) {
         Some(img) => Box::into_raw(Box::new(img)),
         None => std::ptr::null_mut(),
@@ -52,12 +64,14 @@ pub unsafe extern "C" fn dssim_create_image_rgb(dssim: &mut Dssim, pixels: *cons
 }
 
 /// Free image data
-#[no_mangle]
+#[unsafe(no_mangle)]
 pub unsafe extern "C" fn dssim_free_image(img: *mut DssimImage) {
     if img.is_null() {
         return;
     }
-    let _ = Box::from_raw(img);
+    unsafe {
+        let _ = Box::from_raw(img);
+    }
 }
 
 /// Compare these two images.
@@ -65,10 +79,14 @@ pub unsafe extern "C" fn dssim_free_image(img: *mut DssimImage) {
 /// `img1` can be reused for multiple comparisons.
 ///
 /// Don't forget to free the images and the DSSIM context when done.
-#[no_mangle]
-pub unsafe extern "C" fn dssim_compare(dssim: &mut Dssim, img1: *const DssimImage, img2: *const DssimImage) -> f64 {
-    let img1 = img1.as_ref().unwrap();
-    let img2 = img2.as_ref().unwrap();
+#[unsafe(no_mangle)]
+pub unsafe extern "C" fn dssim_compare(
+    dssim: &mut Dssim,
+    img1: *const DssimImage,
+    img2: *const DssimImage,
+) -> f64 {
+    let img1 = unsafe { img1.as_ref().unwrap() };
+    let img2 = unsafe { img2.as_ref().unwrap() };
     let (val, _) = dssim.compare(img1, img2);
     val.into()
 }
diff --git a/dssim-core/src/dssim.rs b/dssim-core/src/dssim.rs
index 5905efa..4ef604e 100644
--- a/dssim-core/src/dssim.rs
+++ b/dssim-core/src/dssim.rs
@@ -21,23 +21,21 @@
 
 use crate::blur;
 use crate::image::*;
+#[cfg(not(feature = "threads"))]
+use crate::lieon as rayon;
 use crate::linear::ToRGBAPLU;
 pub use crate::tolab::ToLABBitmap;
 pub use crate::val::Dssim as Val;
 use imgref::*;
-use itertools::multizip;
-#[cfg(not(feature = "threads"))]
-use crate::lieon as rayon;
 use rayon::prelude::*;
 use rgb::{RGB, RGBA};
 use std::borrow::Borrow;
-use std::mem::MaybeUninit;
 use std::ops;
 use std::ops::Deref;
 use std::sync::Arc;
 
 trait Channable<T, I> {
-    fn img1_img2_blur(&self, modified: &Self, tmp: &mut [MaybeUninit<I>]) -> Vec<T>;
+    fn img1_img2_blur(&self, modified: &Self, tmp: &mut [I]) -> Vec<T>;
 }
 
 #[derive(Clone)]
@@ -102,7 +100,11 @@ pub fn new() -> Dssim {
 
 impl DssimChan<f32> {
     pub fn new(bitmap: ImgVec<f32>, is_chroma: bool) -> Self {
-        debug_assert!(bitmap.pixels().all(|i| i.is_finite() && i >= 0.0 && i <= 1.0));
+        debug_assert!(
+            bitmap
+                .pixels()
+                .all(|i| i.is_finite() && i >= 0.0 && i <= 1.0)
+        );
 
         Self {
             width: bitmap.width(),
@@ -116,7 +118,7 @@ impl DssimChan<f32> {
 }
 
 impl DssimChan<f32> {
-    fn preprocess(&mut self, tmp: &mut [MaybeUninit<f32>]) {
+    fn preprocess(&mut self, tmp: &mut [f32]) {
         let width = self.width;
         let height = self.height;
         assert!(width > 0);
@@ -132,42 +134,15 @@ impl DssimChan<f32> {
         let (mu, _, _) = blur::blur(img.as_ref(), tmp).into_contiguous_buf();
         self.mu = mu;
 
-        self.img_sq_blur = img.pixels().map(|i| {
-            debug_assert!(i <= 1.0 && i >= 0.0);
-            i * i
-        }).collect();
-        blur::blur_in_place(ImgRefMut::new(&mut self.img_sq_blur[..], width, height), tmp);
-    }
-}
-
-impl Channable<LAB, f32> for [DssimChan<f32>] {
-    fn img1_img2_blur(&self, modified: &Self, tmp32: &mut [MaybeUninit<f32>]) -> Vec<LAB> {
-
-        let blurred:Vec<_> = self.iter().zip(modified.iter()).map(|(o,m)|{
-            o.img1_img2_blur(m, tmp32)
-        }).collect();
-
-        multizip((blurred[0].iter().copied(), blurred[1].iter().copied(), blurred[2].iter().copied())).map(|(l,a,b)| {
-            LAB {l,a,b}
-        }).collect()
+        self.img_sq_blur = blur::blur_mul(img.as_ref(), img.as_ref(), tmp);
     }
 }
 
 impl Channable<f32, f32> for DssimChan<f32> {
-    fn img1_img2_blur(&self, modified: &Self, tmp32: &mut [MaybeUninit<f32>]) -> Vec<f32> {
+    fn img1_img2_blur(&self, modified: &Self, tmp32: &mut [f32]) -> Vec<f32> {
+        let src = self.img.as_ref().unwrap();
         let modified_img = modified.img.as_ref().unwrap();
-
-        let mut out = self.img.as_ref().unwrap().pixels().zip(modified_img.pixels()).map(|(px1, px2)| {
-            debug_assert!(px1 <= 1.0 && px1 >= 0.0);
-            debug_assert!(px2 <= 1.0 && px2 >= 0.0);
-            px1 * px2
-        }).collect::<Vec<_>>();
-
-        let width = modified_img.width();
-        let height = modified_img.height();
-        debug_assert_eq!(out.len(), width * height);
-        blur::blur_in_place(ImgRefMut::new(&mut out, width, height), tmp32);
-        out
+        blur::blur_mul(src.as_ref(), modified_img.as_ref(), tmp32)
     }
 }
 
@@ -195,7 +170,12 @@ impl Dssim {
     ///
     /// If you have a slice of `u8`, then see `rgb` crate's `as_rgba()`.
     #[must_use]
-    pub fn create_image_rgba(&self, bitmap: &[RGBA<u8>], width: usize, height: usize) -> Option<DssimImage<f32>> {
+    pub fn create_image_rgba(
+        &self,
+        bitmap: &[RGBA<u8>],
+        width: usize,
+        height: usize,
+    ) -> Option<DssimImage<f32>> {
         if width * height < bitmap.len() {
             return None;
         }
@@ -207,7 +187,12 @@ impl Dssim {
     ///
     /// If you have a slice of `u8`, then see `rgb` crate's `as_rgb()`.
     #[must_use]
-    pub fn create_image_rgb(&self, bitmap: &[RGB<u8>], width: usize, height: usize) -> Option<DssimImage<f32>> {
+    pub fn create_image_rgb(
+        &self,
+        bitmap: &[RGB<u8>],
+        width: usize,
+        height: usize,
+    ) -> Option<DssimImage<f32>> {
         if width * height < bitmap.len() {
             return None;
         }
@@ -239,42 +224,57 @@ impl Dssim {
     }
 
     #[inline(never)]
-    fn make_scales_recursive<InBitmap, OutBitmap>(scales_left: usize, image: MaybeArc<'_, InBitmap>, scales: &mut Vec<DssimChanScale<f32>>)
-    where
+    fn make_scales_recursive<InBitmap, OutBitmap>(
+        scales_left: usize,
+        image: MaybeArc<'_, InBitmap>,
+        scales: &mut Vec<DssimChanScale<f32>>,
+    ) where
         InBitmap: ToLABBitmap + Send + Sync + Downsample<Output = OutBitmap>,
         OutBitmap: ToLABBitmap + Send + Sync + Downsample<Output = OutBitmap>,
     {
         // Run to_lab and next downsampling in parallel
-        let (chan, _) = rayon::join({
-            let image = image.clone();
-            move || {
-                let lab = image.to_lab();
-                drop(image); // Free larger RGB image ASAP
-                DssimChanScale {
-                    chan: lab.into_par_iter().with_max_len(1).enumerate().map(|(n,l)| {
-                        let w = l.width();
-                        let h = l.height();
-                        let mut ch = DssimChan::new(l, n > 0);
-
-                        let pixels = w * h;
-                        let mut tmp = Vec::with_capacity(pixels);
-                        ch.preprocess(&mut tmp.spare_capacity_mut()[..pixels]);
-                        ch
-                    }).collect(),
+        let (chan, _) = rayon::join(
+            {
+                let image = image.clone();
+                move || {
+                    let lab = image.to_lab();
+                    drop(image); // Free larger RGB image ASAP
+                    DssimChanScale {
+                        chan: lab
+                            .into_par_iter()
+                            .with_max_len(1)
+                            .enumerate()
+                            .map(|(n, l)| {
+                                let w = l.width();
+                                let h = l.height();
+                                let mut ch = DssimChan::new(l, n > 0);
+
+                                let pixels = w * h;
+                                let mut tmp = blur::zeroed_f32_vec(pixels);
+                                ch.preprocess(&mut tmp);
+                                ch
+                            })
+                            .collect(),
+                    }
                 }
-            }
-        }, {
-            let scales = &mut *scales;
-            move || {
-                if scales_left > 0 {
-                    let down = image.downsample();
-                    drop(image);
-                    if let Some(downsampled) = down {
-                        Self::make_scales_recursive(scales_left - 1, MaybeArc::Owned(Arc::new(downsampled)), scales);
+            },
+            {
+                let scales = &mut *scales;
+                move || {
+                    if scales_left > 0 {
+                        let down = image.downsample();
+                        drop(image);
+                        if let Some(downsampled) = down {
+                            Self::make_scales_recursive(
+                                scales_left - 1,
+                                MaybeArc::Owned(Arc::new(downsampled)),
+                                scales,
+                            );
+                        }
                     }
                 }
-            }
-        });
+            },
+        );
         scales.push(chan);
     }
 
@@ -283,53 +283,85 @@ impl Dssim {
     /// The `SsimMap`s are returned only if you've enabled them first.
     ///
     /// `Val` is a fancy wrapper for `f64`
-    pub fn compare<M: Borrow<DssimImage<f32>>>(&self, original_image: &DssimImage<f32>, modified_image: M) -> (Val, Vec<SsimMap>) {
+    pub fn compare<M: Borrow<DssimImage<f32>>>(
+        &self,
+        original_image: &DssimImage<f32>,
+        modified_image: M,
+    ) -> (Val, Vec<SsimMap>) {
         self.compare_inner(original_image, modified_image.borrow())
     }
 
     #[inline(never)]
-    fn compare_inner(&self, original_image: &DssimImage<f32>, modified_image: &DssimImage<f32>) -> (Val, Vec<SsimMap>) {
+    fn compare_inner(
+        &self,
+        original_image: &DssimImage<f32>,
+        modified_image: &DssimImage<f32>,
+    ) -> (Val, Vec<SsimMap>) {
         let scaled_images_iter = modified_image.scale.iter().zip(original_image.scale.iter());
-        let combined_iter = self.scale_weights.iter().copied().zip(scaled_images_iter).enumerate();
-
-        let res: Vec<_> = combined_iter.par_bridge().map(|(n, (weight, (modified_image_scale, original_image_scale)))| {
-            let scale_width = original_image_scale.chan[0].width;
-            let scale_height = original_image_scale.chan[0].height;
-            let mut tmp = Vec::with_capacity(scale_width * scale_height);
-            let tmp = &mut tmp.spare_capacity_mut()[0 .. scale_width*scale_height];
-
-            let ssim_map = match original_image_scale.chan.len() {
-                3 => {
-                    let (original_lab, (img1_img2_blur, modified_lab)) = rayon::join(
-                    || Self::lab_chan(original_image_scale),
-                    || rayon::join(
-                        || original_image_scale.chan.img1_img2_blur(&modified_image_scale.chan, tmp),
-                        || Self::lab_chan(modified_image_scale)));
-
-                    Self::compare_scale(&original_lab, &modified_lab, &img1_img2_blur)
+        let combined_iter = self
+            .scale_weights
+            .iter()
+            .copied()
+            .zip(scaled_images_iter)
+            .enumerate();
+
+        let res: Vec<_> = combined_iter
+            .par_bridge()
+            .map(
+                |(n, (weight, (modified_image_scale, original_image_scale)))| {
+                    let scale_width = original_image_scale.chan[0].width;
+                    let scale_height = original_image_scale.chan[0].height;
+                    let ssim_map = match original_image_scale.chan.len() {
+                        3 => {
+                            let pixels = scale_width * scale_height;
+                            let img1_img2_blur: Vec<Vec<f32>> = (0..3usize)
+                                .into_par_iter()
+                                .map(|c| {
+                                    let mut tmp = blur::zeroed_f32_vec(pixels);
+                                    original_image_scale.chan[c]
+                                        .img1_img2_blur(&modified_image_scale.chan[c], &mut tmp)
+                                })
+                                .collect();
+                            Self::compare_scale_3ch(
+                                original_image_scale,
+                                modified_image_scale,
+                                &img1_img2_blur,
+                            )
+                        }
+                        1 => {
+                            let mut tmp = blur::zeroed_f32_vec(scale_width * scale_height);
+                            let img1_img2_blur = original_image_scale.chan[0]
+                                .img1_img2_blur(&modified_image_scale.chan[0], &mut tmp);
+                            Self::compare_scale(
+                                &original_image_scale.chan[0],
+                                &modified_image_scale.chan[0],
+                                &img1_img2_blur,
+                            )
+                        }
+                        _ => panic!(),
+                    };
+
+                    let sum = ssim_map.pixels().fold(0., |sum, i| sum + f64::from(i));
+                    let len = (ssim_map.width() * ssim_map.height()) as f64;
+                    let avg = (sum / len).max(0.0).powf((0.5_f64).powf(n as f64));
+                    let score = 1.0
+                        - (ssim_map
+                            .pixels()
+                            .fold(0., |sum, i| sum + (avg - f64::from(i)).abs())
+                            / len);
+
+                    let map = if self.save_maps_scales as usize > n {
+                        Some(SsimMap {
+                            map: ssim_map,
+                            ssim: score,
+                        })
+                    } else {
+                        None
+                    };
+                    (score, weight, map)
                 },
-                1 => {
-                    let img1_img2_blur = original_image_scale.chan[0].img1_img2_blur(&modified_image_scale.chan[0], tmp);
-                    Self::compare_scale(&original_image_scale.chan[0], &modified_image_scale.chan[0], &img1_img2_blur)
-                },
-                _ => panic!(),
-            };
-
-            let sum = ssim_map.pixels().fold(0., |sum, i| sum + f64::from(i));
-            let len = (ssim_map.width()*ssim_map.height()) as f64;
-            let avg = (sum / len).max(0.0).powf((0.5_f64).powf(n as f64));
-            let score = 1.0 - (ssim_map.pixels().fold(0., |sum, i| sum + (avg - f64::from(i)).abs()) / len);
-
-            let map = if self.save_maps_scales as usize > n {
-                Some(SsimMap {
-                    map: ssim_map,
-                    ssim: score,
-                })
-            } else {
-                None
-            };
-            (score, weight, map)
-        }).collect();
+            )
+            .collect();
 
         let mut ssim_sum = 0.0;
         let mut weight_sum = 0.0;
@@ -345,31 +377,138 @@ impl Dssim {
         (to_dssim(ssim_sum / weight_sum).into(), ssim_maps)
     }
 
-    fn lab_chan(scale: &DssimChanScale<f32>) -> DssimChan<LAB> {
-        let l = &scale.chan[0];
-        let a = &scale.chan[1];
-        let b = &scale.chan[2];
-        assert_eq!(l.width, a.width);
-        assert_eq!(b.width, a.width);
-        DssimChan {
-            img_sq_blur: multizip((l.img_sq_blur.iter().copied(), a.img_sq_blur.iter().copied(), b.img_sq_blur.iter().copied()))
-                .map(|(l,a,b)|LAB {l,a,b}).collect(),
-            img: if let (Some(l),Some(a),Some(b)) = (&l.img, &a.img, &b.img) {
-                let buf = multizip((l.pixels(), a.pixels(), b.pixels())).map(|(l,a,b)|{
-                    debug_assert!(l.is_finite() && a.is_finite() && b.is_finite());
-                    LAB {l,a,b}
-                }).collect();
-                Some(ImgVec::new(buf, l.width(), l.height()))
-            } else {None},
-            mu: multizip((l.mu.iter().copied(), a.mu.iter().copied(), b.mu.iter().copied())).map(|(l,a,b)|LAB {l,a,b}).collect(),
-            is_chroma: false,
-            width: l.width,
-            height: l.height,
+    /// Specialized 3-channel comparison with manually unrolled channel loop.
+    /// Eliminates zip iterator overhead and branch mispredictions from
+    /// iterating over exactly 3 channels.
+    #[inline(never)]
+    fn compare_scale_3ch(
+        original: &DssimChanScale<f32>,
+        modified: &DssimChanScale<f32>,
+        img1_img2_blur: &[Vec<f32>],
+    ) -> ImgVec<f32> {
+        let width = original.chan[0].width;
+        let height = original.chan[0].height;
+        let pixels = width * height;
+
+        // Extract all slice references up front to avoid repeated Vec/struct indexing
+        let (o0, o1, o2) = (&original.chan[0], &original.chan[1], &original.chan[2]);
+        let (m0, m1, m2) = (&modified.chan[0], &modified.chan[1], &modified.chan[2]);
+
+        let o0_mu = &o0.mu[..pixels];
+        let o1_mu = &o1.mu[..pixels];
+        let o2_mu = &o2.mu[..pixels];
+        let m0_mu = &m0.mu[..pixels];
+        let m1_mu = &m1.mu[..pixels];
+        let m2_mu = &m2.mu[..pixels];
+        let o0_sq = &o0.img_sq_blur[..pixels];
+        let o1_sq = &o1.img_sq_blur[..pixels];
+        let o2_sq = &o2.img_sq_blur[..pixels];
+        let m0_sq = &m0.img_sq_blur[..pixels];
+        let m1_sq = &m1.img_sq_blur[..pixels];
+        let m2_sq = &m2.img_sq_blur[..pixels];
+        let i12_0 = &img1_img2_blur[0][..pixels];
+        let i12_1 = &img1_img2_blur[1][..pixels];
+        let i12_2 = &img1_img2_blur[2][..pixels];
+
+        #[cfg(all(feature = "fma", target_arch = "x86_64"))]
+        {
+            use archmage::SimdToken as _;
+            if let Some(token) = archmage::X64V3Token::summon() {
+                let mut map_out = blur::zeroed_f32_vec(pixels);
+                map_out
+                    .par_chunks_mut(1024)
+                    .enumerate()
+                    .for_each(|(ci, chunk)| {
+                        let off = ci * 1024;
+                        let len = chunk.len();
+                        crate::ssim_simd::compare_3ch_avx2(
+                            token,
+                            [
+                                &o0_mu[off..off + len],
+                                &o1_mu[off..off + len],
+                                &o2_mu[off..off + len],
+                            ],
+                            [
+                                &m0_mu[off..off + len],
+                                &m1_mu[off..off + len],
+                                &m2_mu[off..off + len],
+                            ],
+                            [
+                                &o0_sq[off..off + len],
+                                &o1_sq[off..off + len],
+                                &o2_sq[off..off + len],
+                            ],
+                            [
+                                &m0_sq[off..off + len],
+                                &m1_sq[off..off + len],
+                                &m2_sq[off..off + len],
+                            ],
+                            [
+                                &i12_0[off..off + len],
+                                &i12_1[off..off + len],
+                                &i12_2[off..off + len],
+                            ],
+                            chunk,
+                        );
+                    });
+                return ImgVec::new(map_out, width, height);
+            }
         }
+
+        let c1: f32 = 0.01 * 0.01;
+        let c2: f32 = 0.03 * 0.03;
+        let inv3: f32 = 1.0 / 3.0;
+
+        let map_out: Vec<f32> = (0..pixels)
+            .into_par_iter()
+            .with_min_len(1 << 10)
+            .map(|i| {
+                // Channel 0 (L)
+                let mu1_0 = o0_mu[i];
+                let mu2_0 = m0_mu[i];
+                let mu1mu1_0 = mu1_0 * mu1_0;
+                let mu2mu2_0 = mu2_0 * mu2_0;
+                let mu1mu2_0 = mu1_0 * mu2_0;
+
+                // Channel 1 (a)
+                let mu1_1 = o1_mu[i];
+                let mu2_1 = m1_mu[i];
+                let mu1mu1_1 = mu1_1 * mu1_1;
+                let mu2mu2_1 = mu2_1 * mu2_1;
+                let mu1mu2_1 = mu1_1 * mu2_1;
+
+                // Channel 2 (b)
+                let mu1_2 = o2_mu[i];
+                let mu2_2 = m2_mu[i];
+                let mu1mu1_2 = mu1_2 * mu1_2;
+                let mu2mu2_2 = mu2_2 * mu2_2;
+                let mu1mu2_2 = mu1_2 * mu2_2;
+
+                let mu1_sq = (mu1mu1_0 + mu1mu1_1 + mu1mu1_2) * inv3;
+                let mu2_sq = (mu2mu2_0 + mu2mu2_1 + mu2mu2_2) * inv3;
+                let mu1_mu2 = (mu1mu2_0 + mu1mu2_1 + mu1mu2_2) * inv3;
+
+                let sigma1_sq =
+                    ((o0_sq[i] - mu1mu1_0) + (o1_sq[i] - mu1mu1_1) + (o2_sq[i] - mu1mu1_2)) * inv3;
+                let sigma2_sq =
+                    ((m0_sq[i] - mu2mu2_0) + (m1_sq[i] - mu2mu2_1) + (m2_sq[i] - mu2mu2_2)) * inv3;
+                let sigma12 =
+                    ((i12_0[i] - mu1mu2_0) + (i12_1[i] - mu1mu2_1) + (i12_2[i] - mu1mu2_2)) * inv3;
+
+                (2.0 * mu1_mu2 + c1) * (2.0 * sigma12 + c2)
+                    / ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2))
+            })
+            .collect();
+
+        ImgVec::new(map_out, width, height)
     }
 
     #[inline(never)]
-    fn compare_scale<L>(original: &DssimChan<L>, modified: &DssimChan<L>, img1_img2_blur: &[L]) -> ImgVec<f32>
+    fn compare_scale<L>(
+        original: &DssimChan<L>,
+        modified: &DssimChan<L>,
+        img1_img2_blur: &[L],
+    ) -> ImgVec<f32>
     where
         L: Send + Sync + Clone + Copy + ops::Mul<Output = L> + ops::Sub<Output = L> + 'static,
         f32: From<L>,
@@ -388,30 +527,64 @@ impl Dssim {
         debug_assert_eq!(img1_img2_blur.len(), original.mu.len());
         debug_assert_eq!(img1_img2_blur.len(), original.img_sq_blur.len());
 
-        let mu_iter = original.mu.as_slice().par_iter().with_min_len(1<<10).cloned().zip_eq(modified.mu.as_slice().par_iter().with_min_len(1<<10).cloned());
-        let sq_iter = original.img_sq_blur.as_slice().par_iter().with_min_len(1<<10).cloned().zip_eq(modified.img_sq_blur.as_slice().par_iter().with_min_len(1<<10).cloned());
-        let map_out = img1_img2_blur.par_iter().with_min_len(1<<10).cloned().zip_eq(mu_iter).zip_eq(sq_iter)
-        .map(|((img1_img2_blur, (mu1, mu2)), (img1_sq_blur, img2_sq_blur))| {
-            let mu1mu1 = mu1 * mu1;
-            let mu1mu2 = mu1 * mu2;
-            let mu2mu2 = mu2 * mu2;
-            let mu1_sq: f32 = mu1mu1.into();
-            let mu2_sq: f32 = mu2mu2.into();
-            let mu1_mu2: f32 = mu1mu2.into();
-            let sigma1_sq: f32 = (img1_sq_blur - mu1mu1).into();
-            let sigma2_sq: f32 = (img2_sq_blur - mu2mu2).into();
-            let sigma12: f32 = (img1_img2_blur - mu1mu2).into();
-
-            2.0f32.mul_add(mu1_mu2, c1) * 2.0f32.mul_add(sigma12, c2) /
-                       ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2))
-        }).collect();
+        let mu_iter = original
+            .mu
+            .as_slice()
+            .par_iter()
+            .with_min_len(1 << 10)
+            .cloned()
+            .zip_eq(
+                modified
+                    .mu
+                    .as_slice()
+                    .par_iter()
+                    .with_min_len(1 << 10)
+                    .cloned(),
+            );
+        let sq_iter = original
+            .img_sq_blur
+            .as_slice()
+            .par_iter()
+            .with_min_len(1 << 10)
+            .cloned()
+            .zip_eq(
+                modified
+                    .img_sq_blur
+                    .as_slice()
+                    .par_iter()
+                    .with_min_len(1 << 10)
+                    .cloned(),
+            );
+        let map_out = img1_img2_blur
+            .par_iter()
+            .with_min_len(1 << 10)
+            .cloned()
+            .zip_eq(mu_iter)
+            .zip_eq(sq_iter)
+            .map(
+                |((img1_img2_blur, (mu1, mu2)), (img1_sq_blur, img2_sq_blur))| {
+                    let mu1mu1 = mu1 * mu1;
+                    let mu1mu2 = mu1 * mu2;
+                    let mu2mu2 = mu2 * mu2;
+                    let mu1_sq: f32 = mu1mu1.into();
+                    let mu2_sq: f32 = mu2mu2.into();
+                    let mu1_mu2: f32 = mu1mu2.into();
+                    let sigma1_sq: f32 = (img1_sq_blur - mu1mu1).into();
+                    let sigma2_sq: f32 = (img2_sq_blur - mu2mu2).into();
+                    let sigma12: f32 = (img1_img2_blur - mu1mu2).into();
+
+                    (2.0 * mu1_mu2 + c1) * (2.0 * sigma12 + c2)
+                        / ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2))
+                },
+            )
+            .collect();
 
         ImgVec::new(map_out, width, height)
     }
 }
 
 fn to_dssim(ssim: f64) -> f64 {
-    1.0 / ssim.max(std::f64::EPSILON) - 1.0
+    1.0 / ssim.max(f64::EPSILON) - 1.0
 }
 
 #[test]
@@ -425,27 +598,41 @@ fn png_compare() {
 
     let buf1 = &file1.buffer.to_rgbaplu()[..];
     let buf2 = &file2.buffer.to_rgbaplu()[..];
-    let img1 = d.create_image(&Img::new(buf1, file1.width, file1.height)).unwrap();
-    let img2 = d.create_image(&Img::new(buf2, file2.width, file2.height)).unwrap();
+    let img1 = d
+        .create_image(&Img::new(buf1, file1.width, file1.height))
+        .unwrap();
+    let img2 = d
+        .create_image(&Img::new(buf2, file2.width, file2.height))
+        .unwrap();
 
     let (res, _) = d.compare(&img1, img2);
     assert!((0.001 - res).abs() < 0.0005, "res is {res}");
 
-    let img1b = d.create_image(&Img::new(buf1, file1.width, file1.height)).unwrap();
+    let img1b = d
+        .create_image(&Img::new(buf1, file1.width, file1.height))
+        .unwrap();
     let (res, _) = d.compare(&img1, img1b);
 
     assert!(0.000000000000001 > res);
     assert!(res < 0.000000000000001);
     assert_eq!(res, res);
 
-    let sub_img1 = d.create_image(&Img::new(buf1, file1.width, file1.height).sub_image(2,3,44,33)).unwrap();
-    let sub_img2 = d.create_image(&Img::new(buf2, file2.width, file2.height).sub_image(17,9,44,33)).unwrap();
+    let sub_img1 = d
+        .create_image(&Img::new(buf1, file1.width, file1.height).sub_image(2, 3, 44, 33))
+        .unwrap();
+    let sub_img2 = d
+        .create_image(&Img::new(buf2, file2.width, file2.height).sub_image(17, 9, 44, 33))
+        .unwrap();
     // Test passing second image directly
     let (res, _) = d.compare(&sub_img1, sub_img2);
     assert!(res > 0.1);
 
-    let sub_img1 = d.create_image(&Img::new(buf1, file1.width, file1.height).sub_image(22,8,61,40)).unwrap();
-    let sub_img2 = d.create_image(&Img::new(buf2, file2.width, file2.height).sub_image(22,8,61,40)).unwrap();
+    let sub_img1 = d
+        .create_image(&Img::new(buf1, file1.width, file1.height).sub_image(22, 8, 61, 40))
+        .unwrap();
+    let sub_img2 = d
+        .create_image(&Img::new(buf2, file2.width, file2.height).sub_image(22, 8, 61, 40))
+        .unwrap();
     // Test passing second image as reference
     let (res, _) = d.compare(&sub_img1, sub_img2);
     assert!(res < 0.01);
@@ -478,18 +665,18 @@ impl<T> Deref for MaybeArc<'_, T> {
 
 #[test]
 fn poison() {
-    let a = RGBAPLU::new(1.,1.,1.,1.);
-    let b = RGBAPLU::new(0.,0.,0.,0.);
-    let n = 1./0.;
-    let n = RGBAPLU::new(n,n,n,n);
-    let buf = vec![
-      b,a,a,b,n,n,
-      a,b,b,a,n,n,
-      b,a,a,b,n,
-    ];
+    let a = RGBAPLU::new(1., 1., 1., 1.);
+    let b = RGBAPLU::new(0., 0., 0., 0.);
+    let n = 1. / 0.;
+    let n = RGBAPLU::new(n, n, n, n);
+    let buf = vec![b, a, a, b, n, n, a, b, b, a, n, n, b, a, a, b, n];
     let img = ImgVec::new_stride(buf, 4, 3, 6);
     assert!(img.pixels().all(|p| p.r.is_finite() && p.a.is_finite()));
-    assert!(img.as_ref().pixels().all(|p| p.g.is_finite() && p.b.is_finite()));
+    assert!(
+        img.as_ref()
+            .pixels()
+            .all(|p| p.g.is_finite() && p.b.is_finite())
+    );
 
     let d = new();
     let sub_img1 = d.create_image(&img.as_ref()).unwrap();
diff --git a/dssim-core/src/ffi.rs b/dssim-core/src/ffi.rs
index bf0c837..863760a 100644
--- a/dssim-core/src/ffi.rs
+++ b/dssim-core/src/ffi.rs
@@ -1,6 +1,7 @@
 #![allow(dead_code)]
 #![allow(non_camel_case_types)]
 #![allow(non_snake_case)]
+#![allow(clippy::enum_variant_names)]
 
 use std::os::raw::c_ulong;
 
@@ -8,17 +9,19 @@ pub type dssim_px_t = f32;
 
 #[link(name = "Accelerate", kind = "framework")]
 #[cfg(target_os = "macos")]
-extern "C" {
-    pub fn vImageConvolve_PlanarF(src: *const vImage_Buffer<*const f32>,
-                                  dest: *mut vImage_Buffer<*mut f32>,
-                                  tempBuffer: *mut f32,
-                                  srcOffsetToROI_X: vImagePixelCount,
-                                  srcOffsetToROI_Y: vImagePixelCount,
-                                  kernel: *const f32,
-                                  kernel_height: u32,
-                                  kernel_width: u32,
-                                  backgroundColor: Pixel_F,
-                                  flags: vImage_Flags) -> vImage_Error;
+unsafe extern "C" {
+    pub fn vImageConvolve_PlanarF(
+        src: *const vImage_Buffer<*const f32>,
+        dest: *mut vImage_Buffer<*mut f32>,
+        tempBuffer: *mut f32,
+        srcOffsetToROI_X: vImagePixelCount,
+        srcOffsetToROI_Y: vImagePixelCount,
+        kernel: *const f32,
+        kernel_height: u32,
+        kernel_width: u32,
+        backgroundColor: Pixel_F,
+        flags: vImage_Flags,
+    ) -> vImage_Error;
 }
 
 pub type vImagePixelCount = c_ulong;
@@ -29,51 +32,50 @@ pub type Pixel_F = f32;
 pub enum vImage_Flags {
     kvImageNoFlags = 0,
 
-     /* Operate on red, green and blue channels only. Alpha is copied from source
-        to destination. For Interleaved formats only. */
+    /* Operate on red, green and blue channels only. Alpha is copied from source
+    to destination. For Interleaved formats only. */
     kvImageLeaveAlphaUnchanged = 1,
 
-     /* Copy edge pixels. Convolution Only. */
+    /* Copy edge pixels. Convolution Only. */
     kvImageCopyInPlace = 2,
 
     /* Use the background color for missing pixels. */
-    kvImageBackgroundColorFill  = 4,
+    kvImageBackgroundColorFill = 4,
 
     /* Use the nearest pixel for missing pixels. */
     kvImageEdgeExtend = 8,
 
     /* Pass to turn off internal tiling and disable internal multithreading. Use this if
-       you want to do your own tiling, or to use the Min/Max filters in place. */
-    kvImageDoNotTile =   16,
+    you want to do your own tiling, or to use the Min/Max filters in place. */
+    kvImageDoNotTile = 16,
 
     /* Use a higher quality, slower resampling filter for Geometry operations
-       (shear, scale, rotate, affine transform, etc.) */
-    kvImageHighQualityResampling =   32,
+    (shear, scale, rotate, affine transform, etc.) */
+    kvImageHighQualityResampling = 32,
 
-     /* Use only the part of the kernel that overlaps the image. For integer kernels,
-        real_divisor = divisor * (sum of used kernel elements) / (sum of kernel elements).
-        This should preserve image brightness at the edges. Convolution only. */
-    kvImageTruncateKernel  =   64,
+    /* Use only the part of the kernel that overlaps the image. For integer kernels,
+    real_divisor = divisor * (sum of used kernel elements) / (sum of kernel elements).
+    This should preserve image brightness at the edges. Convolution only. */
+    kvImageTruncateKernel = 64,
 
     /* The function will return the number of bytes required for the temp buffer.
-       If this value is negative, it is an error, per standard usage. */
-    kvImageGetTempBufferSize =  128,
+    If this value is negative, it is an error, per standard usage. */
+    kvImageGetTempBufferSize = 128,
 
     /* Some functions such as vImageConverter_CreateWithCGImageFormat have so many possible error conditions
-       that developers may need more help than a simple error code to diagnose problems. When this
-       flag is set and an error is encountered, an informative error message will be logged to the Apple
-       System Logger (ASL).  The output should be visible in Console.app. */
-    kvImagePrintDiagnosticsToConsole =  256,
+    that developers may need more help than a simple error code to diagnose problems. When this
+    flag is set and an error is encountered, an informative error message will be logged to the Apple
+    System Logger (ASL).  The output should be visible in Console.app. */
+    kvImagePrintDiagnosticsToConsole = 256,
 
     /* Pass this flag to prevent vImage from allocating additional storage. */
-    kvImageNoAllocate =  512,
+    kvImageNoAllocate = 512,
 
     /* Use methods that are HDR-aware, capable of providing correct results for input images with pixel values
-       outside the otherwise limited (typically [-2,2]) range. This may be slower. */
-    kvImageHDRContent =  1024
+    outside the otherwise limited (typically [-2,2]) range. This may be slower. */
+    kvImageHDRContent = 1024,
 }
 
-
 #[repr(C)]
 pub struct vImage_Buffer<T> {
     pub data: T,
@@ -81,4 +83,3 @@ pub struct vImage_Buffer<T> {
     pub width: vImagePixelCount,
     pub rowBytes: usize,
 }
-
diff --git a/dssim-core/src/image.rs b/dssim-core/src/image.rs
index 2074448..7ead0f6 100644
--- a/dssim-core/src/image.rs
+++ b/dssim-core/src/image.rs
@@ -190,7 +190,10 @@ pub trait Downsample {
     fn downsample(&self) -> Option<Self::Output>;
 }
 
-impl<T> Downsample for ImgVec<T> where T: Average4 + Copy + Sync + Send {
+impl<T> Downsample for ImgVec<T>
+where
+    T: Average4 + Copy + Sync + Send,
+{
     type Output = ImgVec<T>;
 
     fn downsample(&self) -> Option<Self::Output> {
@@ -198,7 +201,10 @@ impl<T> Downsample for ImgVec<T> where T: Average4 + Copy + Sync + Send {
     }
 }
 
-impl<T> Downsample for ImgRef<'_, T> where T: Average4 + Copy + Sync + Send {
+impl<T> Downsample for ImgRef<'_, T>
+where
+    T: Average4 + Copy + Sync + Send,
+{
     type Output = ImgVec<T>;
 
     fn downsample(&self) -> Option<Self::Output> {
@@ -214,13 +220,20 @@ impl<T> Downsample for ImgRef<'_, T> where T: Average4 + Copy + Sync + Send {
         let half_width = width / 2;
 
         let mut scaled = Vec::with_capacity(half_width * half_height);
-        scaled.extend(self.buf().chunks(stride * 2).take(half_height).flat_map(|pair| {
-            let (top, bot) = pair.split_at(stride);
-            let top = &top[0..half_width * 2];
-            let bot = &bot[0..half_width * 2];
-
-            top.chunks_exact(2).zip(bot.chunks_exact(2)).map(|(a, b)| Average4::average4(a[0], a[1], b[0], b[1]))
-        }));
+        scaled.extend(
+            self.buf()
+                .chunks(stride * 2)
+                .take(half_height)
+                .flat_map(|pair| {
+                    let (top, bot) = pair.split_at(stride);
+                    let top = &top[0..half_width * 2];
+                    let bot = &bot[0..half_width * 2];
+
+                    top.chunks_exact(2)
+                        .zip(bot.chunks_exact(2))
+                        .map(|(a, b)| Average4::average4(a[0], a[1], b[0], b[1]))
+                }),
+        );
 
         assert_eq!(half_width * half_height, scaled.len());
         Some(Img::new(scaled, half_width, half_height))
@@ -238,15 +251,21 @@ pub(crate) fn worst(input: ImgRef<'_, f32>) -> ImgVec<f32> {
     }
 
     let mut scaled = Vec::with_capacity(half_width * half_height);
-    scaled.extend(input.buf().chunks(stride * 2).take(half_height).flat_map(|pair| {
-        let (top, bot) = pair.split_at(stride);
-        let top = &top[0..half_width * 2];
-        let bot = &bot[0..half_width * 2];
-
-        top.chunks_exact(2).zip(bot.chunks_exact(2)).map(|(a,b)| {
-            a[0].min(a[1]).min(b[0].min(b[1]))
-        })
-    }));
+    scaled.extend(
+        input
+            .buf()
+            .chunks(stride * 2)
+            .take(half_height)
+            .flat_map(|pair| {
+                let (top, bot) = pair.split_at(stride);
+                let top = &top[0..half_width * 2];
+                let bot = &bot[0..half_width * 2];
+
+                top.chunks_exact(2)
+                    .zip(bot.chunks_exact(2))
+                    .map(|(a, b)| a[0].min(a[1]).min(b[0].min(b[1])))
+            }),
+    );
 
     assert_eq!(half_width * half_height, scaled.len());
     Img::new(scaled, half_width, half_height)
@@ -263,15 +282,22 @@ pub(crate) fn avgworst(input: ImgRef<'_, f32>) -> ImgVec<f32> {
     }
 
     let mut scaled = Vec::with_capacity(half_width * half_height);
-    scaled.extend(input.buf().chunks(stride * 2).take(half_height).flat_map(|pair| {
-        let (top, bot) = pair.split_at(stride);
-        let top = &top[0..half_width * 2];
-        let bot = &bot[0..half_width * 2];
-
-        top.chunks_exact(2).zip(bot.chunks_exact(2)).map(|(a,b)| {
-            (a[0] + a[1] + b[0] + b[1]).mul_add(0.25, a[0].min(a[1]).min(b[0].min(b[1])))*0.5
-        })
-    }));
+    scaled.extend(
+        input
+            .buf()
+            .chunks(stride * 2)
+            .take(half_height)
+            .flat_map(|pair| {
+                let (top, bot) = pair.split_at(stride);
+                let top = &top[0..half_width * 2];
+                let bot = &bot[0..half_width * 2];
+
+                top.chunks_exact(2).zip(bot.chunks_exact(2)).map(|(a, b)| {
+                    (a[0] + a[1] + b[0] + b[1]).mul_add(0.25, a[0].min(a[1]).min(b[0].min(b[1])))
+                        * 0.5
+                })
+            }),
+    );
 
     assert_eq!(half_width * half_height, scaled.len());
     Img::new(scaled, half_width, half_height)
@@ -288,15 +314,21 @@ pub(crate) fn avg(input: ImgRef<'_, f32>) -> ImgVec<f32> {
     }
 
     let mut scaled = Vec::with_capacity(half_width * half_height);
-    scaled.extend(input.buf().chunks(stride * 2).take(half_height).flat_map(|pair| {
-        let (top, bot) = pair.split_at(stride);
-        let top = &top[0..half_width * 2];
-        let bot = &bot[0..half_width * 2];
-
-        top.chunks_exact(2).zip(bot.chunks_exact(2)).map(|(a,b)| {
-            (a[0] + a[1] + b[0] + b[1]) * 0.25
-        })
-    }));
+    scaled.extend(
+        input
+            .buf()
+            .chunks(stride * 2)
+            .take(half_height)
+            .flat_map(|pair| {
+                let (top, bot) = pair.split_at(stride);
+                let top = &top[0..half_width * 2];
+                let bot = &bot[0..half_width * 2];
+
+                top.chunks_exact(2)
+                    .zip(bot.chunks_exact(2))
+                    .map(|(a, b)| (a[0] + a[1] + b[0] + b[1]) * 0.25)
+            }),
+    );
 
     assert_eq!(half_width * half_height, scaled.len());
     Img::new(scaled, half_width, half_height)
diff --git a/dssim-core/src/lib.rs b/dssim-core/src/lib.rs
index 1288900..36847d8 100644
--- a/dssim-core/src/lib.rs
+++ b/dssim-core/src/lib.rs
@@ -2,18 +2,22 @@
 //! support several pixel types. It also allows replacing some parts of the algorithm with different implementations
 //! (if you need higher accuracy or higher speed).
 #![doc(html_logo_url = "https://kornel.ski/dssim/logo.png")]
+#![deny(unsafe_code)]
 #![allow(clippy::manual_range_contains)]
 #![allow(clippy::new_without_default)]
 
 mod blur;
+#[allow(unsafe_code)]
 mod c_api;
 mod dssim;
 /// cbindgen:ignore
+#[allow(unsafe_code)]
 mod ffi;
 mod image;
 #[cfg(not(feature = "threads"))]
 mod lieon;
 mod linear;
+mod ssim_simd;
 mod tolab;
 mod val;
 
diff --git a/dssim-core/src/lieon.rs b/dssim-core/src/lieon.rs
index b3ee837..14ab8ad 100644
--- a/dssim-core/src/lieon.rs
+++ b/dssim-core/src/lieon.rs
@@ -10,13 +10,18 @@ pub mod prelude {
 }
 
 pub trait ParIterator: Sized {
-    fn with_min_len(self, _one: usize) -> Self { self }
-    fn with_max_len(self, _one: usize) -> Self { self }
-    fn par_bridge(self) -> Self { self }
+    fn with_min_len(self, _one: usize) -> Self {
+        self
+    }
+    fn with_max_len(self, _one: usize) -> Self {
+        self
+    }
+    fn par_bridge(self) -> Self {
+        self
+    }
 }
 
-impl<T: Iterator> ParIterator for T {
-}
+impl<T: Iterator> ParIterator for T {}
 
 pub trait ParSliceLie<T> {
     fn par_chunks(&self, n: usize) -> std::slice::Chunks<'_, T>;
diff --git a/dssim-core/src/linear.rs b/dssim-core/src/linear.rs
index d6d4ad1..72473d2 100644
--- a/dssim-core/src/linear.rs
+++ b/dssim-core/src/linear.rs
@@ -6,7 +6,7 @@ use rgb::*;
 #[doc(hidden)]
 pub trait GammaComponent {
     type Lut;
-    fn max_value() -> usize;
+    const COMPONENT_MAX: f32;
     fn to_linear(&self, lut: &Self::Lut) -> f32;
     fn make_lut() -> Self::Lut;
 }
@@ -52,7 +52,7 @@ pub trait ToRGBAPLU {
 
 impl GammaComponent for u8 {
     type Lut = [f32; 256];
-    fn max_value() -> usize { 255 }
+    const COMPONENT_MAX: f32 = 255.;
     #[inline(always)]
     fn to_linear(&self, lut: &Self::Lut) -> f32 {
         lut[*self as usize]
@@ -62,7 +62,7 @@ impl GammaComponent for u8 {
     fn make_lut() -> Self::Lut {
         let mut out = [0.; 256];
         for (i, o) in out.iter_mut().enumerate() {
-            *o = to_linear(i as f32 / f32::from(Self::max_value()));
+            *o = to_linear(i as f32 / Self::COMPONENT_MAX);
         }
         out
     }
@@ -70,7 +70,7 @@ impl GammaComponent for u8 {
 
 impl GammaComponent for u16 {
     type Lut = [f32; 65536];
-    fn max_value() -> usize { 65535 }
+    const COMPONENT_MAX: f32 = 65535.;
     #[inline(always)]
     fn to_linear(&self, lut: &Self::Lut) -> f32 {
         lut[*self as usize]
@@ -80,18 +80,21 @@ impl GammaComponent for u16 {
     fn make_lut() -> Self::Lut {
         let mut out = [0.; 65536];
         for (i, o) in out.iter_mut().enumerate() {
-            *o = to_linear(i as f32 / f32::from(Self::max_value()));
+            *o = to_linear(i as f32 / Self::COMPONENT_MAX);
         }
         out
     }
 }
 
-impl<M> GammaPixel for RGBA<M> where M: Clone + Into<f32> + GammaComponent {
+impl<M> GammaPixel for RGBA<M>
+where
+    M: Clone + Into<f32> + GammaComponent,
+{
     type Component = M;
     type Output = RGBAPLU;
     #[inline]
     fn to_linear(&self, gamma_lut: &M::Lut) -> RGBAPLU {
-        let a_unit = self.a.clone().into() / M::max_value() as f32;
+        let a_unit = self.a.clone().into() / M::COMPONENT_MAX;
         RGBAPLU {
             r: self.r.to_linear(gamma_lut) * a_unit,
             g: self.g.to_linear(gamma_lut) * a_unit,
@@ -101,13 +104,16 @@ impl<M> GammaPixel for RGBA<M> where M: Clone + Into<f32> + GammaComponent {
     }
 }
 
-impl<M> GammaPixel for BGRA<M> where M: Clone + Into<f32> + GammaComponent {
+impl<M> GammaPixel for BGRA<M>
+where
+    M: Clone + Into<f32> + GammaComponent,
+{
     type Component = M;
     type Output = RGBAPLU;
 
     #[inline]
     fn to_linear(&self, gamma_lut: &M::Lut) -> RGBAPLU {
-        let a_unit = self.a.clone().into() / M::max_value() as f32;
+        let a_unit = self.a.clone().into() / M::COMPONENT_MAX;
         RGBAPLU {
             r: self.r.to_linear(gamma_lut) * a_unit,
             g: self.g.to_linear(gamma_lut) * a_unit,
@@ -117,7 +123,10 @@ impl<M> GammaPixel for BGRA<M> where M: Clone + Into<f32> + GammaComponent {
     }
 }
 
-impl<M> GammaPixel for RGB<M> where M: GammaComponent {
+impl<M> GammaPixel for RGB<M>
+where
+    M: GammaComponent,
+{
     type Component = M;
     type Output = RGBAPLU;
     #[inline]
@@ -131,7 +140,10 @@ impl<M> GammaPixel for RGB<M> where M: GammaComponent {
     }
 }
 
-impl<M> GammaPixel for BGR<M> where M: GammaComponent {
+impl<M> GammaPixel for BGR<M>
+where
+    M: GammaComponent,
+{
     type Component = M;
     type Output = RGBAPLU;
 
@@ -146,12 +158,15 @@ impl<M> GammaPixel for BGR<M> where M: GammaComponent {
     }
 }
 
-impl<M> GammaPixel for GrayAlpha<M> where M: Copy + Clone + Into<f32> + GammaComponent {
+impl<M> GammaPixel for GrayAlpha<M>
+where
+    M: Copy + Clone + Into<f32> + GammaComponent,
+{
     type Component = M;
     type Output = RGBAPLU;
 
     fn to_linear(&self, gamma_lut: &M::Lut) -> RGBAPLU {
-        let a_unit = self.value().into() / M::max_value() as f32;
+        let a_unit = self.value().into() / M::COMPONENT_MAX;
         let g = self.value().to_linear(gamma_lut);
         RGBAPLU {
             r: g * a_unit,
@@ -162,7 +177,10 @@ impl<M> GammaPixel for GrayAlpha<M> where M: Copy + Clone + Into<f32> + GammaCom
     }
 }
 
-impl<M> GammaPixel for M where M: GammaComponent {
+impl<M> GammaPixel for M
+where
+    M: GammaComponent,
+{
     type Component = M;
     type Output = f32;
 
@@ -172,18 +190,29 @@ impl<M> GammaPixel for M where M: GammaComponent {
     }
 }
 
-impl<M> GammaPixel for Gray<M> where M: Copy + GammaComponent {
+impl<M> GammaPixel for Gray<M>
+where
+    M: Copy + GammaComponent,
+{
     type Component = M;
     type Output = RGBAPLU;
 
     #[inline(always)]
     fn to_linear(&self, gamma_lut: &M::Lut) -> RGBAPLU {
         let g = self.value().to_linear(gamma_lut);
-        RGBAPLU { r: g, g, b: g, a: 1.0 }
+        RGBAPLU {
+            r: g,
+            g,
+            b: g,
+            a: 1.0,
+        }
     }
 }
 
-impl<P> ToRGBAPLU for [P] where P: GammaPixel<Output=RGBAPLU> {
+impl<P> ToRGBAPLU for [P]
+where
+    P: GammaPixel<Output = RGBAPLU>,
+{
     fn to_rgbaplu(&self) -> Vec<RGBAPLU> {
         let gamma_lut = P::make_lut();
         self.iter().map(|px| px.to_linear(&gamma_lut)).collect()
@@ -191,6 +220,8 @@ impl<P> ToRGBAPLU for [P] where P: GammaPixel<Output=RGBAPLU> {
 
     fn to_rgblu(&self) -> Vec<RGBLU> {
         let gamma_lut = P::make_lut();
-        self.iter().map(|px| px.to_linear(&gamma_lut).rgb()).collect()
+        self.iter()
+            .map(|px| px.to_linear(&gamma_lut).rgb())
+            .collect()
     }
 }
diff --git a/dssim-core/src/ssim_simd.rs b/dssim-core/src/ssim_simd.rs
new file mode 100644
index 0000000..f96eaa3
--- /dev/null
+++ b/dssim-core/src/ssim_simd.rs
@@ -0,0 +1,121 @@
+// AVX2+FMA SIMD for 3-channel SSIM comparison.
+// Processes 8 pixels per iteration using 256-bit vectors.
+
+#[cfg(all(feature = "fma", target_arch = "x86_64"))]
+mod simd {
+    use archmage::prelude::*;
+    use magetypes::simd::f32x8;
+
+    #[arcane]
+    pub fn compare_3ch_avx2(
+        t: X64V3Token,
+        o_mu: [&[f32]; 3],
+        m_mu: [&[f32]; 3],
+        o_sq: [&[f32]; 3],
+        m_sq: [&[f32]; 3],
+        i12: [&[f32]; 3],
+        out: &mut [f32],
+    ) {
+        let len = out.len();
+        let vc1 = f32x8::splat(t, 0.01 * 0.01);
+        let vc2 = f32x8::splat(t, 0.03 * 0.03);
+        let vinv3 = f32x8::splat(t, 1.0 / 3.0);
+        let vtwo = f32x8::splat(t, 2.0);
+
+        let mut i = 0;
+        while i + 8 <= len {
+            // 15 loads: 6 mu + 6 sq + 3 i12
+            let omu0 = f32x8::load(t, (&o_mu[0][i..i + 8]).try_into().unwrap());
+            let omu1 = f32x8::load(t, (&o_mu[1][i..i + 8]).try_into().unwrap());
+            let omu2 = f32x8::load(t, (&o_mu[2][i..i + 8]).try_into().unwrap());
+            let mmu0 = f32x8::load(t, (&m_mu[0][i..i + 8]).try_into().unwrap());
+            let mmu1 = f32x8::load(t, (&m_mu[1][i..i + 8]).try_into().unwrap());
+            let mmu2 = f32x8::load(t, (&m_mu[2][i..i + 8]).try_into().unwrap());
+
+            let osq0 = f32x8::load(t, (&o_sq[0][i..i + 8]).try_into().unwrap());
+            let osq1 = f32x8::load(t, (&o_sq[1][i..i + 8]).try_into().unwrap());
+            let osq2 = f32x8::load(t, (&o_sq[2][i..i + 8]).try_into().unwrap());
+            let msq0 = f32x8::load(t, (&m_sq[0][i..i + 8]).try_into().unwrap());
+            let msq1 = f32x8::load(t, (&m_sq[1][i..i + 8]).try_into().unwrap());
+            let msq2 = f32x8::load(t, (&m_sq[2][i..i + 8]).try_into().unwrap());
+
+            let iv0 = f32x8::load(t, (&i12[0][i..i + 8]).try_into().unwrap());
+            let iv1 = f32x8::load(t, (&i12[1][i..i + 8]).try_into().unwrap());
+            let iv2 = f32x8::load(t, (&i12[2][i..i + 8]).try_into().unwrap());
+
+            // Per-channel mu products (9 muls)
+            let mu1mu1_0 = omu0 * omu0;
+            let mu2mu2_0 = mmu0 * mmu0;
+            let mu1mu2_0 = omu0 * mmu0;
+
+            let mu1mu1_1 = omu1 * omu1;
+            let mu2mu2_1 = mmu1 * mmu1;
+            let mu1mu2_1 = omu1 * mmu1;
+
+            let mu1mu1_2 = omu2 * omu2;
+            let mu2mu2_2 = mmu2 * mmu2;
+            let mu1mu2_2 = omu2 * mmu2;
+
+            // Average across channels (* inv3)
+            let mu1_sq = (mu1mu1_0 + mu1mu1_1 + mu1mu1_2) * vinv3;
+            let mu2_sq = (mu2mu2_0 + mu2mu2_1 + mu2mu2_2) * vinv3;
+            let mu1_mu2 = (mu1mu2_0 + mu1mu2_1 + mu1mu2_2) * vinv3;
+
+            // sigma = sq_blur - mu^2, averaged across channels
+            let sigma1_sq = ((osq0 - mu1mu1_0) + (osq1 - mu1mu1_1) + (osq2 - mu1mu1_2)) * vinv3;
+            let sigma2_sq = ((msq0 - mu2mu2_0) + (msq1 - mu2mu2_1) + (msq2 - mu2mu2_2)) * vinv3;
+            let sigma12 = ((iv0 - mu1mu2_0) + (iv1 - mu1mu2_1) + (iv2 - mu1mu2_2)) * vinv3;
+
+            // SSIM = (2*mu1_mu2 + c1)*(2*sigma12 + c2) / ((mu1_sq+mu2_sq+c1)*(sigma1_sq+sigma2_sq+c2))
+            let num = vtwo.mul_add(mu1_mu2, vc1) * vtwo.mul_add(sigma12, vc2);
+            let den = (mu1_sq + mu2_sq + vc1) * (sigma1_sq + sigma2_sq + vc2);
+            let result = num / den;
+
+            result.store((&mut out[i..i + 8]).try_into().unwrap());
+            i += 8;
+        }
+
+        // Scalar tail for remaining pixels
+        let c1: f32 = 0.01 * 0.01;
+        let c2: f32 = 0.03 * 0.03;
+        let inv3: f32 = 1.0 / 3.0;
+        for i in i..len {
+            let mu1_0 = o_mu[0][i];
+            let mu2_0 = m_mu[0][i];
+            let mu1mu1_0 = mu1_0 * mu1_0;
+            let mu2mu2_0 = mu2_0 * mu2_0;
+            let mu1mu2_0 = mu1_0 * mu2_0;
+
+            let mu1_1 = o_mu[1][i];
+            let mu2_1 = m_mu[1][i];
+            let mu1mu1_1 = mu1_1 * mu1_1;
+            let mu2mu2_1 = mu2_1 * mu2_1;
+            let mu1mu2_1 = mu1_1 * mu2_1;
+
+            let mu1_2 = o_mu[2][i];
+            let mu2_2 = m_mu[2][i];
+            let mu1mu1_2 = mu1_2 * mu1_2;
+            let mu2mu2_2 = mu2_2 * mu2_2;
+            let mu1mu2_2 = mu1_2 * mu2_2;
+
+            let mu1_sq = (mu1mu1_0 + mu1mu1_1 + mu1mu1_2) * inv3;
+            let mu2_sq = (mu2mu2_0 + mu2mu2_1 + mu2mu2_2) * inv3;
+            let mu1_mu2 = (mu1mu2_0 + mu1mu2_1 + mu1mu2_2) * inv3;
+
+            let sigma1_sq =
+                ((o_sq[0][i] - mu1mu1_0) + (o_sq[1][i] - mu1mu1_1) + (o_sq[2][i] - mu1mu1_2))
+                    * inv3;
+            let sigma2_sq =
+                ((m_sq[0][i] - mu2mu2_0) + (m_sq[1][i] - mu2mu2_1) + (m_sq[2][i] - mu2mu2_2))
+                    * inv3;
+            let sigma12 =
+                ((i12[0][i] - mu1mu2_0) + (i12[1][i] - mu1mu2_1) + (i12[2][i] - mu1mu2_2)) * inv3;
+
+            out[i] = (2.0 * mu1_mu2 + c1) * (2.0 * sigma12 + c2)
+                / ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2));
+        }
+    }
+}
+
+#[cfg(all(feature = "fma", target_arch = "x86_64"))]
+pub(crate) use simd::compare_3ch_avx2;
diff --git a/dssim-core/src/tolab.rs b/dssim-core/src/tolab.rs
index 69ea6fa..6809e58 100644
--- a/dssim-core/src/tolab.rs
+++ b/dssim-core/src/tolab.rs
@@ -1,12 +1,12 @@
 #![allow(non_snake_case)]
 #![allow(non_upper_case_globals)]
 
-use crate::image::ToRGB;
 use crate::image::RGBAPLU;
 use crate::image::RGBLU;
-use imgref::*;
+use crate::image::ToRGB;
 #[cfg(not(feature = "threads"))]
 use crate::lieon as rayon;
+use imgref::*;
 use rayon::prelude::*;
 
 const D65x: f32 = 0.9505;
@@ -20,7 +20,7 @@ pub(crate) trait ToLAB {
 
 #[inline(always)]
 fn fma_matrix(r: f32, rx: f32, g: f32, gx: f32, b: f32, bx: f32) -> f32 {
-    b.mul_add(bx, g.mul_add(gx, r * rx))
+    r * rx + g * gx + b * bx
 }
 
 const EPSILON: f32 = 216. / 24389.;
@@ -28,18 +28,51 @@ const K: f32 = 24389. / (27. * 116.); // http://www.brucelindbloom.com/LContinui
 
 impl ToLAB for RGBLU {
     fn to_lab(&self) -> (f32, f32, f32) {
-        let fx = fma_matrix(self.r, 0.4124 / D65x, self.g, 0.3576 / D65x, self.b, 0.1805 / D65x);
-        let fy = fma_matrix(self.r, 0.2126 / D65y, self.g, 0.7152 / D65y, self.b, 0.0722 / D65y);
-        let fz = fma_matrix(self.r, 0.0193 / D65z, self.g, 0.1192 / D65z, self.b, 0.9505 / D65z);
+        let fx = fma_matrix(
+            self.r,
+            0.4124 / D65x,
+            self.g,
+            0.3576 / D65x,
+            self.b,
+            0.1805 / D65x,
+        );
+        let fy = fma_matrix(
+            self.r,
+            0.2126 / D65y,
+            self.g,
+            0.7152 / D65y,
+            self.b,
+            0.0722 / D65y,
+        );
+        let fz = fma_matrix(
+            self.r,
+            0.0193 / D65z,
+            self.g,
+            0.1192 / D65z,
+            self.b,
+            0.9505 / D65z,
+        );
 
-        let X = if fx > EPSILON { cbrt_poly(fx) - 16. / 116. } else { K * fx };
-        let Y = if fy > EPSILON { cbrt_poly(fy) - 16. / 116. } else { K * fy };
-        let Z = if fz > EPSILON { cbrt_poly(fz) - 16. / 116. } else { K * fz };
+        let X = if fx > EPSILON {
+            cbrt_poly(fx) - 16. / 116.
+        } else {
+            K * fx
+        };
+        let Y = if fy > EPSILON {
+            cbrt_poly(fy) - 16. / 116.
+        } else {
+            K * fy
+        };
+        let Z = if fz > EPSILON {
+            cbrt_poly(fz) - 16. / 116.
+        } else {
+            K * fz
+        };
 
         let lab = (
             (Y * 1.05f32), // 1.05 instead of 1.16 to boost color importance without pushing colors outside of 1.0 range
-            (500.0 / 220.0f32).mul_add(X - Y, 86.2 / 220.0f32), /* 86 is a fudge to make the value positive */
-            (200.0 / 220.0f32).mul_add(Y - Z, 107.9 / 220.0f32), /* 107 is a fudge to make the value positive */
+            (500.0 / 220.0) * (X - Y) + (86.2 / 220.0), /* 86 is a fudge to make the value positive */
+            (200.0 / 220.0) * (Y - Z) + (107.9 / 220.0), /* 107 is a fudge to make the value positive */
         );
         debug_assert!(lab.0 <= 1.0 && lab.1 <= 1.0 && lab.2 <= 1.0);
         lab
@@ -53,15 +86,208 @@ fn cbrt_poly(x: f32) -> f32 {
     let y = (poly[2] * x + poly[1]) * x + poly[0];
 
     // 2x Halley's Method
-    let y3 = y*y*y;
+    let y3 = y * y * y;
     let y = y * (y3 + 2. * x) / (2. * y3 + x);
-    let y3 = y*y*y;
+    let y3 = y * y * y;
     let y = y * (y3 + 2. * x) / (2. * y3 + x);
     debug_assert!(y < 1.001);
     debug_assert!(x < 216. / 24389. || y >= 16. / 116.);
     y
 }
 
+// ── AVX2+FMA SIMD path ──────────────────────────────────────────────
+#[cfg(all(feature = "fma", target_arch = "x86_64"))]
+mod simd {
+    use super::{EPSILON, GBitmap, K, RGBLU, ToLAB, cbrt_poly};
+    #[cfg(not(feature = "threads"))]
+    use crate::lieon::prelude::*;
+    use archmage::prelude::*;
+    use imgref::*;
+    use magetypes::simd::f32x8;
+    #[cfg(feature = "threads")]
+    use rayon::prelude::*;
+
+    #[rite]
+    fn cbrt_poly_x8(t: X64V3Token, x: f32x8) -> f32x8 {
+        let two = f32x8::splat(t, 2.0);
+        let half_neg = f32x8::splat(t, -0.5);
+        let c151 = f32x8::splat(t, 1.51);
+        let c02 = f32x8::splat(t, 0.2);
+
+        // Polynomial guess: y = (-0.5*x + 1.51)*x + 0.2
+        let y = half_neg.mul_add(x, c151).mul_add(x, c02);
+
+        // 1st Halley's: y = y*(y³+2x) / (2y³+x)
+        let y3 = y * y * y;
+        let y = y * two.mul_add(x, y3) / two.mul_add(y3, x);
+
+        // 2nd Halley's
+        let y3 = y * y * y;
+        y * two.mul_add(x, y3) / two.mul_add(y3, x)
+    }
+
+    #[rite]
+    fn to_lab_x8(t: X64V3Token, vr: f32x8, vg: f32x8, vb: f32x8) -> (f32x8, f32x8, f32x8) {
+        // Matrix constants (pre-divided by D65)
+        let m00 = f32x8::splat(t, 0.4124 / 0.9505);
+        let m01 = f32x8::splat(t, 0.3576 / 0.9505);
+        let m02 = f32x8::splat(t, 0.1805 / 0.9505);
+        let m10 = f32x8::splat(t, 0.2126);
+        let m11 = f32x8::splat(t, 0.7152);
+        let m12 = f32x8::splat(t, 0.0722);
+        let m20 = f32x8::splat(t, 0.0193 / 1.089);
+        let m21 = f32x8::splat(t, 0.1192 / 1.089);
+        let m22 = f32x8::splat(t, 0.9505 / 1.089);
+
+        // RGB → XYZ/D65 (6 FMAs + 3 muls)
+        let fx = vr.mul_add(m00, vg.mul_add(m01, vb * m02));
+        let fy = vr.mul_add(m10, vg.mul_add(m11, vb * m12));
+        let fz = vr.mul_add(m20, vg.mul_add(m21, vb * m22));
+
+        // Conditional cbrt: if f > EPSILON { cbrt(f) - 16/116 } else { K*f }
+        let veps = f32x8::splat(t, EPSILON);
+        let vk = f32x8::splat(t, K);
+        let v16_116 = f32x8::splat(t, 16.0 / 116.0);
+
+        let cbrt_x = cbrt_poly_x8(t, fx) - v16_116;
+        let linear_x = vk * fx;
+        let big_x = f32x8::blend(fx.simd_gt(veps), cbrt_x, linear_x);
+
+        let cbrt_y = cbrt_poly_x8(t, fy) - v16_116;
+        let linear_y = vk * fy;
+        let big_y = f32x8::blend(fy.simd_gt(veps), cbrt_y, linear_y);
+
+        let cbrt_z = cbrt_poly_x8(t, fz) - v16_116;
+        let linear_z = vk * fz;
+        let big_z = f32x8::blend(fz.simd_gt(veps), cbrt_z, linear_z);
+
+        // LAB scaling
+        let l = big_y * f32x8::splat(t, 1.05);
+        let a = (big_x - big_y) * f32x8::splat(t, 500.0 / 220.0) + f32x8::splat(t, 86.2 / 220.0);
+        let b = (big_y - big_z) * f32x8::splat(t, 200.0 / 220.0) + f32x8::splat(t, 107.9 / 220.0);
+
+        (l, a, b)
+    }
+
+    #[arcane]
+    pub fn process_row_avx2(
+        t: X64V3Token,
+        in_row: &[RGBLU],
+        l_row: &mut [f32],
+        a_row: &mut [f32],
+        b_row: &mut [f32],
+    ) {
+        let width = in_row.len();
+        let mut x = 0;
+
+        // SIMD loop: 8 pixels at a time
+        while x + 8 <= width {
+            // AoS→SoA gather
+            let mut rs = [0f32; 8];
+            let mut gs = [0f32; 8];
+            let mut bs = [0f32; 8];
+            for i in 0..8 {
+                rs[i] = in_row[x + i].r;
+                gs[i] = in_row[x + i].g;
+                bs[i] = in_row[x + i].b;
+            }
+            let vr = f32x8::from_array(t, rs);
+            let vg = f32x8::from_array(t, gs);
+            let vb = f32x8::from_array(t, bs);
+
+            let (vl, va, vb_out) = to_lab_x8(t, vr, vg, vb);
+
+            vl.store((&mut l_row[x..x + 8]).try_into().unwrap());
+            va.store((&mut a_row[x..x + 8]).try_into().unwrap());
+            vb_out.store((&mut b_row[x..x + 8]).try_into().unwrap());
+            x += 8;
+        }
+
+        // Scalar tail
+        for x in x..width {
+            let (l, a, b) = in_row[x].to_lab();
+            l_row[x] = l;
+            a_row[x] = a;
+            b_row[x] = b;
+        }
+    }
+
+    #[arcane]
+    pub fn process_gray_row_avx2(t: X64V3Token, in_row: &[f32], out_row: &mut [f32]) {
+        let width = in_row.len();
+        let veps = f32x8::splat(t, EPSILON);
+        let vk_116 = f32x8::splat(t, K * 1.16);
+        let v16_116 = f32x8::splat(t, 16.0 / 116.0);
+        let v116 = f32x8::splat(t, 1.16);
+
+        let mut x = 0;
+        while x + 8 <= width {
+            let fy = f32x8::load(t, (&in_row[x..x + 8]).try_into().unwrap());
+            let cbrt_path = (cbrt_poly_x8(t, fy) - v16_116) * v116;
+            let linear_path = vk_116 * fy;
+            let result = f32x8::blend(fy.simd_gt(veps), cbrt_path, linear_path);
+            result.store((&mut out_row[x..x + 8]).try_into().unwrap());
+            x += 8;
+        }
+
+        // Scalar tail
+        for x in x..width {
+            let fy = in_row[x];
+            out_row[x] = if fy > EPSILON {
+                (cbrt_poly(fy) - 16. / 116.) * 1.16
+            } else {
+                (K * 1.16) * fy
+            };
+        }
+    }
+
+    pub fn rgb_to_lab_simd(token: X64V3Token, img: ImgRef<'_, RGBLU>) -> Vec<GBitmap> {
+        let width = img.width();
+        let height = img.height();
+        assert!(width > 0);
+        let area = width * height;
+
+        let mut out_l = vec![0f32; area];
+        let mut out_a = vec![0f32; area];
+        let mut out_b = vec![0f32; area];
+
+        out_l
+            .par_chunks_exact_mut(width)
+            .zip(
+                out_a
+                    .par_chunks_exact_mut(width)
+                    .zip(out_b.par_chunks_exact_mut(width)),
+            )
+            .enumerate()
+            .for_each(|(y, (l_row, (a_row, b_row)))| {
+                let in_row = &img.rows().nth(y).unwrap()[0..width];
+                process_row_avx2(token, in_row, l_row, a_row, b_row);
+            });
+
+        vec![
+            Img::new(out_l, width, height),
+            Img::new(out_a, width, height),
+            Img::new(out_b, width, height),
+        ]
+    }
+
+    pub fn gray_to_lab_simd(token: X64V3Token, img: &GBitmap) -> Vec<GBitmap> {
+        let width = img.width();
+        let height = img.height();
+        let area = width * height;
+        let mut out = vec![0f32; area];
+
+        out.par_chunks_exact_mut(width)
+            .enumerate()
+            .for_each(|(y, out_row)| {
+                let in_row = &img.rows().nth(y).unwrap()[0..width];
+                process_gray_row_avx2(token, in_row, out_row);
+            });
+
+        vec![Img::new(out, width, height)]
+    }
+}
+
 /// Convert image to L\*a\*b\* planar
 ///
 /// It should return 1 (gray) or 3 (color) planes.
@@ -85,14 +311,28 @@ impl ToLABBitmap for ImgVec<RGBLU> {
 impl ToLABBitmap for GBitmap {
     fn to_lab(&self) -> Vec<GBitmap> {
         debug_assert!(self.width() > 0);
+
+        #[cfg(all(feature = "fma", target_arch = "x86_64"))]
+        {
+            use archmage::SimdToken as _;
+            if let Some(token) = archmage::X64V3Token::summon() {
+                return simd::gray_to_lab_simd(token, self);
+            }
+        }
+
         let f = |fy| {
-            if fy > EPSILON { (cbrt_poly(fy) - 16. / 116.) * 1.16 } else { (K * 1.16) * fy }
+            if fy > EPSILON {
+                (cbrt_poly(fy) - 16. / 116.) * 1.16
+            } else {
+                (K * 1.16) * fy
+            }
         };
 
         #[cfg(feature = "threads")]
-        let out = (0..self.height()).into_par_iter().flat_map_iter(|y| {
-            self[y].iter().map(|&fy| f(fy))
-        }).collect();
+        let out = (0..self.height())
+            .into_par_iter()
+            .flat_map_iter(|y| self[y].iter().map(|&fy| f(fy)))
+            .collect();
 
         #[cfg(not(feature = "threads"))]
         let out = self.pixels().map(f).collect();
@@ -103,39 +343,42 @@ impl ToLABBitmap for GBitmap {
 
 #[inline(never)]
 fn rgb_to_lab<T: Copy + Sync + Send + 'static, F>(img: ImgRef<'_, T>, cb: F) -> Vec<GBitmap>
-    where F: Fn(T, usize) -> (f32, f32, f32) + Sync + Send + 'static
+where
+    F: Fn(T, usize) -> (f32, f32, f32) + Sync + Send + 'static,
 {
     let width = img.width();
     assert!(width > 0);
     let height = img.height();
     let area = width * height;
 
-    let mut out_l = Vec::with_capacity(area);
-    let mut out_a = Vec::with_capacity(area);
-    let mut out_b = Vec::with_capacity(area);
+    let mut out_l = vec![0f32; area];
+    let mut out_a = vec![0f32; area];
+    let mut out_b = vec![0f32; area];
 
     // For output width == stride
-    out_l.spare_capacity_mut().par_chunks_exact_mut(width).take(height).zip(
-        out_a.spare_capacity_mut().par_chunks_exact_mut(width).take(height).zip(
-            out_b.spare_capacity_mut().par_chunks_exact_mut(width).take(height))
-    ).enumerate()
-    .for_each(|(y, (l_row, (a_row, b_row)))| {
-        let in_row = &img.rows().nth(y).unwrap()[0..width];
-        let l_row = &mut l_row[0..width];
-        let a_row = &mut a_row[0..width];
-        let b_row = &mut b_row[0..width];
-        for x in 0..width {
-            let n = (x+11) ^ (y+11);
-            let (l,a,b) = cb(in_row[x], n);
-            l_row[x].write(l);
-            a_row[x].write(a);
-            b_row[x].write(b);
-        }
-    });
-
-    unsafe { out_l.set_len(area) };
-    unsafe { out_a.set_len(area) };
-    unsafe { out_b.set_len(area) };
+    out_l
+        .par_chunks_exact_mut(width)
+        .take(height)
+        .zip(
+            out_a
+                .par_chunks_exact_mut(width)
+                .take(height)
+                .zip(out_b.par_chunks_exact_mut(width).take(height)),
+        )
+        .enumerate()
+        .for_each(|(y, (l_row, (a_row, b_row)))| {
+            let in_row = &img.rows().nth(y).unwrap()[0..width];
+            let l_row = &mut l_row[0..width];
+            let a_row = &mut a_row[0..width];
+            let b_row = &mut b_row[0..width];
+            for x in 0..width {
+                let n = (x + 11) ^ (y + 11);
+                let (l, a, b) = cb(in_row[x], n);
+                l_row[x] = l;
+                a_row[x] = a;
+                b_row[x] = b;
+            }
+        });
 
     vec![
         Img::new(out_l, width, height),
@@ -147,18 +390,21 @@ fn rgb_to_lab<T: Copy + Sync + Send + 'static, F>(img: ImgRef<'_, T>, cb: F) ->
 impl ToLABBitmap for ImgRef<'_, RGBAPLU> {
     #[inline]
     fn to_lab(&self) -> Vec<GBitmap> {
-        rgb_to_lab(*self, |px, n|{
-            px.to_rgb(n).to_lab()
-        })
+        rgb_to_lab(*self, |px, n| px.to_rgb(n).to_lab())
     }
 }
 
 impl ToLABBitmap for ImgRef<'_, RGBLU> {
     #[inline]
     fn to_lab(&self) -> Vec<GBitmap> {
-        rgb_to_lab(*self, |px, _n|{
-            px.to_lab()
-        })
+        #[cfg(all(feature = "fma", target_arch = "x86_64"))]
+        {
+            use archmage::SimdToken as _;
+            if let Some(token) = archmage::X64V3Token::summon() {
+                return simd::rgb_to_lab_simd(token, *self);
+            }
+        }
+        rgb_to_lab(*self, |px, _n| px.to_lab())
     }
 }
 
@@ -172,9 +418,13 @@ fn cbrts1() {
         let actual = a * a * a;
         let expected = x;
         let absdiff = (expected as f64 - actual as f64).abs();
-        assert!(absdiff < 0.0002, "{expected} - {actual} = {} @ {x}", expected - actual);
+        assert!(
+            absdiff < 0.0002,
+            "{expected} - {actual} = {} @ {x}",
+            expected - actual
+        );
         if i % 400 == 0 {
-            println!("{:+0.3}", (expected - actual)*255.);
+            println!("{:+0.3}", (expected - actual) * 255.);
         }
         totaldiff += absdiff;
         maxdiff = maxdiff.max(absdiff);
@@ -194,7 +444,11 @@ fn cbrts2() {
         let absdiff = (expected - actual).abs();
         totaldiff += absdiff;
         maxdiff = maxdiff.max(absdiff);
-        assert!(absdiff < 0.0000005, "{expected} - {actual} = {} @ {x}", expected - actual);
+        assert!(
+            absdiff < 0.0000005,
+            "{expected} - {actual} = {} @ {x}",
+            expected - actual
+        );
     }
     println!("2={totaldiff:0.6}; {maxdiff:0.8}");
     assert!(totaldiff < 0.0025, "{totaldiff}");
diff --git a/dssim-core/src/val.rs b/dssim-core/src/val.rs
index f6165df..adca3c8 100644
--- a/dssim-core/src/val.rs
+++ b/dssim-core/src/val.rs
@@ -128,10 +128,18 @@ impl PartialOrd<f64> for Dssim {
         self.0.partial_cmp(other)
     }
 
-    fn lt(&self, other: &f64) -> bool { self.0.lt(other) }
-    fn le(&self, other: &f64) -> bool { self.0.le(other) }
-    fn gt(&self, other: &f64) -> bool { self.0.gt(other) }
-    fn ge(&self, other: &f64) -> bool { self.0.ge(other) }
+    fn lt(&self, other: &f64) -> bool {
+        self.0.lt(other)
+    }
+    fn le(&self, other: &f64) -> bool {
+        self.0.le(other)
+    }
+    fn gt(&self, other: &f64) -> bool {
+        self.0.gt(other)
+    }
+    fn ge(&self, other: &f64) -> bool {
+        self.0.ge(other)
+    }
 }
 
 impl PartialOrd<Dssim> for f64 {
@@ -139,8 +147,16 @@ impl PartialOrd<Dssim> for f64 {
         self.partial_cmp(&other.0)
     }
 
-    fn lt(&self, other: &Dssim) -> bool { self.lt(&other.0) }
-    fn le(&self, other: &Dssim) -> bool { self.le(&other.0) }
-    fn gt(&self, other: &Dssim) -> bool { self.gt(&other.0) }
-    fn ge(&self, other: &Dssim) -> bool { self.ge(&other.0) }
+    fn lt(&self, other: &Dssim) -> bool {
+        self.lt(&other.0)
+    }
+    fn le(&self, other: &Dssim) -> bool {
+        self.le(&other.0)
+    }
+    fn gt(&self, other: &Dssim) -> bool {
+        self.gt(&other.0)
+    }
+    fn ge(&self, other: &Dssim) -> bool {
+        self.ge(&other.0)
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index aefef03..d6ad9f3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,25 +4,32 @@
 
 pub use dssim_core::*;
 use imgref::Img;
-use load_image::ImageData;
 use std::path::Path;
 
-fn load(attr: &Dssim, path: &Path) -> Result<DssimImage<f32>, load_image::Error> {
-    let img = load_image::load_path(path)?;
-    Ok(match img.bitmap {
-        ImageData::RGB8(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgblu(), img.width, img.height)),
-        ImageData::RGB16(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgblu(), img.width, img.height)),
-        ImageData::RGBA8(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgbaplu(), img.width, img.height)),
-        ImageData::RGBA16(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgbaplu(), img.width, img.height)),
-        ImageData::GRAY8(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgblu(), img.width, img.height)),
-        ImageData::GRAY16(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgblu(), img.width, img.height)),
-        ImageData::GRAYA8(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgbaplu(), img.width, img.height)),
-        ImageData::GRAYA16(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgbaplu(), img.width, img.height)),
-    }.expect("infallible"))
+pub mod load;
+
+use load::PixelData;
+
+fn load_impl(attr: &Dssim, path: &Path) -> Result<DssimImage<f32>, load::LoadError> {
+    let (width, height, pixels) = load::load_path(path)?;
+    Ok(match pixels {
+        PixelData::Rgb8(ref bm) => attr.create_image(&Img::new(bm.to_rgblu(), width, height)),
+        PixelData::Rgb16(ref bm) => attr.create_image(&Img::new(bm.to_rgblu(), width, height)),
+        PixelData::Rgba8(ref bm) => attr.create_image(&Img::new(bm.to_rgbaplu(), width, height)),
+        PixelData::Rgba16(ref bm) => attr.create_image(&Img::new(bm.to_rgbaplu(), width, height)),
+        PixelData::Gray8(ref bm) => attr.create_image(&Img::new(bm.to_rgblu(), width, height)),
+        PixelData::Gray16(ref bm) => attr.create_image(&Img::new(bm.to_rgblu(), width, height)),
+        PixelData::GrayA8(ref bm) => attr.create_image(&Img::new(bm.to_rgbaplu(), width, height)),
+        PixelData::GrayA16(ref bm) => attr.create_image(&Img::new(bm.to_rgbaplu(), width, height)),
+    }
+    .expect("infallible"))
 }
 
 /// Load PNG or JPEG image from the given path. Applies color profiles and converts to `sRGB`.
 #[inline]
-pub fn load_image(attr: &Dssim, path: impl AsRef<Path>) -> Result<DssimImage<f32>, load_image::Error> {
-    load(attr, path.as_ref())
+pub fn load_image(
+    attr: &Dssim,
+    path: impl AsRef<Path>,
+) -> Result<DssimImage<f32>, load::LoadError> {
+    load_impl(attr, path.as_ref())
 }
diff --git a/src/load.rs b/src/load.rs
new file mode 100644
index 0000000..8b02a82
--- /dev/null
+++ b/src/load.rs
@@ -0,0 +1,475 @@
+use rgb::{ComponentBytes, FromSlice, Gray, GrayAlpha, RGB, RGBA};
+use std::io::Cursor;
+use std::path::Path;
+
+#[derive(Debug)]
+pub enum LoadError {
+    Io(std::io::Error),
+    Png(png::DecodingError),
+    Jpeg(zune_jpeg::errors::DecodeErrors),
+    ColorProfile(moxcms::CmsError),
+    Pnm(zenbitmaps::BitmapError),
+    UnsupportedFormat,
+}
+
+impl std::fmt::Display for LoadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Io(e) => write!(f, "I/O error: {e}"),
+            Self::Png(e) => write!(f, "PNG error: {e}"),
+            Self::Jpeg(e) => write!(f, "JPEG error: {e}"),
+            Self::ColorProfile(e) => write!(f, "Color profile error: {e}"),
+            Self::Pnm(e) => write!(f, "PNM error: {e}"),
+            Self::UnsupportedFormat => {
+                write!(f, "Unsupported image format (expected PNG, JPEG, or PNM)")
+            }
+        }
+    }
+}
+
+impl std::error::Error for LoadError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            Self::Io(e) => Some(e),
+            Self::Png(e) => Some(e),
+            Self::Jpeg(e) => Some(e),
+            Self::ColorProfile(e) => Some(e),
+            Self::Pnm(e) => Some(e),
+            Self::UnsupportedFormat => None,
+        }
+    }
+}
+
+impl From<std::io::Error> for LoadError {
+    fn from(e: std::io::Error) -> Self {
+        Self::Io(e)
+    }
+}
+
+impl From<png::DecodingError> for LoadError {
+    fn from(e: png::DecodingError) -> Self {
+        Self::Png(e)
+    }
+}
+
+impl From<zune_jpeg::errors::DecodeErrors> for LoadError {
+    fn from(e: zune_jpeg::errors::DecodeErrors) -> Self {
+        Self::Jpeg(e)
+    }
+}
+
+impl From<moxcms::CmsError> for LoadError {
+    fn from(e: moxcms::CmsError) -> Self {
+        Self::ColorProfile(e)
+    }
+}
+
+impl From<zenbitmaps::BitmapError> for LoadError {
+    fn from(e: zenbitmaps::BitmapError) -> Self {
+        Self::Pnm(e)
+    }
+}
+
+pub enum PixelData {
+    Rgb8(Vec<RGB<u8>>),
+    Rgba8(Vec<RGBA<u8>>),
+    Gray8(Vec<Gray<u8>>),
+    GrayA8(Vec<GrayAlpha<u8>>),
+    Rgb16(Vec<RGB<u16>>),
+    Rgba16(Vec<RGBA<u16>>),
+    Gray16(Vec<Gray<u16>>),
+    GrayA16(Vec<GrayAlpha<u16>>),
+}
+
+fn is_graya_opaque_u8(pixels: &[GrayAlpha<u8>]) -> bool {
+    pixels.iter().all(|p| p.a == 255)
+}
+
+fn is_graya_opaque_u16(pixels: &[GrayAlpha<u16>]) -> bool {
+    pixels.iter().all(|p| p.a == 65535)
+}
+
+fn is_rgba_opaque_u8(pixels: &[RGBA<u8>]) -> bool {
+    pixels.iter().all(|p| p.a == 255)
+}
+
+fn is_rgba_opaque_u16(pixels: &[RGBA<u16>]) -> bool {
+    pixels.iter().all(|p| p.a == 65535)
+}
+
+pub fn load_path(path: &Path) -> Result<(usize, usize, PixelData), LoadError> {
+    let data = std::fs::read(path)?;
+    if data.starts_with(b"\x89PNG") {
+        load_png(&data)
+    } else if data.starts_with(&[0xFF, 0xD8]) {
+        load_jpeg(&data)
+    } else if data.len() >= 2
+        && data[0] == b'P'
+        && matches!(data[1], b'5' | b'6' | b'7' | b'f' | b'F')
+    {
+        load_pnm(&data)
+    } else {
+        Err(LoadError::UnsupportedFormat)
+    }
+}
+
+fn load_png(data: &[u8]) -> Result<(usize, usize, PixelData), LoadError> {
+    let mut decoder = png::Decoder::new(Cursor::new(data));
+    decoder.set_transformations(png::Transformations::EXPAND);
+    let mut reader = decoder.read_info()?;
+
+    let info = reader.info();
+    let icc_profile = info.icc_profile.as_ref().map(|c| c.to_vec());
+    let has_srgb_chunk = info.srgb.is_some();
+
+    let (color_type, bit_depth) = reader.output_color_type();
+    let buf_size = reader.output_buffer_size().unwrap_or(0);
+    let width = info.width as usize;
+    let height = info.height as usize;
+
+    let mut buf = vec![0u8; buf_size];
+    reader.next_frame(&mut buf)?;
+
+    let is_16bit = bit_depth == png::BitDepth::Sixteen;
+
+    // Apply ICC profile if present and no sRGB chunk
+    let needs_icc = !has_srgb_chunk && icc_profile.is_some();
+    let icc_data = icc_profile.as_deref();
+
+    match (color_type, is_16bit) {
+        (png::ColorType::Grayscale, false) => {
+            let mut pixels: Vec<Gray<u8>> = buf[..width * height].as_gray().to_vec();
+            if needs_icc {
+                apply_icc_8bit(
+                    pixels.as_mut_slice().as_bytes_mut(),
+                    icc_data.unwrap(),
+                    moxcms::Layout::Gray,
+                )?;
+            }
+            Ok((width, height, PixelData::Gray8(pixels)))
+        }
+        (png::ColorType::Grayscale, true) => {
+            let mut pixels = bytes_to_gray16_be(&buf, width * height);
+            if needs_icc {
+                apply_icc_16bit(
+                    pixels.as_mut_slice().as_bytes_mut(),
+                    icc_data.unwrap(),
+                    moxcms::Layout::Gray,
+                )?;
+            }
+            Ok((width, height, PixelData::Gray16(pixels)))
+        }
+        (png::ColorType::GrayscaleAlpha, false) => {
+            let mut pixels: Vec<GrayAlpha<u8>> = buf[..width * height * 2].as_gray_alpha().to_vec();
+            if is_graya_opaque_u8(&pixels) {
+                // Strip alpha for fully-opaque images (matches load_image behavior)
+                let mut gray: Vec<Gray<u8>> = pixels.iter().map(|p| Gray::new(p.v)).collect();
+                if needs_icc {
+                    apply_icc_8bit(
+                        gray.as_mut_slice().as_bytes_mut(),
+                        icc_data.unwrap(),
+                        moxcms::Layout::Gray,
+                    )?;
+                }
+                Ok((width, height, PixelData::Gray8(gray)))
+            } else {
+                if needs_icc {
+                    apply_icc_8bit(
+                        pixels.as_mut_slice().as_bytes_mut(),
+                        icc_data.unwrap(),
+                        moxcms::Layout::GrayAlpha,
+                    )?;
+                }
+                Ok((width, height, PixelData::GrayA8(pixels)))
+            }
+        }
+        (png::ColorType::GrayscaleAlpha, true) => {
+            let mut pixels = bytes_to_graya16_be(&buf, width * height);
+            if is_graya_opaque_u16(&pixels) {
+                let mut gray: Vec<Gray<u16>> = pixels.iter().map(|p| Gray::new(p.v)).collect();
+                if needs_icc {
+                    apply_icc_16bit(
+                        gray.as_mut_slice().as_bytes_mut(),
+                        icc_data.unwrap(),
+                        moxcms::Layout::Gray,
+                    )?;
+                }
+                Ok((width, height, PixelData::Gray16(gray)))
+            } else {
+                if needs_icc {
+                    apply_icc_16bit(
+                        pixels.as_mut_slice().as_bytes_mut(),
+                        icc_data.unwrap(),
+                        moxcms::Layout::GrayAlpha,
+                    )?;
+                }
+                Ok((width, height, PixelData::GrayA16(pixels)))
+            }
+        }
+        (png::ColorType::Rgb, false) => {
+            let mut pixels: Vec<RGB<u8>> = buf[..width * height * 3].as_rgb().to_vec();
+            if needs_icc {
+                apply_icc_8bit(
+                    pixels.as_mut_slice().as_bytes_mut(),
+                    icc_data.unwrap(),
+                    moxcms::Layout::Rgb,
+                )?;
+            }
+            Ok((width, height, PixelData::Rgb8(pixels)))
+        }
+        (png::ColorType::Rgb, true) => {
+            let mut pixels = bytes_to_rgb16_be(&buf, width * height);
+            if needs_icc {
+                apply_icc_16bit(
+                    pixels.as_mut_slice().as_bytes_mut(),
+                    icc_data.unwrap(),
+                    moxcms::Layout::Rgb,
+                )?;
+            }
+            Ok((width, height, PixelData::Rgb16(pixels)))
+        }
+        (png::ColorType::Rgba, false) => {
+            let mut pixels: Vec<RGBA<u8>> = buf[..width * height * 4].as_rgba().to_vec();
+            if is_rgba_opaque_u8(&pixels) {
+                let mut rgb: Vec<RGB<u8>> =
+                    pixels.iter().map(|p| RGB::new(p.r, p.g, p.b)).collect();
+                if needs_icc {
+                    apply_icc_8bit(
+                        rgb.as_mut_slice().as_bytes_mut(),
+                        icc_data.unwrap(),
+                        moxcms::Layout::Rgb,
+                    )?;
+                }
+                Ok((width, height, PixelData::Rgb8(rgb)))
+            } else {
+                if needs_icc {
+                    apply_icc_8bit(
+                        pixels.as_mut_slice().as_bytes_mut(),
+                        icc_data.unwrap(),
+                        moxcms::Layout::Rgba,
+                    )?;
+                }
+                Ok((width, height, PixelData::Rgba8(pixels)))
+            }
+        }
+        (png::ColorType::Rgba, true) => {
+            let mut pixels = bytes_to_rgba16_be(&buf, width * height);
+            if is_rgba_opaque_u16(&pixels) {
+                let mut rgb: Vec<RGB<u16>> =
+                    pixels.iter().map(|p| RGB::new(p.r, p.g, p.b)).collect();
+                if needs_icc {
+                    apply_icc_16bit(
+                        rgb.as_mut_slice().as_bytes_mut(),
+                        icc_data.unwrap(),
+                        moxcms::Layout::Rgb,
+                    )?;
+                }
+                Ok((width, height, PixelData::Rgb16(rgb)))
+            } else {
+                if needs_icc {
+                    apply_icc_16bit(
+                        pixels.as_mut_slice().as_bytes_mut(),
+                        icc_data.unwrap(),
+                        moxcms::Layout::Rgba,
+                    )?;
+                }
+                Ok((width, height, PixelData::Rgba16(pixels)))
+            }
+        }
+        _ => Err(LoadError::UnsupportedFormat),
+    }
+}
+
+fn load_jpeg(data: &[u8]) -> Result<(usize, usize, PixelData), LoadError> {
+    use zune_jpeg::zune_core::bytestream::ZCursor;
+    use zune_jpeg::zune_core::colorspace::ColorSpace;
+    use zune_jpeg::zune_core::options::DecoderOptions;
+
+    let mut decoder = zune_jpeg::JpegDecoder::new(ZCursor::new(data));
+    decoder.decode_headers()?;
+
+    let icc_profile = decoder.icc_profile();
+    let (width, height) = decoder.dimensions().ok_or(LoadError::UnsupportedFormat)?;
+    let input_cs = decoder
+        .input_colorspace()
+        .ok_or(LoadError::UnsupportedFormat)?;
+
+    // For grayscale JPEGs, decode as Luma to match the GRAY ICC profile.
+    // zune-jpeg defaults to RGB output even for 1-component images.
+    if input_cs == ColorSpace::Luma {
+        decoder.set_options(DecoderOptions::default().jpeg_set_out_colorspace(ColorSpace::Luma));
+    }
+
+    let mut pixels = decoder.decode()?;
+    let output_cs = decoder
+        .output_colorspace()
+        .ok_or(LoadError::UnsupportedFormat)?;
+
+    match output_cs {
+        ColorSpace::Luma => {
+            if let Some(ref icc_data) = icc_profile {
+                apply_icc_8bit(&mut pixels, icc_data, moxcms::Layout::Gray)?;
+            }
+            let gray: Vec<Gray<u8>> = pixels.into_iter().map(Gray::new).collect();
+            Ok((width, height, PixelData::Gray8(gray)))
+        }
+        ColorSpace::RGB => {
+            if let Some(ref icc_data) = icc_profile {
+                apply_icc_8bit(&mut pixels, icc_data, moxcms::Layout::Rgb)?;
+            }
+            let rgb: Vec<RGB<u8>> = pixels
+                .chunks_exact(3)
+                .map(|c| RGB::new(c[0], c[1], c[2]))
+                .collect();
+            Ok((width, height, PixelData::Rgb8(rgb)))
+        }
+        _ => Err(LoadError::UnsupportedFormat),
+    }
+}
+
+fn load_pnm(data: &[u8]) -> Result<(usize, usize, PixelData), LoadError> {
+    let decoded = zenbitmaps::decode(data, zenbitmaps::Unstoppable)?;
+    let w = decoded.width as usize;
+    let h = decoded.height as usize;
+    let pixels = decoded.pixels();
+
+    match decoded.layout {
+        zenbitmaps::PixelLayout::Gray8 => Ok((w, h, PixelData::Gray8(pixels.as_gray().to_vec()))),
+        zenbitmaps::PixelLayout::Rgb8 => Ok((w, h, PixelData::Rgb8(pixels.as_rgb().to_vec()))),
+        zenbitmaps::PixelLayout::Rgba8 => {
+            let rgba = pixels.as_rgba();
+            if is_rgba_opaque_u8(rgba) {
+                Ok((
+                    w,
+                    h,
+                    PixelData::Rgb8(rgba.iter().map(|p| RGB::new(p.r, p.g, p.b)).collect()),
+                ))
+            } else {
+                Ok((w, h, PixelData::Rgba8(rgba.to_vec())))
+            }
+        }
+        zenbitmaps::PixelLayout::Gray16 => {
+            // zenbitmaps provides native-endian u16
+            let gray: Vec<Gray<u16>> = pixels
+                .chunks_exact(2)
+                .map(|c| Gray::new(u16::from_ne_bytes([c[0], c[1]])))
+                .collect();
+            Ok((w, h, PixelData::Gray16(gray)))
+        }
+        _ => Err(LoadError::UnsupportedFormat),
+    }
+}
+
+/// ICC profile colorspace is at bytes 16-19
+fn icc_is_gray(icc_data: &[u8]) -> bool {
+    icc_data.len() >= 20 && &icc_data[16..20] == b"GRAY"
+}
+
+/// Create the appropriate sRGB-equivalent destination profile.
+/// Gray ICC profiles need a gray destination; RGB profiles need sRGB.
+fn srgb_destination(is_gray: bool) -> moxcms::ColorProfile {
+    if is_gray {
+        // Build a gray profile with the exact sRGB parametric TRC.
+        // new_gray_with_gamma(2.2) is a poor approximation — sRGB uses a
+        // piecewise curve: f(x) = (a*x+b)^g for x >= d, f(x) = e*x for x < d
+        let mut profile = moxcms::ColorProfile::new_gray_with_gamma(2.2);
+        profile.gray_trc = Some(moxcms::ToneReprCurve::Parametric(vec![
+            2.4,
+            1. / 1.055,
+            0.055 / 1.055,
+            1. / 12.92,
+            0.04045,
+        ]));
+        profile
+    } else {
+        moxcms::ColorProfile::new_srgb()
+    }
+}
+
+fn apply_icc_8bit(
+    buf: &mut [u8],
+    icc_data: &[u8],
+    layout: moxcms::Layout,
+) -> Result<(), LoadError> {
+    let src = moxcms::ColorProfile::new_from_slice(icc_data)?;
+    let dst = srgb_destination(icc_is_gray(icc_data));
+    let transform =
+        src.create_transform_8bit(layout, &dst, layout, moxcms::TransformOptions::default())?;
+    let mut out = vec![0u8; buf.len()];
+    transform.transform(buf, &mut out)?;
+    buf.copy_from_slice(&out);
+    Ok(())
+}
+
+fn apply_icc_16bit(
+    buf: &mut [u8],
+    icc_data: &[u8],
+    layout: moxcms::Layout,
+) -> Result<(), LoadError> {
+    let src = moxcms::ColorProfile::new_from_slice(icc_data)?;
+    let dst = srgb_destination(icc_is_gray(icc_data));
+    let transform =
+        src.create_transform_16bit(layout, &dst, layout, moxcms::TransformOptions::default())?;
+    let pixel_count = buf.len() / 2;
+    let src_u16: Vec<u16> = buf
+        .chunks_exact(2)
+        .map(|c| u16::from_ne_bytes([c[0], c[1]]))
+        .collect();
+    let mut dst_u16 = vec![0u16; pixel_count];
+    transform.transform(&src_u16, &mut dst_u16)?;
+    for (chunk, &val) in buf.chunks_exact_mut(2).zip(dst_u16.iter()) {
+        let bytes = val.to_ne_bytes();
+        chunk[0] = bytes[0];
+        chunk[1] = bytes[1];
+    }
+    Ok(())
+}
+
+// PNG stores 16-bit values in big-endian byte order.
+// Convert to native-endian typed pixels.
+
+fn bytes_to_gray16_be(buf: &[u8], count: usize) -> Vec<Gray<u16>> {
+    buf[..count * 2]
+        .chunks_exact(2)
+        .map(|c| Gray::new(u16::from_be_bytes([c[0], c[1]])))
+        .collect()
+}
+
+fn bytes_to_graya16_be(buf: &[u8], count: usize) -> Vec<GrayAlpha<u16>> {
+    buf[..count * 4]
+        .chunks_exact(4)
+        .map(|c| {
+            GrayAlpha::new(
+                u16::from_be_bytes([c[0], c[1]]),
+                u16::from_be_bytes([c[2], c[3]]),
+            )
+        })
+        .collect()
+}
+
+fn bytes_to_rgb16_be(buf: &[u8], count: usize) -> Vec<RGB<u16>> {
+    buf[..count * 6]
+        .chunks_exact(6)
+        .map(|c| {
+            RGB::new(
+                u16::from_be_bytes([c[0], c[1]]),
+                u16::from_be_bytes([c[2], c[3]]),
+                u16::from_be_bytes([c[4], c[5]]),
+            )
+        })
+        .collect()
+}
+
+fn bytes_to_rgba16_be(buf: &[u8], count: usize) -> Vec<RGBA<u16>> {
+    buf[..count * 8]
+        .chunks_exact(8)
+        .map(|c| {
+            RGBA::new(
+                u16::from_be_bytes([c[0], c[1]]),
+                u16::from_be_bytes([c[2], c[3]]),
+                u16::from_be_bytes([c[4], c[5]]),
+                u16::from_be_bytes([c[6], c[7]]),
+            )
+        })
+        .collect()
+}
diff --git a/src/main.rs b/src/main.rs
index f7929fc..e636c60 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -25,20 +25,27 @@ use std::path::PathBuf;
 use std::process::ExitCode;
 
 fn usage(argv0: &str) {
-    eprintln!("\
-       Usage: {argv0} original.png modified.png [modified.png...]\
-     \n   or: {argv0} -o difference.png original.png modified.png\n\n\
+    eprintln!(
+        "\
+       Usage: {argv0} original modified [modified...]\
+     \n   or: {argv0} -o difference.png original modified\n\n\
        Compares first image against subsequent images, and outputs\n\
        1/SSIM-1 difference for each of them in order (0 = identical).\n\n\
        Images must have identical size, but may have different gamma & depth.\n\
-       \nVersion {} https://kornel.ski/dssim\n", env!("CARGO_PKG_VERSION"));
+       \nVersion {} https://kornel.ski/dssim\n",
+        env!("CARGO_PKG_VERSION")
+    );
 }
 
 #[inline(always)]
 fn to_byte(i: f32) -> u8 {
-    if i <= 0.0 {0}
-    else if i >= 255.0/256.0 {255}
-    else {(i * 256.0) as u8}
+    if i <= 0.0 {
+        0
+    } else if i >= 255.0 / 256.0 {
+        255
+    } else {
+        (i * 256.0) as u8
+    }
 }
 
 fn main() -> ExitCode {
@@ -87,11 +94,17 @@ fn run() -> Result<(), Box<dyn std::error::Error>> {
     std::thread::scope(|scope| {
         let decode_thread = || {
             let images_send = images_send; // ensure it's moved, and attr isn't
-            filenames_recv.into_iter().try_for_each(|(i, file): (usize, PathBuf)| {
-                dssim::load_image(&attr, &file)
-                    .map_err(|e| format!("Can't load {}, because: {e}", file.display()))
-                    .and_then(|image| images_send.send(i, (file, image)).map_err(|_| "Aborted".into()))
-            })
+            filenames_recv
+                .into_iter()
+                .try_for_each(|(i, file): (usize, PathBuf)| {
+                    dssim::load_image(&attr, &file)
+                        .map_err(|e| format!("Can't load {}, because: {e}", file.display()))
+                        .and_then(|image| {
+                            images_send
+                                .send(i, (file, image))
+                                .map_err(|_| "Aborted".into())
+                        })
+                })
         };
 
         let threads = [
@@ -100,16 +113,26 @@ fn run() -> Result<(), Box<dyn std::error::Error>> {
         ];
 
         let result = (|| {
-            files.into_iter().map(PathBuf::from).enumerate()
+            files
+                .into_iter()
+                .map(PathBuf::from)
+                .enumerate()
                 .try_for_each(move |f| filenames_send.send(f))?;
 
             let (file1, original) = images_recv.next().ok_or("Can't load any images")?;
 
             for (file2, modified) in images_recv {
                 if original.width() != modified.width() || original.height() != modified.height() {
-                    return Err(format!("Image {} has a different size ({}x{}) than {} ({}x{})\n",
-                        file2.display(), modified.width(), modified.height(),
-                        file1.display(), original.width(), original.height()).into());
+                    return Err(format!(
+                        "Image {} has a different size ({}x{}) than {} ({}x{})\n",
+                        file2.display(),
+                        modified.width(),
+                        modified.height(),
+                        file1.display(),
+                        original.width(),
+                        original.height()
+                    )
+                    .into());
                 }
 
                 let (dssim, ssim_maps) = attr.compare(&original, modified);
@@ -123,36 +146,66 @@ fn run() -> Result<(), Box<dyn std::error::Error>> {
             Ok(())
         })();
 
-        threads.into_iter().try_for_each(|t| t.join().map_err(|_| "thread panicked; this is a bug")?)?;
+        threads
+            .into_iter()
+            .try_for_each(|t| t.join().map_err(|_| "thread panicked; this is a bug")?)?;
         result
     })
 }
 
-fn write_ssim_maps(ssim_maps: Vec<dssim_core::SsimMap>, map_output_file: &str) -> Result<(), Box<dyn std::error::Error>> {
+fn write_ssim_maps(
+    ssim_maps: Vec<dssim_core::SsimMap>,
+    map_output_file: &str,
+) -> Result<(), Box<dyn std::error::Error>> {
     #[cfg(feature = "threads")]
     let ssim_maps_iter = ssim_maps.par_iter();
     #[cfg(not(feature = "threads"))]
     let ssim_maps_iter = ssim_maps.iter();
     ssim_maps_iter.enumerate().try_for_each(|(n, map_meta)| {
         let avgssim = map_meta.ssim as f32;
-        let out: Vec<_> = map_meta.map.pixels().map(|ssim|{
-            let max = 1_f32 - ssim;
-            let maxsq = max * max;
-            rgb::RGBA8 {
-                r: to_byte(maxsq * 16.0),
-                g: to_byte(max * 3.0),
-                b: to_byte(max / ((1_f32 - avgssim) * 4_f32)),
-                a: 255,
-            }
-        }).collect();
-        lodepng::encode32_file(format!("{map_output_file}-{n}.png"), &out, map_meta.map.width(), map_meta.map.height())
-            .map_err(|e| {
-                format!("Can't write {map_output_file}: {e}")
+        let out: Vec<_> = map_meta
+            .map
+            .pixels()
+            .map(|ssim| {
+                let max = 1_f32 - ssim;
+                let maxsq = max * max;
+                rgb::RGBA8 {
+                    r: to_byte(maxsq * 16.0),
+                    g: to_byte(max * 3.0),
+                    b: to_byte(max / ((1_f32 - avgssim) * 4_f32)),
+                    a: 255,
+                }
             })
+            .collect();
+        write_png_rgba(
+            &format!("{map_output_file}-{n}.png"),
+            &out,
+            map_meta.map.width(),
+            map_meta.map.height(),
+        )
+        .map_err(|e| format!("Can't write {map_output_file}: {e}"))
     })?;
     Ok(())
 }
 
+fn write_png_rgba(
+    path: &str,
+    data: &[rgb::RGBA8],
+    width: usize,
+    height: usize,
+) -> Result<(), Box<dyn std::error::Error>> {
+    use rgb::ComponentBytes;
+
+    let file = std::io::BufWriter::new(std::fs::File::create(path)?);
+    let mut encoder = png::Encoder::new(file, width as u32, height as u32);
+    encoder.set_color(png::ColorType::Rgba);
+    encoder.set_depth(png::BitDepth::Eight);
+    let mut writer = encoder.write_header()?;
+    writer.write_image_data(data.as_bytes())?;
+    writer.finish()?;
+    Ok(())
+}
+
 #[test]
 fn image_gray() {
     let attr = dssim::Dssim::new();