diff --git a/Cargo.lock b/Cargo.lock index 6e85543..79ac2d0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "adler2" @@ -9,25 +9,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" [[package]] -name = "aom-decode" -version = "0.2.13" +name = "archmage" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "045795fa770d3b7ec2a98d96d8a4a3a81402d1e7bb71a70968f0d799729deda0" +checksum = "c0fb818f2e02ca08cc6416725044fadc84bcf5a96e2eb7ab103c966123667d86" dependencies = [ - "avif-parse", - "imgref", - "libaom-sys", - "log", - "quick-error", - "rgb", - "yuv", + "archmage-macros", + "safe_unaligned_simd", ] [[package]] -name = "arrayvec" -version = "0.7.6" +name = "archmage-macros" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +checksum = "1886761e37995842cfbaf8c24d483e62d9b8c174254d29146f596d4d87efff2d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "autocfg" @@ -36,66 +36,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] -name = "avif-parse" -version = "1.4.0" +name = "bitflags" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f85ce2a7cd14ac0a30dc29a115de22466aeb8a029410f9f1e4f283443c959d1" -dependencies = [ - "arrayvec", - "bitreader", - "byteorder", - "fallible_collections", - "leb128", - "log", -] - -[[package]] -name = "bitreader" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "886559b1e163d56c765bc3a985febb4eee8009f625244511d8ee3c432e08c066" -dependencies = [ - "cfg-if", -] +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" [[package]] name = "bytemuck" -version = "1.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3995eaeebcdf32f91f980d360f78732ddc061097ab4e39991ae7a6ace9194677" - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - -[[package]] -name = "cc" -version = "1.2.38" +version = "1.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80f41ae168f955c12fb8960b057d70d0ca153fb83182b57d86380443527be7e9" -dependencies = [ - "find-msvc-tools", - "jobserver", - "libc", - "shlex", -] +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" [[package]] name = "cfg-if" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" - -[[package]] -name = "cmake" -version = "0.1.54" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" -dependencies = [ - "cc", -] +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "crc32fast" @@ -148,30 +104,28 @@ dependencies = [ "dssim-core", "getopts", "imgref", - "load_image", - "lodepng", + "moxcms", "ordered-channel", + "png", "rayon", "rgb", + "zenbitmaps", + "zune-jpeg", ] [[package]] name = "dssim-core" version = "3.4.0" dependencies = [ + "archmage", "imgref", "itertools", "lodepng", + "magetypes", "rayon", "rgb", ] -[[package]] -name = "dunce" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" - [[package]] name = "either" version = "1.15.0" @@ -179,55 +133,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] -name = "fallible_collections" -version = "0.5.1" +name = "enough" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b3e85d14d419ba3e1db925519461c0d17a49bdd2d67ea6316fa965ca7acdf74" +checksum = "177a59811828ea5dbc8927662d444ffee7b55a44696486f1b1a245ae9e652533" [[package]] -name = "find-msvc-tools" -version = "0.1.2" +name = "fdeflate" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ced73b1dacfc750a6db6c0a0c3a3853c8b41997e2e2c563dc90804ae6867959" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] [[package]] name = "flate2" -version = "1.1.2" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", - "libz-rs-sys", "miniz_oxide", + "zlib-rs", ] -[[package]] -name = "foreign-types" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" -dependencies = [ - "foreign-types-macros", - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-macros" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "foreign-types-shared" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b" - [[package]] name = "getopts" version = "0.2.24" @@ -237,23 +167,11 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "getrandom" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" -dependencies = [ - "cfg-if", - "libc", - "r-efi", - "wasi", -] - [[package]] name = "imgref" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0263a3d970d5c054ed9312c0057b4f3bde9c0b33836d3637361d4a9e6e7a408" +checksum = "e7c5cedc30da3a610cac6b4ba17597bdf7152cf974e8aab3afb3d54455e371c8" [[package]] name = "itertools" @@ -264,124 +182,17 @@ dependencies = [ "either", ] -[[package]] -name = "jobserver" -version = "0.1.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" -dependencies = [ - "getrandom", - "libc", -] - -[[package]] -name = "jpeg-decoder" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00810f1d8b74be64b13dbf3db89ac67740615d6c891f0e7b6179326533011a07" -dependencies = [ - "rayon", -] - -[[package]] -name = "lcms2" -version = "6.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b75877b724685dd49310bdbadbf973fc69b1d01992a6d4a861b928fc3943f87b" -dependencies = [ - "bytemuck", - "foreign-types", - "lcms2-sys", -] - -[[package]] -name = "lcms2-sys" -version = "4.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c2604b23848ca80b2add60f0fb2270fd980e622c25029b6597fa01cfd5f8d5f" -dependencies = [ - "cc", - "dunce", - "libc", - "pkg-config", -] - -[[package]] -name = "leb128" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67" - -[[package]] -name = "libaom-sys" -version = "0.17.2+libaom.3.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c4fce8aaf0c2d8534529b0698ed98ec1da8d35319588d20b0d79ce6446e4194" -dependencies = [ - "cmake", -] - [[package]] name = "libc" -version = "0.2.175" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" - -[[package]] -name = "libwebp" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7792a82f95b5b2528d9fe1642231f972d2cdd73b91ccbcb6ab214c2cf1a74f4" -dependencies = [ - "libwebp-sys2", -] - -[[package]] -name = "libwebp-sys2" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4790186411a6843ecc0a141c8948c8e623a0bb5e886834b1b6c90f3dfa85bb99" -dependencies = [ - "cc", - "cfg-if", - "libc", - "pkg-config", - "vcpkg", -] - -[[package]] -name = "libz-rs-sys" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "840db8cf39d9ec4dd794376f38acc40d0fc65eec2a8f484f7fd375b84602becd" -dependencies = [ - "zlib-rs", -] - -[[package]] -name = "load_image" -version = "3.3.1" +version = "0.2.182" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13af80aa49d27d6334430fcc8aa55c01acdf23209da94f670e217650320644c0" -dependencies = [ - "aom-decode", - "bytemuck", - "imgref", - "jpeg-decoder", - "lcms2", - "libwebp", - "lodepng", - "mozjpeg", - "quick-error", - "rexif", - "rgb", -] +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" [[package]] name = "lodepng" -version = "3.12.1" +version = "3.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a32335d22e44238e2bb0b4d726964d18952ce1f1279ec3305305d2c61539eb" +checksum = "fe7982db11054edc023a1b424dddcc65be18f71fa46ec6bde2efcfc1fb6b22da" dependencies = [ "crc32fast", "flate2", @@ -390,10 +201,13 @@ dependencies = [ ] [[package]] -name = "log" -version = "0.4.28" +name = "magetypes" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" +checksum = "9233ec8c05f4844b1b4d92b5fd55ae179e8f7d1c09415ec45ceeda294237c42c" +dependencies = [ + "archmage", +] [[package]] name = "miniz_oxide" @@ -402,41 +216,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] -name = "mozjpeg" -version = "0.10.13" +name = "moxcms" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7891b80aaa86097d38d276eb98b3805d6280708c4e0a1e6f6aed9380c51fec9" +checksum = "04e5f643aebb3c117fa77e268557e3bf19e250769062820fcaf3ff33ea560adf" dependencies = [ - "arrayvec", - "bytemuck", - "libc", - "mozjpeg-sys", - "rgb", -] - -[[package]] -name = "mozjpeg-sys" -version = "2.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f0dc668bf9bf888c88e2fb1ab16a406d2c380f1d082b20d51dd540ab2aa70c1" -dependencies = [ - "cc", - "dunce", - "libc", - "nasm-rs", -] - -[[package]] -name = "nasm-rs" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34f676553b60ccbb76f41f9ae8f2428dac3f259ff8f1c2468a174778d06a1af9" -dependencies = [ - "jobserver", - "log", + "num-traits", + "pxfm", ] [[package]] @@ -455,41 +245,42 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95be4d57809897b5a7539fc15a7dfe0e84141bc3dfaa2e9b1b27caa90acf61ab" [[package]] -name = "pkg-config" -version = "0.3.32" +name = "png" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" +dependencies = [ + "bitflags", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] [[package]] name = "proc-macro2" -version = "1.0.101" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] [[package]] -name = "quick-error" -version = "2.0.1" +name = "pxfm" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" +checksum = "b5a041e753da8b807c9255f28de81879c78c876392ff2469cde94799b2896b9d" [[package]] name = "quote" -version = "1.0.40" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] -[[package]] -name = "r-efi" -version = "5.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" - [[package]] name = "rayon" version = "1.11.0" @@ -510,32 +301,32 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "rexif" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be932047c168919c8d5af065b16fa7d4bd24835ffa256bf0cf1ff463f91c15df" - [[package]] name = "rgb" -version = "0.8.52" +version = "0.8.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce" +checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4" dependencies = [ "bytemuck", ] [[package]] -name = "shlex" -version = "1.3.0" +name = "safe_unaligned_simd" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +checksum = "f9c874fb993656d484725d58fbfddb39dfcf98ebcadc1a1d09568aa6b39bafba" + +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" [[package]] name = "syn" -version = "2.0.106" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -543,59 +334,64 @@ dependencies = [ ] [[package]] -name = "unicode-ident" -version = "1.0.19" +name = "thiserror" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] [[package]] -name = "unicode-width" -version = "0.2.1" +name = "thiserror-impl" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] -name = "vcpkg" -version = "0.2.15" +name = "unicode-ident" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] -name = "wasi" -version = "0.14.7+wasi-0.2.4" +name = "unicode-width" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c" -dependencies = [ - "wasip2", -] +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" [[package]] -name = "wasip2" -version = "1.0.1+wasi-0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +name = "zenbitmaps" +version = "0.1.0" +source = "git+https://github.com/imazen/zenbitmaps.git?rev=818992c#818992c0e56b5b5d544880964d796037cbc49eba" dependencies = [ - "wit-bindgen", + "enough", + "rgb", + "thiserror", ] [[package]] -name = "wit-bindgen" -version = "0.46.0" +name = "zlib-rs" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" [[package]] -name = "yuv" -version = "0.1.9" +name = "zune-core" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cbe2d856acbe6d86c0fa6f458b73e962834061ca2f7f94c6e4633afc9efd4d4" -dependencies = [ - "num-traits", - "rgb", -] +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" [[package]] -name = "zlib-rs" -version = "0.5.2" +name = "zune-jpeg" +version = "0.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f06ae92f42f5e5c42443fd094f245eb656abf56dd7cce9b8b263236565e00f2" +checksum = "410e9ecef634c709e3831c2cfdb8d9c32164fae1c67496d5b68fff728eec37fe" +dependencies = [ + "zune-core", +] diff --git a/Cargo.toml b/Cargo.toml index ffed2a4..3e51cb8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,8 +11,8 @@ name = "dssim" readme = "README.md" repository = "https://github.com/kornelski/dssim.git" version = "3.4.1" -edition = "2021" -rust-version = "1.72" +edition = "2024" +rust-version = "1.85" [[bin]] doctest = false @@ -25,20 +25,17 @@ imgref = "1.11.0" getopts = "0.2.24" rayon = { version = "1.10.0", optional = true } rgb = "0.8.52" -lodepng = { version = "3.12", default-features = false, features = ["rust_backend"] } -load_image = { version = "3.3", features = ["lcms2-static"] } +png = "0.18" +zune-jpeg = "0.5" +moxcms = "0.8" crossbeam-channel = "0.5.15" ordered-channel = { version = "1.2.0" } +zenbitmaps = { git = "https://github.com/imazen/zenbitmaps.git", rev = "818992c", features = ["rgb"] } [features] -default = ["threads", "dssim-core/default", "no-macos-vimage"] +default = ["threads", "dssim-core/default", "no-macos-vimage", "fma"] threads = ["dep:rayon", "dssim-core/threads"] -avif = ["load_image/avif"] -# Support comparing WebP files directly -webp = ["load_image/webp"] -webp-static = ["load_image/webp-static"] -# Decode JPEGs using libjpeg-turbo derivative (closer to the reference decoder) -mozjpeg = ["load_image/mozjpeg"] +fma = ["dssim-core/fma"] no-macos-vimage = ["dssim-core/no-macos-vimage"] [package.metadata.release] diff --git a/benches/compare.rs b/benches/compare.rs index b973e58..afdb7c8 100644 --- a/benches/compare.rs +++ b/benches/compare.rs @@ -1,25 +1,29 @@ #![feature(test)] extern crate test; -use dssim::{ToRGBAPLU, RGBAPLU}; +use dssim::{RGBAPLU, ToRGBAPLU}; use imgref::{Img, ImgVec}; use test::Bencher; fn load(path: &str) -> Result, lodepng::Error> { let image = lodepng::decode32_file(path)?; - Ok(Img::new(image.buffer.to_rgbaplu(), image.width, image.height)) + Ok(Img::new( + image.buffer.to_rgbaplu(), + image.width, + image.height, + )) } #[bench] fn compare(bench: &mut Bencher) { let attr = dssim::Dssim::new(); let other = &load("tests/test1-sm.png").unwrap(); - let orig = attr.create_image(&load("tests/test2-sm.png").unwrap()).unwrap(); + let orig = attr + .create_image(&load("tests/test2-sm.png").unwrap()) + .unwrap(); let modif = attr.create_image(other).unwrap(); - bench.iter(|| { - attr.compare(&orig, modif.clone()) - }); + bench.iter(|| attr.compare(&orig, modif.clone())); } #[bench] @@ -27,7 +31,5 @@ fn create_image(bench: &mut Bencher) { let attr = dssim::Dssim::new(); let img = &load("tests/test1-sm.png").unwrap(); - bench.iter(|| { - attr.create_image(img) - }); + bench.iter(|| attr.create_image(img)); } diff --git a/dssim-core/Cargo.toml b/dssim-core/Cargo.toml index 6edc493..874f5c0 100644 --- a/dssim-core/Cargo.toml +++ b/dssim-core/Cargo.toml @@ -11,7 +11,7 @@ name = "dssim-core" readme = "README.md" repository = "https://github.com/kornelski/dssim.git" version = "3.4.0" -edition = "2021" +edition = "2024" [lib] crate-type = ["lib", "staticlib"] @@ -19,6 +19,8 @@ crate-type = ["lib", "staticlib"] [dependencies] imgref = "1.11.0" itertools = "0.14" +archmage = { version = "0.8.6", optional = true } +magetypes = { version = "0.8.6", optional = true } rayon = { version = "1.11.0", optional = true } rgb = "0.8.52" @@ -28,6 +30,7 @@ lodepng = "3.12.1" [features] default = ["threads"] threads = ["dep:rayon"] +fma = ["dep:archmage", "dep:magetypes"] no-macos-vimage = [] # internal for cargo-c only capi = [] diff --git a/dssim-core/src/blur.rs b/dssim-core/src/blur.rs index 3f63618..69fa1d5 100644 --- a/dssim-core/src/blur.rs +++ b/dssim-core/src/blur.rs @@ -1,21 +1,23 @@ - +#[cfg(all(target_os = "macos", not(feature = "no-macos-vimage")))] const KERNEL: [f32; 9] = [ - 0.095332, 0.118095, 0.095332, - 0.118095, 0.146293, 0.118095, - 0.095332, 0.118095, 0.095332, + 0.095332, 0.118095, 0.095332, 0.118095, 0.146293, 0.118095, 0.095332, 0.118095, 0.095332, ]; +#[inline] +pub(crate) fn zeroed_f32_vec(len: usize) -> Vec { + vec![0.0f32; len] +} + #[cfg(all(target_os = "macos", not(feature = "no-macos-vimage")))] mod mac { use super::KERNEL; - use crate::ffi::vImageConvolve_PlanarF; - use crate::ffi::vImagePixelCount; use crate::ffi::vImage_Buffer; use crate::ffi::vImage_Flags::kvImageEdgeExtend; + use crate::ffi::vImageConvolve_PlanarF; + use crate::ffi::vImagePixelCount; use imgref::*; - use std::mem::MaybeUninit; - pub fn blur(src: ImgRef<'_, f32>, tmp: &mut [MaybeUninit]) -> ImgVec { + pub fn blur(src: ImgRef<'_, f32>, tmp: &mut [f32]) -> ImgVec { let width = src.width(); let height = src.height(); @@ -25,22 +27,19 @@ mod mac { rowBytes: src.stride() * std::mem::size_of::(), data: src.buf().as_ptr(), }; - let mut dst_vec = Vec::with_capacity(width * height); + let mut dst_vec = vec![0f32; width * height]; let mut dstbuf = vImage_Buffer { width: width as vImagePixelCount, height: height as vImagePixelCount, rowBytes: width * std::mem::size_of::(), - data: dst_vec.spare_capacity_mut().as_mut_ptr().cast(), + data: dst_vec.as_mut_ptr(), }; do_blur(&srcbuf, tmp, &mut dstbuf, width, height); - unsafe { - dst_vec.set_len(dst_vec.capacity()); - } ImgVec::new(dst_vec, width, height) } - pub fn blur_in_place(mut srcdst: ImgRefMut<'_, f32>, tmp: &mut [MaybeUninit]) { + pub fn blur_in_place(mut srcdst: ImgRefMut<'_, f32>, tmp: &mut [f32]) { let srcbuf = vImage_Buffer { width: srcdst.width() as vImagePixelCount, height: srcdst.height() as vImagePixelCount, @@ -57,7 +56,27 @@ mod mac { do_blur(&srcbuf, tmp, &mut dstbuf, srcdst.width(), srcdst.height()); } - fn do_blur(srcbuf: &vImage_Buffer<*const f32>, tmp: &mut [MaybeUninit], dstbuf: &mut vImage_Buffer<*mut f32>, width: usize, height: usize) { + /// Blur the element-wise product of two images. On macOS, falls back to + /// multiply then blur since vImage has no fused variant. + pub fn blur_mul(src1: ImgRef<'_, f32>, src2: ImgRef<'_, f32>, tmp: &mut [f32]) -> Vec { + let width = src1.width(); + let height = src1.height(); + let mut product: Vec = src1 + .pixels() + .zip(src2.pixels()) + .map(|(a, b)| a * b) + .collect(); + blur_in_place(ImgRefMut::new(&mut product, width, height), tmp); + product + } + + fn do_blur( + srcbuf: &vImage_Buffer<*const f32>, + tmp: &mut [f32], + dstbuf: &mut vImage_Buffer<*mut f32>, + width: usize, + height: usize, + ) { assert_eq!(tmp.len(), width * height); unsafe { @@ -65,18 +84,40 @@ mod mac { width: width as vImagePixelCount, height: height as vImagePixelCount, rowBytes: width * std::mem::size_of::(), - data: tmp.as_mut_ptr().cast::(), + data: tmp.as_mut_ptr(), }; - let res = vImageConvolve_PlanarF(srcbuf, &mut tmpwrbuf, std::ptr::null_mut(), 0, 0, KERNEL.as_ptr(), 3, 3, 0., kvImageEdgeExtend); + let res = vImageConvolve_PlanarF( + srcbuf, + &mut tmpwrbuf, + std::ptr::null_mut(), + 0, + 0, + KERNEL.as_ptr(), + 3, + 3, + 0., + kvImageEdgeExtend, + ); assert_eq!(0, res); let tmprbuf = vImage_Buffer { width: width as vImagePixelCount, height: height as vImagePixelCount, rowBytes: width * std::mem::size_of::(), - data: tmp.as_ptr().cast::(), + data: tmp.as_ptr(), }; - let res = vImageConvolve_PlanarF(&tmprbuf, dstbuf, std::ptr::null_mut(), 0, 0, KERNEL.as_ptr(), 3, 3, 0., kvImageEdgeExtend); + let res = vImageConvolve_PlanarF( + &tmprbuf, + dstbuf, + std::ptr::null_mut(), + 0, + 0, + KERNEL.as_ptr(), + 3, + 3, + 0., + kvImageEdgeExtend, + ); assert_eq!(0, res); } } @@ -84,101 +125,459 @@ mod mac { #[cfg(not(all(target_os = "macos", not(feature = "no-macos-vimage"))))] mod portable { - use super::KERNEL; + use super::zeroed_f32_vec; use imgref::*; - use std::cmp::min; - use std::mem::MaybeUninit; - #[inline] - unsafe fn do3f(prev: &[f32], curr: &[f32], next: &[f32], i: usize) -> f32 { - debug_assert!(i > 0); + // 1D kernel from separable decomposition of the 3×3 kernel. + // Symmetric: K1D = [K_SIDE, K_CENTER, K_SIDE]. + const K_SIDE: f32 = 0.308_758_86; + const K_CENTER: f32 = 0.382_482_8; + + // Fused double-blur 5-tap kernel: convolving K1D with itself. + // K5 = [K5_OUTER, K5_INNER, K5_MID, K5_INNER, K5_OUTER] + // This makes H→V→H→V equivalent to a single H5→V5 pass, halving memory traffic. + const K5_OUTER: f32 = K_SIDE * K_SIDE; + const K5_INNER: f32 = 2.0 * K_SIDE * K_CENTER; + const K5_MID: f32 = 2.0 * K_SIDE * K_SIDE + K_CENTER * K_CENTER; + + /// Horizontal 5-tap blur. Equivalent to two sequential 3-tap horizontal blurs. + #[inline(never)] + fn blur_h5(src: &[f32], dst: &mut [f32], width: usize, height: usize, src_stride: usize) { + let last = width - 1; + for y in 0..height { + let row = &src[y * src_stride..][..width]; + let out = &mut dst[y * width..][..width]; + + // Left edge pixels (0, 1): clamp negative indices to 0 + for i in 0..2.min(width) { + let m2 = row[0]; // i-2 and i-1 clamp to 0 + let m1 = if i >= 1 { row[i - 1] } else { row[0] }; + let p1 = if i < last { row[i + 1] } else { row[last] }; + let p2 = if i + 2 <= last { row[i + 2] } else { row[last] }; + out[i] = (m2 + p2) * K5_OUTER + (m1 + p1) * K5_INNER + row[i] * K5_MID; + } - let c0 = i - 1; - let c1 = i; - let c2 = i + 1; + // Inner pixels: 2 <= i <= width-3 + for i in 2..width.saturating_sub(2) { + out[i] = (row[i - 2] + row[i + 2]) * K5_OUTER + + (row[i - 1] + row[i + 1]) * K5_INNER + + row[i] * K5_MID; + } - (prev.get_unchecked(c0)*KERNEL[0] + prev.get_unchecked(c1)*KERNEL[1] + prev.get_unchecked(c2)*KERNEL[2]) + - (curr.get_unchecked(c0)*KERNEL[3] + curr.get_unchecked(c1)*KERNEL[4] + curr.get_unchecked(c2)*KERNEL[5]) + - (next.get_unchecked(c0)*KERNEL[6] + next.get_unchecked(c1)*KERNEL[7] + next.get_unchecked(c2)*KERNEL[8]) + // Right edge pixels: clamp beyond-end indices to last + for i in width.saturating_sub(2).max(2)..width { + let p1 = if i < last { row[i + 1] } else { row[last] }; + let p2 = if i + 2 <= last { row[i + 2] } else { row[last] }; + out[i] = + (row[i - 2] + p2) * K5_OUTER + (row[i - 1] + p1) * K5_INNER + row[i] * K5_MID; + } + } } - fn do3(prev: &[f32], curr: &[f32], next: &[f32], i: usize, width: usize) -> f32 { - let c0 = if i > 0 { i - 1 } else { 0 }; - let c1 = i; - let c2 = min(i + 1, width - 1); + /// Vertical 5-tap blur. Equivalent to two sequential 3-tap vertical blurs. + #[inline(never)] + fn blur_v5(src: &[f32], dst: &mut [f32], width: usize, height: usize, dst_stride: usize) { + let last_y = height - 1; - prev[c2].mul_add(KERNEL[2], prev[c0].mul_add(KERNEL[0], prev[c1] * KERNEL[1])) + - curr[c2].mul_add(KERNEL[5], curr[c0].mul_add(KERNEL[3], curr[c1] * KERNEL[4])) + - next[c2].mul_add(KERNEL[8], next[c0].mul_add(KERNEL[6], next[c1] * KERNEL[7])) + for y in 0..height { + let ym2 = y.saturating_sub(2); + let ym1 = y.saturating_sub(1); + let yp1 = (y + 1).min(last_y); + let yp2 = (y + 2).min(last_y); + + let rm2 = &src[ym2 * width..][..width]; + let rm1 = &src[ym1 * width..][..width]; + let rc = &src[y * width..][..width]; + let rp1 = &src[yp1 * width..][..width]; + let rp2 = &src[yp2 * width..][..width]; + + let out = &mut dst[y * dst_stride..][..width]; + + for x in 0..width { + out[x] = + (rm2[x] + rp2[x]) * K5_OUTER + (rm1[x] + rp1[x]) * K5_INNER + rc[x] * K5_MID; + } + } } - pub fn blur(src: ImgRef<'_, f32>, tmp: &mut [MaybeUninit]) -> ImgVec { - let width = src.width(); - let height = src.height(); - let tmp_dst = ImgRefMut::new(tmp, width, height); - let tmp_src = do_blur(src, tmp_dst); - let mut dst_vec = Vec::with_capacity(width * height); - do_blur(tmp_src.as_ref(), ImgRefMut::new(dst_vec.spare_capacity_mut(), width, height)); - unsafe { - dst_vec.set_len(dst_vec.capacity()); + /// Horizontal 5-tap blur with fused element-wise multiply. + /// Computes blur(src1 * src2) in a single H5 pass. + #[allow(clippy::too_many_arguments)] + #[inline(never)] + fn blur_h5_mul( + src1: &[f32], + src2: &[f32], + dst: &mut [f32], + width: usize, + height: usize, + stride1: usize, + stride2: usize, + ) { + let last = width - 1; + for y in 0..height { + let r1 = &src1[y * stride1..][..width]; + let r2 = &src2[y * stride2..][..width]; + let out = &mut dst[y * width..][..width]; + + // General clamped access for edge pixels + let clamp = |i: isize| i.max(0).min(last as isize) as usize; + let prod = |i: isize| r1[clamp(i)] * r2[clamp(i)]; + + // Left edge pixels + for i in 0..2.min(width) { + let ii = i as isize; + out[i] = (prod(ii - 2) + prod(ii + 2)) * K5_OUTER + + (prod(ii - 1) + prod(ii + 1)) * K5_INNER + + (r1[i] * r2[i]) * K5_MID; + } + + // Inner pixels + for i in 2..width.saturating_sub(2) { + let pm2 = r1[i - 2] * r2[i - 2]; + let pm1 = r1[i - 1] * r2[i - 1]; + let pc = r1[i] * r2[i]; + let pp1 = r1[i + 1] * r2[i + 1]; + let pp2 = r1[i + 2] * r2[i + 2]; + out[i] = (pm2 + pp2) * K5_OUTER + (pm1 + pp1) * K5_INNER + pc * K5_MID; + } + + // Right edge pixels + for i in width.saturating_sub(2).max(2)..width { + let ii = i as isize; + out[i] = (prod(ii - 2) + prod(ii + 2)) * K5_OUTER + + (prod(ii - 1) + prod(ii + 1)) * K5_INNER + + (r1[i] * r2[i]) * K5_MID; + } } - ImgVec::new(dst_vec, width, height) } - fn do_blur<'d>(src: ImgRef<'_, f32>, mut dst: ImgRefMut<'d, MaybeUninit>) -> ImgRefMut<'d, f32> { - assert_eq!(src.width(), dst.width()); - assert_eq!(src.height(), dst.height()); - assert!(src.width() > 0); - assert!(src.width() < 1 << 24); - assert!(src.height() > 0); - assert!(src.height() < 1 << 24); - debug_assert!(src.pixels().all(|p| p.is_finite())); + // ── AVX2+FMA SIMD path ────────────────────────────────────────────── + #[cfg(all(feature = "fma", target_arch = "x86_64"))] + mod simd { + use super::{K5_INNER, K5_MID, K5_OUTER}; + use archmage::prelude::*; + use magetypes::simd::f32x8; + + #[arcane] + pub fn blur_avx2( + t: X64V3Token, + buf: &[f32], + tmp: &mut [f32], + dst: &mut [f32], + w: usize, + h: usize, + stride: usize, + ) { + blur_h5_simd(t, buf, tmp, w, h, stride); + blur_v5_simd(t, tmp, dst, w, h, w); + } - let width = src.width(); - let height = src.height(); - let src_stride = src.stride(); - let dst_stride = dst.stride(); - let src = src.buf(); - let dst = dst.buf_mut(); - - let mut prev = &src[0..width]; - let mut curr = prev; - let mut next = prev; - for y in 0..height { - prev = curr; - curr = next; - let next_start = (y+1)*src_stride; - next = if y+1 < height {&src[next_start..next_start+width]} else {curr}; + #[arcane] + pub fn blur_in_place_avx2( + t: X64V3Token, + buf: &mut [f32], + tmp: &mut [f32], + w: usize, + h: usize, + stride: usize, + ) { + blur_h5_simd(t, buf, tmp, w, h, stride); + blur_v5_simd(t, tmp, buf, w, h, stride); + } - let dstrow = &mut dst[y*dst_stride..y*dst_stride+width]; + #[allow(clippy::too_many_arguments)] + #[arcane] + pub fn blur_mul_avx2( + t: X64V3Token, + src1: &[f32], + src2: &[f32], + tmp: &mut [f32], + dst: &mut [f32], + w: usize, + h: usize, + stride1: usize, + stride2: usize, + ) { + blur_h5_mul_simd(t, src1, src2, tmp, w, h, stride1, stride2); + blur_v5_simd(t, tmp, dst, w, h, w); + } - dstrow[0].write(do3(prev, curr, next, 0, width)); - for i in 1..width-1 { - unsafe { - dstrow[i].write(do3f(prev, curr, next, i)); + #[rite] + fn blur_h5_simd( + t: X64V3Token, + src: &[f32], + dst: &mut [f32], + width: usize, + height: usize, + src_stride: usize, + ) { + let vk_outer = f32x8::splat(t, K5_OUTER); + let vk_inner = f32x8::splat(t, K5_INNER); + let vk_mid = f32x8::splat(t, K5_MID); + let last = width - 1; + + for y in 0..height { + let row = &src[y * src_stride..][..width]; + let out = &mut dst[y * width..][..width]; + + // Left edge: scalar (2 pixels) + for i in 0..2.min(width) { + let ii = i as isize; + let get = |j: isize| row[j.max(0).min(last as isize) as usize]; + out[i] = (get(ii - 2) + get(ii + 2)) * K5_OUTER + + (get(ii - 1) + get(ii + 1)) * K5_INNER + + row[i] * K5_MID; + } + + // SIMD loop: needs row[i-2..i+10] valid + let mut i = 2; + while i + 10 <= width { + let far_left = f32x8::load(t, (&row[i - 2..i + 6]).try_into().unwrap()); + let left = f32x8::load(t, (&row[i - 1..i + 7]).try_into().unwrap()); + let center = f32x8::load(t, (&row[i..i + 8]).try_into().unwrap()); + let right = f32x8::load(t, (&row[i + 1..i + 9]).try_into().unwrap()); + let far_right = f32x8::load(t, (&row[i + 2..i + 10]).try_into().unwrap()); + let outer_sum = far_left + far_right; + let inner_sum = left + right; + let result = + outer_sum.mul_add(vk_outer, inner_sum.mul_add(vk_inner, center * vk_mid)); + result.store((&mut out[i..i + 8]).try_into().unwrap()); + i += 8; + } + + // Scalar tail + right edges + while i < width { + let ii = i as isize; + let get = |j: isize| row[j.max(0).min(last as isize) as usize]; + out[i] = (get(ii - 2) + get(ii + 2)) * K5_OUTER + + (get(ii - 1) + get(ii + 1)) * K5_INNER + + row[i] * K5_MID; + i += 1; } } - if width > 1 { - dstrow[width-1].write(do3(prev, curr, next, width-1, width)); + } + + #[rite] + fn blur_v5_simd( + t: X64V3Token, + src: &[f32], + dst: &mut [f32], + width: usize, + height: usize, + dst_stride: usize, + ) { + let vk_outer = f32x8::splat(t, K5_OUTER); + let vk_inner = f32x8::splat(t, K5_INNER); + let vk_mid = f32x8::splat(t, K5_MID); + let last_y = height - 1; + + for y in 0..height { + let ym2 = y.saturating_sub(2); + let ym1 = y.saturating_sub(1); + let yp1 = (y + 1).min(last_y); + let yp2 = (y + 2).min(last_y); + + let rm2 = &src[ym2 * width..][..width]; + let rm1 = &src[ym1 * width..][..width]; + let rc = &src[y * width..][..width]; + let rp1 = &src[yp1 * width..][..width]; + let rp2 = &src[yp2 * width..][..width]; + + let out = &mut dst[y * dst_stride..][..width]; + + let mut x = 0; + while x + 8 <= width { + let vm2 = f32x8::load(t, (&rm2[x..x + 8]).try_into().unwrap()); + let vm1 = f32x8::load(t, (&rm1[x..x + 8]).try_into().unwrap()); + let vc = f32x8::load(t, (&rc[x..x + 8]).try_into().unwrap()); + let vp1 = f32x8::load(t, (&rp1[x..x + 8]).try_into().unwrap()); + let vp2 = f32x8::load(t, (&rp2[x..x + 8]).try_into().unwrap()); + let outer = vm2 + vp2; + let inner = vm1 + vp1; + let result = outer.mul_add(vk_outer, inner.mul_add(vk_inner, vc * vk_mid)); + result.store((&mut out[x..x + 8]).try_into().unwrap()); + x += 8; + } + + // Scalar tail + while x < width { + out[x] = (rm2[x] + rp2[x]) * K5_OUTER + + (rm1[x] + rp1[x]) * K5_INNER + + rc[x] * K5_MID; + x += 1; + } } } - // assumes init after writing all the data - unsafe { - ImgRefMut::new_stride(std::slice::from_raw_parts_mut(dst.as_mut_ptr().cast(), dst.len()), width, height, dst_stride) + #[allow(clippy::too_many_arguments)] + #[rite] + fn blur_h5_mul_simd( + t: X64V3Token, + src1: &[f32], + src2: &[f32], + dst: &mut [f32], + width: usize, + height: usize, + stride1: usize, + stride2: usize, + ) { + let vk_outer = f32x8::splat(t, K5_OUTER); + let vk_inner = f32x8::splat(t, K5_INNER); + let vk_mid = f32x8::splat(t, K5_MID); + let last = width - 1; + + for y in 0..height { + let r1 = &src1[y * stride1..][..width]; + let r2 = &src2[y * stride2..][..width]; + let out = &mut dst[y * width..][..width]; + + // Left edge: scalar (2 pixels) + for i in 0..2.min(width) { + let ii = i as isize; + let cl = |j: isize| j.max(0).min(last as isize) as usize; + let p = |j: isize| r1[cl(j)] * r2[cl(j)]; + out[i] = (p(ii - 2) + p(ii + 2)) * K5_OUTER + + (p(ii - 1) + p(ii + 1)) * K5_INNER + + (r1[i] * r2[i]) * K5_MID; + } + + // SIMD loop: needs [i-2..i+10] valid from both sources + let mut i = 2; + while i + 10 <= width { + let l1m2 = f32x8::load(t, (&r1[i - 2..i + 6]).try_into().unwrap()); + let l2m2 = f32x8::load(t, (&r2[i - 2..i + 6]).try_into().unwrap()); + let l1m1 = f32x8::load(t, (&r1[i - 1..i + 7]).try_into().unwrap()); + let l2m1 = f32x8::load(t, (&r2[i - 1..i + 7]).try_into().unwrap()); + let l1c = f32x8::load(t, (&r1[i..i + 8]).try_into().unwrap()); + let l2c = f32x8::load(t, (&r2[i..i + 8]).try_into().unwrap()); + let l1p1 = f32x8::load(t, (&r1[i + 1..i + 9]).try_into().unwrap()); + let l2p1 = f32x8::load(t, (&r2[i + 1..i + 9]).try_into().unwrap()); + let l1p2 = f32x8::load(t, (&r1[i + 2..i + 10]).try_into().unwrap()); + let l2p2 = f32x8::load(t, (&r2[i + 2..i + 10]).try_into().unwrap()); + + let p_m2 = l1m2 * l2m2; + let p_m1 = l1m1 * l2m1; + let p_c = l1c * l2c; + let p_p1 = l1p1 * l2p1; + let p_p2 = l1p2 * l2p2; + + let outer = p_m2 + p_p2; + let inner = p_m1 + p_p1; + let result = outer.mul_add(vk_outer, inner.mul_add(vk_inner, p_c * vk_mid)); + result.store((&mut out[i..i + 8]).try_into().unwrap()); + i += 8; + } + + // Scalar tail + right edges + while i < width { + let ii = i as isize; + let cl = |j: isize| j.max(0).min(last as isize) as usize; + let p = |j: isize| r1[cl(j)] * r2[cl(j)]; + out[i] = (p(ii - 2) + p(ii + 2)) * K5_OUTER + + (p(ii - 1) + p(ii + 1)) * K5_INNER + + (r1[i] * r2[i]) * K5_MID; + i += 1; + } + } } } - pub fn blur_in_place(srcdst: ImgRefMut<'_, f32>, tmp: &mut [MaybeUninit]) { - let tmp_dst = ImgRefMut::new(tmp, srcdst.width(), srcdst.height()); - let tmp_src = do_blur(srcdst.as_ref(), tmp_dst); - do_blur(tmp_src.as_ref(), as_maybe_uninit(srcdst)); + pub fn blur(src: ImgRef<'_, f32>, tmp: &mut [f32]) -> ImgVec { + let width = src.width(); + let height = src.height(); + assert!(width > 0 && width < 1 << 24); + assert!(height > 0 && height < 1 << 24); + debug_assert!(src.pixels().all(|p| p.is_finite())); + + let pixels = width * height; + assert!(tmp.len() >= pixels); + let tmp = &mut tmp[..pixels]; + let mut dst = zeroed_f32_vec(pixels); + + #[cfg(all(feature = "fma", target_arch = "x86_64"))] + { + use archmage::SimdToken as _; + if let Some(token) = archmage::X64V3Token::summon() { + simd::blur_avx2(token, src.buf(), tmp, &mut dst, width, height, src.stride()); + return ImgVec::new(dst, width, height); + } + } + + blur_h5(src.buf(), tmp, width, height, src.stride()); + blur_v5(tmp, &mut dst, width, height, width); + + ImgVec::new(dst, width, height) + } + + pub fn blur_in_place(mut srcdst: ImgRefMut<'_, f32>, tmp: &mut [f32]) { + let width = srcdst.width(); + let height = srcdst.height(); + let stride = srcdst.stride(); + let pixels = width * height; + + assert!(tmp.len() >= pixels); + let tmp = &mut tmp[..pixels]; + let buf = srcdst.buf_mut(); + + #[cfg(all(feature = "fma", target_arch = "x86_64"))] + { + use archmage::SimdToken as _; + if let Some(token) = archmage::X64V3Token::summon() { + simd::blur_in_place_avx2(token, buf, tmp, width, height, stride); + return; + } + } + + blur_h5(buf, tmp, width, height, stride); + blur_v5(tmp, buf, width, height, stride); } - fn as_maybe_uninit(img: ImgRefMut<'_, f32>) -> ImgRefMut<'_, MaybeUninit> { - img.map_buf(|dst| unsafe { - std::slice::from_raw_parts_mut(dst.as_mut_ptr().cast::>(), dst.len()) - }) + /// Blur the element-wise product of two images: blur(src1 * src2). + /// Fuses the multiply into the horizontal pass, then does a single vertical pass. + pub fn blur_mul(src1: ImgRef<'_, f32>, src2: ImgRef<'_, f32>, tmp: &mut [f32]) -> Vec { + let width = src1.width(); + let height = src1.height(); + debug_assert_eq!(width, src2.width()); + debug_assert_eq!(height, src2.height()); + assert!(width > 0 && width < 1 << 24); + assert!(height > 0 && height < 1 << 24); + + let pixels = width * height; + assert!(tmp.len() >= pixels); + let tmp = &mut tmp[..pixels]; + let mut dst = zeroed_f32_vec(pixels); + + #[cfg(all(feature = "fma", target_arch = "x86_64"))] + { + use archmage::SimdToken as _; + if let Some(token) = archmage::X64V3Token::summon() { + simd::blur_mul_avx2( + token, + src1.buf(), + src2.buf(), + tmp, + &mut dst, + width, + height, + src1.stride(), + src2.stride(), + ); + return dst; + } + } + + blur_h5_mul( + src1.buf(), + src2.buf(), + tmp, + width, + height, + src1.stride(), + src2.stride(), + ); + blur_v5(tmp, &mut dst, width, height, width); + + dst } } @@ -196,9 +595,9 @@ fn blur_zero() { let src = vec![0.25]; let mut src2 = src.clone(); - let mut tmp = vec![-55.; 1]; tmp.clear(); - let dst = blur(ImgRef::new(&src[..], 1,1), tmp.spare_capacity_mut()); - blur_in_place(ImgRefMut::new(&mut src2[..], 1, 1), tmp.spare_capacity_mut()); + let mut tmp = vec![0.; 1]; + let dst = blur(ImgRef::new(&src[..], 1, 1), &mut tmp); + blur_in_place(ImgRefMut::new(&mut src2[..], 1, 1), &mut tmp); assert_eq!(&src2, dst.buf()); assert!((0.25 - dst.buf()[0]).abs() < 0.00001); @@ -206,40 +605,43 @@ fn blur_zero() { #[test] fn blur_one() { - blur_one_compare(Img::new(vec![ - 0.,0.,0.,0.,0., - 0.,0.,0.,0.,0., - 0.,0.,1.,0.,0., - 0.,0.,0.,0.,0., - 0.,0.,0.,0.,0., - ], 5, 5)); + blur_one_compare(Img::new( + vec![ + 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., + 0., 0., 0., + ], + 5, + 5, + )); } #[test] fn blur_one_stride() { - let nan = 1./0.; - blur_one_compare(Img::new_stride(vec![ - 0.,0.,0.,0.,0., nan, -11., - 0.,0.,0.,0.,0., 333., nan, - 0.,0.,1.,0.,0., nan, -11., - 0.,0.,0.,0.,0., 333., nan, - 0.,0.,0.,0.,0., nan, - ], 5, 5, 7)); + let nan = 1. / 0.; + blur_one_compare(Img::new_stride( + vec![ + 0., 0., 0., 0., 0., nan, -11., 0., 0., 0., 0., 0., 333., nan, 0., 0., 1., 0., 0., nan, + -11., 0., 0., 0., 0., 0., 333., nan, 0., 0., 0., 0., 0., nan, + ], + 5, + 5, + 7, + )); } #[cfg(test)] fn blur_one_compare(src: ImgVec) { let mut src2 = src.clone(); - let mut tmp = vec![-55.; 5*5]; tmp.clear(); - let dst = blur(src.as_ref(), tmp.spare_capacity_mut()); - blur_in_place(src2.as_mut(), tmp.spare_capacity_mut()); + let mut tmp = vec![0.; 5 * 5]; + let dst = blur(src.as_ref(), &mut tmp); + blur_in_place(src2.as_mut(), &mut tmp); assert_eq!(&src2.pixels().collect::>(), dst.buf()); - assert!((1./110. - dst.buf()[0]).abs() < 0.0001, "{dst:?}"); - assert!((1./110. - dst.buf()[5*5-1]).abs() < 0.0001, "{dst:?}"); - assert!((0.11354011 - dst.buf()[2*5+2]).abs() < 0.0001); + assert!((1. / 110. - dst.buf()[0]).abs() < 0.0001, "{dst:?}"); + assert!((1. / 110. - dst.buf()[5 * 5 - 1]).abs() < 0.0001, "{dst:?}"); + assert!((0.11354011 - dst.buf()[2 * 5 + 2]).abs() < 0.0001); } #[test] @@ -247,9 +649,9 @@ fn blur_1x1() { let src = vec![1.]; let mut src2 = src.clone(); - let mut tmp = vec![-999.; 1]; tmp.clear(); - let dst = blur(ImgRef::new(&src[..], 1,1), tmp.spare_capacity_mut()); - blur_in_place(ImgRefMut::new(&mut src2[..], 1,1), tmp.spare_capacity_mut()); + let mut tmp = vec![0.; 1]; + let dst = blur(ImgRef::new(&src[..], 1, 1), &mut tmp); + blur_in_place(ImgRefMut::new(&mut src2[..], 1, 1), &mut tmp); assert!((dst.buf()[0] - 1.).abs() < 0.00001); assert!((src2[0] - 1.).abs() < 0.00001); @@ -258,38 +660,61 @@ fn blur_1x1() { #[test] fn blur_two() { let src = vec![ - 0.,1.,1.,1., - 1.,1.,1.,1., - 1.,1.,1.,1., - 1.,1.,1.,1., + 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; let mut src2 = src.clone(); - let mut tmp = vec![-55.; 4*4]; tmp.clear(); - let dst = blur(ImgRef::new(&src[..], 4,4), tmp.spare_capacity_mut()); - blur_in_place(ImgRefMut::new(&mut src2[..], 4,4), tmp.spare_capacity_mut()); + let mut tmp = vec![0.; 4 * 4]; + let dst = blur(ImgRef::new(&src[..], 4, 4), &mut tmp); + blur_in_place(ImgRefMut::new(&mut src2[..], 4, 4), &mut tmp); assert_eq!(&src2, dst.buf()); - let z00 = 0.*KERNEL[0] + 0.*KERNEL[1] + 1.*KERNEL[2] + - 0.*KERNEL[3] + 0.*KERNEL[4] + 1.*KERNEL[5] + - 1.*KERNEL[6] + 1.*KERNEL[7] + 1.*KERNEL[8]; - let z01 = 0.*KERNEL[0] + 1.*KERNEL[1] + 1.*KERNEL[2] + - 0.*KERNEL[3] + 1.*KERNEL[4] + 1.*KERNEL[5] + - 1.*KERNEL[6] + 1.*KERNEL[7] + 1.*KERNEL[8]; - - let z10 = 0.*KERNEL[0] + 0.*KERNEL[1] + 1.*KERNEL[2] + - 1.*KERNEL[3] + 1.*KERNEL[4] + 1.*KERNEL[5] + - 1.*KERNEL[6] + 1.*KERNEL[7] + 1.*KERNEL[8]; - let z11 = 0.*KERNEL[0] + 1.*KERNEL[1] + 1.*KERNEL[2] + - 1.*KERNEL[3] + 1.*KERNEL[4] + 1.*KERNEL[5] + - 1.*KERNEL[6] + 1.*KERNEL[7] + 1.*KERNEL[8]; - let exp = z00*KERNEL[0] + z00*KERNEL[1] + z01*KERNEL[2] + - z00*KERNEL[3] + z00*KERNEL[4] + z01*KERNEL[5] + - z10*KERNEL[6] + z10*KERNEL[7] + z11*KERNEL[8]; - + // All-1 corners should remain 1.0 (kernel is normalized) assert!((1. - dst.buf()[3]).abs() < 0.0001, "{}", dst.buf()[3]); - assert!((1. - dst.buf()[3 * 4]).abs() < 0.0001, "{}", dst.buf()[3 * 4]); - assert!((1. - dst.buf()[4 * 4 - 1]).abs() < 0.0001, "{}", dst.buf()[4 * 4 - 1]); - assert!((f64::from(exp) - f64::from(dst.buf()[0])).abs() < 0.0000001); + assert!( + (1. - dst.buf()[3 * 4]).abs() < 0.0001, + "{}", + dst.buf()[3 * 4] + ); + assert!( + (1. - dst.buf()[4 * 4 - 1]).abs() < 0.0001, + "{}", + dst.buf()[4 * 4 - 1] + ); + + // Reference 5-tap computation for corner [0][0] + let k_side: f32 = 0.308_758_86; + let k_center: f32 = 0.382_482_8; + let k5o = k_side * k_side; + let k5i = 2.0 * k_side * k_center; + let k5m = 2.0 * k_side * k_side + k_center * k_center; + let cl = |i: isize, max: usize| i.max(0).min(max as isize) as usize; + + // H5 pass on 4×4 + let mut h = [0.0f32; 16]; + for y in 0..4 { + for x in 0..4usize { + let xi = x as isize; + h[y * 4 + x] = (src[y * 4 + cl(xi - 2, 3)] + src[y * 4 + cl(xi + 2, 3)]) * k5o + + (src[y * 4 + cl(xi - 1, 3)] + src[y * 4 + cl(xi + 1, 3)]) * k5i + + src[y * 4 + x] * k5m; + } + } + // V5 pass + let mut exp_all = [0.0f32; 16]; + for y in 0..4usize { + for x in 0..4 { + let yi = y as isize; + exp_all[y * 4 + x] = (h[cl(yi - 2, 3) * 4 + x] + h[cl(yi + 2, 3) * 4 + x]) * k5o + + (h[cl(yi - 1, 3) * 4 + x] + h[cl(yi + 1, 3) * 4 + x]) * k5i + + h[y * 4 + x] * k5m; + } + } + let exp = exp_all[0]; + assert!( + (f64::from(exp) - f64::from(dst.buf()[0])).abs() < 0.0001, + "expected {exp}, got {}", + dst.buf()[0] + ); } diff --git a/dssim-core/src/c_api.rs b/dssim-core/src/c_api.rs index 64c0904..38c31de 100644 --- a/dssim-core/src/c_api.rs +++ b/dssim-core/src/c_api.rs @@ -4,19 +4,21 @@ use rgb::{RGB8, RGBA8}; pub type DssimImage = crate::DssimImage; /// Create new context for comparisons -#[no_mangle] +#[unsafe(no_mangle)] pub extern "C" fn dssim_new() -> *mut Dssim { let d = Box::new(crate::new()); Box::into_raw(d) } /// Free the context -#[no_mangle] +#[unsafe(no_mangle)] pub unsafe extern "C" fn dssim_free(d: *mut Dssim) { if d.is_null() { return; } - let _ = Box::from_raw(d); + unsafe { + let _ = Box::from_raw(d); + } } /// Take sRGB RGBA pixels (non-premultiplied, alpha last) and preprocess them into image format that can be compared. @@ -24,11 +26,16 @@ pub unsafe extern "C" fn dssim_free(d: *mut Dssim) { /// Pixels are copied. Returns NULL on error. /// /// Call `dssim_free_image` to free memory when the image is no longer needed. -#[no_mangle] -pub unsafe extern "C" fn dssim_create_image_rgba(dssim: &mut Dssim, pixels: *const u8, width: u32, height: u32) -> *mut DssimImage { +#[unsafe(no_mangle)] +pub unsafe extern "C" fn dssim_create_image_rgba( + dssim: &mut Dssim, + pixels: *const u8, + width: u32, + height: u32, +) -> *mut DssimImage { let width = width as usize; let height = height as usize; - let pixels = std::slice::from_raw_parts(pixels.cast::(), width * height); + let pixels = unsafe { std::slice::from_raw_parts(pixels.cast::(), width * height) }; match dssim.create_image_rgba(pixels, width, height) { Some(img) => Box::into_raw(Box::new(img)), None => std::ptr::null_mut(), @@ -40,11 +47,16 @@ pub unsafe extern "C" fn dssim_create_image_rgba(dssim: &mut Dssim, pixels: *con /// Pixels are copied. Returns NULL on error. /// /// Call `dssim_free_image` to free memory when the image is no longer needed. -#[no_mangle] -pub unsafe extern "C" fn dssim_create_image_rgb(dssim: &mut Dssim, pixels: *const u8, width: u32, height: u32) -> *mut DssimImage { +#[unsafe(no_mangle)] +pub unsafe extern "C" fn dssim_create_image_rgb( + dssim: &mut Dssim, + pixels: *const u8, + width: u32, + height: u32, +) -> *mut DssimImage { let width = width as usize; let height = height as usize; - let pixels = std::slice::from_raw_parts(pixels.cast::(), width * height); + let pixels = unsafe { std::slice::from_raw_parts(pixels.cast::(), width * height) }; match dssim.create_image_rgb(pixels, width, height) { Some(img) => Box::into_raw(Box::new(img)), None => std::ptr::null_mut(), @@ -52,12 +64,14 @@ pub unsafe extern "C" fn dssim_create_image_rgb(dssim: &mut Dssim, pixels: *cons } /// Free image data -#[no_mangle] +#[unsafe(no_mangle)] pub unsafe extern "C" fn dssim_free_image(img: *mut DssimImage) { if img.is_null() { return; } - let _ = Box::from_raw(img); + unsafe { + let _ = Box::from_raw(img); + } } /// Compare these two images. @@ -65,10 +79,14 @@ pub unsafe extern "C" fn dssim_free_image(img: *mut DssimImage) { /// `img1` can be reused for multiple comparisons. /// /// Don't forget to free the images and the DSSIM context when done. -#[no_mangle] -pub unsafe extern "C" fn dssim_compare(dssim: &mut Dssim, img1: *const DssimImage, img2: *const DssimImage) -> f64 { - let img1 = img1.as_ref().unwrap(); - let img2 = img2.as_ref().unwrap(); +#[unsafe(no_mangle)] +pub unsafe extern "C" fn dssim_compare( + dssim: &mut Dssim, + img1: *const DssimImage, + img2: *const DssimImage, +) -> f64 { + let img1 = unsafe { img1.as_ref().unwrap() }; + let img2 = unsafe { img2.as_ref().unwrap() }; let (val, _) = dssim.compare(img1, img2); val.into() } diff --git a/dssim-core/src/dssim.rs b/dssim-core/src/dssim.rs index 5905efa..4ef604e 100644 --- a/dssim-core/src/dssim.rs +++ b/dssim-core/src/dssim.rs @@ -21,23 +21,21 @@ use crate::blur; use crate::image::*; +#[cfg(not(feature = "threads"))] +use crate::lieon as rayon; use crate::linear::ToRGBAPLU; pub use crate::tolab::ToLABBitmap; pub use crate::val::Dssim as Val; use imgref::*; -use itertools::multizip; -#[cfg(not(feature = "threads"))] -use crate::lieon as rayon; use rayon::prelude::*; use rgb::{RGB, RGBA}; use std::borrow::Borrow; -use std::mem::MaybeUninit; use std::ops; use std::ops::Deref; use std::sync::Arc; trait Channable { - fn img1_img2_blur(&self, modified: &Self, tmp: &mut [MaybeUninit]) -> Vec; + fn img1_img2_blur(&self, modified: &Self, tmp: &mut [I]) -> Vec; } #[derive(Clone)] @@ -102,7 +100,11 @@ pub fn new() -> Dssim { impl DssimChan { pub fn new(bitmap: ImgVec, is_chroma: bool) -> Self { - debug_assert!(bitmap.pixels().all(|i| i.is_finite() && i >= 0.0 && i <= 1.0)); + debug_assert!( + bitmap + .pixels() + .all(|i| i.is_finite() && i >= 0.0 && i <= 1.0) + ); Self { width: bitmap.width(), @@ -116,7 +118,7 @@ impl DssimChan { } impl DssimChan { - fn preprocess(&mut self, tmp: &mut [MaybeUninit]) { + fn preprocess(&mut self, tmp: &mut [f32]) { let width = self.width; let height = self.height; assert!(width > 0); @@ -132,42 +134,15 @@ impl DssimChan { let (mu, _, _) = blur::blur(img.as_ref(), tmp).into_contiguous_buf(); self.mu = mu; - self.img_sq_blur = img.pixels().map(|i| { - debug_assert!(i <= 1.0 && i >= 0.0); - i * i - }).collect(); - blur::blur_in_place(ImgRefMut::new(&mut self.img_sq_blur[..], width, height), tmp); - } -} - -impl Channable for [DssimChan] { - fn img1_img2_blur(&self, modified: &Self, tmp32: &mut [MaybeUninit]) -> Vec { - - let blurred:Vec<_> = self.iter().zip(modified.iter()).map(|(o,m)|{ - o.img1_img2_blur(m, tmp32) - }).collect(); - - multizip((blurred[0].iter().copied(), blurred[1].iter().copied(), blurred[2].iter().copied())).map(|(l,a,b)| { - LAB {l,a,b} - }).collect() + self.img_sq_blur = blur::blur_mul(img.as_ref(), img.as_ref(), tmp); } } impl Channable for DssimChan { - fn img1_img2_blur(&self, modified: &Self, tmp32: &mut [MaybeUninit]) -> Vec { + fn img1_img2_blur(&self, modified: &Self, tmp32: &mut [f32]) -> Vec { + let src = self.img.as_ref().unwrap(); let modified_img = modified.img.as_ref().unwrap(); - - let mut out = self.img.as_ref().unwrap().pixels().zip(modified_img.pixels()).map(|(px1, px2)| { - debug_assert!(px1 <= 1.0 && px1 >= 0.0); - debug_assert!(px2 <= 1.0 && px2 >= 0.0); - px1 * px2 - }).collect::>(); - - let width = modified_img.width(); - let height = modified_img.height(); - debug_assert_eq!(out.len(), width * height); - blur::blur_in_place(ImgRefMut::new(&mut out, width, height), tmp32); - out + blur::blur_mul(src.as_ref(), modified_img.as_ref(), tmp32) } } @@ -195,7 +170,12 @@ impl Dssim { /// /// If you have a slice of `u8`, then see `rgb` crate's `as_rgba()`. #[must_use] - pub fn create_image_rgba(&self, bitmap: &[RGBA], width: usize, height: usize) -> Option> { + pub fn create_image_rgba( + &self, + bitmap: &[RGBA], + width: usize, + height: usize, + ) -> Option> { if width * height < bitmap.len() { return None; } @@ -207,7 +187,12 @@ impl Dssim { /// /// If you have a slice of `u8`, then see `rgb` crate's `as_rgb()`. #[must_use] - pub fn create_image_rgb(&self, bitmap: &[RGB], width: usize, height: usize) -> Option> { + pub fn create_image_rgb( + &self, + bitmap: &[RGB], + width: usize, + height: usize, + ) -> Option> { if width * height < bitmap.len() { return None; } @@ -239,42 +224,57 @@ impl Dssim { } #[inline(never)] - fn make_scales_recursive(scales_left: usize, image: MaybeArc<'_, InBitmap>, scales: &mut Vec>) - where + fn make_scales_recursive( + scales_left: usize, + image: MaybeArc<'_, InBitmap>, + scales: &mut Vec>, + ) where InBitmap: ToLABBitmap + Send + Sync + Downsample, OutBitmap: ToLABBitmap + Send + Sync + Downsample, { // Run to_lab and next downsampling in parallel - let (chan, _) = rayon::join({ - let image = image.clone(); - move || { - let lab = image.to_lab(); - drop(image); // Free larger RGB image ASAP - DssimChanScale { - chan: lab.into_par_iter().with_max_len(1).enumerate().map(|(n,l)| { - let w = l.width(); - let h = l.height(); - let mut ch = DssimChan::new(l, n > 0); - - let pixels = w * h; - let mut tmp = Vec::with_capacity(pixels); - ch.preprocess(&mut tmp.spare_capacity_mut()[..pixels]); - ch - }).collect(), + let (chan, _) = rayon::join( + { + let image = image.clone(); + move || { + let lab = image.to_lab(); + drop(image); // Free larger RGB image ASAP + DssimChanScale { + chan: lab + .into_par_iter() + .with_max_len(1) + .enumerate() + .map(|(n, l)| { + let w = l.width(); + let h = l.height(); + let mut ch = DssimChan::new(l, n > 0); + + let pixels = w * h; + let mut tmp = blur::zeroed_f32_vec(pixels); + ch.preprocess(&mut tmp); + ch + }) + .collect(), + } } - } - }, { - let scales = &mut *scales; - move || { - if scales_left > 0 { - let down = image.downsample(); - drop(image); - if let Some(downsampled) = down { - Self::make_scales_recursive(scales_left - 1, MaybeArc::Owned(Arc::new(downsampled)), scales); + }, + { + let scales = &mut *scales; + move || { + if scales_left > 0 { + let down = image.downsample(); + drop(image); + if let Some(downsampled) = down { + Self::make_scales_recursive( + scales_left - 1, + MaybeArc::Owned(Arc::new(downsampled)), + scales, + ); + } } } - } - }); + }, + ); scales.push(chan); } @@ -283,53 +283,85 @@ impl Dssim { /// The `SsimMap`s are returned only if you've enabled them first. /// /// `Val` is a fancy wrapper for `f64` - pub fn compare>>(&self, original_image: &DssimImage, modified_image: M) -> (Val, Vec) { + pub fn compare>>( + &self, + original_image: &DssimImage, + modified_image: M, + ) -> (Val, Vec) { self.compare_inner(original_image, modified_image.borrow()) } #[inline(never)] - fn compare_inner(&self, original_image: &DssimImage, modified_image: &DssimImage) -> (Val, Vec) { + fn compare_inner( + &self, + original_image: &DssimImage, + modified_image: &DssimImage, + ) -> (Val, Vec) { let scaled_images_iter = modified_image.scale.iter().zip(original_image.scale.iter()); - let combined_iter = self.scale_weights.iter().copied().zip(scaled_images_iter).enumerate(); - - let res: Vec<_> = combined_iter.par_bridge().map(|(n, (weight, (modified_image_scale, original_image_scale)))| { - let scale_width = original_image_scale.chan[0].width; - let scale_height = original_image_scale.chan[0].height; - let mut tmp = Vec::with_capacity(scale_width * scale_height); - let tmp = &mut tmp.spare_capacity_mut()[0 .. scale_width*scale_height]; - - let ssim_map = match original_image_scale.chan.len() { - 3 => { - let (original_lab, (img1_img2_blur, modified_lab)) = rayon::join( - || Self::lab_chan(original_image_scale), - || rayon::join( - || original_image_scale.chan.img1_img2_blur(&modified_image_scale.chan, tmp), - || Self::lab_chan(modified_image_scale))); - - Self::compare_scale(&original_lab, &modified_lab, &img1_img2_blur) + let combined_iter = self + .scale_weights + .iter() + .copied() + .zip(scaled_images_iter) + .enumerate(); + + let res: Vec<_> = combined_iter + .par_bridge() + .map( + |(n, (weight, (modified_image_scale, original_image_scale)))| { + let scale_width = original_image_scale.chan[0].width; + let scale_height = original_image_scale.chan[0].height; + let ssim_map = match original_image_scale.chan.len() { + 3 => { + let pixels = scale_width * scale_height; + let img1_img2_blur: Vec> = (0..3usize) + .into_par_iter() + .map(|c| { + let mut tmp = blur::zeroed_f32_vec(pixels); + original_image_scale.chan[c] + .img1_img2_blur(&modified_image_scale.chan[c], &mut tmp) + }) + .collect(); + Self::compare_scale_3ch( + original_image_scale, + modified_image_scale, + &img1_img2_blur, + ) + } + 1 => { + let mut tmp = blur::zeroed_f32_vec(scale_width * scale_height); + let img1_img2_blur = original_image_scale.chan[0] + .img1_img2_blur(&modified_image_scale.chan[0], &mut tmp); + Self::compare_scale( + &original_image_scale.chan[0], + &modified_image_scale.chan[0], + &img1_img2_blur, + ) + } + _ => panic!(), + }; + + let sum = ssim_map.pixels().fold(0., |sum, i| sum + f64::from(i)); + let len = (ssim_map.width() * ssim_map.height()) as f64; + let avg = (sum / len).max(0.0).powf((0.5_f64).powf(n as f64)); + let score = 1.0 + - (ssim_map + .pixels() + .fold(0., |sum, i| sum + (avg - f64::from(i)).abs()) + / len); + + let map = if self.save_maps_scales as usize > n { + Some(SsimMap { + map: ssim_map, + ssim: score, + }) + } else { + None + }; + (score, weight, map) }, - 1 => { - let img1_img2_blur = original_image_scale.chan[0].img1_img2_blur(&modified_image_scale.chan[0], tmp); - Self::compare_scale(&original_image_scale.chan[0], &modified_image_scale.chan[0], &img1_img2_blur) - }, - _ => panic!(), - }; - - let sum = ssim_map.pixels().fold(0., |sum, i| sum + f64::from(i)); - let len = (ssim_map.width()*ssim_map.height()) as f64; - let avg = (sum / len).max(0.0).powf((0.5_f64).powf(n as f64)); - let score = 1.0 - (ssim_map.pixels().fold(0., |sum, i| sum + (avg - f64::from(i)).abs()) / len); - - let map = if self.save_maps_scales as usize > n { - Some(SsimMap { - map: ssim_map, - ssim: score, - }) - } else { - None - }; - (score, weight, map) - }).collect(); + ) + .collect(); let mut ssim_sum = 0.0; let mut weight_sum = 0.0; @@ -345,31 +377,138 @@ impl Dssim { (to_dssim(ssim_sum / weight_sum).into(), ssim_maps) } - fn lab_chan(scale: &DssimChanScale) -> DssimChan { - let l = &scale.chan[0]; - let a = &scale.chan[1]; - let b = &scale.chan[2]; - assert_eq!(l.width, a.width); - assert_eq!(b.width, a.width); - DssimChan { - img_sq_blur: multizip((l.img_sq_blur.iter().copied(), a.img_sq_blur.iter().copied(), b.img_sq_blur.iter().copied())) - .map(|(l,a,b)|LAB {l,a,b}).collect(), - img: if let (Some(l),Some(a),Some(b)) = (&l.img, &a.img, &b.img) { - let buf = multizip((l.pixels(), a.pixels(), b.pixels())).map(|(l,a,b)|{ - debug_assert!(l.is_finite() && a.is_finite() && b.is_finite()); - LAB {l,a,b} - }).collect(); - Some(ImgVec::new(buf, l.width(), l.height())) - } else {None}, - mu: multizip((l.mu.iter().copied(), a.mu.iter().copied(), b.mu.iter().copied())).map(|(l,a,b)|LAB {l,a,b}).collect(), - is_chroma: false, - width: l.width, - height: l.height, + /// Specialized 3-channel comparison with manually unrolled channel loop. + /// Eliminates zip iterator overhead and branch mispredictions from + /// iterating over exactly 3 channels. + #[inline(never)] + fn compare_scale_3ch( + original: &DssimChanScale, + modified: &DssimChanScale, + img1_img2_blur: &[Vec], + ) -> ImgVec { + let width = original.chan[0].width; + let height = original.chan[0].height; + let pixels = width * height; + + // Extract all slice references up front to avoid repeated Vec/struct indexing + let (o0, o1, o2) = (&original.chan[0], &original.chan[1], &original.chan[2]); + let (m0, m1, m2) = (&modified.chan[0], &modified.chan[1], &modified.chan[2]); + + let o0_mu = &o0.mu[..pixels]; + let o1_mu = &o1.mu[..pixels]; + let o2_mu = &o2.mu[..pixels]; + let m0_mu = &m0.mu[..pixels]; + let m1_mu = &m1.mu[..pixels]; + let m2_mu = &m2.mu[..pixels]; + let o0_sq = &o0.img_sq_blur[..pixels]; + let o1_sq = &o1.img_sq_blur[..pixels]; + let o2_sq = &o2.img_sq_blur[..pixels]; + let m0_sq = &m0.img_sq_blur[..pixels]; + let m1_sq = &m1.img_sq_blur[..pixels]; + let m2_sq = &m2.img_sq_blur[..pixels]; + let i12_0 = &img1_img2_blur[0][..pixels]; + let i12_1 = &img1_img2_blur[1][..pixels]; + let i12_2 = &img1_img2_blur[2][..pixels]; + + #[cfg(all(feature = "fma", target_arch = "x86_64"))] + { + use archmage::SimdToken as _; + if let Some(token) = archmage::X64V3Token::summon() { + let mut map_out = blur::zeroed_f32_vec(pixels); + map_out + .par_chunks_mut(1024) + .enumerate() + .for_each(|(ci, chunk)| { + let off = ci * 1024; + let len = chunk.len(); + crate::ssim_simd::compare_3ch_avx2( + token, + [ + &o0_mu[off..off + len], + &o1_mu[off..off + len], + &o2_mu[off..off + len], + ], + [ + &m0_mu[off..off + len], + &m1_mu[off..off + len], + &m2_mu[off..off + len], + ], + [ + &o0_sq[off..off + len], + &o1_sq[off..off + len], + &o2_sq[off..off + len], + ], + [ + &m0_sq[off..off + len], + &m1_sq[off..off + len], + &m2_sq[off..off + len], + ], + [ + &i12_0[off..off + len], + &i12_1[off..off + len], + &i12_2[off..off + len], + ], + chunk, + ); + }); + return ImgVec::new(map_out, width, height); + } } + + let c1: f32 = 0.01 * 0.01; + let c2: f32 = 0.03 * 0.03; + let inv3: f32 = 1.0 / 3.0; + + let map_out: Vec = (0..pixels) + .into_par_iter() + .with_min_len(1 << 10) + .map(|i| { + // Channel 0 (L) + let mu1_0 = o0_mu[i]; + let mu2_0 = m0_mu[i]; + let mu1mu1_0 = mu1_0 * mu1_0; + let mu2mu2_0 = mu2_0 * mu2_0; + let mu1mu2_0 = mu1_0 * mu2_0; + + // Channel 1 (a) + let mu1_1 = o1_mu[i]; + let mu2_1 = m1_mu[i]; + let mu1mu1_1 = mu1_1 * mu1_1; + let mu2mu2_1 = mu2_1 * mu2_1; + let mu1mu2_1 = mu1_1 * mu2_1; + + // Channel 2 (b) + let mu1_2 = o2_mu[i]; + let mu2_2 = m2_mu[i]; + let mu1mu1_2 = mu1_2 * mu1_2; + let mu2mu2_2 = mu2_2 * mu2_2; + let mu1mu2_2 = mu1_2 * mu2_2; + + let mu1_sq = (mu1mu1_0 + mu1mu1_1 + mu1mu1_2) * inv3; + let mu2_sq = (mu2mu2_0 + mu2mu2_1 + mu2mu2_2) * inv3; + let mu1_mu2 = (mu1mu2_0 + mu1mu2_1 + mu1mu2_2) * inv3; + + let sigma1_sq = + ((o0_sq[i] - mu1mu1_0) + (o1_sq[i] - mu1mu1_1) + (o2_sq[i] - mu1mu1_2)) * inv3; + let sigma2_sq = + ((m0_sq[i] - mu2mu2_0) + (m1_sq[i] - mu2mu2_1) + (m2_sq[i] - mu2mu2_2)) * inv3; + let sigma12 = + ((i12_0[i] - mu1mu2_0) + (i12_1[i] - mu1mu2_1) + (i12_2[i] - mu1mu2_2)) * inv3; + + (2.0 * mu1_mu2 + c1) * (2.0 * sigma12 + c2) + / ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)) + }) + .collect(); + + ImgVec::new(map_out, width, height) } #[inline(never)] - fn compare_scale(original: &DssimChan, modified: &DssimChan, img1_img2_blur: &[L]) -> ImgVec + fn compare_scale( + original: &DssimChan, + modified: &DssimChan, + img1_img2_blur: &[L], + ) -> ImgVec where L: Send + Sync + Clone + Copy + ops::Mul + ops::Sub + 'static, f32: From, @@ -388,30 +527,64 @@ impl Dssim { debug_assert_eq!(img1_img2_blur.len(), original.mu.len()); debug_assert_eq!(img1_img2_blur.len(), original.img_sq_blur.len()); - let mu_iter = original.mu.as_slice().par_iter().with_min_len(1<<10).cloned().zip_eq(modified.mu.as_slice().par_iter().with_min_len(1<<10).cloned()); - let sq_iter = original.img_sq_blur.as_slice().par_iter().with_min_len(1<<10).cloned().zip_eq(modified.img_sq_blur.as_slice().par_iter().with_min_len(1<<10).cloned()); - let map_out = img1_img2_blur.par_iter().with_min_len(1<<10).cloned().zip_eq(mu_iter).zip_eq(sq_iter) - .map(|((img1_img2_blur, (mu1, mu2)), (img1_sq_blur, img2_sq_blur))| { - let mu1mu1 = mu1 * mu1; - let mu1mu2 = mu1 * mu2; - let mu2mu2 = mu2 * mu2; - let mu1_sq: f32 = mu1mu1.into(); - let mu2_sq: f32 = mu2mu2.into(); - let mu1_mu2: f32 = mu1mu2.into(); - let sigma1_sq: f32 = (img1_sq_blur - mu1mu1).into(); - let sigma2_sq: f32 = (img2_sq_blur - mu2mu2).into(); - let sigma12: f32 = (img1_img2_blur - mu1mu2).into(); - - 2.0f32.mul_add(mu1_mu2, c1) * 2.0f32.mul_add(sigma12, c2) / - ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)) - }).collect(); + let mu_iter = original + .mu + .as_slice() + .par_iter() + .with_min_len(1 << 10) + .cloned() + .zip_eq( + modified + .mu + .as_slice() + .par_iter() + .with_min_len(1 << 10) + .cloned(), + ); + let sq_iter = original + .img_sq_blur + .as_slice() + .par_iter() + .with_min_len(1 << 10) + .cloned() + .zip_eq( + modified + .img_sq_blur + .as_slice() + .par_iter() + .with_min_len(1 << 10) + .cloned(), + ); + let map_out = img1_img2_blur + .par_iter() + .with_min_len(1 << 10) + .cloned() + .zip_eq(mu_iter) + .zip_eq(sq_iter) + .map( + |((img1_img2_blur, (mu1, mu2)), (img1_sq_blur, img2_sq_blur))| { + let mu1mu1 = mu1 * mu1; + let mu1mu2 = mu1 * mu2; + let mu2mu2 = mu2 * mu2; + let mu1_sq: f32 = mu1mu1.into(); + let mu2_sq: f32 = mu2mu2.into(); + let mu1_mu2: f32 = mu1mu2.into(); + let sigma1_sq: f32 = (img1_sq_blur - mu1mu1).into(); + let sigma2_sq: f32 = (img2_sq_blur - mu2mu2).into(); + let sigma12: f32 = (img1_img2_blur - mu1mu2).into(); + + (2.0 * mu1_mu2 + c1) * (2.0 * sigma12 + c2) + / ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)) + }, + ) + .collect(); ImgVec::new(map_out, width, height) } } fn to_dssim(ssim: f64) -> f64 { - 1.0 / ssim.max(std::f64::EPSILON) - 1.0 + 1.0 / ssim.max(f64::EPSILON) - 1.0 } #[test] @@ -425,27 +598,41 @@ fn png_compare() { let buf1 = &file1.buffer.to_rgbaplu()[..]; let buf2 = &file2.buffer.to_rgbaplu()[..]; - let img1 = d.create_image(&Img::new(buf1, file1.width, file1.height)).unwrap(); - let img2 = d.create_image(&Img::new(buf2, file2.width, file2.height)).unwrap(); + let img1 = d + .create_image(&Img::new(buf1, file1.width, file1.height)) + .unwrap(); + let img2 = d + .create_image(&Img::new(buf2, file2.width, file2.height)) + .unwrap(); let (res, _) = d.compare(&img1, img2); assert!((0.001 - res).abs() < 0.0005, "res is {res}"); - let img1b = d.create_image(&Img::new(buf1, file1.width, file1.height)).unwrap(); + let img1b = d + .create_image(&Img::new(buf1, file1.width, file1.height)) + .unwrap(); let (res, _) = d.compare(&img1, img1b); assert!(0.000000000000001 > res); assert!(res < 0.000000000000001); assert_eq!(res, res); - let sub_img1 = d.create_image(&Img::new(buf1, file1.width, file1.height).sub_image(2,3,44,33)).unwrap(); - let sub_img2 = d.create_image(&Img::new(buf2, file2.width, file2.height).sub_image(17,9,44,33)).unwrap(); + let sub_img1 = d + .create_image(&Img::new(buf1, file1.width, file1.height).sub_image(2, 3, 44, 33)) + .unwrap(); + let sub_img2 = d + .create_image(&Img::new(buf2, file2.width, file2.height).sub_image(17, 9, 44, 33)) + .unwrap(); // Test passing second image directly let (res, _) = d.compare(&sub_img1, sub_img2); assert!(res > 0.1); - let sub_img1 = d.create_image(&Img::new(buf1, file1.width, file1.height).sub_image(22,8,61,40)).unwrap(); - let sub_img2 = d.create_image(&Img::new(buf2, file2.width, file2.height).sub_image(22,8,61,40)).unwrap(); + let sub_img1 = d + .create_image(&Img::new(buf1, file1.width, file1.height).sub_image(22, 8, 61, 40)) + .unwrap(); + let sub_img2 = d + .create_image(&Img::new(buf2, file2.width, file2.height).sub_image(22, 8, 61, 40)) + .unwrap(); // Test passing second image as reference let (res, _) = d.compare(&sub_img1, sub_img2); assert!(res < 0.01); @@ -478,18 +665,18 @@ impl Deref for MaybeArc<'_, T> { #[test] fn poison() { - let a = RGBAPLU::new(1.,1.,1.,1.); - let b = RGBAPLU::new(0.,0.,0.,0.); - let n = 1./0.; - let n = RGBAPLU::new(n,n,n,n); - let buf = vec![ - b,a,a,b,n,n, - a,b,b,a,n,n, - b,a,a,b,n, - ]; + let a = RGBAPLU::new(1., 1., 1., 1.); + let b = RGBAPLU::new(0., 0., 0., 0.); + let n = 1. / 0.; + let n = RGBAPLU::new(n, n, n, n); + let buf = vec![b, a, a, b, n, n, a, b, b, a, n, n, b, a, a, b, n]; let img = ImgVec::new_stride(buf, 4, 3, 6); assert!(img.pixels().all(|p| p.r.is_finite() && p.a.is_finite())); - assert!(img.as_ref().pixels().all(|p| p.g.is_finite() && p.b.is_finite())); + assert!( + img.as_ref() + .pixels() + .all(|p| p.g.is_finite() && p.b.is_finite()) + ); let d = new(); let sub_img1 = d.create_image(&img.as_ref()).unwrap(); diff --git a/dssim-core/src/ffi.rs b/dssim-core/src/ffi.rs index bf0c837..863760a 100644 --- a/dssim-core/src/ffi.rs +++ b/dssim-core/src/ffi.rs @@ -1,6 +1,7 @@ #![allow(dead_code)] #![allow(non_camel_case_types)] #![allow(non_snake_case)] +#![allow(clippy::enum_variant_names)] use std::os::raw::c_ulong; @@ -8,17 +9,19 @@ pub type dssim_px_t = f32; #[link(name = "Accelerate", kind = "framework")] #[cfg(target_os = "macos")] -extern "C" { - pub fn vImageConvolve_PlanarF(src: *const vImage_Buffer<*const f32>, - dest: *mut vImage_Buffer<*mut f32>, - tempBuffer: *mut f32, - srcOffsetToROI_X: vImagePixelCount, - srcOffsetToROI_Y: vImagePixelCount, - kernel: *const f32, - kernel_height: u32, - kernel_width: u32, - backgroundColor: Pixel_F, - flags: vImage_Flags) -> vImage_Error; +unsafe extern "C" { + pub fn vImageConvolve_PlanarF( + src: *const vImage_Buffer<*const f32>, + dest: *mut vImage_Buffer<*mut f32>, + tempBuffer: *mut f32, + srcOffsetToROI_X: vImagePixelCount, + srcOffsetToROI_Y: vImagePixelCount, + kernel: *const f32, + kernel_height: u32, + kernel_width: u32, + backgroundColor: Pixel_F, + flags: vImage_Flags, + ) -> vImage_Error; } pub type vImagePixelCount = c_ulong; @@ -29,51 +32,50 @@ pub type Pixel_F = f32; pub enum vImage_Flags { kvImageNoFlags = 0, - /* Operate on red, green and blue channels only. Alpha is copied from source - to destination. For Interleaved formats only. */ + /* Operate on red, green and blue channels only. Alpha is copied from source + to destination. For Interleaved formats only. */ kvImageLeaveAlphaUnchanged = 1, - /* Copy edge pixels. Convolution Only. */ + /* Copy edge pixels. Convolution Only. */ kvImageCopyInPlace = 2, /* Use the background color for missing pixels. */ - kvImageBackgroundColorFill = 4, + kvImageBackgroundColorFill = 4, /* Use the nearest pixel for missing pixels. */ kvImageEdgeExtend = 8, /* Pass to turn off internal tiling and disable internal multithreading. Use this if - you want to do your own tiling, or to use the Min/Max filters in place. */ - kvImageDoNotTile = 16, + you want to do your own tiling, or to use the Min/Max filters in place. */ + kvImageDoNotTile = 16, /* Use a higher quality, slower resampling filter for Geometry operations - (shear, scale, rotate, affine transform, etc.) */ - kvImageHighQualityResampling = 32, + (shear, scale, rotate, affine transform, etc.) */ + kvImageHighQualityResampling = 32, - /* Use only the part of the kernel that overlaps the image. For integer kernels, - real_divisor = divisor * (sum of used kernel elements) / (sum of kernel elements). - This should preserve image brightness at the edges. Convolution only. */ - kvImageTruncateKernel = 64, + /* Use only the part of the kernel that overlaps the image. For integer kernels, + real_divisor = divisor * (sum of used kernel elements) / (sum of kernel elements). + This should preserve image brightness at the edges. Convolution only. */ + kvImageTruncateKernel = 64, /* The function will return the number of bytes required for the temp buffer. - If this value is negative, it is an error, per standard usage. */ - kvImageGetTempBufferSize = 128, + If this value is negative, it is an error, per standard usage. */ + kvImageGetTempBufferSize = 128, /* Some functions such as vImageConverter_CreateWithCGImageFormat have so many possible error conditions - that developers may need more help than a simple error code to diagnose problems. When this - flag is set and an error is encountered, an informative error message will be logged to the Apple - System Logger (ASL). The output should be visible in Console.app. */ - kvImagePrintDiagnosticsToConsole = 256, + that developers may need more help than a simple error code to diagnose problems. When this + flag is set and an error is encountered, an informative error message will be logged to the Apple + System Logger (ASL). The output should be visible in Console.app. */ + kvImagePrintDiagnosticsToConsole = 256, /* Pass this flag to prevent vImage from allocating additional storage. */ - kvImageNoAllocate = 512, + kvImageNoAllocate = 512, /* Use methods that are HDR-aware, capable of providing correct results for input images with pixel values - outside the otherwise limited (typically [-2,2]) range. This may be slower. */ - kvImageHDRContent = 1024 + outside the otherwise limited (typically [-2,2]) range. This may be slower. */ + kvImageHDRContent = 1024, } - #[repr(C)] pub struct vImage_Buffer { pub data: T, @@ -81,4 +83,3 @@ pub struct vImage_Buffer { pub width: vImagePixelCount, pub rowBytes: usize, } - diff --git a/dssim-core/src/image.rs b/dssim-core/src/image.rs index 2074448..7ead0f6 100644 --- a/dssim-core/src/image.rs +++ b/dssim-core/src/image.rs @@ -190,7 +190,10 @@ pub trait Downsample { fn downsample(&self) -> Option; } -impl Downsample for ImgVec where T: Average4 + Copy + Sync + Send { +impl Downsample for ImgVec +where + T: Average4 + Copy + Sync + Send, +{ type Output = ImgVec; fn downsample(&self) -> Option { @@ -198,7 +201,10 @@ impl Downsample for ImgVec where T: Average4 + Copy + Sync + Send { } } -impl Downsample for ImgRef<'_, T> where T: Average4 + Copy + Sync + Send { +impl Downsample for ImgRef<'_, T> +where + T: Average4 + Copy + Sync + Send, +{ type Output = ImgVec; fn downsample(&self) -> Option { @@ -214,13 +220,20 @@ impl Downsample for ImgRef<'_, T> where T: Average4 + Copy + Sync + Send { let half_width = width / 2; let mut scaled = Vec::with_capacity(half_width * half_height); - scaled.extend(self.buf().chunks(stride * 2).take(half_height).flat_map(|pair| { - let (top, bot) = pair.split_at(stride); - let top = &top[0..half_width * 2]; - let bot = &bot[0..half_width * 2]; - - top.chunks_exact(2).zip(bot.chunks_exact(2)).map(|(a, b)| Average4::average4(a[0], a[1], b[0], b[1])) - })); + scaled.extend( + self.buf() + .chunks(stride * 2) + .take(half_height) + .flat_map(|pair| { + let (top, bot) = pair.split_at(stride); + let top = &top[0..half_width * 2]; + let bot = &bot[0..half_width * 2]; + + top.chunks_exact(2) + .zip(bot.chunks_exact(2)) + .map(|(a, b)| Average4::average4(a[0], a[1], b[0], b[1])) + }), + ); assert_eq!(half_width * half_height, scaled.len()); Some(Img::new(scaled, half_width, half_height)) @@ -238,15 +251,21 @@ pub(crate) fn worst(input: ImgRef<'_, f32>) -> ImgVec { } let mut scaled = Vec::with_capacity(half_width * half_height); - scaled.extend(input.buf().chunks(stride * 2).take(half_height).flat_map(|pair| { - let (top, bot) = pair.split_at(stride); - let top = &top[0..half_width * 2]; - let bot = &bot[0..half_width * 2]; - - top.chunks_exact(2).zip(bot.chunks_exact(2)).map(|(a,b)| { - a[0].min(a[1]).min(b[0].min(b[1])) - }) - })); + scaled.extend( + input + .buf() + .chunks(stride * 2) + .take(half_height) + .flat_map(|pair| { + let (top, bot) = pair.split_at(stride); + let top = &top[0..half_width * 2]; + let bot = &bot[0..half_width * 2]; + + top.chunks_exact(2) + .zip(bot.chunks_exact(2)) + .map(|(a, b)| a[0].min(a[1]).min(b[0].min(b[1]))) + }), + ); assert_eq!(half_width * half_height, scaled.len()); Img::new(scaled, half_width, half_height) @@ -263,15 +282,22 @@ pub(crate) fn avgworst(input: ImgRef<'_, f32>) -> ImgVec { } let mut scaled = Vec::with_capacity(half_width * half_height); - scaled.extend(input.buf().chunks(stride * 2).take(half_height).flat_map(|pair| { - let (top, bot) = pair.split_at(stride); - let top = &top[0..half_width * 2]; - let bot = &bot[0..half_width * 2]; - - top.chunks_exact(2).zip(bot.chunks_exact(2)).map(|(a,b)| { - (a[0] + a[1] + b[0] + b[1]).mul_add(0.25, a[0].min(a[1]).min(b[0].min(b[1])))*0.5 - }) - })); + scaled.extend( + input + .buf() + .chunks(stride * 2) + .take(half_height) + .flat_map(|pair| { + let (top, bot) = pair.split_at(stride); + let top = &top[0..half_width * 2]; + let bot = &bot[0..half_width * 2]; + + top.chunks_exact(2).zip(bot.chunks_exact(2)).map(|(a, b)| { + (a[0] + a[1] + b[0] + b[1]).mul_add(0.25, a[0].min(a[1]).min(b[0].min(b[1]))) + * 0.5 + }) + }), + ); assert_eq!(half_width * half_height, scaled.len()); Img::new(scaled, half_width, half_height) @@ -288,15 +314,21 @@ pub(crate) fn avg(input: ImgRef<'_, f32>) -> ImgVec { } let mut scaled = Vec::with_capacity(half_width * half_height); - scaled.extend(input.buf().chunks(stride * 2).take(half_height).flat_map(|pair| { - let (top, bot) = pair.split_at(stride); - let top = &top[0..half_width * 2]; - let bot = &bot[0..half_width * 2]; - - top.chunks_exact(2).zip(bot.chunks_exact(2)).map(|(a,b)| { - (a[0] + a[1] + b[0] + b[1]) * 0.25 - }) - })); + scaled.extend( + input + .buf() + .chunks(stride * 2) + .take(half_height) + .flat_map(|pair| { + let (top, bot) = pair.split_at(stride); + let top = &top[0..half_width * 2]; + let bot = &bot[0..half_width * 2]; + + top.chunks_exact(2) + .zip(bot.chunks_exact(2)) + .map(|(a, b)| (a[0] + a[1] + b[0] + b[1]) * 0.25) + }), + ); assert_eq!(half_width * half_height, scaled.len()); Img::new(scaled, half_width, half_height) diff --git a/dssim-core/src/lib.rs b/dssim-core/src/lib.rs index 1288900..36847d8 100644 --- a/dssim-core/src/lib.rs +++ b/dssim-core/src/lib.rs @@ -2,18 +2,22 @@ //! support several pixel types. It also allows replacing some parts of the algorithm with different implementations //! (if you need higher accuracy or higher speed). #![doc(html_logo_url = "https://kornel.ski/dssim/logo.png")] +#![deny(unsafe_code)] #![allow(clippy::manual_range_contains)] #![allow(clippy::new_without_default)] mod blur; +#[allow(unsafe_code)] mod c_api; mod dssim; /// cbindgen:ignore +#[allow(unsafe_code)] mod ffi; mod image; #[cfg(not(feature = "threads"))] mod lieon; mod linear; +mod ssim_simd; mod tolab; mod val; diff --git a/dssim-core/src/lieon.rs b/dssim-core/src/lieon.rs index b3ee837..14ab8ad 100644 --- a/dssim-core/src/lieon.rs +++ b/dssim-core/src/lieon.rs @@ -10,13 +10,18 @@ pub mod prelude { } pub trait ParIterator: Sized { - fn with_min_len(self, _one: usize) -> Self { self } - fn with_max_len(self, _one: usize) -> Self { self } - fn par_bridge(self) -> Self { self } + fn with_min_len(self, _one: usize) -> Self { + self + } + fn with_max_len(self, _one: usize) -> Self { + self + } + fn par_bridge(self) -> Self { + self + } } -impl ParIterator for T { -} +impl ParIterator for T {} pub trait ParSliceLie { fn par_chunks(&self, n: usize) -> std::slice::Chunks<'_, T>; diff --git a/dssim-core/src/linear.rs b/dssim-core/src/linear.rs index d6d4ad1..72473d2 100644 --- a/dssim-core/src/linear.rs +++ b/dssim-core/src/linear.rs @@ -6,7 +6,7 @@ use rgb::*; #[doc(hidden)] pub trait GammaComponent { type Lut; - fn max_value() -> usize; + const COMPONENT_MAX: f32; fn to_linear(&self, lut: &Self::Lut) -> f32; fn make_lut() -> Self::Lut; } @@ -52,7 +52,7 @@ pub trait ToRGBAPLU { impl GammaComponent for u8 { type Lut = [f32; 256]; - fn max_value() -> usize { 255 } + const COMPONENT_MAX: f32 = 255.; #[inline(always)] fn to_linear(&self, lut: &Self::Lut) -> f32 { lut[*self as usize] @@ -62,7 +62,7 @@ impl GammaComponent for u8 { fn make_lut() -> Self::Lut { let mut out = [0.; 256]; for (i, o) in out.iter_mut().enumerate() { - *o = to_linear(i as f32 / f32::from(Self::max_value())); + *o = to_linear(i as f32 / Self::COMPONENT_MAX); } out } @@ -70,7 +70,7 @@ impl GammaComponent for u8 { impl GammaComponent for u16 { type Lut = [f32; 65536]; - fn max_value() -> usize { 65535 } + const COMPONENT_MAX: f32 = 65535.; #[inline(always)] fn to_linear(&self, lut: &Self::Lut) -> f32 { lut[*self as usize] @@ -80,18 +80,21 @@ impl GammaComponent for u16 { fn make_lut() -> Self::Lut { let mut out = [0.; 65536]; for (i, o) in out.iter_mut().enumerate() { - *o = to_linear(i as f32 / f32::from(Self::max_value())); + *o = to_linear(i as f32 / Self::COMPONENT_MAX); } out } } -impl GammaPixel for RGBA where M: Clone + Into + GammaComponent { +impl GammaPixel for RGBA +where + M: Clone + Into + GammaComponent, +{ type Component = M; type Output = RGBAPLU; #[inline] fn to_linear(&self, gamma_lut: &M::Lut) -> RGBAPLU { - let a_unit = self.a.clone().into() / M::max_value() as f32; + let a_unit = self.a.clone().into() / M::COMPONENT_MAX; RGBAPLU { r: self.r.to_linear(gamma_lut) * a_unit, g: self.g.to_linear(gamma_lut) * a_unit, @@ -101,13 +104,16 @@ impl GammaPixel for RGBA where M: Clone + Into + GammaComponent { } } -impl GammaPixel for BGRA where M: Clone + Into + GammaComponent { +impl GammaPixel for BGRA +where + M: Clone + Into + GammaComponent, +{ type Component = M; type Output = RGBAPLU; #[inline] fn to_linear(&self, gamma_lut: &M::Lut) -> RGBAPLU { - let a_unit = self.a.clone().into() / M::max_value() as f32; + let a_unit = self.a.clone().into() / M::COMPONENT_MAX; RGBAPLU { r: self.r.to_linear(gamma_lut) * a_unit, g: self.g.to_linear(gamma_lut) * a_unit, @@ -117,7 +123,10 @@ impl GammaPixel for BGRA where M: Clone + Into + GammaComponent { } } -impl GammaPixel for RGB where M: GammaComponent { +impl GammaPixel for RGB +where + M: GammaComponent, +{ type Component = M; type Output = RGBAPLU; #[inline] @@ -131,7 +140,10 @@ impl GammaPixel for RGB where M: GammaComponent { } } -impl GammaPixel for BGR where M: GammaComponent { +impl GammaPixel for BGR +where + M: GammaComponent, +{ type Component = M; type Output = RGBAPLU; @@ -146,12 +158,15 @@ impl GammaPixel for BGR where M: GammaComponent { } } -impl GammaPixel for GrayAlpha where M: Copy + Clone + Into + GammaComponent { +impl GammaPixel for GrayAlpha +where + M: Copy + Clone + Into + GammaComponent, +{ type Component = M; type Output = RGBAPLU; fn to_linear(&self, gamma_lut: &M::Lut) -> RGBAPLU { - let a_unit = self.value().into() / M::max_value() as f32; + let a_unit = self.value().into() / M::COMPONENT_MAX; let g = self.value().to_linear(gamma_lut); RGBAPLU { r: g * a_unit, @@ -162,7 +177,10 @@ impl GammaPixel for GrayAlpha where M: Copy + Clone + Into + GammaCom } } -impl GammaPixel for M where M: GammaComponent { +impl GammaPixel for M +where + M: GammaComponent, +{ type Component = M; type Output = f32; @@ -172,18 +190,29 @@ impl GammaPixel for M where M: GammaComponent { } } -impl GammaPixel for Gray where M: Copy + GammaComponent { +impl GammaPixel for Gray +where + M: Copy + GammaComponent, +{ type Component = M; type Output = RGBAPLU; #[inline(always)] fn to_linear(&self, gamma_lut: &M::Lut) -> RGBAPLU { let g = self.value().to_linear(gamma_lut); - RGBAPLU { r: g, g, b: g, a: 1.0 } + RGBAPLU { + r: g, + g, + b: g, + a: 1.0, + } } } -impl

ToRGBAPLU for [P] where P: GammaPixel { +impl

ToRGBAPLU for [P] +where + P: GammaPixel, +{ fn to_rgbaplu(&self) -> Vec { let gamma_lut = P::make_lut(); self.iter().map(|px| px.to_linear(&gamma_lut)).collect() @@ -191,6 +220,8 @@ impl

ToRGBAPLU for [P] where P: GammaPixel { fn to_rgblu(&self) -> Vec { let gamma_lut = P::make_lut(); - self.iter().map(|px| px.to_linear(&gamma_lut).rgb()).collect() + self.iter() + .map(|px| px.to_linear(&gamma_lut).rgb()) + .collect() } } diff --git a/dssim-core/src/ssim_simd.rs b/dssim-core/src/ssim_simd.rs new file mode 100644 index 0000000..f96eaa3 --- /dev/null +++ b/dssim-core/src/ssim_simd.rs @@ -0,0 +1,121 @@ +// AVX2+FMA SIMD for 3-channel SSIM comparison. +// Processes 8 pixels per iteration using 256-bit vectors. + +#[cfg(all(feature = "fma", target_arch = "x86_64"))] +mod simd { + use archmage::prelude::*; + use magetypes::simd::f32x8; + + #[arcane] + pub fn compare_3ch_avx2( + t: X64V3Token, + o_mu: [&[f32]; 3], + m_mu: [&[f32]; 3], + o_sq: [&[f32]; 3], + m_sq: [&[f32]; 3], + i12: [&[f32]; 3], + out: &mut [f32], + ) { + let len = out.len(); + let vc1 = f32x8::splat(t, 0.01 * 0.01); + let vc2 = f32x8::splat(t, 0.03 * 0.03); + let vinv3 = f32x8::splat(t, 1.0 / 3.0); + let vtwo = f32x8::splat(t, 2.0); + + let mut i = 0; + while i + 8 <= len { + // 15 loads: 6 mu + 6 sq + 3 i12 + let omu0 = f32x8::load(t, (&o_mu[0][i..i + 8]).try_into().unwrap()); + let omu1 = f32x8::load(t, (&o_mu[1][i..i + 8]).try_into().unwrap()); + let omu2 = f32x8::load(t, (&o_mu[2][i..i + 8]).try_into().unwrap()); + let mmu0 = f32x8::load(t, (&m_mu[0][i..i + 8]).try_into().unwrap()); + let mmu1 = f32x8::load(t, (&m_mu[1][i..i + 8]).try_into().unwrap()); + let mmu2 = f32x8::load(t, (&m_mu[2][i..i + 8]).try_into().unwrap()); + + let osq0 = f32x8::load(t, (&o_sq[0][i..i + 8]).try_into().unwrap()); + let osq1 = f32x8::load(t, (&o_sq[1][i..i + 8]).try_into().unwrap()); + let osq2 = f32x8::load(t, (&o_sq[2][i..i + 8]).try_into().unwrap()); + let msq0 = f32x8::load(t, (&m_sq[0][i..i + 8]).try_into().unwrap()); + let msq1 = f32x8::load(t, (&m_sq[1][i..i + 8]).try_into().unwrap()); + let msq2 = f32x8::load(t, (&m_sq[2][i..i + 8]).try_into().unwrap()); + + let iv0 = f32x8::load(t, (&i12[0][i..i + 8]).try_into().unwrap()); + let iv1 = f32x8::load(t, (&i12[1][i..i + 8]).try_into().unwrap()); + let iv2 = f32x8::load(t, (&i12[2][i..i + 8]).try_into().unwrap()); + + // Per-channel mu products (9 muls) + let mu1mu1_0 = omu0 * omu0; + let mu2mu2_0 = mmu0 * mmu0; + let mu1mu2_0 = omu0 * mmu0; + + let mu1mu1_1 = omu1 * omu1; + let mu2mu2_1 = mmu1 * mmu1; + let mu1mu2_1 = omu1 * mmu1; + + let mu1mu1_2 = omu2 * omu2; + let mu2mu2_2 = mmu2 * mmu2; + let mu1mu2_2 = omu2 * mmu2; + + // Average across channels (* inv3) + let mu1_sq = (mu1mu1_0 + mu1mu1_1 + mu1mu1_2) * vinv3; + let mu2_sq = (mu2mu2_0 + mu2mu2_1 + mu2mu2_2) * vinv3; + let mu1_mu2 = (mu1mu2_0 + mu1mu2_1 + mu1mu2_2) * vinv3; + + // sigma = sq_blur - mu^2, averaged across channels + let sigma1_sq = ((osq0 - mu1mu1_0) + (osq1 - mu1mu1_1) + (osq2 - mu1mu1_2)) * vinv3; + let sigma2_sq = ((msq0 - mu2mu2_0) + (msq1 - mu2mu2_1) + (msq2 - mu2mu2_2)) * vinv3; + let sigma12 = ((iv0 - mu1mu2_0) + (iv1 - mu1mu2_1) + (iv2 - mu1mu2_2)) * vinv3; + + // SSIM = (2*mu1_mu2 + c1)*(2*sigma12 + c2) / ((mu1_sq+mu2_sq+c1)*(sigma1_sq+sigma2_sq+c2)) + let num = vtwo.mul_add(mu1_mu2, vc1) * vtwo.mul_add(sigma12, vc2); + let den = (mu1_sq + mu2_sq + vc1) * (sigma1_sq + sigma2_sq + vc2); + let result = num / den; + + result.store((&mut out[i..i + 8]).try_into().unwrap()); + i += 8; + } + + // Scalar tail for remaining pixels + let c1: f32 = 0.01 * 0.01; + let c2: f32 = 0.03 * 0.03; + let inv3: f32 = 1.0 / 3.0; + for i in i..len { + let mu1_0 = o_mu[0][i]; + let mu2_0 = m_mu[0][i]; + let mu1mu1_0 = mu1_0 * mu1_0; + let mu2mu2_0 = mu2_0 * mu2_0; + let mu1mu2_0 = mu1_0 * mu2_0; + + let mu1_1 = o_mu[1][i]; + let mu2_1 = m_mu[1][i]; + let mu1mu1_1 = mu1_1 * mu1_1; + let mu2mu2_1 = mu2_1 * mu2_1; + let mu1mu2_1 = mu1_1 * mu2_1; + + let mu1_2 = o_mu[2][i]; + let mu2_2 = m_mu[2][i]; + let mu1mu1_2 = mu1_2 * mu1_2; + let mu2mu2_2 = mu2_2 * mu2_2; + let mu1mu2_2 = mu1_2 * mu2_2; + + let mu1_sq = (mu1mu1_0 + mu1mu1_1 + mu1mu1_2) * inv3; + let mu2_sq = (mu2mu2_0 + mu2mu2_1 + mu2mu2_2) * inv3; + let mu1_mu2 = (mu1mu2_0 + mu1mu2_1 + mu1mu2_2) * inv3; + + let sigma1_sq = + ((o_sq[0][i] - mu1mu1_0) + (o_sq[1][i] - mu1mu1_1) + (o_sq[2][i] - mu1mu1_2)) + * inv3; + let sigma2_sq = + ((m_sq[0][i] - mu2mu2_0) + (m_sq[1][i] - mu2mu2_1) + (m_sq[2][i] - mu2mu2_2)) + * inv3; + let sigma12 = + ((i12[0][i] - mu1mu2_0) + (i12[1][i] - mu1mu2_1) + (i12[2][i] - mu1mu2_2)) * inv3; + + out[i] = (2.0 * mu1_mu2 + c1) * (2.0 * sigma12 + c2) + / ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)); + } + } +} + +#[cfg(all(feature = "fma", target_arch = "x86_64"))] +pub(crate) use simd::compare_3ch_avx2; diff --git a/dssim-core/src/tolab.rs b/dssim-core/src/tolab.rs index 69ea6fa..6809e58 100644 --- a/dssim-core/src/tolab.rs +++ b/dssim-core/src/tolab.rs @@ -1,12 +1,12 @@ #![allow(non_snake_case)] #![allow(non_upper_case_globals)] -use crate::image::ToRGB; use crate::image::RGBAPLU; use crate::image::RGBLU; -use imgref::*; +use crate::image::ToRGB; #[cfg(not(feature = "threads"))] use crate::lieon as rayon; +use imgref::*; use rayon::prelude::*; const D65x: f32 = 0.9505; @@ -20,7 +20,7 @@ pub(crate) trait ToLAB { #[inline(always)] fn fma_matrix(r: f32, rx: f32, g: f32, gx: f32, b: f32, bx: f32) -> f32 { - b.mul_add(bx, g.mul_add(gx, r * rx)) + r * rx + g * gx + b * bx } const EPSILON: f32 = 216. / 24389.; @@ -28,18 +28,51 @@ const K: f32 = 24389. / (27. * 116.); // http://www.brucelindbloom.com/LContinui impl ToLAB for RGBLU { fn to_lab(&self) -> (f32, f32, f32) { - let fx = fma_matrix(self.r, 0.4124 / D65x, self.g, 0.3576 / D65x, self.b, 0.1805 / D65x); - let fy = fma_matrix(self.r, 0.2126 / D65y, self.g, 0.7152 / D65y, self.b, 0.0722 / D65y); - let fz = fma_matrix(self.r, 0.0193 / D65z, self.g, 0.1192 / D65z, self.b, 0.9505 / D65z); + let fx = fma_matrix( + self.r, + 0.4124 / D65x, + self.g, + 0.3576 / D65x, + self.b, + 0.1805 / D65x, + ); + let fy = fma_matrix( + self.r, + 0.2126 / D65y, + self.g, + 0.7152 / D65y, + self.b, + 0.0722 / D65y, + ); + let fz = fma_matrix( + self.r, + 0.0193 / D65z, + self.g, + 0.1192 / D65z, + self.b, + 0.9505 / D65z, + ); - let X = if fx > EPSILON { cbrt_poly(fx) - 16. / 116. } else { K * fx }; - let Y = if fy > EPSILON { cbrt_poly(fy) - 16. / 116. } else { K * fy }; - let Z = if fz > EPSILON { cbrt_poly(fz) - 16. / 116. } else { K * fz }; + let X = if fx > EPSILON { + cbrt_poly(fx) - 16. / 116. + } else { + K * fx + }; + let Y = if fy > EPSILON { + cbrt_poly(fy) - 16. / 116. + } else { + K * fy + }; + let Z = if fz > EPSILON { + cbrt_poly(fz) - 16. / 116. + } else { + K * fz + }; let lab = ( (Y * 1.05f32), // 1.05 instead of 1.16 to boost color importance without pushing colors outside of 1.0 range - (500.0 / 220.0f32).mul_add(X - Y, 86.2 / 220.0f32), /* 86 is a fudge to make the value positive */ - (200.0 / 220.0f32).mul_add(Y - Z, 107.9 / 220.0f32), /* 107 is a fudge to make the value positive */ + (500.0 / 220.0) * (X - Y) + (86.2 / 220.0), /* 86 is a fudge to make the value positive */ + (200.0 / 220.0) * (Y - Z) + (107.9 / 220.0), /* 107 is a fudge to make the value positive */ ); debug_assert!(lab.0 <= 1.0 && lab.1 <= 1.0 && lab.2 <= 1.0); lab @@ -53,15 +86,208 @@ fn cbrt_poly(x: f32) -> f32 { let y = (poly[2] * x + poly[1]) * x + poly[0]; // 2x Halley's Method - let y3 = y*y*y; + let y3 = y * y * y; let y = y * (y3 + 2. * x) / (2. * y3 + x); - let y3 = y*y*y; + let y3 = y * y * y; let y = y * (y3 + 2. * x) / (2. * y3 + x); debug_assert!(y < 1.001); debug_assert!(x < 216. / 24389. || y >= 16. / 116.); y } +// ── AVX2+FMA SIMD path ────────────────────────────────────────────── +#[cfg(all(feature = "fma", target_arch = "x86_64"))] +mod simd { + use super::{EPSILON, GBitmap, K, RGBLU, ToLAB, cbrt_poly}; + #[cfg(not(feature = "threads"))] + use crate::lieon::prelude::*; + use archmage::prelude::*; + use imgref::*; + use magetypes::simd::f32x8; + #[cfg(feature = "threads")] + use rayon::prelude::*; + + #[rite] + fn cbrt_poly_x8(t: X64V3Token, x: f32x8) -> f32x8 { + let two = f32x8::splat(t, 2.0); + let half_neg = f32x8::splat(t, -0.5); + let c151 = f32x8::splat(t, 1.51); + let c02 = f32x8::splat(t, 0.2); + + // Polynomial guess: y = (-0.5*x + 1.51)*x + 0.2 + let y = half_neg.mul_add(x, c151).mul_add(x, c02); + + // 1st Halley's: y = y*(y³+2x) / (2y³+x) + let y3 = y * y * y; + let y = y * two.mul_add(x, y3) / two.mul_add(y3, x); + + // 2nd Halley's + let y3 = y * y * y; + y * two.mul_add(x, y3) / two.mul_add(y3, x) + } + + #[rite] + fn to_lab_x8(t: X64V3Token, vr: f32x8, vg: f32x8, vb: f32x8) -> (f32x8, f32x8, f32x8) { + // Matrix constants (pre-divided by D65) + let m00 = f32x8::splat(t, 0.4124 / 0.9505); + let m01 = f32x8::splat(t, 0.3576 / 0.9505); + let m02 = f32x8::splat(t, 0.1805 / 0.9505); + let m10 = f32x8::splat(t, 0.2126); + let m11 = f32x8::splat(t, 0.7152); + let m12 = f32x8::splat(t, 0.0722); + let m20 = f32x8::splat(t, 0.0193 / 1.089); + let m21 = f32x8::splat(t, 0.1192 / 1.089); + let m22 = f32x8::splat(t, 0.9505 / 1.089); + + // RGB → XYZ/D65 (6 FMAs + 3 muls) + let fx = vr.mul_add(m00, vg.mul_add(m01, vb * m02)); + let fy = vr.mul_add(m10, vg.mul_add(m11, vb * m12)); + let fz = vr.mul_add(m20, vg.mul_add(m21, vb * m22)); + + // Conditional cbrt: if f > EPSILON { cbrt(f) - 16/116 } else { K*f } + let veps = f32x8::splat(t, EPSILON); + let vk = f32x8::splat(t, K); + let v16_116 = f32x8::splat(t, 16.0 / 116.0); + + let cbrt_x = cbrt_poly_x8(t, fx) - v16_116; + let linear_x = vk * fx; + let big_x = f32x8::blend(fx.simd_gt(veps), cbrt_x, linear_x); + + let cbrt_y = cbrt_poly_x8(t, fy) - v16_116; + let linear_y = vk * fy; + let big_y = f32x8::blend(fy.simd_gt(veps), cbrt_y, linear_y); + + let cbrt_z = cbrt_poly_x8(t, fz) - v16_116; + let linear_z = vk * fz; + let big_z = f32x8::blend(fz.simd_gt(veps), cbrt_z, linear_z); + + // LAB scaling + let l = big_y * f32x8::splat(t, 1.05); + let a = (big_x - big_y) * f32x8::splat(t, 500.0 / 220.0) + f32x8::splat(t, 86.2 / 220.0); + let b = (big_y - big_z) * f32x8::splat(t, 200.0 / 220.0) + f32x8::splat(t, 107.9 / 220.0); + + (l, a, b) + } + + #[arcane] + pub fn process_row_avx2( + t: X64V3Token, + in_row: &[RGBLU], + l_row: &mut [f32], + a_row: &mut [f32], + b_row: &mut [f32], + ) { + let width = in_row.len(); + let mut x = 0; + + // SIMD loop: 8 pixels at a time + while x + 8 <= width { + // AoS→SoA gather + let mut rs = [0f32; 8]; + let mut gs = [0f32; 8]; + let mut bs = [0f32; 8]; + for i in 0..8 { + rs[i] = in_row[x + i].r; + gs[i] = in_row[x + i].g; + bs[i] = in_row[x + i].b; + } + let vr = f32x8::from_array(t, rs); + let vg = f32x8::from_array(t, gs); + let vb = f32x8::from_array(t, bs); + + let (vl, va, vb_out) = to_lab_x8(t, vr, vg, vb); + + vl.store((&mut l_row[x..x + 8]).try_into().unwrap()); + va.store((&mut a_row[x..x + 8]).try_into().unwrap()); + vb_out.store((&mut b_row[x..x + 8]).try_into().unwrap()); + x += 8; + } + + // Scalar tail + for x in x..width { + let (l, a, b) = in_row[x].to_lab(); + l_row[x] = l; + a_row[x] = a; + b_row[x] = b; + } + } + + #[arcane] + pub fn process_gray_row_avx2(t: X64V3Token, in_row: &[f32], out_row: &mut [f32]) { + let width = in_row.len(); + let veps = f32x8::splat(t, EPSILON); + let vk_116 = f32x8::splat(t, K * 1.16); + let v16_116 = f32x8::splat(t, 16.0 / 116.0); + let v116 = f32x8::splat(t, 1.16); + + let mut x = 0; + while x + 8 <= width { + let fy = f32x8::load(t, (&in_row[x..x + 8]).try_into().unwrap()); + let cbrt_path = (cbrt_poly_x8(t, fy) - v16_116) * v116; + let linear_path = vk_116 * fy; + let result = f32x8::blend(fy.simd_gt(veps), cbrt_path, linear_path); + result.store((&mut out_row[x..x + 8]).try_into().unwrap()); + x += 8; + } + + // Scalar tail + for x in x..width { + let fy = in_row[x]; + out_row[x] = if fy > EPSILON { + (cbrt_poly(fy) - 16. / 116.) * 1.16 + } else { + (K * 1.16) * fy + }; + } + } + + pub fn rgb_to_lab_simd(token: X64V3Token, img: ImgRef<'_, RGBLU>) -> Vec { + let width = img.width(); + let height = img.height(); + assert!(width > 0); + let area = width * height; + + let mut out_l = vec![0f32; area]; + let mut out_a = vec![0f32; area]; + let mut out_b = vec![0f32; area]; + + out_l + .par_chunks_exact_mut(width) + .zip( + out_a + .par_chunks_exact_mut(width) + .zip(out_b.par_chunks_exact_mut(width)), + ) + .enumerate() + .for_each(|(y, (l_row, (a_row, b_row)))| { + let in_row = &img.rows().nth(y).unwrap()[0..width]; + process_row_avx2(token, in_row, l_row, a_row, b_row); + }); + + vec![ + Img::new(out_l, width, height), + Img::new(out_a, width, height), + Img::new(out_b, width, height), + ] + } + + pub fn gray_to_lab_simd(token: X64V3Token, img: &GBitmap) -> Vec { + let width = img.width(); + let height = img.height(); + let area = width * height; + let mut out = vec![0f32; area]; + + out.par_chunks_exact_mut(width) + .enumerate() + .for_each(|(y, out_row)| { + let in_row = &img.rows().nth(y).unwrap()[0..width]; + process_gray_row_avx2(token, in_row, out_row); + }); + + vec![Img::new(out, width, height)] + } +} + /// Convert image to L\*a\*b\* planar /// /// It should return 1 (gray) or 3 (color) planes. @@ -85,14 +311,28 @@ impl ToLABBitmap for ImgVec { impl ToLABBitmap for GBitmap { fn to_lab(&self) -> Vec { debug_assert!(self.width() > 0); + + #[cfg(all(feature = "fma", target_arch = "x86_64"))] + { + use archmage::SimdToken as _; + if let Some(token) = archmage::X64V3Token::summon() { + return simd::gray_to_lab_simd(token, self); + } + } + let f = |fy| { - if fy > EPSILON { (cbrt_poly(fy) - 16. / 116.) * 1.16 } else { (K * 1.16) * fy } + if fy > EPSILON { + (cbrt_poly(fy) - 16. / 116.) * 1.16 + } else { + (K * 1.16) * fy + } }; #[cfg(feature = "threads")] - let out = (0..self.height()).into_par_iter().flat_map_iter(|y| { - self[y].iter().map(|&fy| f(fy)) - }).collect(); + let out = (0..self.height()) + .into_par_iter() + .flat_map_iter(|y| self[y].iter().map(|&fy| f(fy))) + .collect(); #[cfg(not(feature = "threads"))] let out = self.pixels().map(f).collect(); @@ -103,39 +343,42 @@ impl ToLABBitmap for GBitmap { #[inline(never)] fn rgb_to_lab(img: ImgRef<'_, T>, cb: F) -> Vec - where F: Fn(T, usize) -> (f32, f32, f32) + Sync + Send + 'static +where + F: Fn(T, usize) -> (f32, f32, f32) + Sync + Send + 'static, { let width = img.width(); assert!(width > 0); let height = img.height(); let area = width * height; - let mut out_l = Vec::with_capacity(area); - let mut out_a = Vec::with_capacity(area); - let mut out_b = Vec::with_capacity(area); + let mut out_l = vec![0f32; area]; + let mut out_a = vec![0f32; area]; + let mut out_b = vec![0f32; area]; // For output width == stride - out_l.spare_capacity_mut().par_chunks_exact_mut(width).take(height).zip( - out_a.spare_capacity_mut().par_chunks_exact_mut(width).take(height).zip( - out_b.spare_capacity_mut().par_chunks_exact_mut(width).take(height)) - ).enumerate() - .for_each(|(y, (l_row, (a_row, b_row)))| { - let in_row = &img.rows().nth(y).unwrap()[0..width]; - let l_row = &mut l_row[0..width]; - let a_row = &mut a_row[0..width]; - let b_row = &mut b_row[0..width]; - for x in 0..width { - let n = (x+11) ^ (y+11); - let (l,a,b) = cb(in_row[x], n); - l_row[x].write(l); - a_row[x].write(a); - b_row[x].write(b); - } - }); - - unsafe { out_l.set_len(area) }; - unsafe { out_a.set_len(area) }; - unsafe { out_b.set_len(area) }; + out_l + .par_chunks_exact_mut(width) + .take(height) + .zip( + out_a + .par_chunks_exact_mut(width) + .take(height) + .zip(out_b.par_chunks_exact_mut(width).take(height)), + ) + .enumerate() + .for_each(|(y, (l_row, (a_row, b_row)))| { + let in_row = &img.rows().nth(y).unwrap()[0..width]; + let l_row = &mut l_row[0..width]; + let a_row = &mut a_row[0..width]; + let b_row = &mut b_row[0..width]; + for x in 0..width { + let n = (x + 11) ^ (y + 11); + let (l, a, b) = cb(in_row[x], n); + l_row[x] = l; + a_row[x] = a; + b_row[x] = b; + } + }); vec![ Img::new(out_l, width, height), @@ -147,18 +390,21 @@ fn rgb_to_lab(img: ImgRef<'_, T>, cb: F) -> impl ToLABBitmap for ImgRef<'_, RGBAPLU> { #[inline] fn to_lab(&self) -> Vec { - rgb_to_lab(*self, |px, n|{ - px.to_rgb(n).to_lab() - }) + rgb_to_lab(*self, |px, n| px.to_rgb(n).to_lab()) } } impl ToLABBitmap for ImgRef<'_, RGBLU> { #[inline] fn to_lab(&self) -> Vec { - rgb_to_lab(*self, |px, _n|{ - px.to_lab() - }) + #[cfg(all(feature = "fma", target_arch = "x86_64"))] + { + use archmage::SimdToken as _; + if let Some(token) = archmage::X64V3Token::summon() { + return simd::rgb_to_lab_simd(token, *self); + } + } + rgb_to_lab(*self, |px, _n| px.to_lab()) } } @@ -172,9 +418,13 @@ fn cbrts1() { let actual = a * a * a; let expected = x; let absdiff = (expected as f64 - actual as f64).abs(); - assert!(absdiff < 0.0002, "{expected} - {actual} = {} @ {x}", expected - actual); + assert!( + absdiff < 0.0002, + "{expected} - {actual} = {} @ {x}", + expected - actual + ); if i % 400 == 0 { - println!("{:+0.3}", (expected - actual)*255.); + println!("{:+0.3}", (expected - actual) * 255.); } totaldiff += absdiff; maxdiff = maxdiff.max(absdiff); @@ -194,7 +444,11 @@ fn cbrts2() { let absdiff = (expected - actual).abs(); totaldiff += absdiff; maxdiff = maxdiff.max(absdiff); - assert!(absdiff < 0.0000005, "{expected} - {actual} = {} @ {x}", expected - actual); + assert!( + absdiff < 0.0000005, + "{expected} - {actual} = {} @ {x}", + expected - actual + ); } println!("2={totaldiff:0.6}; {maxdiff:0.8}"); assert!(totaldiff < 0.0025, "{totaldiff}"); diff --git a/dssim-core/src/val.rs b/dssim-core/src/val.rs index f6165df..adca3c8 100644 --- a/dssim-core/src/val.rs +++ b/dssim-core/src/val.rs @@ -128,10 +128,18 @@ impl PartialOrd for Dssim { self.0.partial_cmp(other) } - fn lt(&self, other: &f64) -> bool { self.0.lt(other) } - fn le(&self, other: &f64) -> bool { self.0.le(other) } - fn gt(&self, other: &f64) -> bool { self.0.gt(other) } - fn ge(&self, other: &f64) -> bool { self.0.ge(other) } + fn lt(&self, other: &f64) -> bool { + self.0.lt(other) + } + fn le(&self, other: &f64) -> bool { + self.0.le(other) + } + fn gt(&self, other: &f64) -> bool { + self.0.gt(other) + } + fn ge(&self, other: &f64) -> bool { + self.0.ge(other) + } } impl PartialOrd for f64 { @@ -139,8 +147,16 @@ impl PartialOrd for f64 { self.partial_cmp(&other.0) } - fn lt(&self, other: &Dssim) -> bool { self.lt(&other.0) } - fn le(&self, other: &Dssim) -> bool { self.le(&other.0) } - fn gt(&self, other: &Dssim) -> bool { self.gt(&other.0) } - fn ge(&self, other: &Dssim) -> bool { self.ge(&other.0) } + fn lt(&self, other: &Dssim) -> bool { + self.lt(&other.0) + } + fn le(&self, other: &Dssim) -> bool { + self.le(&other.0) + } + fn gt(&self, other: &Dssim) -> bool { + self.gt(&other.0) + } + fn ge(&self, other: &Dssim) -> bool { + self.ge(&other.0) + } } diff --git a/src/lib.rs b/src/lib.rs index aefef03..d6ad9f3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,25 +4,32 @@ pub use dssim_core::*; use imgref::Img; -use load_image::ImageData; use std::path::Path; -fn load(attr: &Dssim, path: &Path) -> Result, load_image::Error> { - let img = load_image::load_path(path)?; - Ok(match img.bitmap { - ImageData::RGB8(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgblu(), img.width, img.height)), - ImageData::RGB16(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgblu(), img.width, img.height)), - ImageData::RGBA8(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgbaplu(), img.width, img.height)), - ImageData::RGBA16(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgbaplu(), img.width, img.height)), - ImageData::GRAY8(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgblu(), img.width, img.height)), - ImageData::GRAY16(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgblu(), img.width, img.height)), - ImageData::GRAYA8(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgbaplu(), img.width, img.height)), - ImageData::GRAYA16(ref bitmap) => attr.create_image(&Img::new(bitmap.to_rgbaplu(), img.width, img.height)), - }.expect("infallible")) +pub mod load; + +use load::PixelData; + +fn load_impl(attr: &Dssim, path: &Path) -> Result, load::LoadError> { + let (width, height, pixels) = load::load_path(path)?; + Ok(match pixels { + PixelData::Rgb8(ref bm) => attr.create_image(&Img::new(bm.to_rgblu(), width, height)), + PixelData::Rgb16(ref bm) => attr.create_image(&Img::new(bm.to_rgblu(), width, height)), + PixelData::Rgba8(ref bm) => attr.create_image(&Img::new(bm.to_rgbaplu(), width, height)), + PixelData::Rgba16(ref bm) => attr.create_image(&Img::new(bm.to_rgbaplu(), width, height)), + PixelData::Gray8(ref bm) => attr.create_image(&Img::new(bm.to_rgblu(), width, height)), + PixelData::Gray16(ref bm) => attr.create_image(&Img::new(bm.to_rgblu(), width, height)), + PixelData::GrayA8(ref bm) => attr.create_image(&Img::new(bm.to_rgbaplu(), width, height)), + PixelData::GrayA16(ref bm) => attr.create_image(&Img::new(bm.to_rgbaplu(), width, height)), + } + .expect("infallible")) } /// Load PNG or JPEG image from the given path. Applies color profiles and converts to `sRGB`. #[inline] -pub fn load_image(attr: &Dssim, path: impl AsRef) -> Result, load_image::Error> { - load(attr, path.as_ref()) +pub fn load_image( + attr: &Dssim, + path: impl AsRef, +) -> Result, load::LoadError> { + load_impl(attr, path.as_ref()) } diff --git a/src/load.rs b/src/load.rs new file mode 100644 index 0000000..8b02a82 --- /dev/null +++ b/src/load.rs @@ -0,0 +1,475 @@ +use rgb::{ComponentBytes, FromSlice, Gray, GrayAlpha, RGB, RGBA}; +use std::io::Cursor; +use std::path::Path; + +#[derive(Debug)] +pub enum LoadError { + Io(std::io::Error), + Png(png::DecodingError), + Jpeg(zune_jpeg::errors::DecodeErrors), + ColorProfile(moxcms::CmsError), + Pnm(zenbitmaps::BitmapError), + UnsupportedFormat, +} + +impl std::fmt::Display for LoadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Io(e) => write!(f, "I/O error: {e}"), + Self::Png(e) => write!(f, "PNG error: {e}"), + Self::Jpeg(e) => write!(f, "JPEG error: {e}"), + Self::ColorProfile(e) => write!(f, "Color profile error: {e}"), + Self::Pnm(e) => write!(f, "PNM error: {e}"), + Self::UnsupportedFormat => { + write!(f, "Unsupported image format (expected PNG, JPEG, or PNM)") + } + } + } +} + +impl std::error::Error for LoadError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Io(e) => Some(e), + Self::Png(e) => Some(e), + Self::Jpeg(e) => Some(e), + Self::ColorProfile(e) => Some(e), + Self::Pnm(e) => Some(e), + Self::UnsupportedFormat => None, + } + } +} + +impl From for LoadError { + fn from(e: std::io::Error) -> Self { + Self::Io(e) + } +} + +impl From for LoadError { + fn from(e: png::DecodingError) -> Self { + Self::Png(e) + } +} + +impl From for LoadError { + fn from(e: zune_jpeg::errors::DecodeErrors) -> Self { + Self::Jpeg(e) + } +} + +impl From for LoadError { + fn from(e: moxcms::CmsError) -> Self { + Self::ColorProfile(e) + } +} + +impl From for LoadError { + fn from(e: zenbitmaps::BitmapError) -> Self { + Self::Pnm(e) + } +} + +pub enum PixelData { + Rgb8(Vec>), + Rgba8(Vec>), + Gray8(Vec>), + GrayA8(Vec>), + Rgb16(Vec>), + Rgba16(Vec>), + Gray16(Vec>), + GrayA16(Vec>), +} + +fn is_graya_opaque_u8(pixels: &[GrayAlpha]) -> bool { + pixels.iter().all(|p| p.a == 255) +} + +fn is_graya_opaque_u16(pixels: &[GrayAlpha]) -> bool { + pixels.iter().all(|p| p.a == 65535) +} + +fn is_rgba_opaque_u8(pixels: &[RGBA]) -> bool { + pixels.iter().all(|p| p.a == 255) +} + +fn is_rgba_opaque_u16(pixels: &[RGBA]) -> bool { + pixels.iter().all(|p| p.a == 65535) +} + +pub fn load_path(path: &Path) -> Result<(usize, usize, PixelData), LoadError> { + let data = std::fs::read(path)?; + if data.starts_with(b"\x89PNG") { + load_png(&data) + } else if data.starts_with(&[0xFF, 0xD8]) { + load_jpeg(&data) + } else if data.len() >= 2 + && data[0] == b'P' + && matches!(data[1], b'5' | b'6' | b'7' | b'f' | b'F') + { + load_pnm(&data) + } else { + Err(LoadError::UnsupportedFormat) + } +} + +fn load_png(data: &[u8]) -> Result<(usize, usize, PixelData), LoadError> { + let mut decoder = png::Decoder::new(Cursor::new(data)); + decoder.set_transformations(png::Transformations::EXPAND); + let mut reader = decoder.read_info()?; + + let info = reader.info(); + let icc_profile = info.icc_profile.as_ref().map(|c| c.to_vec()); + let has_srgb_chunk = info.srgb.is_some(); + + let (color_type, bit_depth) = reader.output_color_type(); + let buf_size = reader.output_buffer_size().unwrap_or(0); + let width = info.width as usize; + let height = info.height as usize; + + let mut buf = vec![0u8; buf_size]; + reader.next_frame(&mut buf)?; + + let is_16bit = bit_depth == png::BitDepth::Sixteen; + + // Apply ICC profile if present and no sRGB chunk + let needs_icc = !has_srgb_chunk && icc_profile.is_some(); + let icc_data = icc_profile.as_deref(); + + match (color_type, is_16bit) { + (png::ColorType::Grayscale, false) => { + let mut pixels: Vec> = buf[..width * height].as_gray().to_vec(); + if needs_icc { + apply_icc_8bit( + pixels.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::Gray, + )?; + } + Ok((width, height, PixelData::Gray8(pixels))) + } + (png::ColorType::Grayscale, true) => { + let mut pixels = bytes_to_gray16_be(&buf, width * height); + if needs_icc { + apply_icc_16bit( + pixels.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::Gray, + )?; + } + Ok((width, height, PixelData::Gray16(pixels))) + } + (png::ColorType::GrayscaleAlpha, false) => { + let mut pixels: Vec> = buf[..width * height * 2].as_gray_alpha().to_vec(); + if is_graya_opaque_u8(&pixels) { + // Strip alpha for fully-opaque images (matches load_image behavior) + let mut gray: Vec> = pixels.iter().map(|p| Gray::new(p.v)).collect(); + if needs_icc { + apply_icc_8bit( + gray.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::Gray, + )?; + } + Ok((width, height, PixelData::Gray8(gray))) + } else { + if needs_icc { + apply_icc_8bit( + pixels.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::GrayAlpha, + )?; + } + Ok((width, height, PixelData::GrayA8(pixels))) + } + } + (png::ColorType::GrayscaleAlpha, true) => { + let mut pixels = bytes_to_graya16_be(&buf, width * height); + if is_graya_opaque_u16(&pixels) { + let mut gray: Vec> = pixels.iter().map(|p| Gray::new(p.v)).collect(); + if needs_icc { + apply_icc_16bit( + gray.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::Gray, + )?; + } + Ok((width, height, PixelData::Gray16(gray))) + } else { + if needs_icc { + apply_icc_16bit( + pixels.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::GrayAlpha, + )?; + } + Ok((width, height, PixelData::GrayA16(pixels))) + } + } + (png::ColorType::Rgb, false) => { + let mut pixels: Vec> = buf[..width * height * 3].as_rgb().to_vec(); + if needs_icc { + apply_icc_8bit( + pixels.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::Rgb, + )?; + } + Ok((width, height, PixelData::Rgb8(pixels))) + } + (png::ColorType::Rgb, true) => { + let mut pixels = bytes_to_rgb16_be(&buf, width * height); + if needs_icc { + apply_icc_16bit( + pixels.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::Rgb, + )?; + } + Ok((width, height, PixelData::Rgb16(pixels))) + } + (png::ColorType::Rgba, false) => { + let mut pixels: Vec> = buf[..width * height * 4].as_rgba().to_vec(); + if is_rgba_opaque_u8(&pixels) { + let mut rgb: Vec> = + pixels.iter().map(|p| RGB::new(p.r, p.g, p.b)).collect(); + if needs_icc { + apply_icc_8bit( + rgb.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::Rgb, + )?; + } + Ok((width, height, PixelData::Rgb8(rgb))) + } else { + if needs_icc { + apply_icc_8bit( + pixels.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::Rgba, + )?; + } + Ok((width, height, PixelData::Rgba8(pixels))) + } + } + (png::ColorType::Rgba, true) => { + let mut pixels = bytes_to_rgba16_be(&buf, width * height); + if is_rgba_opaque_u16(&pixels) { + let mut rgb: Vec> = + pixels.iter().map(|p| RGB::new(p.r, p.g, p.b)).collect(); + if needs_icc { + apply_icc_16bit( + rgb.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::Rgb, + )?; + } + Ok((width, height, PixelData::Rgb16(rgb))) + } else { + if needs_icc { + apply_icc_16bit( + pixels.as_mut_slice().as_bytes_mut(), + icc_data.unwrap(), + moxcms::Layout::Rgba, + )?; + } + Ok((width, height, PixelData::Rgba16(pixels))) + } + } + _ => Err(LoadError::UnsupportedFormat), + } +} + +fn load_jpeg(data: &[u8]) -> Result<(usize, usize, PixelData), LoadError> { + use zune_jpeg::zune_core::bytestream::ZCursor; + use zune_jpeg::zune_core::colorspace::ColorSpace; + use zune_jpeg::zune_core::options::DecoderOptions; + + let mut decoder = zune_jpeg::JpegDecoder::new(ZCursor::new(data)); + decoder.decode_headers()?; + + let icc_profile = decoder.icc_profile(); + let (width, height) = decoder.dimensions().ok_or(LoadError::UnsupportedFormat)?; + let input_cs = decoder + .input_colorspace() + .ok_or(LoadError::UnsupportedFormat)?; + + // For grayscale JPEGs, decode as Luma to match the GRAY ICC profile. + // zune-jpeg defaults to RGB output even for 1-component images. + if input_cs == ColorSpace::Luma { + decoder.set_options(DecoderOptions::default().jpeg_set_out_colorspace(ColorSpace::Luma)); + } + + let mut pixels = decoder.decode()?; + let output_cs = decoder + .output_colorspace() + .ok_or(LoadError::UnsupportedFormat)?; + + match output_cs { + ColorSpace::Luma => { + if let Some(ref icc_data) = icc_profile { + apply_icc_8bit(&mut pixels, icc_data, moxcms::Layout::Gray)?; + } + let gray: Vec> = pixels.into_iter().map(Gray::new).collect(); + Ok((width, height, PixelData::Gray8(gray))) + } + ColorSpace::RGB => { + if let Some(ref icc_data) = icc_profile { + apply_icc_8bit(&mut pixels, icc_data, moxcms::Layout::Rgb)?; + } + let rgb: Vec> = pixels + .chunks_exact(3) + .map(|c| RGB::new(c[0], c[1], c[2])) + .collect(); + Ok((width, height, PixelData::Rgb8(rgb))) + } + _ => Err(LoadError::UnsupportedFormat), + } +} + +fn load_pnm(data: &[u8]) -> Result<(usize, usize, PixelData), LoadError> { + let decoded = zenbitmaps::decode(data, zenbitmaps::Unstoppable)?; + let w = decoded.width as usize; + let h = decoded.height as usize; + let pixels = decoded.pixels(); + + match decoded.layout { + zenbitmaps::PixelLayout::Gray8 => Ok((w, h, PixelData::Gray8(pixels.as_gray().to_vec()))), + zenbitmaps::PixelLayout::Rgb8 => Ok((w, h, PixelData::Rgb8(pixels.as_rgb().to_vec()))), + zenbitmaps::PixelLayout::Rgba8 => { + let rgba = pixels.as_rgba(); + if is_rgba_opaque_u8(rgba) { + Ok(( + w, + h, + PixelData::Rgb8(rgba.iter().map(|p| RGB::new(p.r, p.g, p.b)).collect()), + )) + } else { + Ok((w, h, PixelData::Rgba8(rgba.to_vec()))) + } + } + zenbitmaps::PixelLayout::Gray16 => { + // zenbitmaps provides native-endian u16 + let gray: Vec> = pixels + .chunks_exact(2) + .map(|c| Gray::new(u16::from_ne_bytes([c[0], c[1]]))) + .collect(); + Ok((w, h, PixelData::Gray16(gray))) + } + _ => Err(LoadError::UnsupportedFormat), + } +} + +/// ICC profile colorspace is at bytes 16-19 +fn icc_is_gray(icc_data: &[u8]) -> bool { + icc_data.len() >= 20 && &icc_data[16..20] == b"GRAY" +} + +/// Create the appropriate sRGB-equivalent destination profile. +/// Gray ICC profiles need a gray destination; RGB profiles need sRGB. +fn srgb_destination(is_gray: bool) -> moxcms::ColorProfile { + if is_gray { + // Build a gray profile with the exact sRGB parametric TRC. + // new_gray_with_gamma(2.2) is a poor approximation — sRGB uses a + // piecewise curve: f(x) = (a*x+b)^g for x >= d, f(x) = e*x for x < d + let mut profile = moxcms::ColorProfile::new_gray_with_gamma(2.2); + profile.gray_trc = Some(moxcms::ToneReprCurve::Parametric(vec![ + 2.4, + 1. / 1.055, + 0.055 / 1.055, + 1. / 12.92, + 0.04045, + ])); + profile + } else { + moxcms::ColorProfile::new_srgb() + } +} + +fn apply_icc_8bit( + buf: &mut [u8], + icc_data: &[u8], + layout: moxcms::Layout, +) -> Result<(), LoadError> { + let src = moxcms::ColorProfile::new_from_slice(icc_data)?; + let dst = srgb_destination(icc_is_gray(icc_data)); + let transform = + src.create_transform_8bit(layout, &dst, layout, moxcms::TransformOptions::default())?; + let mut out = vec![0u8; buf.len()]; + transform.transform(buf, &mut out)?; + buf.copy_from_slice(&out); + Ok(()) +} + +fn apply_icc_16bit( + buf: &mut [u8], + icc_data: &[u8], + layout: moxcms::Layout, +) -> Result<(), LoadError> { + let src = moxcms::ColorProfile::new_from_slice(icc_data)?; + let dst = srgb_destination(icc_is_gray(icc_data)); + let transform = + src.create_transform_16bit(layout, &dst, layout, moxcms::TransformOptions::default())?; + let pixel_count = buf.len() / 2; + let src_u16: Vec = buf + .chunks_exact(2) + .map(|c| u16::from_ne_bytes([c[0], c[1]])) + .collect(); + let mut dst_u16 = vec![0u16; pixel_count]; + transform.transform(&src_u16, &mut dst_u16)?; + for (chunk, &val) in buf.chunks_exact_mut(2).zip(dst_u16.iter()) { + let bytes = val.to_ne_bytes(); + chunk[0] = bytes[0]; + chunk[1] = bytes[1]; + } + Ok(()) +} + +// PNG stores 16-bit values in big-endian byte order. +// Convert to native-endian typed pixels. + +fn bytes_to_gray16_be(buf: &[u8], count: usize) -> Vec> { + buf[..count * 2] + .chunks_exact(2) + .map(|c| Gray::new(u16::from_be_bytes([c[0], c[1]]))) + .collect() +} + +fn bytes_to_graya16_be(buf: &[u8], count: usize) -> Vec> { + buf[..count * 4] + .chunks_exact(4) + .map(|c| { + GrayAlpha::new( + u16::from_be_bytes([c[0], c[1]]), + u16::from_be_bytes([c[2], c[3]]), + ) + }) + .collect() +} + +fn bytes_to_rgb16_be(buf: &[u8], count: usize) -> Vec> { + buf[..count * 6] + .chunks_exact(6) + .map(|c| { + RGB::new( + u16::from_be_bytes([c[0], c[1]]), + u16::from_be_bytes([c[2], c[3]]), + u16::from_be_bytes([c[4], c[5]]), + ) + }) + .collect() +} + +fn bytes_to_rgba16_be(buf: &[u8], count: usize) -> Vec> { + buf[..count * 8] + .chunks_exact(8) + .map(|c| { + RGBA::new( + u16::from_be_bytes([c[0], c[1]]), + u16::from_be_bytes([c[2], c[3]]), + u16::from_be_bytes([c[4], c[5]]), + u16::from_be_bytes([c[6], c[7]]), + ) + }) + .collect() +} diff --git a/src/main.rs b/src/main.rs index f7929fc..e636c60 100644 --- a/src/main.rs +++ b/src/main.rs @@ -25,20 +25,27 @@ use std::path::PathBuf; use std::process::ExitCode; fn usage(argv0: &str) { - eprintln!("\ - Usage: {argv0} original.png modified.png [modified.png...]\ - \n or: {argv0} -o difference.png original.png modified.png\n\n\ + eprintln!( + "\ + Usage: {argv0} original modified [modified...]\ + \n or: {argv0} -o difference.png original modified\n\n\ Compares first image against subsequent images, and outputs\n\ 1/SSIM-1 difference for each of them in order (0 = identical).\n\n\ Images must have identical size, but may have different gamma & depth.\n\ - \nVersion {} https://kornel.ski/dssim\n", env!("CARGO_PKG_VERSION")); + \nVersion {} https://kornel.ski/dssim\n", + env!("CARGO_PKG_VERSION") + ); } #[inline(always)] fn to_byte(i: f32) -> u8 { - if i <= 0.0 {0} - else if i >= 255.0/256.0 {255} - else {(i * 256.0) as u8} + if i <= 0.0 { + 0 + } else if i >= 255.0 / 256.0 { + 255 + } else { + (i * 256.0) as u8 + } } fn main() -> ExitCode { @@ -87,11 +94,17 @@ fn run() -> Result<(), Box> { std::thread::scope(|scope| { let decode_thread = || { let images_send = images_send; // ensure it's moved, and attr isn't - filenames_recv.into_iter().try_for_each(|(i, file): (usize, PathBuf)| { - dssim::load_image(&attr, &file) - .map_err(|e| format!("Can't load {}, because: {e}", file.display())) - .and_then(|image| images_send.send(i, (file, image)).map_err(|_| "Aborted".into())) - }) + filenames_recv + .into_iter() + .try_for_each(|(i, file): (usize, PathBuf)| { + dssim::load_image(&attr, &file) + .map_err(|e| format!("Can't load {}, because: {e}", file.display())) + .and_then(|image| { + images_send + .send(i, (file, image)) + .map_err(|_| "Aborted".into()) + }) + }) }; let threads = [ @@ -100,16 +113,26 @@ fn run() -> Result<(), Box> { ]; let result = (|| { - files.into_iter().map(PathBuf::from).enumerate() + files + .into_iter() + .map(PathBuf::from) + .enumerate() .try_for_each(move |f| filenames_send.send(f))?; let (file1, original) = images_recv.next().ok_or("Can't load any images")?; for (file2, modified) in images_recv { if original.width() != modified.width() || original.height() != modified.height() { - return Err(format!("Image {} has a different size ({}x{}) than {} ({}x{})\n", - file2.display(), modified.width(), modified.height(), - file1.display(), original.width(), original.height()).into()); + return Err(format!( + "Image {} has a different size ({}x{}) than {} ({}x{})\n", + file2.display(), + modified.width(), + modified.height(), + file1.display(), + original.width(), + original.height() + ) + .into()); } let (dssim, ssim_maps) = attr.compare(&original, modified); @@ -123,36 +146,66 @@ fn run() -> Result<(), Box> { Ok(()) })(); - threads.into_iter().try_for_each(|t| t.join().map_err(|_| "thread panicked; this is a bug")?)?; + threads + .into_iter() + .try_for_each(|t| t.join().map_err(|_| "thread panicked; this is a bug")?)?; result }) } -fn write_ssim_maps(ssim_maps: Vec, map_output_file: &str) -> Result<(), Box> { +fn write_ssim_maps( + ssim_maps: Vec, + map_output_file: &str, +) -> Result<(), Box> { #[cfg(feature = "threads")] let ssim_maps_iter = ssim_maps.par_iter(); #[cfg(not(feature = "threads"))] let ssim_maps_iter = ssim_maps.iter(); ssim_maps_iter.enumerate().try_for_each(|(n, map_meta)| { let avgssim = map_meta.ssim as f32; - let out: Vec<_> = map_meta.map.pixels().map(|ssim|{ - let max = 1_f32 - ssim; - let maxsq = max * max; - rgb::RGBA8 { - r: to_byte(maxsq * 16.0), - g: to_byte(max * 3.0), - b: to_byte(max / ((1_f32 - avgssim) * 4_f32)), - a: 255, - } - }).collect(); - lodepng::encode32_file(format!("{map_output_file}-{n}.png"), &out, map_meta.map.width(), map_meta.map.height()) - .map_err(|e| { - format!("Can't write {map_output_file}: {e}") + let out: Vec<_> = map_meta + .map + .pixels() + .map(|ssim| { + let max = 1_f32 - ssim; + let maxsq = max * max; + rgb::RGBA8 { + r: to_byte(maxsq * 16.0), + g: to_byte(max * 3.0), + b: to_byte(max / ((1_f32 - avgssim) * 4_f32)), + a: 255, + } }) + .collect(); + write_png_rgba( + &format!("{map_output_file}-{n}.png"), + &out, + map_meta.map.width(), + map_meta.map.height(), + ) + .map_err(|e| format!("Can't write {map_output_file}: {e}")) })?; Ok(()) } +fn write_png_rgba( + path: &str, + data: &[rgb::RGBA8], + width: usize, + height: usize, +) -> Result<(), Box> { + use rgb::ComponentBytes; + + let file = std::io::BufWriter::new(std::fs::File::create(path)?); + let mut encoder = png::Encoder::new(file, width as u32, height as u32); + encoder.set_color(png::ColorType::Rgba); + encoder.set_depth(png::BitDepth::Eight); + let mut writer = encoder.write_header()?; + writer.write_image_data(data.as_bytes())?; + writer.finish()?; + Ok(()) +} + #[test] fn image_gray() { let attr = dssim::Dssim::new();