diff --git a/.gitignore b/.gitignore index 307a207..2594646 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,10 @@ .DS_Store Thumbs.db +# AI Agents +.claude/ +.code/ + # Test data *.fastq *.fasta diff --git a/Cargo.lock b/Cargo.lock index f769706..d790c28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,22 +40,22 @@ dependencies = [ [[package]] name = "anstyle-query" -version = "1.1.4" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys", ] [[package]] name = "anstyle-wincon" -version = "3.0.10" +version = "3.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys", ] [[package]] @@ -65,24 +65,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" [[package]] -name = "bgzip" -version = "0.3.1" +name = "bit-vec" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b64fd8980fb64af5951bc05de7772b598150a6f7eac42ec17f73e8489915f99b" -dependencies = [ - "flate2", - "log", - "rayon", - "thiserror", -] +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" [[package]] -name = "buffer-redux" -version = "1.0.2" +name = "bitflags" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" + +[[package]] +name = "bstr" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e8acf87c5b9f5897cd3ebb9a327f420e0cae9dd4e5c1d2e36f2c84c571a58f1" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" dependencies = [ "memchr", + "serde", ] [[package]] @@ -92,42 +93,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" [[package]] -name = "bytecount" -version = "0.6.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" - -[[package]] -name = "bzip2" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" -dependencies = [ - "bzip2-sys", - "libc", -] - -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" -dependencies = [ - "cc", - "pkg-config", -] - -[[package]] -name = "cc" -version = "1.2.42" +name = "bytes" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bbf3b3619004ad9bd139f62a9ab5cfe467f307455a0d307b0cf58bf070feaa" -dependencies = [ - "find-msvc-tools", - "jobserver", - "libc", - "shlex", -] +checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" [[package]] name = "cfg-if" @@ -137,9 +106,9 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "clap" -version = "4.5.50" +version = "4.5.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623" +checksum = "aa8120877db0e5c011242f96806ce3c94e0737ab8108532a76a3300a01db2ab8" dependencies = [ "clap_builder", "clap_derive", @@ -147,9 +116,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.50" +version = "4.5.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0" +checksum = "02576b399397b659c26064fbc92a75fede9d18ffd5f80ca1cd74ddab167016e1" dependencies = [ "anstream", "anstyle", @@ -175,15 +144,6 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" -[[package]] -name = "cmake" -version = "0.1.54" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" -dependencies = [ - "cc", -] - [[package]] name = "colorchoice" version = "1.0.4" @@ -200,7 +160,7 @@ dependencies = [ "libc", "once_cell", "unicode-width", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -213,20 +173,10 @@ dependencies = [ ] [[package]] -name = "crossbeam-deque" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" -dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.18" +name = "crossbeam-channel" +version = "0.5.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" dependencies = [ "crossbeam-utils", ] @@ -237,12 +187,6 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" -[[package]] -name = "either" -version = "1.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" - [[package]] name = "encode_unicode" version = "1.0.0" @@ -250,10 +194,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" [[package]] -name = "find-msvc-tools" -version = "0.1.4" +name = "equivalent" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "flate2" @@ -262,7 +206,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ "crc32fast", - "libz-ng-sys", + "libz-rs-sys", "miniz_oxide", ] @@ -278,12 +222,28 @@ dependencies = [ "wasip2", ] +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "indexmap" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" @@ -291,66 +251,86 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] -name = "jobserver" -version = "0.1.34" +name = "js-sys" +version = "0.3.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" dependencies = [ - "getrandom", - "libc", + "once_cell", + "wasm-bindgen", ] [[package]] -name = "js-sys" -version = "0.3.81" +name = "lexical-core" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" dependencies = [ - "once_cell", - "wasm-bindgen", + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", ] [[package]] -name = "libc" -version = "0.2.177" +name = "lexical-parse-float" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] [[package]] -name = "liblzma" -version = "0.3.6" +name = "lexical-parse-integer" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a631d2b24be269775ba8f7789a6afa1ac228346a20c9e87dbbbe4975a79fd764" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" dependencies = [ - "liblzma-sys", + "lexical-util", ] [[package]] -name = "liblzma-sys" -version = "0.3.13" +name = "lexical-util" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efdadf1a99aceff34553de1461674ab6ac7e7f0843ae9875e339f4a14eb43475" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" dependencies = [ - "cc", - "libc", - "pkg-config", + "lexical-util", + "lexical-write-integer", ] [[package]] -name = "libz-ng-sys" -version = "1.1.22" +name = "lexical-write-integer" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7118c2c2a3c7b6edc279a8b19507672b9c4d716f95e671172dfa4e23f9fd824" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" dependencies = [ - "cmake", - "libc", + "lexical-util", ] [[package]] -name = "log" -version = "0.4.28" +name = "libc" +version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" + +[[package]] +name = "libz-rs-sys" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "840db8cf39d9ec4dd794376f38acc40d0fc65eec2a8f484f7fd375b84602becd" +dependencies = [ + "zlib-rs", +] [[package]] name = "memchr" @@ -369,18 +349,102 @@ dependencies = [ ] [[package]] -name = "needletail" -version = "0.6.3" +name = "noodles" +version = "0.102.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "443a31c94fda859fad991e1cc21978b49d42462d868e232970cd9525cc9875ee" +dependencies = [ + "noodles-bam", + "noodles-bgzf", + "noodles-fasta", + "noodles-fastq", + "noodles-sam", +] + +[[package]] +name = "noodles-bam" +version = "0.84.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b80af1d8e5b35c1f711e250008bbaf061f6d6c15063a67cc0f786d168a8c1a2" +dependencies = [ + "bstr", + "indexmap", + "memchr", + "noodles-bgzf", + "noodles-core", + "noodles-csi", + "noodles-sam", +] + +[[package]] +name = "noodles-bgzf" +version = "0.44.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6aa22e1ae8bce4ecf257e2475ef2046026caea08d66b1848d073fe7bc77e4351" +checksum = "7ab0c2585bad37cfc51a55f29c85449400ac51aaade935049d5d9fc5f8add255" dependencies = [ - "buffer-redux", - "bytecount", - "bzip2", + "bytes", + "crossbeam-channel", "flate2", - "liblzma", +] + +[[package]] +name = "noodles-core" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ebfc6396fb51059c98a35bb81b96422ac41ea1bc60b6c8e9c6b2550212e9493" +dependencies = [ + "bstr", +] + +[[package]] +name = "noodles-csi" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d09e31153abd7996f22a50d70f43af6c2ebf96a44ee250326ed15d4e183744c9" +dependencies = [ + "bit-vec", + "bstr", + "indexmap", + "noodles-bgzf", + "noodles-core", +] + +[[package]] +name = "noodles-fasta" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "195be83ea3d724d5e980e42afa841b5d1c6bae22c30c53d5875fcc8fb52f93a0" +dependencies = [ + "bstr", "memchr", - "zstd", + "noodles-bgzf", + "noodles-core", +] + +[[package]] +name = "noodles-fastq" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db9a66608bdbe2a2e1ece0b963a9c3bcbf922ca5600c3b6ba6cb96cf6f8d6458" +dependencies = [ + "bstr", + "memchr", +] + +[[package]] +name = "noodles-sam" +version = "0.80.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a45dc622c7d6bb4c9a971ead740e0fada24c2de1d5e58c31c7a7e385f1be6a04" +dependencies = [ + "bitflags", + "bstr", + "indexmap", + "lexical-core", + "memchr", + "noodles-bgzf", + "noodles-core", + "noodles-csi", ] [[package]] @@ -395,12 +459,6 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" -[[package]] -name = "pkg-config" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" - [[package]] name = "ppv-lite86" version = "0.2.21" @@ -421,9 +479,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.41" +version = "1.0.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" dependencies = [ "proc-macro2", ] @@ -463,35 +521,15 @@ dependencies = [ "getrandom", ] -[[package]] -name = "rayon" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" -dependencies = [ - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" -dependencies = [ - "crossbeam-deque", - "crossbeam-utils", -] - [[package]] name = "readfaker" version = "0.1.0" dependencies = [ "anyhow", - "bgzip", "clap", "console", - "needletail", + "flate2", + "noodles", "rand", "uuid", ] @@ -503,10 +541,33 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] -name = "shlex" -version = "1.3.0" +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "simd-adler32" @@ -522,40 +583,20 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.108" +version = "2.0.110" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" +checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "unicode-ident" -version = "1.0.20" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" [[package]] name = "unicode-width" @@ -591,9 +632,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" +checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60" dependencies = [ "cfg-if", "once_cell", @@ -602,25 +643,11 @@ dependencies = [ "wasm-bindgen-shared", ] -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.104" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - [[package]] name = "wasm-bindgen-macro" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" +checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -628,22 +655,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" +checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc" dependencies = [ + "bumpalo", "proc-macro2", "quote", "syn", - "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" +checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76" dependencies = [ "unicode-ident", ] @@ -654,15 +681,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" -[[package]] -name = "windows-sys" -version = "0.60.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" -dependencies = [ - "windows-targets", -] - [[package]] name = "windows-sys" version = "0.61.2" @@ -672,71 +690,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-targets" -version = "0.53.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" -dependencies = [ - "windows-link", - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" - -[[package]] -name = "windows_i686_gnu" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" - -[[package]] -name = "windows_i686_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" - -[[package]] -name = "windows_i686_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" - [[package]] name = "wit-bindgen" version = "0.46.0" @@ -764,29 +717,7 @@ dependencies = [ ] [[package]] -name = "zstd" -version = "0.13.3" +name = "zlib-rs" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "7.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" -dependencies = [ - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "2.0.16+zstd.1.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" -dependencies = [ - "cc", - "pkg-config", -] +checksum = "2f06ae92f42f5e5c42443fd094f245eb656abf56dd7cce9b8b263236565e00f2" diff --git a/Cargo.toml b/Cargo.toml index 4f1cbac..a843ffc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,10 +5,10 @@ edition = "2024" [dependencies] anyhow = "1.0.100" -bgzip = { version = "0.3.1", features = ["zlib-ng"] } -clap = { version = "4.5.50", features = ["derive"] } +clap = { version = "4.5.52", features = ["derive"] } console = "0.16.1" -needletail = "0.6.3" +flate2 = {version = "1.1.5", features = ["zlib-rs"]} +noodles = { version = "0.102.0", features = ["fasta", "fastq", "bgzf", "bam", "sam"] } rand = "0.9.2" uuid = { version = "1.18.1", features = ["v4"] } diff --git a/README.md b/README.md index 3e6a94f..64f1320 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,10 @@ from real FASTQ data. ## Features - Creates empirical models for read length and quality scores (quality scores are grouped by length batches). -- Supports compressed input and output FASTQ files. +- Supports both FASTQ and BAM formats for input and output. +- Automatic compression detection for input files (gzip/BGZF). +- Multi-threaded BGZF compression for output files. +- Configurable error rates and indel extension probabilities. - Fast: can generate a million reads in under a minute. ## Motivation @@ -23,9 +26,7 @@ to begin with). ## Current Limitations / Planned Improvements -- Insertions and deletions are limited to one nucleotide length. Alteration type (substitution, insertion, deletion) ratios are fixed. - Only generates modified sequences, not chimeras, junk reads and other types of artifacts. -- No BAM files support. ## Installation @@ -35,22 +36,28 @@ platform. ## Usage ```bash -readfaker -r -i -o -n +readfaker -r -i -o -n ``` ### Required Arguments - `-r, --reference ` - Reference sequences to sample reads from -- `-i, --input ` - Input FASTQ file to extract quality and length models -- `-o, --output ` - Output FASTQ file for simulated reads +- `-i, --input ` - Input file to extract quality and length models (FASTQ or BAM) +- `-o, --output ` - Output file for simulated reads (FASTQ or BAM, detected by extension) ### Optional Arguments - `-n, --num-reads ` - Number of reads to generate (default: 100000) - `-s, --seed ` - Random seed for reproducibility +- `--compression-threads ` - Number of compression threads for output (default: 4) +- `--error-sub ` - Error substitution rate (default: 0.7) +- `--error-ins ` - Error insertion rate (default: 0.1) +- `--error-del ` - Error deletion rate (default: 0.2) +- `--error-ins-ext ` - Insertion extension probability using geometric distribution (default: 0.4) +- `--error-del-ext ` - Deletion extension probability using geometric distribution (default: 0.4) - `-v, --verbose` - Enable verbose output -### Example +### Examples ```bash # Generate 10000 reads with verbose output @@ -58,15 +65,21 @@ readfaker -r genome.fasta -i real_reads.fastq.gz -o simulated_reads.fastq.gz -n # Generate reproducible reads with a fixed seed readfaker -r genome.fasta -i real_reads.fastq -o simulated_reads.fastq -s 42 + +# Use BAM input and output with custom error rates +readfaker -r genome.fasta -i real_reads.bam -o simulated_reads.bam -n 50000 --error-sub 0.6 --error-ins 0.15 --error-del 0.25 + +# Adjust compression threads for better performance +readfaker -r genome.fasta -i real_reads.fastq.gz -o simulated_reads.fastq.gz -n 1000000 --compression-threads 8 ``` ## How It Works -1. **Model Extraction**: Reads an existing FASTQ file to build empirical models of read lengths and quality scores +1. **Model Extraction**: Reads an existing FASTQ or BAM file to build empirical models of read lengths and quality scores 2. **Reference Loading**: Parses reference genome sequences from FASTA format 3. **Read Generation**: Samples read lengths, selects random reference positions, applies quality profiles, and - introduces errors based on quality scores -4. **Output**: Writes FASTQ records with automatic BGZF compression for `.gz`, `.bgz`, or `.bgzf` files + introduces errors based on quality scores with configurable error rates and indel extension probabilities +4. **Output**: Writes FASTQ or BAM records with automatic multi-threaded BGZF compression for `.gz`, `.bgz`, `.bgzf`, or `.bam` files ## Building from Source diff --git a/src/cli.rs b/src/cli.rs index c67a8bd..55b1fd4 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -25,12 +25,12 @@ pub struct Cli { #[arg(short = 'r', long, value_name = "FASTA")] pub reference: PathBuf, - /// Input FASTQ file to extract quality and length models - #[arg(short = 'i', long, value_name = "FASTQ")] + /// Input file to extract quality and length models (FASTQ or BAM) + #[arg(short = 'i', long, value_name = "FILE")] pub input: PathBuf, - /// Output FASTQ file for simulated reads - #[arg(short = 'o', long, value_name = "FASTQ")] + /// Output file for simulated reads (FASTQ or BAM, detected by extension) + #[arg(short = 'o', long, value_name = "FILE")] pub output: PathBuf, /// Number of reads to generate @@ -41,6 +41,30 @@ pub struct Cli { #[arg(short = 's', long)] pub seed: Option, + /// Number of compression threads (default: 4, use 0 for auto-detection) + #[arg(long = "compression-threads", default_value = "4")] + pub compression_threads: usize, + + /// Error substitution rate (default: 0.7) + #[arg(long, value_name = "RATE")] + pub error_sub: Option, + + /// Error insertion rate (default: 0.1) + #[arg(long, value_name = "RATE")] + pub error_ins: Option, + + /// Error deletion rate (default: 0.2) + #[arg(long, value_name = "RATE")] + pub error_del: Option, + + /// Error insertion extension rate (default: 0.4) + #[arg(long, value_name = "RATE")] + pub error_ins_ext: Option, + + /// Error deletion extension rate (default: 0.4) + #[arg(long, value_name = "RATE")] + pub error_del_ext: Option, + /// Enable verbose output #[arg(short, long)] pub verbose: bool, @@ -60,7 +84,11 @@ pub mod fmt { /// Format a parameter with proper alignment (width must account for raw text length) pub fn param_aligned(text: &str, width: usize) -> String { - format!("{: String { diff --git a/src/generator.rs b/src/generator.rs index 155653d..625aaf5 100644 --- a/src/generator.rs +++ b/src/generator.rs @@ -1,9 +1,9 @@ -use crate::io::FastqRecord; use crate::io::fasta::FastaRecord; use crate::models::error::AlterationType; use crate::models::{ErrorModel, LengthModel, QualityModel}; use crate::utils::QUALITY_MAPPING; use anyhow::{Result, anyhow, bail}; +use noodles::fastq; use rand::prelude::IndexedRandom; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; @@ -33,7 +33,7 @@ const PHRED_OFFSET: u8 = 33; /// let mut quality_model = QualityModel::new(None, None, None); /// let mut rng = StdRng::seed_from_u64(42); /// quality_model.add_value(100, vec![b':'; 100], &mut rng); -/// let error_model = ErrorModel::new(None, None, None).unwrap(); +/// let error_model = ErrorModel::new(None, None, None, None, None).unwrap(); /// /// let mut generator = ReadGenerator::new( /// references, @@ -98,11 +98,11 @@ impl ReadGenerator { /// Automatically retries if the sampled length exceeds the reference sequence length. /// /// # Returns - /// A `FastqRecord` with simulated sequencing errors based on quality scores + /// A `fastq::Record` with simulated sequencing errors based on quality scores /// /// # Errors /// Returns an error if the length or quality models are empty - pub fn generate_read(&mut self) -> Result { + pub fn generate_read(&mut self) -> Result { loop { let length = self .length_model @@ -126,11 +126,12 @@ impl ReadGenerator { let (final_sequence, final_qualities) = self.apply_errors(sequence, qualities); - return Ok(FastqRecord { - id: format!("{}", Uuid::new_v4()), - sequence: final_sequence, - quality: final_qualities, - }); + let id = format!("{}", Uuid::new_v4()); + return Ok(fastq::Record::new( + fastq::record::Definition::new(id, ""), + final_sequence, + final_qualities, + )); } } @@ -234,7 +235,7 @@ mod tests { let mut length_model = LengthModel::new(); let mut quality_model = QualityModel::new(None, None, None); - let error_model = ErrorModel::new(None, None, None).unwrap(); + let error_model = ErrorModel::new(None, None, None, None, None).unwrap(); let mut rng = StdRng::seed_from_u64(42); length_model.add_value(10); @@ -264,13 +265,14 @@ mod tests { // Generate multiple reads to verify the generator can be reused for _ in 0..5 { let read = generator.generate_read().unwrap(); - assert_eq!(read.len(), 10); - assert!(read.quality.iter().all(|&q| q >= PHRED_OFFSET)); + assert_eq!(read.sequence().len(), 10); + assert!(read.quality_scores().iter().all(|&q| q >= PHRED_OFFSET)); // Verify the ID is a valid UUID + let name_str = std::str::from_utf8(read.name()).unwrap(); assert!( - Uuid::parse_str(&read.id).is_ok(), + Uuid::parse_str(name_str).is_ok(), "Expected valid UUID, got: {}", - read.id + name_str ); } } diff --git a/src/io/bam.rs b/src/io/bam.rs new file mode 100644 index 0000000..4b7556e --- /dev/null +++ b/src/io/bam.rs @@ -0,0 +1,255 @@ +//! BAM file reading and writing. + +use anyhow::{Context, Result}; +use noodles::bam; +use noodles::bgzf; +use noodles::sam; +use noodles::sam::alignment::RecordBuf; +use std::fs::File; +use std::path::{Path, PathBuf}; + +/// Reader for BAM files +pub struct BamReader; + +impl BamReader { + /// Open a BAM file and return a streaming iterator over records + /// + /// # Example + /// ```no_run + /// use readfaker::io::bam::BamReader; + /// use std::path::Path; + /// + /// let reader = BamReader::from_path(Path::new("input.bam"))?; + /// for record in reader { + /// let record = record?; + /// println!("Read: {}, length: {}", + /// record.name().map(|n| n.to_string()).unwrap_or_default(), + /// record.sequence().len()); + /// } + /// # Ok::<(), anyhow::Error>(()) + /// ``` + pub fn from_path(path: &Path) -> Result { + let mut reader = bam::io::reader::Builder::default() + .build_from_path(path) + .with_context(|| format!("Failed to open BAM file: {}", path.display()))?; + + let header = reader.read_header().context("Failed to read BAM header")?; + + Ok(BamReaderIterator { reader, header }) + } +} + +/// Iterator over BAM records +pub struct BamReaderIterator { + reader: bam::io::Reader>, + header: sam::Header, +} + +impl Iterator for BamReaderIterator { + type Item = Result; + + fn next(&mut self) -> Option { + self.reader.records().next().map(|result| { + result + .map_err(|e| anyhow::Error::new(e).context("Failed to read BAM record")) + .and_then(|record| { + RecordBuf::try_from_alignment_record(&self.header, &record) + .map_err(|e| anyhow::Error::new(e).context("Failed to parse BAM record")) + }) + }) + } +} + +/// Writer for BAM files +pub struct BamWriter { + writer: bam::io::Writer>, + header: sam::Header, +} + +impl BamWriter { + /// Creates a new BAM writer for the specified file path. + /// + /// Creates a minimal header for unaligned reads. + /// + /// # Arguments + /// * `path` - Path to the output BAM file + /// * `compression_threads` - Number of compression threads (0 = auto-detect) + pub fn new(path: &PathBuf, compression_threads: usize) -> Result { + // Create a minimal header for unaligned reads + let header = sam::Header::builder() + .add_comment("Generated by ReadFaker") + .build(); + + let file = File::create(path) + .with_context(|| format!("Failed to create BAM file: {}", path.display()))?; + + // Use specified threads or auto-detect CPU cores + let worker_count = if compression_threads == 0 { + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(4) // Fallback to 4 threads + } else { + compression_threads + }; + + let bgzf_writer = bgzf::io::MultithreadedWriter::with_worker_count( + std::num::NonZero::new(worker_count).unwrap(), + file, + ); + + let mut writer = bam::io::Writer::from(bgzf_writer); + + writer + .write_header(&header) + .context("Failed to write BAM header")?; + + Ok(Self { writer, header }) + } + + /// Writes a single alignment record as an unaligned BAM record. + /// + /// # Arguments + /// * `name` - Read name/identifier + /// * `sequence` - Nucleotide sequence + /// * `quality_ascii` - Quality scores in Phred+33 ASCII encoding + pub fn write_record( + &mut self, + name: &str, + sequence: &[u8], + quality_ascii: &[u8], + ) -> Result<()> { + use noodles::sam::alignment::io::Write as AlignmentWrite; + + // Convert Phred+33 ASCII to raw Phred scores (0-93) for BAM format + let quality_phred: Vec = quality_ascii + .iter() + .map(|&q| q.saturating_sub(33)) + .collect(); + + // Create an unaligned SAM record + let record = RecordBuf::builder() + .set_name(name.as_bytes()) + .set_flags(sam::alignment::record::Flags::UNMAPPED) + .set_sequence(sequence.to_vec().into()) + .set_quality_scores(quality_phred.into()) + .build(); + + self.writer + .write_alignment_record(&self.header, &record) + .context("Failed to write BAM record") + } + + /// Flushes the internal buffer to ensure all data is written. + pub fn flush(&mut self) -> Result<()> { + use std::io::Write; + self.writer + .get_mut() + .flush() + .context("Failed to flush BAM writer") + } + + /// Finishes the writer, properly finalizing the BAM file. + /// + /// This writes the BGZF EOF marker and flushes all buffered data. + /// Should be called explicitly to ensure proper file finalization. + pub fn finish(self) -> Result<()> { + // Get the underlying BGZF writer and call finish to write EOF marker + self.writer + .into_inner() + .finish() + .map(|_| ()) + .map_err(|e| anyhow::anyhow!("Failed to finish BAM writer: {}", e)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bam_write_and_read() { + let temp_file = std::env::temp_dir().join("readfaker_test.bam"); + std::fs::remove_file(&temp_file).ok(); + + // Write test records with ASCII quality scores (Phred+33) + // 'I' = 73 (ASCII) = Phred 40 + // 'J' = 74 (ASCII) = Phred 41 + { + let mut writer = BamWriter::new(&temp_file, 4).unwrap(); + writer.write_record("read1", b"ACGT", b"IIII").unwrap(); + writer.write_record("read2", b"TGCA", b"JJJJ").unwrap(); + writer.finish().unwrap(); + } + + // Read back and verify + let records: Vec = BamReader::from_path(&temp_file) + .unwrap() + .collect::>>() + .unwrap(); + + assert_eq!(records.len(), 2); + assert_eq!( + records[0].name().map(|n| n.to_string()), + Some("read1".to_string()) + ); + assert_eq!(records[0].sequence().as_ref(), b"ACGT"); + + // Verify quality scores are stored as raw Phred (not ASCII) + // 'I' (ASCII 73) should be stored as raw Phred 40 + let qualities_read1: Vec = records[0].quality_scores().as_ref().to_vec(); + assert_eq!( + qualities_read1, + vec![40, 40, 40, 40], + "BAM should store raw Phred scores (40), not ASCII (73)" + ); + + assert_eq!( + records[1].name().map(|n| n.to_string()), + Some("read2".to_string()) + ); + + // 'J' (ASCII 74) should be stored as raw Phred 41 + let qualities_read2: Vec = records[1].quality_scores().as_ref().to_vec(); + assert_eq!( + qualities_read2, + vec![41, 41, 41, 41], + "BAM should store raw Phred scores (41), not ASCII (74)" + ); + + std::fs::remove_file(temp_file).ok(); + } + + #[test] + fn test_bam_eof_marker() { + use std::io::{Read, Seek}; + + let temp_file = std::env::temp_dir().join("readfaker_test_eof.bam"); + std::fs::remove_file(&temp_file).ok(); + + // Write a BAM file + { + let mut writer = BamWriter::new(&temp_file, 4).unwrap(); + writer.write_record("read1", b"ACGT", b"IIII").unwrap(); + writer.finish().unwrap(); + } + + // Verify BGZF EOF marker is present at the end + let mut file = File::open(&temp_file).unwrap(); + file.seek(std::io::SeekFrom::End(-28)).unwrap(); + let mut eof_block = [0u8; 28]; + file.read_exact(&mut eof_block).unwrap(); + + // BGZF EOF marker is a specific 28-byte block + let expected_eof = [ + 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43, + 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + + assert_eq!( + eof_block, expected_eof, + "BAM file should end with BGZF EOF marker" + ); + + std::fs::remove_file(temp_file).ok(); + } +} diff --git a/src/io/fasta.rs b/src/io/fasta.rs index 839475c..ee36a9c 100644 --- a/src/io/fasta.rs +++ b/src/io/fasta.rs @@ -1,7 +1,9 @@ -//! FASTA file reading using needletail. +//! FASTA file reading. use anyhow::{Context, Result, bail}; -use needletail::{Sequence, parse_fastx_file}; +use noodles::fasta; +use std::fs::File; +use std::io::BufReader; use std::path::Path; /// Represents a FASTA sequence with its ID and nucleotide sequence. @@ -25,15 +27,16 @@ impl FastaReader { pub fn read(path: &Path) -> Result> { let mut records = Vec::new(); - let mut reader = parse_fastx_file(path) + let file = File::open(path) .with_context(|| format!("Failed to open FASTA file: {}", path.display()))?; + let mut reader = fasta::io::Reader::new(BufReader::new(file)); - while let Some(record) = reader.next() { - let record = record + for result in reader.records() { + let record = result .with_context(|| format!("Failed to parse FASTA record in {}", path.display()))?; - let id = String::from_utf8_lossy(record.id()).to_string(); - let sequence = record.normalize(false).to_vec(); + let id = String::from_utf8_lossy(record.name()).to_string(); + let sequence = record.sequence().as_ref().to_vec(); records.push(FastaRecord { id, sequence }); } diff --git a/src/io/fastq.rs b/src/io/fastq.rs index 696c653..175ca10 100644 --- a/src/io/fastq.rs +++ b/src/io/fastq.rs @@ -1,63 +1,23 @@ //! FASTQ file reading and writing. use anyhow::{Context, Result}; -use bgzip::Compression; -use bgzip::write::BGZFMultiThreadWriter; -use needletail::{Sequence, parse_fastx_file}; +use flate2::read::MultiGzDecoder; +use noodles::bgzf; +use noodles::fastq; use std::fs::File; -use std::io::{self, BufWriter, Write}; +use std::io::{BufRead, BufReader, BufWriter, Write}; use std::path::{Path, PathBuf}; -/// Represents a FASTQ record with sequence and quality scores. -#[derive(Debug, Clone)] -pub struct FastqRecord { - pub id: String, - pub sequence: Vec, - pub quality: Vec, -} - -impl FastqRecord { - /// Creates a new FASTQ record with validation. - /// - /// # Arguments - /// * `id` - Read identifier - /// * `sequence` - Nucleotide sequence - /// * `quality` - Quality scores (must match sequence length) - /// - /// # Returns - /// Validated FASTQ record or error if lengths don't match - pub fn new(id: String, sequence: Vec, quality: Vec) -> Result { - if sequence.len() != quality.len() { - anyhow::bail!( - "Sequence length ({}) does not match quality length ({})", - sequence.len(), - quality.len() - ); - } - Ok(Self { - id, - sequence, - quality, - }) - } - - /// Returns the length of the sequence. - pub fn len(&self) -> usize { - self.sequence.len() - } - - /// Checks whether the sequence is empty. - pub fn is_empty(&self) -> bool { - self.sequence.is_empty() - } -} - /// Reader for FASTQ files -pub struct FastqReader; +pub struct FastqReader { + reader: fastq::io::Reader>, +} impl FastqReader { /// Open a FASTQ file and return a streaming iterator over records /// + /// Automatically detects and handles both compressed (gzip/bgzf) and uncompressed FASTQ files. + /// /// # Example /// ```no_run /// use readfaker::io::fastq::FastqReader; @@ -66,56 +26,81 @@ impl FastqReader { /// let reader = FastqReader::from_path(Path::new("input.fastq"))?; /// for record in reader { /// let record = record?; - /// println!("Read: {}, length: {}", record.id, record.len()); + /// println!("Read: {}, length: {}", std::str::from_utf8(record.name()).unwrap(), record.sequence().len()); /// } /// # Ok::<(), anyhow::Error>(()) /// ``` - pub fn from_path(path: &Path) -> Result> + '_> { - let mut reader = parse_fastx_file(path) + pub fn from_path(path: &Path) -> Result { + let file = File::open(path) .with_context(|| format!("Failed to open FASTQ file: {}", path.display()))?; - Ok(std::iter::from_fn(move || { - reader.next().map(|result| { - result - .context("Failed to parse FASTQ record") - .and_then(|record| { - let id = String::from_utf8_lossy(record.id()).to_string(); - let sequence = record.normalize(false).to_vec(); - let quality = record - .qual() - .context("Missing quality scores in FASTQ record")? - .to_vec(); - - Ok(FastqRecord { - id, - sequence, - quality, - }) - }) - }) - })) + // Check if file is gzip-compressed by reading magic bytes + let mut buffered = BufReader::new(file); + let is_compressed = is_gzip_compressed(&mut buffered)?; + + let reader: Box = if is_compressed { + // Use MultiGzDecoder which handles both regular gzip and BGZF + Box::new(BufReader::new(MultiGzDecoder::new(buffered))) + } else { + Box::new(buffered) + }; + + Ok(Self { + reader: fastq::io::Reader::new(reader), + }) + } +} + +impl Iterator for FastqReader { + type Item = Result; + + fn next(&mut self) -> Option { + let mut record = fastq::Record::default(); + + match self.reader.read_record(&mut record) { + Ok(0) => None, + Ok(_) => Some(Ok(record)), + Err(e) => Some(Err( + anyhow::Error::new(e).context("Failed to parse FASTQ record") + )), + } } } /// Internal writer implementation supporting both uncompressed and BGZF-compressed output. enum FastqWriterInner { - Uncompressed(BufWriter), - Compressed(BGZFMultiThreadWriter), + Uncompressed(fastq::io::Writer>), + Compressed(fastq::io::Writer>), } -impl Write for FastqWriterInner { - fn write(&mut self, buf: &[u8]) -> io::Result { +impl FastqWriterInner { + fn write_record(&mut self, record: &fastq::Record) -> std::io::Result<()> { match self { - FastqWriterInner::Uncompressed(w) => w.write(buf), - FastqWriterInner::Compressed(w) => w.write(buf), + FastqWriterInner::Uncompressed(w) => w.write_record(record), + FastqWriterInner::Compressed(w) => w.write_record(record), } } - fn flush(&mut self) -> io::Result<()> { + fn flush_writer(&mut self) -> std::io::Result<()> { match self { - FastqWriterInner::Uncompressed(w) => w.flush(), - // Don't flush BGZFWriter - it handles finalization in its Drop implementation - FastqWriterInner::Compressed(_) => Ok(()), + FastqWriterInner::Uncompressed(w) => w.get_mut().flush(), + FastqWriterInner::Compressed(w) => w.get_mut().flush(), + } + } + + fn finish(self) -> Result<()> { + match self { + FastqWriterInner::Uncompressed(mut w) => w + .get_mut() + .flush() + .context("Failed to flush uncompressed writer"), + FastqWriterInner::Compressed(w) => { + // Get the underlying BGZF writer and finish it to shutdown threads and write EOF + w.into_inner() + .finish() + .map(|_| ()) // Discard the returned File handle + .map_err(|e| anyhow::anyhow!("Failed to finish BGZF writer: {}", e)) + } } } } @@ -131,20 +116,24 @@ impl Write for FastqWriterInner { /// /// # Example /// ```no_run -/// use readfaker::io::fastq::{FastqWriter, FastqRecord}; +/// use readfaker::io::fastq::FastqWriter; +/// use noodles::fastq; /// use std::path::PathBuf; /// /// // Uncompressed output -/// let mut writer = FastqWriter::new(&PathBuf::from("output.fastq"))?; +/// let mut writer = FastqWriter::new(&PathBuf::from("output.fastq"), 4)?; +/// +/// // Compressed output (BGZF) with auto-detected threads +/// let mut writer_gz = FastqWriter::new(&PathBuf::from("output.fastq.gz"), 0)?; /// -/// // Compressed output (BGZF) -/// let mut writer_gz = FastqWriter::new(&PathBuf::from("output.fastq.gz"))?; +/// // Compressed output with 4 threads +/// let mut writer_4t = FastqWriter::new(&PathBuf::from("output.fastq.gz"), 4)?; /// -/// let record = FastqRecord::new( -/// "read1".to_string(), -/// b"ACGT".to_vec(), -/// b"IIII".to_vec() -/// )?; +/// let record = fastq::Record::new( +/// fastq::record::Definition::new("read1", ""), +/// b"ACGT", +/// b"IIII", +/// ); /// writer.write_record(&record)?; /// writer.flush()?; // Explicitly flush to handle errors /// # Ok::<(), anyhow::Error>(()) @@ -162,37 +151,46 @@ impl FastqWriter { /// /// # Arguments /// * `path` - Path to the output FASTQ file - pub fn new(path: &PathBuf) -> Result { + /// * `compression_threads` - Number of compression threads (0 = auto-detect) + pub fn new(path: &PathBuf, compression_threads: usize) -> Result { let file = File::create(path) .with_context(|| format!("Failed to create FASTQ file: {}", path.display()))?; - let writer = if should_bgzf_compress(path) { - FastqWriterInner::Compressed(BGZFMultiThreadWriter::new(file, Compression::default())) + let writer = if should_compress(path) { + // Use specified threads or auto-detect CPU cores + let worker_count = if compression_threads == 0 { + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(4) // Fallback to 4 threads + } else { + compression_threads + }; + + let bgzf_writer = bgzf::io::MultithreadedWriter::with_worker_count( + std::num::NonZero::new(worker_count).unwrap(), + file, + ); + + FastqWriterInner::Compressed(fastq::io::Writer::new(bgzf_writer)) } else { - FastqWriterInner::Uncompressed(BufWriter::new(file)) + FastqWriterInner::Uncompressed(fastq::io::Writer::new(BufWriter::new(file))) }; Ok(Self { writer }) } /// Writes a single FASTQ record. - pub fn write_record(&mut self, record: &FastqRecord) -> Result<()> { - (|| -> io::Result<()> { - writeln!(self.writer, "@{}", record.id)?; - self.writer.write_all(&record.sequence)?; - self.writer.write_all(b"\n+\n")?; - self.writer.write_all(&record.quality)?; - self.writer.write_all(b"\n")?; - Ok(()) - })() - .context("Failed to write FASTQ record") + pub fn write_record(&mut self, record: &fastq::Record) -> Result<()> { + self.writer + .write_record(record) + .context("Failed to write FASTQ record") } /// Writes multiple FASTQ records. /// /// # Arguments /// * `records` - Slice of FASTQ records to write - pub fn write_records(&mut self, records: &[FastqRecord]) -> Result<()> { + pub fn write_records(&mut self, records: &[fastq::Record]) -> Result<()> { for record in records { self.write_record(record)?; } @@ -203,44 +201,52 @@ impl FastqWriter { /// /// It's recommended to call this explicitly before the writer is dropped /// to ensure flush errors are properly handled. - /// - /// **Note:** For BGZF-compressed writers, this does not force immediate finalization. - /// The compressed stream is properly closed when the writer is dropped. pub fn flush(&mut self) -> Result<()> { - self.writer.flush().context("Failed to flush FASTQ writer") + self.writer + .flush_writer() + .context("Failed to flush FASTQ writer") + } + + /// Finishes the writer, properly shutting down compression threads if applicable. + /// + /// For compressed writers, this shuts down the thread pool and writes the final BGZF EOF block. + /// This should be called explicitly before the writer is dropped to ensure proper finalization. + pub fn finish(self) -> Result<()> { + self.writer.finish() } } -fn should_bgzf_compress(path: &Path) -> bool { +/// Helper function to check if a file is gzip-compressed +fn is_gzip_compressed(reader: &mut BufReader) -> Result { + let buffer = reader.fill_buf().context("Failed to read file header")?; + + // Check for gzip magic bytes (0x1f 0x8b) + Ok(buffer.len() >= 2 && buffer[0] == 0x1f && buffer[1] == 0x8b) +} + +/// Helper function to check if a file should be compressed based on its extension +fn should_compress(path: &Path) -> bool { path.extension() .and_then(|ext| ext.to_str()) - .is_some_and(|ext| { - ["gz", "bgz", "bgzf"] - .iter() - .any(|s| ext.eq_ignore_ascii_case(s)) - }) + .is_some_and(|ext| ["gz", "bgz"].iter().any(|s| ext.eq_ignore_ascii_case(s))) } #[cfg(test)] mod tests { use super::*; - #[test] - fn test_fastq_record_new() { - let record = FastqRecord::new("read1".to_string(), b"ACGT".to_vec(), b"IIII".to_vec()); - assert!(record.is_ok()); - assert_eq!(record.unwrap().len(), 4); - } - #[test] fn test_fastq_writer() { let temp_dir = std::env::temp_dir(); let temp_file = temp_dir.join("test.fastq"); { - let mut writer = FastqWriter::new(&temp_file).unwrap(); - let record = - FastqRecord::new("read1".to_string(), b"ACGT".to_vec(), b"IIII".to_vec()).unwrap(); + let mut writer = FastqWriter::new(&temp_file, 4).unwrap(); + let record = fastq::Record::new( + fastq::record::Definition::new("read1", ""), + b"ACGT", + b"IIII", + ); writer.write_record(&record).unwrap(); writer.flush().unwrap(); } @@ -258,15 +264,17 @@ mod tests { std::fs::remove_file(&temp_file).ok(); { - let mut writer = FastqWriter::new(&temp_file).unwrap(); - let record1 = - FastqRecord::new("read1".to_string(), b"ACGT".to_vec(), b"IIII".to_vec()).unwrap(); - let record2 = FastqRecord::new( - "read2".to_string(), - b"TGCATGCA".to_vec(), - b"IIIIIIII".to_vec(), - ) - .unwrap(); + let mut writer = FastqWriter::new(&temp_file, 4).unwrap(); + let record1 = fastq::Record::new( + fastq::record::Definition::new("read1", ""), + b"ACGT", + b"IIII", + ); + let record2 = fastq::Record::new( + fastq::record::Definition::new("read2", ""), + b"TGCATGCA", + b"IIIIIIII", + ); writer.write_record(&record1).unwrap(); writer.write_record(&record2).unwrap(); @@ -274,9 +282,10 @@ mod tests { } // Verify the file is actually gzip-compressed by checking magic bytes + use std::io::Read; let mut file = File::open(&temp_file).unwrap(); let mut magic_bytes = [0u8; 2]; - io::Read::read_exact(&mut file, &mut magic_bytes).unwrap(); + file.read_exact(&mut magic_bytes).unwrap(); assert_eq!( magic_bytes, [0x1f, 0x8b], @@ -284,24 +293,23 @@ mod tests { ); // Read back and verify content - let records: Vec = FastqReader::from_path(&temp_file) + let records: Vec = FastqReader::from_path(&temp_file) .unwrap() .collect::>>() .unwrap(); assert_eq!(records.len(), 2); - assert_eq!(records[0].id, "read1"); - assert_eq!(records[0].sequence, b"ACGT"); - assert_eq!(records[1].id, "read2"); + assert_eq!(records[0].name(), "read1"); + assert_eq!(records[0].sequence(), b"ACGT"); + assert_eq!(records[1].name(), "read2"); std::fs::remove_file(temp_file).ok(); } #[test] fn test_should_bgzf_compress_suffixes() { - assert!(should_bgzf_compress(Path::new("reads.fastq.gz"))); - assert!(should_bgzf_compress(Path::new("reads.fastq.bgz"))); - assert!(should_bgzf_compress(Path::new("reads.fastq.bgzf"))); - assert!(should_bgzf_compress(Path::new("reads.GZ"))); - assert!(!should_bgzf_compress(Path::new("reads.fastq"))); + assert!(should_compress(Path::new("reads.fastq.gz"))); + assert!(should_compress(Path::new("reads.fastq.bgz"))); + assert!(should_compress(Path::new("reads.GZ"))); + assert!(!should_compress(Path::new("reads.fastq"))); } } diff --git a/src/io/mod.rs b/src/io/mod.rs index 74fa236..f178943 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -1,10 +1,12 @@ //! I/O module for reading and writing sequence files. //! -//! Provides readers and writers for FASTA and FASTQ file formats. +//! Provides readers and writers for FASTA, FASTQ, and BAM file formats. +pub mod bam; pub mod fasta; pub mod fastq; // Re-export main types +pub use bam::{BamReader, BamWriter}; pub use fasta::FastaReader; -pub use fastq::{FastqRecord, FastqWriter}; +pub use fastq::FastqWriter; diff --git a/src/main.rs b/src/main.rs index 4d96f34..392bfb6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,8 @@ use anyhow::Result; use clap::Parser; -use readfaker::cli::{fmt, Cli}; +use readfaker::cli::{Cli, fmt}; use readfaker::generator::ReadGenerator; -use readfaker::io::{FastaReader, FastqWriter}; +use readfaker::io::{BamWriter, FastaReader, FastqWriter}; use readfaker::models::ErrorModel; use readfaker::utils::load_models; @@ -11,10 +11,26 @@ fn main() -> Result<()> { if cli.verbose { eprintln!("{}", fmt::header("ReadFaker Configuration")); - eprintln!("{}: {}", fmt::param_aligned("Reference", 16), cli.reference.display()); - eprintln!("{}: {}", fmt::param_aligned("Input", 16), cli.input.display()); - eprintln!("{}: {}", fmt::param_aligned("Output", 16), cli.output.display()); - eprintln!("{}: {}", fmt::param_aligned("Number of reads", 16), cli.num_reads); + eprintln!( + "{}: {}", + fmt::param_aligned("Reference", 16), + cli.reference.display() + ); + eprintln!( + "{}: {}", + fmt::param_aligned("Input", 16), + cli.input.display() + ); + eprintln!( + "{}: {}", + fmt::param_aligned("Output", 16), + cli.output.display() + ); + eprintln!( + "{}: {}", + fmt::param_aligned("Number of reads", 16), + cli.num_reads + ); if let Some(seed) = cli.seed { eprintln!("{}: {}", fmt::param_aligned("Random seed", 16), seed); } @@ -26,7 +42,44 @@ fn main() -> Result<()> { } let (length_model, quality_model) = load_models(&cli.input, cli.seed)?; - let error_model = ErrorModel::new(None, None, None)?; + let error_model = ErrorModel::new( + cli.error_sub, + cli.error_ins, + cli.error_del, + cli.error_ins_ext, + cli.error_del_ext, + )?; + + if cli.verbose { + eprintln!("Error Model Configuration:"); + eprintln!( + "{}: {:.2}", + fmt::param_aligned("Substitution rate", 20), + error_model.substitution_rate + ); + eprintln!( + "{}: {:.2}", + fmt::param_aligned("Insertion rate", 20), + error_model.insertion_rate + ); + eprintln!( + "{}: {:.2}", + fmt::param_aligned("Deletion rate", 20), + error_model.deletion_rate + ); + eprintln!( + "{}: {:.2}", + fmt::param_aligned("Ins. extension rate", 20), + error_model.insertion_extension_rate + ); + eprintln!( + "{}: {:.2}", + fmt::param_aligned("Del. extension rate", 20), + error_model.deletion_extension_rate + ); + eprintln!(); + } + let mut generator = ReadGenerator::new( FastaReader::read(&cli.reference)?, length_model, @@ -34,19 +87,48 @@ fn main() -> Result<()> { error_model, cli.seed, )?; - let mut writer = FastqWriter::new(&cli.output)?; + + // Detect output format based on extension + let output_ext = cli + .output + .extension() + .and_then(|s| s.to_str()) + .unwrap_or("") + .to_lowercase(); if cli.verbose { - eprintln!("{}", fmt::progress(format!("Generating {} reads...", cli.num_reads))); + eprintln!( + "{}", + fmt::progress(format!("Generating {} reads...", cli.num_reads)) + ); } - for _ in 0..cli.num_reads { - let read = generator.generate_read()?; - writer.write_record(&read)?; + match output_ext.to_lowercase().as_str() { + "bam" => { + let mut writer = BamWriter::new(&cli.output, cli.compression_threads)?; + for _ in 0..cli.num_reads { + let read = generator.generate_read()?; + let name = std::str::from_utf8(read.name()).expect("UUID should be valid UTF-8"); + writer.write_record(name, read.sequence(), read.quality_scores())?; + } + writer.finish()?; + } + _ => { + // Default to FASTQ for all other extensions + let mut writer = FastqWriter::new(&cli.output, cli.compression_threads)?; + for _ in 0..cli.num_reads { + let read = generator.generate_read()?; + writer.write_record(&read)?; + } + writer.finish()?; + } } if cli.verbose { - eprintln!("{}", fmt::success(format!("Output written to {}", cli.output.display()))); + eprintln!( + "{}", + fmt::success(format!("Output written to {}", cli.output.display())) + ); } Ok(()) diff --git a/src/models/error.rs b/src/models/error.rs index 8ac2732..082ac27 100644 --- a/src/models/error.rs +++ b/src/models/error.rs @@ -4,6 +4,9 @@ use rand::Rng; const SUBSTITUTION_DEFAULT_RATE: f64 = 0.7; const INSERTION_DEFAULT_RATE: f64 = 0.1; const DELETION_DEFAULT_RATE: f64 = 0.2; +const INSERTION_EXTENSION_DEFAULT_RATE: f64 = 0.4; +const DELETION_EXTENSION_DEFAULT_RATE: f64 = 0.4; +const EXTENSION_LIMIT: usize = 100; /// Type of sequencing error alteration applied to a nucleotide position. #[derive(Debug)] @@ -20,13 +23,16 @@ pub enum AlterationType { /// /// # Rate Constraints /// - All rates must be in the range [0.0, 1.0] -/// - The sum of all rates must be ≤ 1.0 +/// - The sum of substitution, insertion, and deletion rates must be ≤ 1.0 +/// - Extension rates define geometric distribution for indel lengths /// - If the sum < 1.0, some errors will result in no alteration #[derive(Debug)] pub struct ErrorModel { pub substitution_rate: f64, pub insertion_rate: f64, pub deletion_rate: f64, + pub insertion_extension_rate: f64, + pub deletion_extension_rate: f64, } impl ErrorModel { @@ -36,6 +42,8 @@ impl ErrorModel { /// * `substitution_rate` - Probability of substitution errors (default: 0.7) /// * `insertion_rate` - Probability of insertion errors (default: 0.1) /// * `deletion_rate` - Probability of deletion errors (default: 0.2) + /// * `insertion_extension_rate` - Probability of extending an insertion (default: 0.4) + /// * `deletion_extension_rate` - Probability of extending a deletion (default: 0.4) /// /// # Returns /// A validated `ErrorModel` instance @@ -43,26 +51,33 @@ impl ErrorModel { /// # Errors /// Returns an error if: /// - Any rate is outside [0.0, 1.0] - /// - The sum of all rates exceeds 1.0 + /// - The sum of error rates exceeds 1.0 /// /// # Example /// ``` /// use readfaker::models::ErrorModel; /// - /// // Use default rates (0.7, 0.1, 0.2) - /// let model = ErrorModel::new(None, None, None).unwrap(); + /// // Use default rates (0.7, 0.1, 0.2, 0.4, 0.4) + /// let model = ErrorModel::new(None, None, None, None, None).unwrap(); /// - /// // Custom rates - /// let model = ErrorModel::new(Some(0.5), Some(0.3), Some(0.1)).unwrap(); + /// // Custom rates with longer indels + /// let model = ErrorModel::new( + /// Some(0.5), Some(0.3), Some(0.1), + /// Some(0.25), Some(0.25) + /// ).unwrap(); /// ``` pub fn new( substitution_rate: Option, insertion_rate: Option, deletion_rate: Option, + insertion_extension_rate: Option, + deletion_extension_rate: Option, ) -> Result { let substitution = substitution_rate.unwrap_or(SUBSTITUTION_DEFAULT_RATE); let insertion = insertion_rate.unwrap_or(INSERTION_DEFAULT_RATE); let deletion = deletion_rate.unwrap_or(DELETION_DEFAULT_RATE); + let ins_ext = insertion_extension_rate.unwrap_or(INSERTION_EXTENSION_DEFAULT_RATE); + let del_ext = deletion_extension_rate.unwrap_or(DELETION_EXTENSION_DEFAULT_RATE); if !(0.0..=1.0).contains(&substitution) { bail!( @@ -82,6 +97,18 @@ impl ErrorModel { deletion ); } + if !(0.0..=1.0).contains(&ins_ext) { + bail!( + "Insertion extension rate must be between 0.0 and 1.0, got {}", + ins_ext + ); + } + if !(0.0..=1.0).contains(&del_ext) { + bail!( + "Deletion extension rate must be between 0.0 and 1.0, got {}", + del_ext + ); + } let sum = substitution + insertion + deletion; if sum > 1.0 { @@ -98,9 +125,22 @@ impl ErrorModel { substitution_rate: substitution, insertion_rate: insertion, deletion_rate: deletion, + insertion_extension_rate: ins_ext, + deletion_extension_rate: del_ext, }) } + /// Sample length from geometric distribution based on extension rate + fn sample_length(&self, rng: &mut impl Rng, extension_rate: f64) -> usize { + let mut length = 1; + // Simple geometric sampling: keep extending while random < rate + // If rate is 0.0, this loop never runs, returns 1 + while rng.random_range(0.0..1.0) < extension_rate && length <= EXTENSION_LIMIT { + length += 1; + } + length + } + /// Randomly determines which type of error alteration to apply. /// /// Uses the model's rate probabilities to select between substitution, insertion, @@ -113,19 +153,17 @@ impl ErrorModel { /// # Returns /// * `Some(AlterationType)` - The type of alteration to apply /// * `None` - No alteration (when random value exceeds the sum of all rates) - /// - /// # Current Implementation - /// Always returns count = 1 for insertions and deletions. Multi-nucleotide - /// operations will be supported in future versions. pub fn get_alteration_type(&self, rng: &mut impl Rng) -> Option { let r = rng.random_range(0.0..1.0); if r < self.substitution_rate { Some(AlterationType::Substitution) } else if r < self.substitution_rate + self.insertion_rate { - Some(AlterationType::Insertion(1)) + let len = self.sample_length(rng, self.insertion_extension_rate); + Some(AlterationType::Insertion(len)) } else if r < self.substitution_rate + self.insertion_rate + self.deletion_rate { - Some(AlterationType::Deletion(1)) + let len = self.sample_length(rng, self.deletion_extension_rate); + Some(AlterationType::Deletion(len)) } else { None } @@ -140,34 +178,35 @@ mod tests { #[test] fn test_default_values() { - let model = ErrorModel::new(None, None, None).unwrap(); + let model = ErrorModel::new(None, None, None, None, None).unwrap(); assert_eq!(model.substitution_rate, 0.7); assert_eq!(model.insertion_rate, 0.1); assert_eq!(model.deletion_rate, 0.2); + assert_eq!(model.insertion_extension_rate, 0.4); + assert_eq!(model.deletion_extension_rate, 0.4); } #[test] fn test_custom_valid_values() { - let model = ErrorModel::new(Some(0.5), Some(0.3), Some(0.2)).unwrap(); + let model = ErrorModel::new(Some(0.5), Some(0.3), Some(0.2), Some(0.5), Some(0.5)).unwrap(); assert_eq!(model.substitution_rate, 0.5); assert_eq!(model.insertion_rate, 0.3); assert_eq!(model.deletion_rate, 0.2); + assert_eq!(model.insertion_extension_rate, 0.5); + assert_eq!(model.deletion_extension_rate, 0.5); } #[test] fn test_sum_exceeds_one() { - let result = ErrorModel::new(Some(0.6), Some(0.3), Some(0.2)); + let result = ErrorModel::new(Some(0.6), Some(0.3), Some(0.2), None, None); assert!(result.is_err()); let err_msg = result.unwrap_err().to_string(); assert!(err_msg.contains("must sum to at most 1.0")); - assert!(err_msg.contains("substitution=0.6")); - assert!(err_msg.contains("insertion=0.3")); - assert!(err_msg.contains("deletion=0.2")); } #[test] fn test_get_alteration_type_substitution() { - let model = ErrorModel::new(Some(1.0), Some(0.0), Some(0.0)).unwrap(); + let model = ErrorModel::new(Some(1.0), Some(0.0), Some(0.0), None, None).unwrap(); let mut rng = StdRng::seed_from_u64(42); // With 100% substitution rate, should always return Substitution @@ -179,7 +218,7 @@ mod tests { #[test] fn test_get_alteration_type_insertion() { - let model = ErrorModel::new(Some(0.0), Some(1.0), Some(0.0)).unwrap(); + let model = ErrorModel::new(Some(0.0), Some(1.0), Some(0.0), None, None).unwrap(); let mut rng = StdRng::seed_from_u64(42); // With 100% insertion rate, should always return Insertion @@ -187,7 +226,7 @@ mod tests { let alteration = model.get_alteration_type(&mut rng); match alteration { Some(AlterationType::Insertion(count)) => { - assert_eq!(count, 1); + assert!(count >= 1); // Default extension is 0.4, so length >= 1 } _ => panic!("Expected Insertion, got {:?}", alteration), } @@ -196,7 +235,7 @@ mod tests { #[test] fn test_get_alteration_type_deletion() { - let model = ErrorModel::new(Some(0.0), Some(0.0), Some(1.0)).unwrap(); + let model = ErrorModel::new(Some(0.0), Some(0.0), Some(1.0), None, None).unwrap(); let mut rng = StdRng::seed_from_u64(42); // With 100% deletion rate, should always return Deletion @@ -204,7 +243,7 @@ mod tests { let alteration = model.get_alteration_type(&mut rng); match alteration { Some(AlterationType::Deletion(count)) => { - assert_eq!(count, 1); + assert!(count >= 1); // Default extension is 0.4, so length >= 1 } _ => panic!("Expected Deletion, got {:?}", alteration), } @@ -213,7 +252,7 @@ mod tests { #[test] fn test_get_alteration_type_mixed() { - let model = ErrorModel::new(Some(0.5), Some(0.3), Some(0.2)).unwrap(); + let model = ErrorModel::new(Some(0.5), Some(0.3), Some(0.2), None, None).unwrap(); let mut rng = StdRng::seed_from_u64(42); let mut substitution_count = 0; @@ -237,4 +276,25 @@ mod tests { assert!(deletion_count > 100); // Expect ~200 assert_eq!(none_count, 0); // Sum = 1.0, so no None } + + #[test] + fn test_indel_extension() { + // Set high extension rate to ensure we get length > 1 + let model = ErrorModel::new(Some(0.0), Some(0.5), Some(0.5), Some(0.8), Some(0.8)).unwrap(); + let mut rng = StdRng::seed_from_u64(42); + + let mut max_ins_len = 0; + let mut max_del_len = 0; + + for _ in 0..1000 { + match model.get_alteration_type(&mut rng) { + Some(AlterationType::Insertion(count)) => max_ins_len = max_ins_len.max(count), + Some(AlterationType::Deletion(count)) => max_del_len = max_del_len.max(count), + _ => {} + } + } + + assert!(max_ins_len > 1); + assert!(max_del_len > 1); + } } diff --git a/src/utils.rs b/src/utils.rs index 797936d..1c7a787 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,3 +1,4 @@ +use crate::io::bam::BamReader; use crate::io::fastq::FastqReader; use crate::models::{LengthModel, QualityModel}; use rand::SeedableRng; @@ -16,19 +17,20 @@ pub static QUALITY_MAPPING: LazyLock<[f32; 94]> = LazyLock::new(|| { mapping }); -/// Loads length and quality models from an existing FASTQ file. +/// Loads length and quality models from an existing FASTQ or BAM file. /// +/// Automatically detects the file format based on the extension (.fastq, .fq, .bam). /// Reads all records from the input file and builds empirical models /// for read lengths and quality scores. /// /// # Arguments -/// * `fastq_path` - Path to the FASTQ file to analyze +/// * `input_path` - Path to the FASTQ or BAM file to analyze /// * `seed` - Optional random seed for reproducibility (uses system entropy if None) /// /// # Returns /// Tuple of (LengthModel, QualityModel) built from the input file pub fn load_models( - fastq_path: &Path, + input_path: &Path, seed: Option, ) -> anyhow::Result<(LengthModel, QualityModel)> { let mut length_model = LengthModel::new(); @@ -39,12 +41,48 @@ pub fn load_models( None => StdRng::from_rng(&mut rand::rng()), }; - let reader = FastqReader::from_path(fastq_path)?; + // Detect file format based on file name (handles compound extensions like .fastq.gz) + let file_name = input_path + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(""); - for record in reader { - let record = record?; - length_model.add_value(record.len()); - quality_model.add_value(record.len(), record.quality, &mut rng); + let file_name_lower = file_name.to_lowercase(); + + // Strip compression extensions to get base extension + let base_name = file_name_lower + .strip_suffix(".gz") + .or_else(|| file_name_lower.strip_suffix(".bgz")) + .unwrap_or(&file_name_lower); + + if base_name.ends_with(".bam") { + let reader = BamReader::from_path(input_path)?; + for record in reader { + let record = record?; + let length = record.sequence().len(); + // Convert raw Phred scores (0-93) to Phred+33 ASCII encoding + let quality: Vec = record + .quality_scores() + .as_ref() + .iter() + .map(|&q| q.saturating_add(33)) + .collect(); + length_model.add_value(length); + quality_model.add_value(length, quality, &mut rng); + } + } else if base_name.ends_with(".fastq") || base_name.ends_with(".fq") { + let reader = FastqReader::from_path(input_path)?; + for record in reader { + let record = record?; + let length = record.sequence().len(); + let quality = record.quality_scores().to_vec(); + length_model.add_value(length); + quality_model.add_value(length, quality, &mut rng); + } + } else { + anyhow::bail!( + "Unsupported input file format. Expected .fastq, .fq, or .bam (optionally compressed with .gz, .bgz)" + ); } Ok((length_model, quality_model))