diff --git a/Cargo.lock b/Cargo.lock index f028a32..a02e019 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -138,12 +138,6 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" -[[package]] -name = "ascii" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16" - [[package]] name = "askama" version = "0.12.1" @@ -170,7 +164,7 @@ dependencies = [ "proc-macro2 1.0.93", "quote", "serde", - "syn 2.0.96", + "syn", ] [[package]] @@ -207,7 +201,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -218,7 +212,7 @@ checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -361,6 +355,16 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + [[package]] name = "base64ct" version = "1.6.0" @@ -376,6 +380,26 @@ dependencies = [ "serde", ] +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bindgen" version = "0.69.5" @@ -395,7 +419,7 @@ dependencies = [ "regex", "rustc-hash 1.1.0", "shlex", - "syn 2.0.96", + "syn", "which", ] @@ -446,7 +470,7 @@ checksum = "56791e4bd64c99fc361e01008f45c984baa93f12a0957d1b3c51dd2c6baab453" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -464,6 +488,26 @@ version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "bytemuck" +version = "1.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3995eaeebcdf32f91f980d360f78732ddc061097ab4e39991ae7a6ace9194677" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f154e572231cb6ba2bd1176980827e3d5dc04cc183a75dea38109fbdd672d29" +dependencies = [ + "proc-macro2 1.0.93", + "quote", + "syn", +] + [[package]] name = "byteorder" version = "1.5.0" @@ -561,7 +605,7 @@ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" dependencies = [ "glob", "libc", - "libloading 0.8.6", + "libloading", ] [[package]] @@ -595,7 +639,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -777,7 +821,7 @@ dependencies = [ "proc-macro2 1.0.93", "quote", "strsim", - "syn 2.0.96", + "syn", ] [[package]] @@ -788,7 +832,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -801,7 +845,7 @@ dependencies = [ "hashbrown 0.14.5", "lock_api", "once_cell", - "parking_lot_core 0.9.10", + "parking_lot_core", ] [[package]] @@ -826,13 +870,12 @@ dependencies = [ [[package]] name = "dihardts_cstools" -version = "1.0.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6cc0a9c4ec8b728b893851938b3a01033e3425e7130df3fd7010a79bc3e82dab" +checksum = "b6fb98a20f15d7d3af0321cf2d23bad6163a9add334cd0c622f97a952085e177" dependencies = [ "anyhow", "bitvec", - "hdf5", "murmur3", ] @@ -869,7 +912,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -979,9 +1022,9 @@ dependencies = [ "lazy_static", "memchr", "num_cpus", - "ordered-float", + "ordered-float 4.5.0", "smartstring", - "thiserror 2.0.11", + "thiserror 2.0.12", ] [[package]] @@ -993,7 +1036,7 @@ dependencies = [ "darling", "heck 0.5.0", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -1007,7 +1050,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "thiserror 2.0.11", + "thiserror 2.0.12", ] [[package]] @@ -1055,6 +1098,15 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" +[[package]] +name = "font-types" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02a596f5713680923a2080d86de50fe472fb290693cf0f701187a1c8b36996b7" +dependencies = [ + "bytemuck", +] + [[package]] name = "foreign-types" version = "0.3.2" @@ -1147,7 +1199,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -1282,62 +1334,6 @@ dependencies = [ "foldhash", ] -[[package]] -name = "hdf5" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdcd9b131fd67bb827b386d0dc63d3e74196a14616ef800acf87ca5fef741a10" -dependencies = [ - "bitflags 1.3.2", - "cfg-if", - "hdf5-derive", - "hdf5-sys", - "hdf5-types", - "lazy_static", - "libc", - "ndarray", - "parking_lot 0.11.2", - "paste", -] - -[[package]] -name = "hdf5-derive" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5a77ac6a41e6880594d506118c0b8bc665ec959fe4636e0c84809756d224820" -dependencies = [ - "proc-macro2 1.0.93", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "hdf5-sys" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4842d5980dc311a7c8933c7b45534fdae84df5ae7939a0ae8e449a56d4beb3d2" -dependencies = [ - "libc", - "libloading 0.7.4", - "pkg-config", - "regex", - "serde", - "serde_derive", - "winreg 0.10.1", -] - -[[package]] -name = "hdf5-types" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b47268c0dfb499b1ffe5638b6e7694e7a87fe49fb92eca998a4346e5483e428f" -dependencies = [ - "ascii", - "cfg-if", - "hdf5-sys", - "libc", -] - [[package]] name = "heck" version = "0.4.1" @@ -1705,7 +1701,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -1714,6 +1710,12 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" +[[package]] +name = "identity-hash" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfdd7caa900436d8f13b2346fe10257e0c05c1f1f9e351f4f5d57c03bd5f45da" + [[package]] name = "idna" version = "1.0.3" @@ -1743,6 +1745,7 @@ checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", "hashbrown 0.15.2", + "serde", ] [[package]] @@ -1780,15 +1783,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "instant" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" -dependencies = [ - "cfg-if", -] - [[package]] name = "ipnet" version = "2.10.1" @@ -1891,16 +1885,6 @@ version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" -[[package]] -name = "libloading" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" -dependencies = [ - "cfg-if", - "winapi", -] - [[package]] name = "libloading" version = "0.8.6" @@ -2023,11 +2007,13 @@ dependencies = [ "rand 0.8.5", "refinery", "reqwest", + "rustyms", "scylla", "serde", "serde_json", "serial_test", "sysinfo", + "thiserror 2.0.12", "tokio", "tokio-util", "tower 0.4.13", @@ -2078,9 +2064,9 @@ checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" [[package]] name = "matrixmultiply" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" dependencies = [ "autocfg", "rawpointer", @@ -2145,7 +2131,7 @@ dependencies = [ "hashbrown 0.15.2", "indexmap", "metrics", - "ordered-float", + "ordered-float 4.5.0", "quanta", "radix_trie", "rand 0.8.5", @@ -2201,6 +2187,35 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" +[[package]] +name = "mzdata" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c697aaf28d7028f10a4812acf50a01cf75251ebe6088bb6ba4991bfab0b6780" +dependencies = [ + "base64-simd", + "bitflags 2.8.0", + "bytemuck", + "chrono", + "flate2", + "identity-hash", + "indexmap", + "log", + "mzpeaks", + "num-traits", + "regex", + "thiserror 2.0.12", +] + +[[package]] +name = "mzpeaks" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "543be9eac70437bfc915b3339e6ae4f23dc034922f13eb2535dcc19e7e9e9481" +dependencies = [ + "num-traits", +] + [[package]] name = "native-tls" version = "0.2.12" @@ -2220,14 +2235,16 @@ dependencies = [ [[package]] name = "ndarray" -version = "0.15.6" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" dependencies = [ "matrixmultiply", "num-complex", "num-integer", "num-traits", + "portable-atomic", + "portable-atomic-util", "rawpointer", ] @@ -2269,6 +2286,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", + "serde", +] + [[package]] name = "num-complex" version = "0.4.6" @@ -2276,6 +2304,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" dependencies = [ "num-traits", + "serde", ] [[package]] @@ -2293,6 +2322,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-integer", + "num-traits", + "serde", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -2356,7 +2396,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -2387,21 +2427,27 @@ dependencies = [ ] [[package]] -name = "overload" -version = "0.1.1" +name = "ordered-float" +version = "5.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +checksum = "e2c1f9f56e534ac6a9b8a4600bdf0f530fb393b5f393e7b4d03489c3cf0c3f01" +dependencies = [ + "num-traits", + "rand 0.8.5", + "serde", +] [[package]] -name = "parking_lot" -version = "0.11.2" +name = "outref" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" -dependencies = [ - "instant", - "lock_api", - "parking_lot_core 0.8.6", -] +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" + +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "parking_lot" @@ -2410,21 +2456,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", - "parking_lot_core 0.9.10", -] - -[[package]] -name = "parking_lot_core" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" -dependencies = [ - "cfg-if", - "instant", - "libc", - "redox_syscall 0.2.16", - "smallvec", - "winapi", + "parking_lot_core", ] [[package]] @@ -2435,7 +2467,7 @@ checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.5.8", + "redox_syscall", "smallvec", "windows-targets 0.52.6", ] @@ -2482,7 +2514,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "198db74531d58c70a361c42201efde7e2591e976d518caf7662a47dc5720e7b6" dependencies = [ "memchr", - "thiserror 2.0.11", + "thiserror 2.0.12", "ucd-trie", ] @@ -2506,7 +2538,7 @@ dependencies = [ "pest_meta", "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -2574,6 +2606,15 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "postgres-protocol" version = "0.6.7" @@ -2625,7 +2666,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac" dependencies = [ "proc-macro2 1.0.93", - "syn 2.0.96", + "syn", +] + +[[package]] +name = "probability" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42746b805e424b759d46c22c65dc66ccca057a2db96e9db4fda6c337a287e485" +dependencies = [ + "random", + "special", ] [[package]] @@ -2693,7 +2744,7 @@ dependencies = [ "itertools 0.13.0", "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -2770,6 +2821,7 @@ dependencies = [ "libc", "rand_chacha 0.3.1", "rand_core 0.6.4", + "serde", ] [[package]] @@ -2810,6 +2862,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom 0.2.15", + "serde", ] [[package]] @@ -2839,6 +2892,12 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "random" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "474c42c904f04dfe2a595a02f71e1a0e5e92ffb5761cc9a4c02140b93b8dd504" + [[package]] name = "rapidfuzz" version = "0.5.0" @@ -2881,12 +2940,13 @@ dependencies = [ ] [[package]] -name = "redox_syscall" -version = "0.2.16" +name = "read-fonts" +version = "0.29.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +checksum = "04ca636dac446b5664bd16c069c00a9621806895b8bb02c2dc68542b23b8f25d" dependencies = [ - "bitflags 1.3.2", + "bytemuck", + "font-types", ] [[package]] @@ -2940,7 +3000,7 @@ dependencies = [ "quote", "refinery-core", "regex", - "syn 2.0.96", + "syn", ] [[package]] @@ -3026,7 +3086,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "winreg 0.50.0", + "winreg", ] [[package]] @@ -3133,6 +3193,30 @@ version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" +[[package]] +name = "rustyms" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4c4248bb25b2e7911154def35d8129e248c109ecd1ddf85f2ad3000a8ee823" +dependencies = [ + "bincode", + "flate2", + "itertools 0.14.0", + "mzdata", + "ndarray", + "ordered-float 5.0.0", + "probability", + "rand 0.9.0", + "rayon", + "regex", + "serde", + "similar", + "swash", + "thin-vec", + "uom", + "zeno", +] + [[package]] name = "ryu" version = "1.0.18" @@ -3196,7 +3280,7 @@ dependencies = [ "smallvec", "snap", "socket2", - "thiserror 2.0.11", + "thiserror 2.0.12", "tokio", "tracing", "uuid", @@ -3217,7 +3301,7 @@ dependencies = [ "scylla-macros", "snap", "stable_deref_trait", - "thiserror 2.0.11", + "thiserror 2.0.12", "tokio", "uuid", "yoke", @@ -3232,7 +3316,7 @@ dependencies = [ "darling", "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -3294,7 +3378,7 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -3362,7 +3446,7 @@ dependencies = [ "futures", "log", "once_cell", - "parking_lot 0.12.3", + "parking_lot", "scc", "serial_test_derive", ] @@ -3375,7 +3459,7 @@ checksum = "5d69265a08751de7844521fd15003ae0a888e035773ba05695c5c759a6f89eef" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -3424,6 +3508,12 @@ dependencies = [ "libc", ] +[[package]] +name = "similar" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" + [[package]] name = "siphasher" version = "1.0.1" @@ -3436,6 +3526,16 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a" +[[package]] +name = "skrifa" +version = "0.31.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbeb4ca4399663735553a09dd17ce7e49a0a0203f03b706b39628c4d913a8607" +dependencies = [ + "bytemuck", + "read-fonts", +] + [[package]] name = "slab" version = "0.4.9" @@ -3478,6 +3578,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "special" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b89cf0d71ae639fdd8097350bfac415a41aabf1d5ddd356295fdc95f09760382" +dependencies = [ + "libm", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -3514,14 +3623,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] -name = "syn" -version = "1.0.109" +name = "swash" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +checksum = "f745de914febc7c9ab4388dfaf94bbc87e69f57bb41133a9b0c84d4be49856f3" dependencies = [ - "proc-macro2 1.0.93", - "quote", - "unicode-ident", + "skrifa", + "yazi", + "zeno", ] [[package]] @@ -3555,7 +3664,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -3623,6 +3732,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "thin-vec" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "144f754d318415ac792f9d69fc87abbbfc043ce2ef041c60f16ad828f638717d" +dependencies = [ + "serde", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -3634,11 +3752,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.11" +version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" +checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" dependencies = [ - "thiserror-impl 2.0.11", + "thiserror-impl 2.0.12", ] [[package]] @@ -3649,18 +3767,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] name = "thiserror-impl" -version = "2.0.11" +version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" +checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -3739,7 +3857,7 @@ dependencies = [ "bytes", "libc", "mio", - "parking_lot 0.12.3", + "parking_lot", "pin-project-lite", "signal-hook-registry", "socket2", @@ -3755,7 +3873,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -3781,7 +3899,7 @@ dependencies = [ "futures-channel", "futures-util", "log", - "parking_lot 0.12.3", + "parking_lot", "percent-encoding", "phf", "pin-project-lite", @@ -3950,7 +4068,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -4053,7 +4171,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568" dependencies = [ "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -4147,6 +4265,26 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + +[[package]] +name = "uom" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffd36e5350a65d112584053ee91843955826bf9e56ec0d1351214e01f6d7cd9c" +dependencies = [ + "num-bigint", + "num-complex", + "num-rational", + "num-traits", + "serde", + "typenum", +] + [[package]] name = "url" version = "2.5.4" @@ -4209,6 +4347,18 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + [[package]] name = "vt100" version = "0.15.2" @@ -4304,7 +4454,7 @@ dependencies = [ "log", "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", "wasm-bindgen-shared", ] @@ -4339,7 +4489,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4404,7 +4554,7 @@ version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "372d5b87f58ec45c384ba03563b03544dc5fadc3983e434b286913f5b4a9bb6d" dependencies = [ - "redox_syscall 0.5.8", + "redox_syscall", "wasite", "web-sys", ] @@ -4606,16 +4756,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "winreg" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" -dependencies = [ - "serde", - "winapi", -] - [[package]] name = "winreg" version = "0.50.0" @@ -4656,6 +4796,12 @@ dependencies = [ "tap", ] +[[package]] +name = "yazi" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01738255b5a16e78bbb83e7fbba0a1e7dd506905cfc53f4622d89015a03fbb5" + [[package]] name = "yoke" version = "0.7.5" @@ -4676,10 +4822,16 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", "synstructure", ] +[[package]] +name = "zeno" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6df3dc4292935e51816d896edcd52aa30bc297907c26167fec31e2b0c6a32524" + [[package]] name = "zerocopy" version = "0.7.35" @@ -4707,7 +4859,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -4718,7 +4870,7 @@ checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -4738,7 +4890,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", "synstructure", ] @@ -4767,7 +4919,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2 1.0.93", "quote", - "syn 2.0.96", + "syn", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 024b523..72fd9c6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,7 @@ chrono = "0.4.23" clap = { version = "4.3.0", features = ["derive"] } crossbeam-queue = "0.3.11" csv = "1.2.2" -dihardts_cstools = "~1.0.0" +dihardts_cstools = "~2.2.0" dihardts_omicstools = "~3.1.0" env_logger = "0.10.0" fallible-iterator = "0.3.0" @@ -69,10 +69,12 @@ prometheus-parse = "0.2.5" rand = "0.8.5" refinery = { version = "0.8.7", features = ["tokio-postgres"] } reqwest = { version = "0.11.22", features = ["json", "stream"] } +rustyms = "0.10.0" scylla = { version = "1.1.0", features = ["metrics"] } serde = "1.0.160" serde_json = "1.0.95" sysinfo = "0.28.2" +thiserror = "2.0.12" tokio = { version = "1.44.2", features = ["full"] } tokio-util = "0.7.14" tower = "0.4.13" diff --git a/src/bin.rs b/src/bin.rs index ded6186..0339632 100755 --- a/src/bin.rs +++ b/src/bin.rs @@ -400,10 +400,7 @@ async fn main() -> Result<()> { let log_folder = Path::new(&log_folder).to_path_buf(); - let num_partitions = match partitions.parse::() { - Ok(num_partitions) => Some(num_partitions), - Err(_) => None, - }; + let num_partitions = partitions.parse::().ok(); // Default partition limits (empty if created) let mut partition_limits: Vec = Vec::with_capacity(0); diff --git a/src/database/scylla/peptide_search.rs b/src/database/scylla/peptide_search.rs index 830363f..9934b96 100644 --- a/src/database/scylla/peptide_search.rs +++ b/src/database/scylla/peptide_search.rs @@ -1,32 +1,39 @@ -// std imports -use std::cmp::{max, min}; -use std::collections::HashMap; +use std::cmp::max; +use std::collections::{HashMap, HashSet}; +use std::fmt::Display; +use std::ops::Deref; use std::pin::Pin; use std::sync::Arc; -// 3rd party imports use anyhow::Result; use async_stream::try_stream; use dihardts_cstools::bloom_filter::BloomFilter; +use dihardts_omicstools::chemistry::amino_acid::{AminoAcid, CANONICAL_AMINO_ACIDS}; use dihardts_omicstools::proteomics::post_translational_modifications::PostTranslationalModification as PTM; use futures::{pin_mut, Stream, StreamExt}; +use itertools::Itertools; use scylla::value::CqlValue; use tokio::sync::mpsc::{unbounded_channel as channel, UnboundedSender as Sender}; use tracing::error; -// local imports -use crate::functions::post_translational_modification::get_ptm_conditions; +use crate::chemistry::amino_acid::INTERNAL_GLYCINE; +use crate::entities::configuration::Configuration; +use crate::entities::peptide::MatchingPeptide; +use crate::functions::post_translational_modification::PTMCollection; +use crate::mass::convert::{to_float as mass_to_float, to_int as mass_to_int}; use crate::tools::peptide_partitioner::get_mass_partition; -use crate::{ - database::scylla::peptide_table::PeptideTable, entities::peptide::Peptide, - functions::post_translational_modification::PTMCondition, -}; +use crate::{database::scylla::peptide_table::PeptideTable, entities::peptide::Peptide}; use super::client::Client; /// Trait to check conditions on peptides /// -pub trait FilterFunction: Send + Sync { +pub trait FilterFunction: Send + Sync + Display { + /// Returns true if the peptide matches the condition, false otherwise. + /// + /// # Arguments + /// * `peptide` - The peptide to check + /// fn is_match(&mut self, peptide: &Peptide) -> Result; } @@ -40,6 +47,12 @@ impl FilterFunction for IsSwissProtFilterFunction { } } +impl Display for IsSwissProtFilterFunction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "is SwissProt") + } +} + /// Filter peptides which are not in TrEMBL /// struct IsTrEMBLFilterFunction; @@ -50,6 +63,12 @@ impl FilterFunction for IsTrEMBLFilterFunction { } } +impl Display for IsTrEMBLFilterFunction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "is TrEMBL") + } +} + /// Makes sure that no peptide is returned twice /// pub struct ThreadSafeDistinctFilterFunction { @@ -66,6 +85,12 @@ impl FilterFunction for ThreadSafeDistinctFilterFunction { } } +impl Display for ThreadSafeDistinctFilterFunction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "distinct") + } +} + /// Filters peptides which are not in the given taxonomy IDs /// struct TaxonomyFilterFunction { @@ -83,6 +108,12 @@ impl FilterFunction for TaxonomyFilterFunction { } } +impl Display for TaxonomyFilterFunction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "taxonomy in [{}]", self.taxonomy_ids.iter().join(", ")) + } +} + /// Filters peptides which are not in the given proteome IDs /// struct ProteomeFilterFunction { @@ -100,30 +131,163 @@ impl FilterFunction for ProteomeFilterFunction { } } -pub type FalliblePeptideStream = Pin> + Send>>; +impl Display for ProteomeFilterFunction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "proteome in [{}]", self.proteome_ids.iter().join(", ")) + } +} + +/// Filters peptides which start with a specific amino acid +/// +struct StartsWithFilterFunction { + amino_acid: char, +} + +impl FilterFunction for StartsWithFilterFunction { + fn is_match(&mut self, peptide: &Peptide) -> Result { + Ok(peptide.get_sequence().starts_with(self.amino_acid)) + } +} + +impl Display for StartsWithFilterFunction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "starts with '{}'", self.amino_acid) + } +} + +/// Filters peptides which end with a specific amino acid +/// +struct EndsWithFilterFunction { + /// One letter code of the amino acid + amino_acid: char, +} + +impl FilterFunction for EndsWithFilterFunction { + fn is_match(&mut self, peptide: &Peptide) -> Result { + Ok(peptide.get_sequence().ends_with(self.amino_acid)) + } +} + +impl Display for EndsWithFilterFunction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "ends with '{}'", self.amino_acid) + } +} + +/// Filters peptides contains an specific amount occurrences of an amino acid +/// +struct EqualsNumberOfOccurrencesFilterFunction { + /// One letter code of the amino acid + amino_acid: char, + amount: i16, +} + +impl FilterFunction for EqualsNumberOfOccurrencesFilterFunction { + fn is_match(&mut self, peptide: &Peptide) -> Result { + let count = peptide.get_aa_count(self.amino_acid); + Ok(count == self.amount) + } +} + +impl Display for EqualsNumberOfOccurrencesFilterFunction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "occurences of '{}' == {}", self.amino_acid, self.amount,) + } +} -/// Maps PTM conditions to partitions +/// Filters peptides contains an specific amount occurrences of an amino acid +/// +struct GreaterOrEqualsNumberOfOccurrencesFilterFunction { + /// One letter code of the amino acid + amino_acid: char, + amount: i16, +} + +impl FilterFunction for GreaterOrEqualsNumberOfOccurrencesFilterFunction { + fn is_match(&mut self, peptide: &Peptide) -> Result { + let count = peptide.get_aa_count(self.amino_acid); + Ok(count >= self.amount) + } +} + +impl Display for GreaterOrEqualsNumberOfOccurrencesFilterFunction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "occurences of '{}' >= {}", self.amino_acid, self.amount,) + } +} + +/// Filters peptides contains an specific amount occurrences of an amino acid +/// +struct NoOccurrencesFilterFunction { + /// One letter code of the amino acid + amino_acid: char, +} + +impl FilterFunction for NoOccurrencesFilterFunction { + fn is_match(&mut self, peptide: &Peptide) -> Result { + let count = peptide.get_aa_count(self.amino_acid); + Ok(count == 0) + } +} + +impl Display for NoOccurrencesFilterFunction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "occurences of '{}' == 0", self.amino_acid,) + } +} + +pub type FalliblePeptideStream = Pin> + Send>>; + +/// Maps peptide conditions to partitions /// key is the partition, value is a vector of tuples with the lower and upper mass limit and the PTM condition /// -pub type PtmConditionMap = HashMap>; +pub type FinalizedPeptideConditionMap = HashMap>; +/// Defines the search for peptides in the database and provides some helper functions +/// #[allow(clippy::too_many_arguments)] pub trait Search { + /// Search for peptides in the database based on the given parameters. + /// + /// # Arguments + /// * `client` - The client to use for the query + /// * `configuration` - The configuration to use for the query + /// * `mass` - The mass to search for + /// * `lower_mass_tolerance_ppm` - The lower mass tolerance in ppm + /// * `upper_mass_tolerance_ppm` - The upper mass tolerance in ppm + /// * `max_variable_modifications` - The maximum number of variable modifications to apply + /// * `distinct` - Whether to return distinct peptides only + /// * `taxonomy_ids` - The taxonomy IDs to filter the peptides by + /// * `proteome_ids` - The proteome IDs to filter the peptides by + /// * `is_reviewed` - Whether to filter the peptides by SwissProt or TrEMBL + /// * `ptm_collection` - The PTM collection to use for the query + /// * `resolve_modifications` - Wether to resolve modifications and return the modified sequences as ProForma compliant strings + /// * `num_threads` - The number of concurrent searches + /// fn search( client: Arc, - partition_limits: Arc>, + configuration: Arc, mass: i64, lower_mass_tolerance_ppm: i64, upper_mass_tolerance_ppm: i64, - max_variable_modifications: i16, + max_variable_modifications: usize, distinct: bool, taxonomy_ids: Option>, proteome_ids: Option>, is_reviewed: Option, - ptms: &[PTM], + ptm_collection: &PTMCollection, + resolve_modifications: bool, num_threads: Option, ) -> impl std::future::Future> + Send; + /// Creates a vecotor of filter functions based on the given parameters. + /// + /// # Arguments + /// * `distinct` - Whether to return distinct peptides only + /// * `taxonomy_ids` - The taxonomy IDs to filter the peptides by + /// * `proteome_ids` - The proteome IDs to filter the peptides by + /// * `is_reviewed` - Whether to filter the peptides by SwissProt or TrEMBL + /// fn create_filter_pipeline( distinct: bool, taxonomy_ids: Option>>, @@ -152,28 +316,29 @@ pub trait Search { Ok(filter_pipeline) } - /// Query PTM condition which needs more work prior to the actual query than just querying the mass. + /// Query condition (e.g. PTMs) which needs more work prior to the actual query than just querying the mass. /// /// # Arguments + /// * `task_id` - The ID of the task, used for logging /// * `client` - The client to use for the query - /// * `partition_limits` - The partition limits - /// * `ptm_condition` - The PTM condition to query - /// * `lower_mass_tolerance_ppm` - The lower mass tolerance in ppm - /// * `upper_mass_tolerance_ppm` - The upper mass tolerance in ppm - /// * `filter_pipeline` - The filter pipeline + /// * `partition` - The partition to query + /// * `conditions` - The conditions pepitdes need to fullfill e.g PTMs + /// * `filter_pipeline` - Global filters to apply (e.g. distinct, taxonomy, proteome, is_reviewed) + /// * `resolve_modifications` - Whether to resolve modifications and return the modified sequences as ProForma compliant strings /// * `peptide_sender` - The sender to send the peptides to the final stream /// fn search_with_ptm_conditions( task_id: usize, client: Arc, partition: usize, - conditions: Vec<(i64, i64, PTMCondition)>, + mut conditions: Vec<(i64, i64, FinalizedPeptideCondition)>, mut filter_pipeline: Vec>, - peptide_sender: Sender>, + resolve_modifications: bool, + peptide_sender: Sender>, ) -> impl std::future::Future> + Send { async move { let partition = CqlValue::BigInt(partition as i64); - for (lower_mass_limit, upper_mass_limit, ptm_condition) in conditions.iter() { + for (lower_mass_limit, upper_mass_limit, ptm_condition) in conditions.iter_mut() { let lower_mass_limit = CqlValue::BigInt(*lower_mass_limit); let upper_mass_limit = CqlValue::BigInt(*upper_mass_limit); @@ -210,7 +375,16 @@ pub trait Search { continue 'peptide_loop; } } - match peptide_sender.send(Ok(peptide)) { + + let additional_sequences = if resolve_modifications { + ptm_condition.modify_sequence(peptide.get_sequence()) + } else { + Vec::new() + }; + + match peptide_sender + .send(Ok(MatchingPeptide::new(peptide, additional_sequences))) + { Ok(_) => {} Err(err) => { error!("task {}: error sending peptide: {}", task_id, err); @@ -224,37 +398,36 @@ pub trait Search { } } - /// Query PTM condition which needs more work prior to the actual query than just querying the mass. + /// Query without and conditions /// /// # Arguments /// * `client` - The client to use for the query - /// * `partition_limits` - The partition limits + /// * `configuration` - Configuration from the database /// * `mass` - The mass to query /// * `lower_mass_tolerance_ppm` - The lower mass tolerance in ppm /// * `upper_mass_tolerance_ppm` - The upper mass tolerance in ppm - /// * `ptm_condition` - The PTM condition to query - /// * `filter_pipeline` - The filter pipeline + /// * `filter_pipeline` - Global filters to apply (e.g. distinct, taxonomy, proteome, is_reviewed) /// * `peptide_sender` - The sender to send the peptides to the final stream /// fn search_without_ptm_condition( client: Arc, - partition_limits: Arc>, + configuration: Arc, mass: i64, lower_mass_tolerance_ppm: i64, upper_mass_tolerance_ppm: i64, mut filter_pipeline: Vec>, - peptide_sender: Sender>, + peptide_sender: Sender>, ) -> impl std::future::Future> + Send { async move { // Calculate mass range - let lower_mass_limit = mass - (mass / 1000000 * lower_mass_tolerance_ppm); - let upper_mass_limit = mass + (mass / 1000000 * upper_mass_tolerance_ppm); + let lower_mass_limit = mass - (mass / 1_000_000 * lower_mass_tolerance_ppm); + let upper_mass_limit = mass + (mass / 1_000_000 * upper_mass_tolerance_ppm); let peptide_stream = PeptideTable::select_by_mass_range( client.as_ref(), lower_mass_limit, upper_mass_limit, - partition_limits.as_ref(), + configuration.get_partition_limits(), ) .await?; pin_mut!(peptide_stream); @@ -265,54 +438,59 @@ pub trait Search { continue 'peptide_loop; } } - peptide_sender.send(Ok(peptide))?; + peptide_sender.send(Ok(MatchingPeptide::new(peptide, Vec::new())))?; } Ok(()) } } - /// Splitup and sort PTM condition by partition + /// Splitup and sort peptide condition by partition and finalize them. /// /// # Arguments - /// * ptm_conditions - The PTM conditions to split and sort - /// * partition_limits - The partition limits + /// * peptide_conditions - The conditions pepitdes need to fullfill e.g PTMs + /// * partition_limits - The partition limits from configuration + /// * lower_mass_tolerance_ppm - The lower mass tolerance in ppm + /// * upper_mass_tolerance_ppm - The upper mass tolerance in ppm /// - fn split_and_sort_ptm_conditions( - ptm_conditions: Vec, + fn split_and_sort_peptide_conditions( + peptide_conditions: Vec, partition_limits: &[i64], lower_mass_tolerance_ppm: i64, upper_mass_tolerance_ppm: i64, - ) -> Result { - let mut sorted_ptm_conditions: PtmConditionMap = HashMap::new(); - for ptm_condition in ptm_conditions { + ) -> Result { + let mut sorted_peptide_conditions: FinalizedPeptideConditionMap = HashMap::new(); + for peptide_condition in peptide_conditions { // Calculate mass range based on ptm condition - let lower_mass_limit = ptm_condition.get_mass() - - (ptm_condition.get_mass() / 1000000 * lower_mass_tolerance_ppm); - let upper_mass_limit = ptm_condition.get_mass() - + (ptm_condition.get_mass() / 1000000 * upper_mass_tolerance_ppm); + let lower_mass_limit = peptide_condition.query_mass + - (peptide_condition.query_mass / 1_000_000 * lower_mass_tolerance_ppm); + let upper_mass_limit = peptide_condition.query_mass + + (peptide_condition.query_mass / 1_000_000 * upper_mass_tolerance_ppm); // Get partition let lower_partition_index = get_mass_partition(partition_limits, lower_mass_limit)?; let upper_partition_index = get_mass_partition(partition_limits, upper_mass_limit)?; if lower_partition_index == upper_partition_index { - sorted_ptm_conditions + sorted_peptide_conditions .entry(lower_partition_index) .or_default() - .push((lower_mass_limit, upper_mass_limit, ptm_condition)); + .push((lower_mass_limit, upper_mass_limit, peptide_condition.into())); } else { #[allow(clippy::needless_range_loop)] for partition in lower_partition_index..=upper_partition_index { - sorted_ptm_conditions.entry(partition).or_default().push(( - lower_mass_limit, - upper_mass_limit, - ptm_condition.clone(), - )); + sorted_peptide_conditions + .entry(partition) + .or_default() + .push(( + lower_mass_limit, + upper_mass_limit, + peptide_condition.clone().into(), + )); } } } - Ok(sorted_ptm_conditions) + Ok(sorted_peptide_conditions) } } @@ -323,29 +501,74 @@ pub struct MultiTaskSearch; impl Search for MultiTaskSearch { async fn search( client: Arc, - partition_limits: Arc>, + configuration: Arc, mass: i64, lower_mass_tolerance_ppm: i64, upper_mass_tolerance_ppm: i64, - max_variable_modifications: i16, + max_variable_modifications: usize, distinct: bool, taxonomy_ids: Option>, proteome_ids: Option>, is_reviewed: Option, - ptms: &[PTM], + ptm_collection: &PTMCollection<'_>, + resolve_modifications: bool, _num_threads: Option, ) -> Result { let taxonomy_ids = taxonomy_ids.map(Arc::new); let proteome_ids = proteome_ids.map(Arc::new); - let sorted_ptm_conditions = Self::split_and_sort_ptm_conditions( - get_ptm_conditions(mass, max_variable_modifications, ptms)?, - partition_limits.as_ref(), + let min_mass = match configuration.get_min_peptide_length() { + Some(min_length) => INTERNAL_GLYCINE.get_mono_mass_int() * min_length as i64, + None => 0, + }; + + // Calulcate max mass as stated in PeptideCondition::from_ptm_collection() 2.3 + let largest_negative_static_ptm = ptm_collection + .get_static_ptms() + .iter() + .filter(|ptm| ptm.get_mass_delta().is_sign_negative()) + .fold(0_i64, |acc, ptm| { + acc.min(mass_to_int(*ptm.get_mass_delta())) + }) + .abs(); + + let largest_negative_variable_ptm = ptm_collection + .get_variable_ptms() + .iter() + .filter(|ptm| ptm.get_mass_delta().is_sign_negative()) + .fold(0_i64, |acc, ptm| { + acc.min(mass_to_int(*ptm.get_mass_delta())) + }) + .abs(); + + // Possible peptide length plus 30% "play" to account for errors + let amino_acid_average = mass_to_int( + CANONICAL_AMINO_ACIDS + .iter() + .map(|aa| aa.get_mono_mass()) + .sum::() + / CANONICAL_AMINO_ACIDS.len() as f64, + ); + let possible_peptide_length = ((mass / amino_acid_average) as f64 * 1.3) as i64; + + let max_mass = mass + + (largest_negative_static_ptm * possible_peptide_length) + + (largest_negative_variable_ptm * possible_peptide_length); + + let sorted_ptm_conditions = Self::split_and_sort_peptide_conditions( + PeptideCondition::from_ptm_collection( + ptm_collection, + mass, + min_mass, + max_mass, + max_variable_modifications, + ), + configuration.get_partition_limits(), lower_mass_tolerance_ppm, upper_mass_tolerance_ppm, )?; - let (peptide_sender, mut peptide_receiver) = channel::>(); + let (peptide_sender, mut peptide_receiver) = channel::>(); Ok(Box::pin(try_stream! { let mut tasks: Vec>> = Vec::with_capacity(max(sorted_ptm_conditions.len(), 1)); @@ -366,6 +589,7 @@ impl Search for MultiTaskSearch { partition, conditions, filter_pipeline, + resolve_modifications, peptide_sender.clone(), ) )); @@ -375,7 +599,7 @@ impl Search for MultiTaskSearch { tasks.push(tokio::task::spawn( Self::search_without_ptm_condition( client.clone(), - partition_limits.clone(), + configuration.clone(), mass, lower_mass_tolerance_ppm, upper_mass_tolerance_ppm, @@ -404,3 +628,919 @@ impl Search for MultiTaskSearch { } // See commit 9926c71adaf7fda760f4dae3be611c18e5cfc233 for other implementations of the Search trait + +/// Peptide condition which are not querieable and need to be checked "on the fly/demand" +/// +#[derive(Clone)] +pub struct PeptideCondition { + /// Mass to query + query_mass: i64, + /// Considered static PTMs + static_ptms: Vec, + /// Considered variable PTMs + variable_ptms: Vec, + /// N-terminal PTM + n_terminal_ptm: Option, + /// C-terminal PTM + c_terminal_ptm: Option, + /// N-terminal bond PTM + n_bond_ptm: Option, + /// C-terminal bond PTM + c_bond_ptm: Option, + /// Excluded amino acids + excluded_amino_acids: HashSet, +} + +impl PeptideCondition { + /// Creates a new PeptideCondition with no PTMs. + /// + /// # Arguments + /// * `targeted_mass` - Mass of peptides to search for + /// * `minimum_mass` - Minimum mass of peptides in the datavase. Usually 6 times Glycine + /// * `max_variable_modifications` - Max. variable modification to apply simultaniously + /// + pub fn new(targeted_mass: i64) -> Self { + Self { + query_mass: targeted_mass, + static_ptms: Vec::new(), + variable_ptms: Vec::new(), + n_terminal_ptm: None, + c_terminal_ptm: None, + n_bond_ptm: None, + c_bond_ptm: None, + excluded_amino_acids: HashSet::new(), + } + } + + /// Adds a static PTM to the PeptideCondition. + /// + pub fn add_static_ptm(&mut self, ptm: &PTM) -> bool { + let mass_delta_int = mass_to_int(*ptm.get_mass_delta()); + if mass_delta_int > self.query_mass { + return false; + } + + self.static_ptms.push(ptm.clone()); + self.query_mass -= mass_delta_int; + true + } + + pub fn add_variable_ptm(&mut self, ptm: &PTM) -> bool { + let mass_delta_int = mass_to_int(*ptm.get_mass_delta()); + if mass_delta_int > self.query_mass { + return false; + } + + self.variable_ptms.push(ptm.clone()); + self.query_mass -= mass_delta_int; + true + } + + pub fn set_n_terminal_ptm(&mut self, ptm: &PTM) -> bool { + let mass_delta_int = mass_to_int(*ptm.get_mass_delta()); + if self.n_terminal_ptm.is_some() || mass_delta_int > self.query_mass { + return false; + } + + self.n_terminal_ptm = Some(ptm.clone()); + self.query_mass -= mass_delta_int; + true + } + + pub fn set_c_terminal_ptm(&mut self, ptm: &PTM) -> bool { + let mass_delta_int = mass_to_int(*ptm.get_mass_delta()); + if self.c_terminal_ptm.is_some() || mass_delta_int > self.query_mass { + return false; + } + + self.c_terminal_ptm = Some(ptm.clone()); + self.query_mass -= mass_delta_int; + true + } + + pub fn set_n_bond_ptm(&mut self, ptm: &PTM) -> bool { + let mass_delta_int = mass_to_int(*ptm.get_mass_delta()); + if self.n_bond_ptm.is_some() || mass_delta_int > self.query_mass { + return false; + } + + self.n_bond_ptm = Some(ptm.clone()); + self.query_mass -= mass_delta_int; + true + } + + pub fn set_c_bond_ptm(&mut self, ptm: &PTM) -> bool { + let mass_delta_int = mass_to_int(*ptm.get_mass_delta()); + // if ptm is positive but larger than the remaining mass or smaller than the minimum mass, skip it + // a negative delta would increase the remaining mass, so we do not check for it + if self.c_bond_ptm.is_some() || mass_delta_int > self.query_mass { + return false; + } + + self.c_bond_ptm = Some(ptm.clone()); + self.query_mass -= mass_delta_int; + true + } + + pub fn add_excluded_amino_acid(&mut self, amino_acid: &dyn AminoAcid) { + self.excluded_amino_acids + .insert(*amino_acid.get_one_letter_code()); + } + + /// Applies the condition to the given amino acid sequence and returns every possible modified version of it + /// in ProForma format. + /// + /// # Arguments + /// * `sequence` - The amino acid sequence to apply the condition tos + /// + pub fn modify_sequence(&self, sequence: &str) -> Vec { + // Map for fast access to variable modifications by amino acid + let mut variable_modifications_map: HashMap> = HashMap::new(); + for ptm in self.variable_ptms.iter() { + variable_modifications_map + .entry(*ptm.get_amino_acid().get_one_letter_code()) + .and_modify(|mods| mods.push(ptm)) + .or_insert(vec![ptm]); + } + + // Results vector to store the modified sequences + let mut proforma_sequences: HashSet = HashSet::new(); + + // Prepare static modifications in ProForma format + let static_mods = self + .static_ptms + .iter() + .map(|ptm| { + format!( + "[{:+}]@{}", + ptm.get_mass_delta(), + ptm.get_amino_acid().get_one_letter_code(), + ) + }) + .collect::>() + .into_iter() + .join(","); + + let mut modded_peptide = String::with_capacity(sequence.len()); + + if !static_mods.is_empty() { + modded_peptide = format!("<{static_mods}>",); + } + + // Add n-bonf if present + if let Some(n_bond_ptm) = &self.n_bond_ptm { + modded_peptide.push_str(&format!("[{}]-", n_bond_ptm.get_mass_delta())); + } + + self.inner_modify_sequence( + sequence, + modded_peptide.clone(), + &variable_modifications_map, + 0, + 0, + &mut proforma_sequences, + ); + + // return results + proforma_sequences.into_iter().collect::>() + } + + /// Modifies the peptide sequence recursively by adding variable modifications at each necessary position. + /// Make sure the given peptide was checked against the condition before calling this function. + /// + /// # Arguments + /// * `peptide` - The original peptide sequence to modify + /// * `modified_peptide` - The current modified peptide sequence + /// * `variable_modifications_map` - A map of amino acids to their possible variable modifications + /// * `position` - The current position in the peptide sequence to modify + /// * `applied_vmods` - The number of variable modifications applied so far + /// * `max_variable_modifications` - The maximum number of variable modifications allowed + /// * `proforma_sequences` - A mutable vector to store the resulting proforma sequences + /// + #[allow(clippy::too_many_arguments)] + fn inner_modify_sequence( + &self, + peptide: &str, + mut modified_peptide: String, + variable_modifications_map: &HashMap>, + position: usize, + applied_vmods: usize, + proforma_sequences: &mut HashSet, + ) { + if position >= peptide.len() { + self.end_modify_sequence(modified_peptide, applied_vmods, proforma_sequences); + return; + } + + modified_peptide.push(peptide.chars().nth(position).unwrap()); + + // First check for n-terminal and c-terminal modifications which must be applied when present. + if position == 0 && self.n_terminal_ptm.is_some() { + modified_peptide.push_str(&format!( + "[{:+}]", + self.n_terminal_ptm.as_ref().unwrap().get_mass_delta() + )); + self.inner_modify_sequence( + peptide, + modified_peptide, + variable_modifications_map, + position + 1, + applied_vmods, + proforma_sequences, + ); + } else if position == peptide.len() - 1 && self.c_terminal_ptm.is_some() { + modified_peptide.push_str(&format!( + "[{:+}]", + self.c_terminal_ptm.as_ref().unwrap().get_mass_delta() + )); + self.inner_modify_sequence( + peptide, + modified_peptide, + variable_modifications_map, + position + 1, + applied_vmods, + proforma_sequences, + ); + } else { + // # Next with unmodified amino acid + self.inner_modify_sequence( + peptide, + modified_peptide.clone(), + variable_modifications_map, + position + 1, + applied_vmods, + proforma_sequences, + ); + + if applied_vmods < self.variable_ptms.len() { + // # Next with modified amino acid + if let Some(modifications) = + variable_modifications_map.get(&peptide.chars().nth(position).unwrap()) + { + for modification in modifications.iter() { + let next_modified_peptide = + format!("{}[{:+}]", &modified_peptide, modification.get_mass_delta()); + self.inner_modify_sequence( + peptide, + next_modified_peptide, + variable_modifications_map, + position + 1, + applied_vmods + 1, + proforma_sequences, + ); + } + } + } + } + } + + /// Modifies the peptide sequence at the end by adding c-terminal to the proforma sequences. + /// + /// # Arguments + /// * `modified_peptide` - The modified peptide sequence to add + /// * `applied_vmods` - The number of variable modifications applied to the peptide + /// * `proforma_sequences` - The vector of proforma sequences to add the modified peptide to + /// + fn end_modify_sequence( + &self, + mut modified_peptide: String, + applied_vmods: usize, + proforma_sequences: &mut HashSet, + ) { + if let Some(c_bond_ptm) = &self.c_bond_ptm { + modified_peptide.push_str(&format!("-[{}]", c_bond_ptm.get_mass_delta(),)); + } + // If the number of applied variable modifications not equals the number of variable PTMs, + // this condition is not fully applied + if applied_vmods == self.variable_ptms.len() { + proforma_sequences.insert(modified_peptide); + } + } + + /// Creates a vector of PeptideConditions from a PTMCollection. + /// + /// # Arguments + /// * `ptm_collection` - The PTMCollection to use + /// * `targeted_mass` - Mass of the unmodfied peptide to search for + /// * `min_mass` - Minimum mass of the peptides in the database, usually mass of Glycin times the minimum configured peptide length + /// * `max_mass` - Maximum mass of the peptides in the database. This value need to be chosen with care if modifications with negative mass delta are used. + /// Otherwise conditions with masses will be generated, way outside the database range, generating useless operations. + /// 1. If no modification with negative mass delta is used, `max_mass` equals the targeted mass. + /// 2. If static modifications with negative mass delta are used, multipe options are viable: + /// 1. Set it to the mass of Tryptophan times the configured maximum length of the peptides. + /// 2. A more conservative approach is to set it equals the targeted mass plus + /// the absolute value of the largest negative mass delta of the static modifications times the configured maximum length of the peptides. + /// 3. Instead of using the configured max length, divide the target mass by the average mass of an amino acid to get the likely length of the peptide and add + /// a certain amount of play (e.g. 30%) to the calculated length. Multiply this value with the absolute value of the largest negative mass delta of the static modifications. + /// 3. If variable modifications with negative mass delta are used, `max_mass` should equals the targeted mass plus + /// the absolute value of the largest negative mass delta of the variable modifications times the allowd number of variable modifications. + /// 4. If both static and variable modifications with negative mass delta are used, case 2 and 3 should be combined. + /// + /// * `max_variable_modifications` - The maximum number of variable modifications to apply + pub fn from_ptm_collection( + ptm_collection: &PTMCollection, + targeted_mass: i64, + min_mass: i64, + max_mass: i64, + max_variable_modifications: usize, + ) -> Vec { + if ptm_collection.is_empty() { + return Vec::new(); + } + + let mut resulting_conditions: Vec = Vec::new(); + + // Handle no modifications (which excludes all static modifications) + let mut condition = PeptideCondition::new(targeted_mass); + for static_ptm in ptm_collection.get_static_ptms() { + condition.add_excluded_amino_acid(static_ptm.get_amino_acid()); + } + resulting_conditions.push(condition); + + // static modifications + let condition = PeptideCondition::new(targeted_mass); + Self::calculate_peptide_conditions_for_static_modifications( + ptm_collection, + min_mass, + max_mass, + condition.clone(), + 0, + &mut resulting_conditions, + ); + + // variable modifications + let current_len = resulting_conditions.len(); + for i in 0..current_len { + let condition = resulting_conditions[i].clone(); + Self::calculate_peptide_conditions_for_variable_modifications( + ptm_collection, + min_mass, + max_mass, + max_variable_modifications, + condition, + 0, + &mut resulting_conditions, + ) + } + + // n terminal modifications + let current_len = resulting_conditions.len(); + for i in 0..current_len { + let mut condition = resulting_conditions[i].clone(); + for modification in ptm_collection.get_n_terminal_ptms() { + if condition.set_n_terminal_ptm(modification) { + resulting_conditions.push(condition.clone()); + } + } + } + + // c terminal modifications + let current_len = resulting_conditions.len(); + for i in 0..current_len { + let mut condition = resulting_conditions[i].clone(); + for modification in ptm_collection.get_c_terminal_ptms() { + if condition.set_c_terminal_ptm(modification) { + resulting_conditions.push(condition.clone()); + } + } + } + + // n bond modifications + let current_len = resulting_conditions.len(); + for i in 0..current_len { + let mut condition = resulting_conditions[i].clone(); + for modification in ptm_collection.get_n_bond_ptms() { + if condition.set_n_bond_ptm(modification) { + resulting_conditions.push(condition.clone()); + } + } + } + + // c bond modifications + let current_len = resulting_conditions.len(); + for i in 0..current_len { + let mut condition = resulting_conditions[i].clone(); + for modification in ptm_collection.get_c_bond_ptms() { + if condition.set_c_bond_ptm(modification) { + resulting_conditions.push(condition.clone()); + } + } + } + + resulting_conditions + } + + fn calculate_peptide_conditions_for_static_modifications( + ptm_collection: &PTMCollection, + min_mass: i64, + max_mass: i64, + mut condition: PeptideCondition, + modification_position: usize, + resulting_conditions: &mut Vec, + ) { + if modification_position >= ptm_collection.get_static_ptms().len() { + return; + } + + // # Without this variable modifications apply the next one + Self::calculate_peptide_conditions_for_static_modifications( + ptm_collection, + min_mass, + max_mass, + condition.clone(), + modification_position + 1, + resulting_conditions, + ); + + while condition.add_static_ptm(ptm_collection.get_static_ptms()[modification_position]) { + if condition.query_mass < min_mass || condition.query_mass > max_mass { + break; + } + resulting_conditions.push(condition.clone()); + // Apply next static modification + Self::calculate_peptide_conditions_for_static_modifications( + ptm_collection, + min_mass, + max_mass, + condition.clone(), + modification_position + 1, + resulting_conditions, + ); + } + } + + fn calculate_peptide_conditions_for_variable_modifications( + ptm_collection: &PTMCollection, + min_mass: i64, + max_mass: i64, + max_variable_modifications: usize, + mut condition: PeptideCondition, + modification_position: usize, + resulting_conditions: &mut Vec, + ) { + if modification_position >= ptm_collection.get_variable_ptms().len() { + return; + } + + // # Without this variable modifications apply the next one + Self::calculate_peptide_conditions_for_variable_modifications( + ptm_collection, + min_mass, + max_mass, + max_variable_modifications, + condition.clone(), + modification_position + 1, + resulting_conditions, + ); + + // # Apply this modification until we run out of mass + while condition.add_variable_ptm(ptm_collection.get_variable_ptms()[modification_position]) + { + if condition.variable_ptms.len() > max_variable_modifications + || condition.query_mass < min_mass + || condition.query_mass > max_mass + { + break; + } + resulting_conditions.push(condition.clone()); + // Apply next static modification + Self::calculate_peptide_conditions_for_variable_modifications( + ptm_collection, + min_mass, + max_mass, + max_variable_modifications, + condition.clone(), + modification_position + 1, + resulting_conditions, + ); + } + } +} + +impl Display for PeptideCondition { + /// Formats the PeptideCondition for display. + /// if finalized, it will display the filter functions and the query mass. + /// Otherwise, it will display the PTMs in pseudo ProForma format. + /// + /// # Arguments + /// * `f` - The formatter to write to + /// + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let static_mods = self + .static_ptms + .iter() + .map(|ptm| { + format!( + "[{}]@{}", + ptm.get_mass_delta(), + ptm.get_amino_acid().get_one_letter_code() + ) + }) + .join(", "); + let variable_mods = self + .variable_ptms + .iter() + .map(|ptm| { + format!( + "v[{}]@{}", + ptm.get_mass_delta(), + ptm.get_amino_acid().get_one_letter_code() + ) + }) + .join(", "); + let n_bind_mod = match &self.n_bond_ptm { + Some(ptm) => format!("[{}]-", ptm.get_mass_delta()), + None => String::new(), + }; + let c_bind_mod = match &self.c_bond_ptm { + Some(ptm) => format!("-[{}]", ptm.get_mass_delta()), + None => String::new(), + }; + let n_terminal_mod = match &self.n_bond_ptm { + Some(ptm) => format!( + "cterm{}@{}", + ptm.get_mass_delta(), + ptm.get_amino_acid().get_one_letter_code() + ), + None => String::new(), + }; + let c_terminal_mod = match &self.c_bond_ptm { + Some(ptm) => format!( + "nterm{}@{}", + ptm.get_mass_delta(), + ptm.get_amino_acid().get_one_letter_code() + ), + None => String::new(), + }; + + write!( + f, + "PeptideCondition: '<{static_mods}>{n_bind_mod}{n_terminal_mod}{variable_mods}{c_terminal_mod}{c_bind_mod}' @ {} Da", + mass_to_float(self.query_mass), + ) + } +} + +pub struct FinalizedPeptideCondition { + inner_peptide_condition: PeptideCondition, + /// Filter functions the peptide has to pass before it is returned + filter_functions: Vec>, +} + +impl FinalizedPeptideCondition { + /// Finalizes the PeptideCondition by calculating the filter functions based on the given modifications. + /// + /// # Arguments + /// * `peptide_condition` - The PeptideCondition to finalize + /// + fn get_filter_functions(peptide_condition: &PeptideCondition) -> Vec> { + let mut filter_functions: Vec> = Vec::with_capacity( + peptide_condition.static_ptms.len() + + peptide_condition.variable_ptms.len() + + peptide_condition.excluded_amino_acids.len() + + 2, // N-terminal and C-terminal PTM + ); + + for excluded_aa in peptide_condition.excluded_amino_acids.iter() { + filter_functions.push(Box::new(NoOccurrencesFilterFunction { + amino_acid: *excluded_aa, + })); + } + + let mut statically_modified_amino_acid_counts: HashMap = HashMap::new(); + for ptm in peptide_condition.static_ptms.iter() { + statically_modified_amino_acid_counts + .entry(*ptm.get_amino_acid().get_one_letter_code()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + + for (amino_acid, amount) in statically_modified_amino_acid_counts + .into_iter() + .sorted_by(|x, y| x.0.cmp(&y.0)) + { + filter_functions.push(Box::new(EqualsNumberOfOccurrencesFilterFunction { + amino_acid, + amount, + })); + } + + let mut variable_modified_amino_acid_counts: HashMap = HashMap::new(); + for ptm in peptide_condition.variable_ptms.iter() { + variable_modified_amino_acid_counts + .entry(*ptm.get_amino_acid().get_one_letter_code()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + + if let Some(ptm) = &peptide_condition.n_terminal_ptm { + // N-terminal PTM is treated as variable modification + variable_modified_amino_acid_counts + .entry(*ptm.get_amino_acid().get_one_letter_code()) + .and_modify(|count| *count += 1) + .or_insert(1); + + filter_functions.push(Box::new(StartsWithFilterFunction { + amino_acid: *ptm.get_amino_acid().get_one_letter_code(), + })); + } + + if let Some(ptm) = &peptide_condition.c_terminal_ptm { + // N-terminal PTM is treated as variable modification + variable_modified_amino_acid_counts + .entry(*ptm.get_amino_acid().get_one_letter_code()) + .and_modify(|count| *count += 1) + .or_insert(1); + + filter_functions.push(Box::new(EndsWithFilterFunction { + amino_acid: *ptm.get_amino_acid().get_one_letter_code(), + })); + } + + for (amino_acid, amount) in variable_modified_amino_acid_counts + .into_iter() + .sorted_by(|x, y| x.0.cmp(&y.0)) + { + filter_functions.push(Box::new(GreaterOrEqualsNumberOfOccurrencesFilterFunction { + amino_acid, + amount, + })); + } + + filter_functions + } + + pub fn check_peptide(&mut self, peptide: &Peptide) -> bool { + // Check if the peptide passes all filter functions + for filter in self.filter_functions.iter_mut() { + if !filter.is_match(peptide).unwrap_or(false) { + return false; + } + } + + true + } +} + +// Make the inner peptide condition readable +impl Deref for FinalizedPeptideCondition { + type Target = PeptideCondition; + + fn deref(&self) -> &Self::Target { + &self.inner_peptide_condition + } +} + +impl From for FinalizedPeptideCondition { + fn from(peptide_condition: PeptideCondition) -> Self { + let filter_functions = FinalizedPeptideCondition::get_filter_functions(&peptide_condition); + Self { + inner_peptide_condition: peptide_condition, + filter_functions, + } + } +} + +impl Display for FinalizedPeptideCondition { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let filter_descriptions = self + .filter_functions + .iter() + .map(|filter| format!("{filter}")) + .join(" && "); + write!( + f, + "FinalizedPeptideCondition: {filter_descriptions} @ {}", + mass_to_float(self.query_mass) + ) + } +} + +#[cfg(test)] +mod tests { + use dihardts_omicstools::{ + chemistry::amino_acid::get_amino_acid_by_one_letter_code, + proteomics::post_translational_modifications::{ModificationType, Position}, + }; + + use super::*; + + #[tokio::test] + async fn test_peptide_condition_from_ptm_collection() { + let ptms = vec![ + PTM::new( + "carba of C", + get_amino_acid_by_one_letter_code('C').unwrap(), + 57.021464, + ModificationType::Static, + Position::Anywhere, + ), + PTM::new( + "oxi of M", + get_amino_acid_by_one_letter_code('M').unwrap(), + 15.99491, + ModificationType::Variable, + Position::Anywhere, + ), + PTM::new( + "oxi of term M", + get_amino_acid_by_one_letter_code('M').unwrap(), + 16.99491, + ModificationType::Variable, + Position::Terminus(dihardts_omicstools::proteomics::peptide::Terminus::N), + ), + PTM::new( + "oxi of term K", + get_amino_acid_by_one_letter_code('K').unwrap(), + 20.3, + ModificationType::Variable, + Position::Terminus(dihardts_omicstools::proteomics::peptide::Terminus::C), + ), + PTM::new( + "something on N-bond", + get_amino_acid_by_one_letter_code('X').unwrap(), + 10.0, + ModificationType::Variable, + Position::Terminus(dihardts_omicstools::proteomics::peptide::Terminus::N), + ), + PTM::new( + "something on N-bond", + get_amino_acid_by_one_letter_code('X').unwrap(), + 40.3, + ModificationType::Variable, + Position::Terminus(dihardts_omicstools::proteomics::peptide::Terminus::C), + ), + ]; + let ptm_collection = PTMCollection::new(&ptms).unwrap(); + let mass: f64 = 839.403366202; // MFCQLAK + + let conditions = PeptideCondition::from_ptm_collection( + &ptm_collection, + mass_to_int(mass), + mass_to_int( + get_amino_acid_by_one_letter_code('G') + .unwrap() + .get_mono_mass() + * 6.0, + ), + mass_to_int(mass), + 2, + ); + + // Easiest way is to check the string representation of the conditions which gives basically a unique representation of the condition + let stringyfied_conditions = conditions + .into_iter() + .map(|condition| format!("{}", FinalizedPeptideCondition::from(condition))) + .collect::>(); + + let expected_conditions = + std::fs::read_to_string("test_files/finalized_peptide_condition.txt") + .unwrap() + .split("\n") + .map(|line| line.to_string()) + .collect::>(); + + assert_eq!(stringyfied_conditions.len(), expected_conditions.len()); + + for condition in stringyfied_conditions.iter() { + assert!( + expected_conditions.contains(condition), + "Condition not found: {condition}" + ); + } + } + + /// Consequently tests various types of PTMs to build conition for checking a sequence on a sequence. + /// + #[test] + fn test_condition_building_and_sequence_modification() { + let sequence = "MFCQLAKTCPVQLWVDMSTPPPGTRVR"; + let mass = 3060.516981066636; + + let peptide = Peptide::new( + 0, + mass_to_int(3060.516981066636), + sequence.to_string(), + 2, + Vec::new(), + false, + false, + Vec::new(), + Vec::new(), + Vec::new(), + Vec::new(), + ) + .unwrap(); + + let carbamidomethylation_c = PTM::new( + "carba of C", + get_amino_acid_by_one_letter_code('C').unwrap(), + 57.021464, + ModificationType::Static, + Position::Anywhere, + ); + + let oxidation_m = PTM::new( + "oxi of M", + get_amino_acid_by_one_letter_code('M').unwrap(), + 15.99491, + ModificationType::Variable, + Position::Anywhere, + ); + + let something_terminal_m = PTM::new( + "oxi of term M", + get_amino_acid_by_one_letter_code('M').unwrap(), + 16.99491, + ModificationType::Variable, + Position::Terminus(dihardts_omicstools::proteomics::peptide::Terminus::N), + ); + + let something_terminal_r = PTM::new( + "oxi of term R", + get_amino_acid_by_one_letter_code('R').unwrap(), + 20.3, + ModificationType::Variable, + Position::Terminus(dihardts_omicstools::proteomics::peptide::Terminus::C), + ); + + let something_bond_n = PTM::new( + "something on N-bond", + get_amino_acid_by_one_letter_code('X').unwrap(), + 10.0, + ModificationType::Variable, + Position::Terminus(dihardts_omicstools::proteomics::peptide::Terminus::N), + ); + + let something_bond_c = PTM::new( + "something on N-bond", + get_amino_acid_by_one_letter_code('X').unwrap(), + 40.3, + ModificationType::Variable, + Position::Terminus(dihardts_omicstools::proteomics::peptide::Terminus::C), + ); + + let mut condition = PeptideCondition::new(mass_to_int(mass)); + condition.add_static_ptm(&carbamidomethylation_c); + condition.add_static_ptm(&carbamidomethylation_c); + condition.add_variable_ptm(&oxidation_m); + + let mut finalized_condition: FinalizedPeptideCondition = condition.clone().into(); + + assert!(finalized_condition.check_peptide(&peptide)); + + let mut modified_sequences = condition.modify_sequence(sequence); + modified_sequences.sort(); + assert_eq!( + modified_sequences.as_slice(), + [ + "<[+57.021464]@C>MFCQLAKTCPVQLWVDM[+15.99491]STPPPGTRVR", + "<[+57.021464]@C>M[+15.99491]FCQLAKTCPVQLWVDMSTPPPGTRVR" + ] + ); + + condition.set_n_terminal_ptm(&something_terminal_m); + finalized_condition = condition.clone().into(); + assert!(finalized_condition.check_peptide(&peptide)); + + let mut modified_sequences = condition.modify_sequence(sequence); + modified_sequences.sort(); + assert_eq!( + modified_sequences.as_slice(), + ["<[+57.021464]@C>M[+16.99491]FCQLAKTCPVQLWVDM[+15.99491]STPPPGTRVR",] + ); + + condition.set_c_terminal_ptm(&something_terminal_r); + finalized_condition = condition.clone().into(); + assert!(finalized_condition.check_peptide(&peptide)); + + let mut modified_sequences = condition.modify_sequence(sequence); + modified_sequences.sort(); + assert_eq!( + modified_sequences.as_slice(), + ["<[+57.021464]@C>M[+16.99491]FCQLAKTCPVQLWVDM[+15.99491]STPPPGTRVR[+20.3]",] + ); + + condition.set_n_bond_ptm(&something_bond_n); + finalized_condition = condition.clone().into(); + assert!(finalized_condition.check_peptide(&peptide)); + + let mut modified_sequences = condition.modify_sequence(sequence); + modified_sequences.sort(); + assert_eq!( + modified_sequences.as_slice(), + ["<[+57.021464]@C>[10]-M[+16.99491]FCQLAKTCPVQLWVDM[+15.99491]STPPPGTRVR[+20.3]",] + ); + + condition.set_c_bond_ptm(&something_bond_c); + finalized_condition = condition.clone().into(); + assert!(finalized_condition.check_peptide(&peptide)); + + let mut modified_sequences = condition.modify_sequence(sequence); + modified_sequences.sort(); + assert_eq!( + modified_sequences.as_slice(), + ["<[+57.021464]@C>[10]-M[+16.99491]FCQLAKTCPVQLWVDM[+15.99491]STPPPGTRVR[+20.3]-[40.3]",] + ); + } +} diff --git a/src/database/scylla/peptide_table.rs b/src/database/scylla/peptide_table.rs index 31d5001..b30de0b 100755 --- a/src/database/scylla/peptide_table.rs +++ b/src/database/scylla/peptide_table.rs @@ -3,7 +3,6 @@ use std::sync::Arc; use anyhow::{bail, Result}; use async_stream::try_stream; -use dihardts_omicstools::proteomics::post_translational_modifications::PostTranslationalModification as PTM; use dihardts_omicstools::proteomics::proteases::protease::Protease; use fallible_iterator::FallibleIterator; use futures::future::join_all; @@ -22,6 +21,7 @@ use crate::entities::configuration::Configuration; use crate::entities::domain::Domain; use crate::entities::peptide::Peptide; use crate::entities::protein::Protein; +use crate::functions::post_translational_modification::PTMCollection; use crate::tools::omicstools::convert_to_internal_dummy_peptide; use crate::tools::peptide_partitioner::get_mass_partition; @@ -44,6 +44,7 @@ pub const SELECT_COLS: [&str; 12] = [ "domains", ]; +#[allow(dead_code)] // Used in the lazy_static! macro and exists in case of future differences between select and insert cols const INSERT_COLS: [&str; 12] = SELECT_COLS; const UPDATE_COLS: [&str; 9] = [ @@ -377,26 +378,26 @@ impl PeptideTable { /// * `taxonomy_id` - Optional: The taxonomy id to filter for /// * `proteome_id` - Optional: The proteome id to filter for /// * `is_reviewed` - Optional: If the peptides should be reviewed or unreviewed - /// * `ptms` - The PTMs to consider - /// * `matching_peptides` - A bloom filter to check if a peptide was already found + /// * `ptm_collection` - The PTMs to consider + /// * `resolve_modifications` - If the modifications should be resolved /// #[allow(clippy::too_many_arguments)] - pub async fn search( + pub async fn search<'a>( client: Arc, configuration: Arc, mass: i64, lower_mass_tolerance_ppm: i64, upper_mass_tolerance_ppm: i64, - max_variable_modifications: i16, + max_variable_modifications: usize, taxonomy_ids: Option>, proteome_ids: Option>, is_reviewed: Option, - ptms: &[PTM], + ptm_collection: &'a PTMCollection<'a>, + resolve_modifications: bool, ) -> Result { - let partition_limits = Arc::new(configuration.get_partition_limits().clone()); MultiTaskSearch::search( client, - partition_limits, + configuration, mass, lower_mass_tolerance_ppm, upper_mass_tolerance_ppm, @@ -405,7 +406,8 @@ impl PeptideTable { taxonomy_ids, proteome_ids, is_reviewed, - ptms, + ptm_collection, + resolve_modifications, None, ) .await diff --git a/src/entities/peptide.rs b/src/entities/peptide.rs index 229c0a2..5b8c401 100755 --- a/src/entities/peptide.rs +++ b/src/entities/peptide.rs @@ -4,6 +4,7 @@ use std::{ collections::HashMap, fmt::Display, hash::{Hash, Hasher}, + ops::Deref, }; // 3rd party imports @@ -229,6 +230,20 @@ impl Peptide { &self.domains } + /// Get occurrences of an amino acid in the peptide + /// + /// # Arguments + /// * `one_letter_code` - The one letter code of the amino acid + /// + pub fn get_aa_count(&self, one_letter_code: char) -> i16 { + let index = one_letter_code as usize % 65; + if index < self.aa_counts.len() { + self.aa_counts[index] + } else { + 0 + } + } + /// Returns the peptide metadata from the given proteins, format: /// (is_swiss_prot, is_trembl, taxonomy_ids, unique_taxonomy_ids, proteome_ids) /// @@ -388,6 +403,53 @@ impl Display for Peptide { } } +/// Peptide plus the option to add multiple sequences, e.g. modified sequences +/// +#[derive(Serialize)] +pub struct MatchingPeptide { + /// The databse peptide peptide itself + #[serde(flatten)] + inner_peptide: Peptide, + /// Additional sequences, e.g. modified sequences + additional_sequences: Vec, +} + +impl MatchingPeptide { + /// Creates a new MatchingPeptide with the given peptide and additional sequences. + /// + pub fn new(peptide: Peptide, additional_sequences: Vec) -> Self { + Self { + inner_peptide: peptide, + additional_sequences, + } + } + + /// Returns the peptide. + pub fn get_inner_peptide(&self) -> &Peptide { + &self.inner_peptide + } + + /// Returns the additional sequences. + pub fn get_additional_sequences(&self) -> &[String] { + &self.additional_sequences + } +} + +impl Deref for MatchingPeptide { + type Target = Peptide; + + fn deref(&self) -> &Self::Target { + &self.inner_peptide + } +} + +#[allow(clippy::from_over_into)] // don't need the from implementation +impl Into<(Peptide, Vec)> for MatchingPeptide { + fn into(self) -> (Peptide, Vec) { + (self.inner_peptide, self.additional_sequences) + } +} + /// Peptide which can be serialized to a TSV file, where Vectors are comma separated lists /// #[derive(Clone, Debug, Deserialize, Serialize)] @@ -407,6 +469,8 @@ pub struct TsvPeptide { taxonomy_ids: String, unique_taxonomy_ids: String, proteome_ids: String, + #[serde(skip_serializing_if = "Option::is_none")] + additional_sequences: Option, // domains: String, } @@ -439,6 +503,16 @@ impl From for TsvPeptide { .collect::>() .join(","), proteome_ids: peptide.proteome_ids.join(","), + additional_sequences: None, // This will be set later if needed } } } + +impl From for TsvPeptide { + fn from(peptide: MatchingPeptide) -> Self { + let (peptide, additional_sequnces): (Peptide, Vec) = peptide.into(); + let mut peptide = TsvPeptide::from(peptide); + peptide.additional_sequences = Some(additional_sequnces.join(",")); + peptide + } +} diff --git a/src/functions/post_translational_modification.rs b/src/functions/post_translational_modification.rs index e9dc46a..9b1750b 100644 --- a/src/functions/post_translational_modification.rs +++ b/src/functions/post_translational_modification.rs @@ -1,486 +1,223 @@ // std imports -use std::cmp::min; -use std::collections::HashMap; -use std::fmt::Display; +use std::{collections::HashSet, fmt::Display}; // 3rd party imports -use anyhow::{bail, Result}; use dihardts_omicstools::proteomics::post_translational_modifications::PostTranslationalModification as PTM; - -// internal imports -use crate::chemistry::amino_acid::{get_internal_amino_acid_by_one_letter_code, InternalAminoAcid}; -use crate::entities::peptide::Peptide; -use crate::mass::convert::{to_float as mass_to_float, to_int as mass_to_int}; - -/// Validates a list of PTMs: -/// Makes sure that there are no static and variable PTMs for the same amino acid -/// -pub fn validate_ptm_vec(ptms: &[PTM]) -> Result<()> { - let static_ptms: Vec<&PTM> = ptms.iter().filter(|ptm| ptm.is_static()).collect(); - let variable_ptms: Vec<&PTM> = ptms.iter().filter(|ptm| ptm.is_variable()).collect(); - // Check if there are any static/variable PTMs for the same amino acid - let mut errors = String::new(); - for static_ptm in &static_ptms { - for variable_ptm in &variable_ptms { - if static_ptm.get_amino_acid().get_code() == variable_ptm.get_amino_acid().get_code() { - errors.push_str(&format!( - "Static PTM {} and variable PTM {} are supposed for the same amino acid {}.\n", - static_ptm.get_name(), - variable_ptm.get_name(), - static_ptm.get_amino_acid().get_code() - )); - } - } - } - if !errors.is_empty() { - bail!(errors); - } - Ok(()) +use thiserror::Error; + +/// Errors which might occur during PTM collection validation +#[derive(Debug, Error)] +pub enum PTMCollectionValidationError { + #[error("Amino acid {0} is statically modified twice or more.")] + StaticallyModifiedTwiceOrMore(String), + #[error("Amino acid {0} is already statically modified.")] + AlreadyStaticallyModified(String), } -/// Simple struct to store and check the occurrence of an amino acid +/// Collection of post-translational modifications (PTMs). +/// Rules: +/// * Static PTMs are applied to every occurence of the targeted amino acid +/// * Variable PTMs can be applied to any occurence of the targeted amino acid (usually limited to a maximum number of variable modifications). +/// Amino acids target cannot be the same as the static PTM. +/// * N-/C-terminal PTMs are applied to the first/last amino acid of the peptide. +/// Treated as variable modifications, although they are not counted for a variable modification limit. +/// Cannot be applied to an amino acid which is statically PTM. +/// * N-/C-bond PTMs are applied to the bond between peptides. Treated as variable modifications but not counted for variable modification limits. +/// Amino acid target is not relevant. /// -#[derive(Clone)] -pub enum AminoAcidOccurrence { - Equal(i16), - GreaterOrEqual(i16), +pub struct PTMCollection<'a> { + static_ptms: Vec<&'a PTM>, + variable_ptms: Vec<&'a PTM>, + n_terminal_ptms: Vec<&'a PTM>, + c_terminal_ptms: Vec<&'a PTM>, + n_bond_ptms: Vec<&'a PTM>, + c_bond_ptms: Vec<&'a PTM>, } -impl AminoAcidOccurrence { - /// Checks if the given count matches the occurrence +impl<'a> PTMCollection<'a> { + /// Creates a new PTMCollection from a slice of PTMs, by sorting them + /// and checks the validity of the collection. /// /// # Arguments - /// * `count` - The count to check + /// * `ptms` - A slice of PTMs to create the collection /// - pub fn check(&self, count: &i16) -> bool { - match self { - AminoAcidOccurrence::Equal(value) => count == value, - AminoAcidOccurrence::GreaterOrEqual(value) => count >= value, + pub fn new(ptms: &'a [PTM]) -> Result { + let mut static_ptms: Vec<&'a PTM> = Vec::new(); + let mut variable_ptms: Vec<&'a PTM> = Vec::new(); + let mut n_terminal_ptms: Vec<&'a PTM> = Vec::new(); + let mut c_terminal_ptms: Vec<&'a PTM> = Vec::new(); + let mut n_bond_ptms: Vec<&'a PTM> = Vec::new(); + let mut c_bond_ptms: Vec<&'a PTM> = Vec::new(); + + // Sort ptms + for ptm in ptms { + if ptm.is_static() && ptm.is_anywhere() { + static_ptms.push(ptm); + } else if ptm.is_variable() && ptm.is_anywhere() { + variable_ptms.push(ptm); + } else if ptm.is_n_terminus() { + n_terminal_ptms.push(ptm); + } else if ptm.is_c_terminus() { + c_terminal_ptms.push(ptm); + } else if ptm.is_n_bond() { + n_bond_ptms.push(ptm); + } else if ptm.is_c_bond() { + c_bond_ptms.push(ptm); + } } - } - /// Increments the occurrence by the given amount - /// - /// # Arguments - /// * `amount` - The amount to increment by - fn increment_by(&mut self, amount: i16) { - match self { - AminoAcidOccurrence::Equal(value) => *value += amount, - AminoAcidOccurrence::GreaterOrEqual(value) => *value += amount, + let mut static_modification_targets = HashSet::with_capacity(static_ptms.len()); + for ptm in static_ptms.iter() { + if !static_modification_targets.insert(ptm.get_amino_acid().get_code()) { + return Err(PTMCollectionValidationError::StaticallyModifiedTwiceOrMore( + ptm.get_name().to_string(), + )); + } } - } -} -impl Display for AminoAcidOccurrence { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - AminoAcidOccurrence::Equal(value) => write!(f, "== {}", value), - AminoAcidOccurrence::GreaterOrEqual(value) => write!(f, ">= {}", value), + for variable_ptm in variable_ptms.iter() { + if static_modification_targets.contains(variable_ptm.get_amino_acid().get_code()) { + return Err(PTMCollectionValidationError::AlreadyStaticallyModified( + variable_ptm.get_name().to_string(), + )); + } } - } -} - -/// PTMCondition to check peptides against a given mass and PTMs. -/// Each PTM condition can be used to query a range of peptides. -/// -#[derive(Clone)] -pub struct PTMCondition { - mass: i64, - amino_acid_occurrences: HashMap, - n_terminus_amino_acid: Option<&'static InternalAminoAcid>, - c_terminus_amino_acid: Option<&'static InternalAminoAcid>, -} -impl PTMCondition { - fn new(mass: f64, counters: &Vec) -> Result { - let mut amino_acid_occurrences: HashMap = HashMap::new(); - let mut mass_delta: f64 = 0.0; - - let mut n_terminus_amino_acid: Option<&'static InternalAminoAcid> = None; - let mut c_terminus_amino_acid: Option<&'static InternalAminoAcid> = None; - - for counter in counters.iter().filter(|ctr| !ctr.ptm.is_bond()) { - if counter.count == 0 { - continue; - } - amino_acid_occurrences - .entry(*counter.ptm.get_amino_acid().get_code() as usize % 65) - .or_insert(if counter.ptm.is_static() { - AminoAcidOccurrence::Equal(0) - } else { - AminoAcidOccurrence::GreaterOrEqual(0) - }) - .increment_by(counter.count); - mass_delta += counter.count as f64 * counter.ptm.get_mass_delta(); - if counter.ptm.is_n_terminus() { - n_terminus_amino_acid = Some(get_internal_amino_acid_by_one_letter_code( - *counter.ptm.get_amino_acid().get_code(), - )?); - } else if counter.ptm.is_c_terminus() { - c_terminus_amino_acid = Some(get_internal_amino_acid_by_one_letter_code( - *counter.ptm.get_amino_acid().get_code(), - )?); + for n_terminal_ptm in n_terminal_ptms.iter() { + if static_modification_targets.contains(n_terminal_ptm.get_amino_acid().get_code()) { + return Err(PTMCollectionValidationError::AlreadyStaticallyModified( + n_terminal_ptm.get_name().to_string(), + )); } } - // Bond modification are currently not supposed to be amino acid specific. So we do not need to adjust a counter. - for counter in counters.iter().filter(|ctr| ctr.ptm.is_bond()) { - mass_delta += counter.count as f64 * counter.ptm.get_mass_delta(); + for c_terminal_ptm in c_terminal_ptms.iter() { + if static_modification_targets.contains(c_terminal_ptm.get_amino_acid().get_code()) { + return Err(PTMCollectionValidationError::AlreadyStaticallyModified( + c_terminal_ptm.get_name().to_string(), + )); + } } - // convert back into the internal integer representation - let mass = mass_to_int(mass - mass_delta); - Ok(PTMCondition { - mass, - amino_acid_occurrences, - n_terminus_amino_acid, - c_terminus_amino_acid, + Ok(PTMCollection { + static_ptms, + variable_ptms, + n_terminal_ptms, + c_terminal_ptms, + n_bond_ptms, + c_bond_ptms, }) } - pub fn get_mass(&self) -> &i64 { - &self.mass + pub fn get_static_ptms(&self) -> &Vec<&'a PTM> { + &self.static_ptms } - pub fn get_amino_acid_occurrences(&self) -> &HashMap { - &self.amino_acid_occurrences + pub fn get_variable_ptms(&self) -> &Vec<&'a PTM> { + &self.variable_ptms } - pub fn get_n_terminus_amino_acid(&self) -> &Option<&'static InternalAminoAcid> { - &self.n_terminus_amino_acid + pub fn get_n_terminal_ptms(&self) -> &Vec<&'a PTM> { + &self.n_terminal_ptms } - pub fn get_c_terminus_amino_acid(&self) -> &Option<&'static InternalAminoAcid> { - &self.c_terminus_amino_acid + pub fn get_c_terminal_ptms(&self) -> &Vec<&'a PTM> { + &self.c_terminal_ptms } - pub fn check_peptide(&self, peptide: &Peptide) -> bool { - self.amino_acid_occurrences - .iter() - .all(|(amino_acid_idx, amino_acid_occurence)| { - amino_acid_occurence.check(&peptide.get_aa_counts()[*amino_acid_idx]) - }) - && match &self.n_terminus_amino_acid { - Some(amino_acid) => { - peptide.get_sequence().chars().next().unwrap() - == *amino_acid.get_one_letter_code() - } - None => true, - } - && match &self.c_terminus_amino_acid { - Some(amino_acid) => { - peptide.get_sequence().chars().last().unwrap() - == *amino_acid.get_one_letter_code() - } - None => true, - } + pub fn get_n_bond_ptms(&self) -> &Vec<&'a PTM> { + &self.n_bond_ptms } -} - -impl Display for PTMCondition { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let mut amino_acid_occurrences_str = String::new(); - for (amino_acid, count) in &self.amino_acid_occurrences { - amino_acid_occurrences_str.push_str(&format!( - "{}: {} ", - char::from_u32(*amino_acid as u32 + 65).unwrap_or_default(), - count - )); - } - let n_terminus_amino_acid_str = match &self.n_terminus_amino_acid { - Some(amino_acid) => format!("{}", amino_acid.get_one_letter_code()), - None => String::from("None"), - }; + pub fn get_c_bond_ptms(&self) -> &Vec<&'a PTM> { + &self.c_bond_ptms + } - let c_terminus_amino_acid_str = match &self.c_terminus_amino_acid { - Some(amino_acid) => format!("{}", amino_acid.get_one_letter_code()), - None => String::from("None"), - }; + pub fn len(&self) -> usize { + self.static_ptms.len() + + self.variable_ptms.len() + + self.n_terminal_ptms.len() + + self.c_terminal_ptms.len() + + self.n_bond_ptms.len() + + self.c_bond_ptms.len() + } - write!( - f, - "Mass: {}, Amino Acid Occurrences: {}, N-Terminus Amino Acid: {}, C-Terminus Amino Acid: {}", - self.mass, - amino_acid_occurrences_str, - n_terminus_amino_acid_str, - c_terminus_amino_acid_str - ) + pub fn is_empty(&self) -> bool { + self.len() == 0 } -} -/// A struct to count the usage of a PTM -/// -struct PTMCounter<'a> { - ptm: &'a PTM, - count: i16, -} + pub fn all(&self) -> Vec<&'a PTM> { + let mut all_ptms: Vec<&'a PTM> = Vec::with_capacity(self.len()); -impl PTMCounter<'_> { - pub fn new(ptm: &PTM) -> PTMCounter { - PTMCounter { ptm, count: 0 } - } -} + all_ptms.extend(self.static_ptms.iter()); + all_ptms.extend(self.variable_ptms.iter()); + all_ptms.extend(self.n_terminal_ptms.iter()); + all_ptms.extend(self.c_terminal_ptms.iter()); + all_ptms.extend(self.n_bond_ptms.iter()); + all_ptms.extend(self.c_bond_ptms.iter()); -/// Calculates the combinations of PTMs which can be applied for a given mass -/// and returns a vector of PTMConditions check if queried peptides are matching -/// the applied PTMs. -/// -/// # Arguments -/// * `mass` - The mass to calculate the combinations for -/// * `max_variable_modifications` - The maximum number of variable modifications -/// * `ptms` - The PTMs to use -/// -pub fn get_ptm_conditions( - mass: i64, - max_variable_modifications: i16, - ptms: &[PTM], -) -> Result> { - validate_ptm_vec(ptms)?; - // Create counter and a vector for the conditions - let mut counters = ptms.iter().map(|ptm| PTMCounter::new(ptm)).collect(); - let mut ptm_conditions: Vec = Vec::new(); - // As the PTM combinations are calculated on the PTM struct of the external create di_hardts_omicstools which works with float mass, - // we need to convert the mass from internal integer representation to float - let mass = mass_to_float(mass); - recursively_apply_ptms( - mass, - 0, - &mut counters, - mass, - max_variable_modifications, - false, - false, - false, - false, - &mut ptm_conditions, - )?; - Ok(ptm_conditions) + all_ptms + } } -/// Recursively calculates the combinations of PTMs -/// which can be applied for a given mass and returns -/// a vector of PTMConditions to check on a peptide it -/// it match this combination. -/// -/// # Arguments -/// * `mass` - The mass to calculate the combinations for -/// * `counter_idx` - The index of the current counter -/// * `counters` - The counters to use -/// * `remaining_mass` - The remaining mass to calculate the combinations for -/// * `free_variable_modifications` - The number of free variable modifications -/// * `is_n_terminus_used` - True if the n-terminus already has a PTM used -/// * `is_c_terminus_used` - True if the c-terminus already has a PTM used -/// * `is_n_bond_used` - True if the n-bond already has a PTM used -/// * `is_c_bond_used` - True if the c-bond already has a PTM used -/// * `ptm_conditions` - The vector to store the PTMConditions in -/// -#[allow(clippy::too_many_arguments)] -fn recursively_apply_ptms( - mass: f64, - counter_idx: usize, - counters: &mut Vec, - remaining_mass: f64, - free_variable_modifications: i16, - is_n_terminus_used: bool, - is_c_terminus_used: bool, - is_n_bond_used: bool, - is_c_bond_used: bool, - ptm_conditions: &mut Vec, -) -> Result<()> { - // Exit method if index is greater than number of counters - if counter_idx >= counters.len() { - return Ok(()); - } +impl Display for PTMCollection<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut ptms: Vec = Vec::with_capacity(self.len()); - let ptm = counters[counter_idx].ptm; - - let mut mod_max_count = 0; - - if ptm.is_anywhere() { - // if PTM is anywhere - if ptm.is_static() { - // if PTM is static, it apply any time - mod_max_count = (remaining_mass / ptm.get_total_mono_mass()).floor() as i16; - } else { - // if PTM is not static, it is vairable. So it apply only if there is "space" for more variable modifications - mod_max_count = min( - (remaining_mass / ptm.get_total_mono_mass()).floor() as i16, - free_variable_modifications, - ); - } - } else if ptm.is_n_terminus() { - if ptm.is_static() { - // 0 if terminus is already in use, 1 if the terminus is free - // and the modification has to fit the mass - mod_max_count = min( - if is_n_terminus_used { 0 } else { 1 }, - (remaining_mass / ptm.get_total_mono_mass()).floor() as i16, - ); - } else { - // 0 if terminus is already in use or there is no more space for a variable modification left - // and the modification has to fit the remaining mass - mod_max_count = min( - if is_n_terminus_used || free_variable_modifications == 0 { - 0 - } else { - 1 - }, - (remaining_mass / ptm.get_total_mono_mass()).floor() as i16, - ); - } - } else if ptm.is_c_terminus() { - if ptm.is_static() { - // 0 if terminus is already in use, 1 if the terminus is free - // and the modification has to fit the mass - mod_max_count = min( - if is_c_terminus_used { 0 } else { 1 }, - (remaining_mass / ptm.get_total_mono_mass()).floor() as i16, - ); - } else { - // 0 if terminus is already in use or there is no more space for a variable modification left - // and the modification has to fit the remaining mass - mod_max_count = min( - if is_c_terminus_used || free_variable_modifications == 0 { - 0 - } else { - 1 - }, - (remaining_mass / ptm.get_total_mono_mass()).floor() as i16, - ); - } - } else if ptm.is_n_bond() { - if ptm.is_static() { - // 0 if bond is already in use, 1 if the bond is free - // and the modification has to fit the mass - mod_max_count = min( - if is_n_bond_used { 0 } else { 1 }, - (remaining_mass / ptm.get_total_mono_mass()).floor() as i16, - ); - } else { - // 0 if bond is already in use or there is no more space for a variable modification left - // and the modification has to fit the remaining mass - mod_max_count = min( - if is_n_bond_used || free_variable_modifications == 0 { - 0 - } else { - 1 - }, - (remaining_mass / ptm.get_total_mono_mass()).floor() as i16, - ); - } - } else if ptm.is_c_bond() { - if ptm.is_static() { - // 0 if bond is already in use, 1 if the bond is free - // and the modification has to fit the mass - mod_max_count = min( - if is_c_bond_used { 0 } else { 1 }, - (remaining_mass / ptm.get_total_mono_mass()).floor() as i16, - ); - } else { - // 0 if bond is already in use or there is no more space for a variable modification left - // and the modification has to fit the remaining mass - mod_max_count = min( - if is_c_bond_used || free_variable_modifications == 0 { - 0 - } else { - 1 - }, - (remaining_mass / ptm.get_total_mono_mass()).floor() as i16, - ); - } - } + ptms.extend(self.static_ptms.iter().map(|ptm| ptm_to_string(ptm))); - let mut is_mass_reached = false; + ptms.extend(self.variable_ptms.iter().map(|ptm| ptm_to_string(ptm))); - // Increase the counter for the current modification until maximum is reached - for count in 0..=mod_max_count { - // Reset all following modification counts to zero - #[allow(clippy::needless_range_loop)] - for i in (counter_idx + 1)..counters.len() { - counters[i].count = 0; - } + ptms.extend(self.n_terminal_ptms.iter().map(|ptm| ptm_to_string(ptm))); - // Calculate the remaining precursor mass for the following modifications - let next_remaining_mass = remaining_mass - (ptm.get_total_mono_mass() * count as f64); - - // Check if remaining precursor has space for more modifications - if next_remaining_mass > 0.0 { - // Assign the count for the current mod to the counter - counters[counter_idx].count = count; - - // calculate free_variable_modifications for next iteration - let next_free_variable_modifications = if ptm.is_static() { - free_variable_modifications - } else { - free_variable_modifications - count - }; - - // set next terminus/terminal residue used to last value - let mut next_is_n_terminus_used = is_n_terminus_used; - let mut next_is_c_terminus_used = is_c_terminus_used; - let mut next_is_n_bond_used = is_n_bond_used; - let mut next_is_c_bond_used = is_c_bond_used; - - // check if n-terminal is used - if ptm.is_n_terminus() && count > 0 { - next_is_n_terminus_used = true; - } else if ptm.is_c_terminus() && count > 0 { - // check if c-terminal is used - next_is_c_terminus_used = true; - } else if ptm.is_n_bond() && count > 0 { - // check if n-bond is used - next_is_n_bond_used = true; - } else if ptm.is_c_bond() && count > 0 { - // check if c-bond is used - next_is_c_bond_used = true; - } + ptms.extend(self.c_terminal_ptms.iter().map(|ptm| ptm_to_string(ptm))); - // start the next iteration - recursively_apply_ptms( - mass, - counter_idx + 1, - counters, - next_remaining_mass, - next_free_variable_modifications, - next_is_n_terminus_used, - next_is_c_terminus_used, - next_is_n_bond_used, - next_is_c_bond_used, - ptm_conditions, - )?; - } else { - is_mass_reached = true; - } + ptms.extend(self.n_terminal_ptms.iter().map(|ptm| ptm_to_string(ptm))); - // Add current counter state to matrix, if get current counter is last counter or precursor is reached - if counter_idx == counters.len() - 1 || is_mass_reached { - ptm_conditions.push(PTMCondition::new(mass, counters)?); - } + ptms.extend(self.c_terminal_ptms.iter().map(|ptm| ptm_to_string(ptm))); - // Stop iteration if precursor is reached - if is_mass_reached { - break; - } + write!( + f, + "PTMCollection (static: {}, variable: {}, n_terminal: {}, c_terminal: {}, n_bond: {}, c_bond: {}):\n\t{}", + self.static_ptms.len(), + self.variable_ptms.len(), + self.n_terminal_ptms.len(), + self.c_terminal_ptms.len(), + self.n_bond_ptms.len(), + self.c_bond_ptms.len(), + ptms.join("\n\t") + ) } +} - Ok(()) +/// Converts a PTM to a string representation. +/// +fn ptm_to_string(ptm: &PTM) -> String { + format!( + "{}, {}, {}, {}, {}", + ptm.get_name(), + ptm.get_amino_acid().get_code(), + ptm.get_mass_delta(), + ptm.get_mod_type(), + ptm.get_position() + ) } #[cfg(test)] mod test { // 3rd party imports use dihardts_omicstools::{ - chemistry::amino_acid::{CYSTEINE, METHIONINE}, - proteomics::peptide::Terminus, - proteomics::post_translational_modifications::{ - ModificationType, Position, PostTranslationalModification as PTM, + chemistry::amino_acid::{CYSTEINE, GLYCINE, METHIONINE}, + proteomics::{ + peptide::Terminus, + post_translational_modifications::{ + ModificationType, Position, PostTranslationalModification as PTM, + }, }, }; // internal imports use super::*; - use crate::mass::convert::to_int as mass_to_int; #[test] fn test_validate_ptm_vec() { @@ -501,13 +238,13 @@ mod test { ), PTM::new( "Imaginary", - &CYSTEINE, + &GLYCINE, 5.6, ModificationType::Static, Position::Terminus(Terminus::N), ), ]; - assert!(validate_ptm_vec(&valid_ptms).is_ok()); + assert!(PTMCollection::new(&valid_ptms).is_ok()); let invalid_ptms = vec![ PTM::new( @@ -532,37 +269,6 @@ mod test { Position::Terminus(Terminus::N), ), ]; - assert!(validate_ptm_vec(&invalid_ptms).is_err()); - } - - #[test] - fn test_get_ptm_conditions() { - let ptms = vec![ - PTM::new( - "Carbamidomethyl", - &CYSTEINE, - 57.021464, - ModificationType::Static, - Position::Anywhere, - ), - PTM::new( - "Oxidation", - &METHIONINE, - 15.994915, - ModificationType::Variable, - Position::Anywhere, - ), - PTM::new( - "Imaginary", - &CYSTEINE, - 5.6, - ModificationType::Static, - Position::Terminus(Terminus::N), - ), - ]; - - let ptm_conditions = get_ptm_conditions(mass_to_int(1000.0), 3, &ptms).unwrap(); - assert!(ptm_conditions.len() == 40); - // TODO: need to come up with serious tests + assert!(PTMCollection::new(&invalid_ptms).is_err()); } } diff --git a/src/web/peptide_controller.rs b/src/web/peptide_controller.rs index 7302050..8be8d72 100644 --- a/src/web/peptide_controller.rs +++ b/src/web/peptide_controller.rs @@ -25,6 +25,7 @@ use crate::database::scylla::peptide_table::PeptideTable; use crate::database::scylla::protein_table::ProteinTable; use crate::entities::peptide::TsvPeptide; use crate::entities::protein::Protein; +use crate::functions::post_translational_modification::PTMCollection; use crate::mass::convert::to_int as mass_to_int; use crate::tools::peptide_partitioner::get_mass_partition; use crate::web::app_state::AppState; @@ -244,6 +245,7 @@ pub struct SearchRequestBody { taxonomy_id: Option, proteome_id: Option, is_reviewed: Option, + resolve_modifications: Option, } /// Struct to deserialize the query parameters for peptide search @@ -270,7 +272,7 @@ pub struct SearchRequestQuery { /// * Method: `POST` /// * Headers: /// * `Content-Type`: `application/json` -/// * `Accept`: `application/json`, `text/tab-separated-values`, `text/plain` (optional, default: `application/json`, controls the output format) +/// * `Accept`: `application/json`, `text/tab-separated-values`, `text/plain`, `text/proforma` (optional, default: `application/json`, controls the output format) /// * Query: /// * `is_download`: `bool` (optional, default: `false`, if true set the Content-Disposition header to download the response instead of showing it in the browser) /// * Body: @@ -302,6 +304,8 @@ pub struct SearchRequestQuery { /// "proteome_id": "UP000000589", /// # Optional flag to search only reviewed proteins /// "is_reviewed": true +/// # Optional: If the PTMs in seqeunces should be resolved +/// "resolve_modifications": true /// } /// ``` /// Deserialized into [SearchRequestBody] @@ -315,7 +319,7 @@ pub struct SearchRequestQuery { /// ... /// ] /// ``` -/// Peptides are formatted as mentioned in the [`get_peptide`-endpoint](get_peptide). +/// Peptides are formatted as mentioned in the [`get_peptide`-endpoint](get_peptide) + attribute `additional_sequences` if `resolve_modifications` is true. /// /// ### `text/tsv` /// ```tsv @@ -329,6 +333,14 @@ pub struct SearchRequestQuery { /// sequence_1 /// sequence_2 /// ... +/// +/// ### `text/proforma` +/// Note: The output will only contain the mass shifts but not the modification ID. +/// +/// ```text +/// <57.021464@C>NCLETPSCKNGFLLDGFPR +/// <57.021464@C>NCLETPSCKNGFLLM[+15.994915]DGFPR +/// ... /// ``` /// pub async fn post_search( @@ -493,17 +505,29 @@ async fn search( let proteome_ids = payload.proteome_id.map(|proteome_id| vec![proteome_id]); + let ptm_collection = match PTMCollection::new(&payload.modifications) { + Ok(collection) => collection, + Err(err) => { + return Ok(( + StatusCode::UNPROCESSABLE_ENTITY, + HeaderMap::new(), + Body::from(format!("Error while validating PTMs: {:?}", err)), + )); + } + }; + let peptide_stream = match PeptideTable::search( app_state.get_db_client(), app_state.get_configuration(), mass_to_int(calculated_mass), payload.lower_mass_tolerance_ppm, payload.upper_mass_tolerance_ppm, - payload.max_variable_modifications, + payload.max_variable_modifications as usize, taxonomy_ids, proteome_ids, payload.is_reviewed, - &payload.modifications, + &ptm_collection, + payload.resolve_modifications.unwrap_or(false), ) .await { @@ -585,7 +609,15 @@ async fn search( yield Err(format!("!!! {:?}", err)); break; } - let peptide = TsvPeptide::from(peptide.unwrap()); + let peptide = match peptide { + Ok(peptide) => peptide, + Err(err) => { + error!("{:?}", err); + yield Err(format!("!!! {:?}", err)); + break; + } + }; + let peptide = TsvPeptide::from(peptide); let mut writer = csv::WriterBuilder::new().has_headers(has_headers).delimiter(b'\t').from_writer(vec![]); match writer.serialize(peptide) { Ok(_) => (), @@ -624,6 +656,31 @@ async fn search( yield Ok(delimiter); }), ), + "text/proforma" => ( + StatusCode::OK, + headers, + Body::from_stream(stream! { + let mut delimiter = "".to_string(); + for await peptide in peptide_stream { + yield Ok(delimiter.to_owned()); + match peptide { + Ok(peptide) => { + if !peptide.get_additional_sequences().is_empty() { + yield Ok(peptide.get_additional_sequences().join("\n")); + } else { + yield Ok(peptide.get_sequence().to_owned()); + } + } + Err(err) => { + error!("{:?}", err); + yield Err(format!("!!! {:?}", err)); + break; + } + }; + delimiter = "\n".to_string(); + } + }), + ), _ => ( StatusCode::NOT_ACCEPTABLE, HeaderMap::new(), diff --git a/src/web/server.rs b/src/web/server.rs index de98789..19d7459 100644 --- a/src/web/server.rs +++ b/src/web/server.rs @@ -26,7 +26,7 @@ use crate::web::peptide_controller::{ }; use crate::web::protein_controller::{get_protein, search_protein}; use crate::web::taxonomy_controller::{get_sub_taxonomies, get_taxonomy, search_taxonomies}; -use crate::web::tools_controller::{digest, get_mass, get_proteases}; +use crate::web::tools_controller::{digest, get_mass, get_partition, get_proteases}; /// Starts the MaCPepDB web server on the given interface and port. /// @@ -103,6 +103,7 @@ pub async fn start( .route("/api/tools/digest", post(digest)) .route("/api/tools/mass/:sequence", get(get_mass)) .route("/api/tools/proteases", get(get_proteases)) + .route("/api/tools/partition", get(get_partition)) // taxonomy .route("/api/taxonomies/search", post(search_taxonomies)) .route("/api/taxonomies/:id/sub", get(get_sub_taxonomies)) diff --git a/src/web/tools_controller.rs b/src/web/tools_controller.rs index e9abbab..6672e0d 100644 --- a/src/web/tools_controller.rs +++ b/src/web/tools_controller.rs @@ -3,23 +3,26 @@ use std::collections::HashSet; use std::sync::Arc; // 3rd party imports -use axum::extract::{Path, State}; +use axum::extract::{Path, Query, State}; use axum::Json; use dihardts_omicstools::proteomics::proteases::functions::{ get_by_name as get_protease_by_name, ALL as AVAILABLE_PROTEASES, }; use fallible_iterator::FallibleIterator; use futures::TryStreamExt; +use http::StatusCode; +use rustyms::CompoundPeptidoformIon; use scylla::value::CqlValue; use serde::Deserialize; use serde_json::{json, Value as JsonValue}; // internal imports -use crate::chemistry::amino_acid::calc_sequence_mass_int; use crate::database::scylla::peptide_table::PeptideTable; use crate::entities::peptide::Peptide; use crate::mass::convert::to_float as mass_to_float; +use crate::mass::convert::to_int as mass_to_int; use crate::tools::omicstools::convert_to_internal_dummy_peptide; +use crate::tools::peptide_partitioner::get_mass_partition; use crate::web::web_error::WebError; use super::app_state::AppState; @@ -188,10 +191,13 @@ pub async fn digest( } } -/// Calculates the mass of the given sequence +/// Calculates the mass of the given sequence. +/// Thanks to rustyms this ProForma 2.1 compliant but will return only the minimum mass +/// if the sequence produces a mass bound the max value is not returned. /// /// # Arguments -/// * `sequence` - The sequence to calculate the mass for, extracted from URL path +/// * `sequence` - Proforma sequence of the peptide, e.g. `<[+57.021464]@C>MFCQLAKTCPVQLWVDSTPPPGTRVR` +/// url encoded `%3C%5B%2B57.021464%5D%40C%3EMFCQLAKTCPVQLWVDSTPPPGTRVR`` /// /// # API /// ## Request @@ -201,15 +207,26 @@ pub async fn digest( /// ## Response /// ```json /// { -/// "mass": 2006.981002959 +/// "mass": 3043.519423982504 /// } /// ``` /// pub async fn get_mass(Path(sequence): Path) -> Result, WebError> { - let mass = calc_sequence_mass_int(&sequence)?; + let peptide = CompoundPeptidoformIon::pro_forma(&sequence, None) + .map_err(|err| WebError::new(StatusCode::UNPROCESSABLE_ENTITY, format!("{}", err)))?; + + let mass = match peptide.formulas().mass_bounds().into_option() { + Some((min, _)) => min.monoisotopic_mass().value, + None => { + return Err(WebError::new( + StatusCode::UNPROCESSABLE_ENTITY, + "Could not calculate mass for the given sequence".to_string(), + )); + } + }; Ok(Json(json!({ - "mass": mass_to_float(mass), + "mass": mass, }))) } @@ -236,3 +253,41 @@ pub async fn get_proteases() -> Result, WebError> { protease_names.sort(); Ok(Json(json!(protease_names))) } + +/// Query parameters for the partition endpoint +/// +#[derive(Deserialize)] +pub struct GetPartitionQuery { + /// Mass to get the partition for + mass: f64, +} + +/// Lists all available proteases +/// +/// # API +/// ## Request +/// * Path: `/api/tools/proteases?mass=2929.4765` +/// * Method: `GET` +/// +/// ## Response +/// List of name of all available proteases +/// ```json +/// { +/// "partition": 2, +/// "partition_limit": 3000.0 +/// } +/// ``` +/// +/// +pub async fn get_partition( + State(app_state): State>, + Query(query_payload): Query, +) -> Result, WebError> { + let partition_limits = app_state.get_configuration_as_ref().get_partition_limits(); + let partition_index = get_mass_partition(partition_limits, mass_to_int(query_payload.mass))?; + + Ok(Json(json!({ + "partition": partition_index, + "partition_limit": mass_to_float(partition_limits[partition_index]), + }))) +} diff --git a/test_files/finalized_peptide_condition.txt b/test_files/finalized_peptide_condition.txt new file mode 100644 index 0000000..d80fd4c --- /dev/null +++ b/test_files/finalized_peptide_condition.txt @@ -0,0 +1,108 @@ +FinalizedPeptideCondition: occurences of 'C' == 0 @ 839.403366202 +FinalizedPeptideCondition: occurences of 'C' == 1 @ 782.381902202 +FinalizedPeptideCondition: occurences of 'C' == 2 @ 725.360438202 +FinalizedPeptideCondition: occurences of 'C' == 3 @ 668.338974202 +FinalizedPeptideCondition: occurences of 'C' == 4 @ 611.317510202 +FinalizedPeptideCondition: occurences of 'C' == 5 @ 554.296046202 +FinalizedPeptideCondition: occurences of 'C' == 6 @ 497.274582202 +FinalizedPeptideCondition: occurences of 'C' == 7 @ 440.253118202 +FinalizedPeptideCondition: occurences of 'C' == 8 @ 383.231654202 +FinalizedPeptideCondition: occurences of 'C' == 0 && occurences of 'M' >= 1 @ 823.408456202 +FinalizedPeptideCondition: occurences of 'C' == 0 && occurences of 'M' >= 2 @ 807.413546202 +FinalizedPeptideCondition: occurences of 'C' == 1 && occurences of 'M' >= 1 @ 766.386992202 +FinalizedPeptideCondition: occurences of 'C' == 1 && occurences of 'M' >= 2 @ 750.392082202 +FinalizedPeptideCondition: occurences of 'C' == 2 && occurences of 'M' >= 1 @ 709.365528202 +FinalizedPeptideCondition: occurences of 'C' == 2 && occurences of 'M' >= 2 @ 693.370618202 +FinalizedPeptideCondition: occurences of 'C' == 3 && occurences of 'M' >= 1 @ 652.344064202 +FinalizedPeptideCondition: occurences of 'C' == 3 && occurences of 'M' >= 2 @ 636.349154202 +FinalizedPeptideCondition: occurences of 'C' == 4 && occurences of 'M' >= 1 @ 595.322600202 +FinalizedPeptideCondition: occurences of 'C' == 4 && occurences of 'M' >= 2 @ 579.327690202 +FinalizedPeptideCondition: occurences of 'C' == 5 && occurences of 'M' >= 1 @ 538.301136202 +FinalizedPeptideCondition: occurences of 'C' == 5 && occurences of 'M' >= 2 @ 522.306226202 +FinalizedPeptideCondition: occurences of 'C' == 6 && occurences of 'M' >= 1 @ 481.279672202 +FinalizedPeptideCondition: occurences of 'C' == 6 && occurences of 'M' >= 2 @ 465.284762202 +FinalizedPeptideCondition: occurences of 'C' == 7 && occurences of 'M' >= 1 @ 424.258208202 +FinalizedPeptideCondition: occurences of 'C' == 7 && occurences of 'M' >= 2 @ 408.263298202 +FinalizedPeptideCondition: occurences of 'C' == 8 && occurences of 'M' >= 1 @ 367.236744202 +FinalizedPeptideCondition: occurences of 'C' == 8 && occurences of 'M' >= 2 @ 351.241834202 +FinalizedPeptideCondition: occurences of 'C' == 0 && starts with 'M' && occurences of 'M' >= 1 @ 822.408456202 +FinalizedPeptideCondition: occurences of 'C' == 1 && starts with 'M' && occurences of 'M' >= 1 @ 765.386992202 +FinalizedPeptideCondition: occurences of 'C' == 2 && starts with 'M' && occurences of 'M' >= 1 @ 708.365528202 +FinalizedPeptideCondition: occurences of 'C' == 3 && starts with 'M' && occurences of 'M' >= 1 @ 651.344064202 +FinalizedPeptideCondition: occurences of 'C' == 4 && starts with 'M' && occurences of 'M' >= 1 @ 594.322600202 +FinalizedPeptideCondition: occurences of 'C' == 5 && starts with 'M' && occurences of 'M' >= 1 @ 537.301136202 +FinalizedPeptideCondition: occurences of 'C' == 6 && starts with 'M' && occurences of 'M' >= 1 @ 480.279672202 +FinalizedPeptideCondition: occurences of 'C' == 7 && starts with 'M' && occurences of 'M' >= 1 @ 423.258208202 +FinalizedPeptideCondition: occurences of 'C' == 8 && starts with 'M' && occurences of 'M' >= 1 @ 366.236744202 +FinalizedPeptideCondition: occurences of 'C' == 0 && starts with 'M' && occurences of 'M' >= 2 @ 806.413546202 +FinalizedPeptideCondition: occurences of 'C' == 0 && starts with 'M' && occurences of 'M' >= 3 @ 790.418636202 +FinalizedPeptideCondition: occurences of 'C' == 1 && starts with 'M' && occurences of 'M' >= 2 @ 749.392082202 +FinalizedPeptideCondition: occurences of 'C' == 1 && starts with 'M' && occurences of 'M' >= 3 @ 733.397172202 +FinalizedPeptideCondition: occurences of 'C' == 2 && starts with 'M' && occurences of 'M' >= 2 @ 692.370618202 +FinalizedPeptideCondition: occurences of 'C' == 2 && starts with 'M' && occurences of 'M' >= 3 @ 676.375708202 +FinalizedPeptideCondition: occurences of 'C' == 3 && starts with 'M' && occurences of 'M' >= 2 @ 635.349154202 +FinalizedPeptideCondition: occurences of 'C' == 3 && starts with 'M' && occurences of 'M' >= 3 @ 619.354244202 +FinalizedPeptideCondition: occurences of 'C' == 4 && starts with 'M' && occurences of 'M' >= 2 @ 578.327690202 +FinalizedPeptideCondition: occurences of 'C' == 4 && starts with 'M' && occurences of 'M' >= 3 @ 562.332780202 +FinalizedPeptideCondition: occurences of 'C' == 5 && starts with 'M' && occurences of 'M' >= 2 @ 521.306226202 +FinalizedPeptideCondition: occurences of 'C' == 5 && starts with 'M' && occurences of 'M' >= 3 @ 505.311316202 +FinalizedPeptideCondition: occurences of 'C' == 6 && starts with 'M' && occurences of 'M' >= 2 @ 464.284762202 +FinalizedPeptideCondition: occurences of 'C' == 6 && starts with 'M' && occurences of 'M' >= 3 @ 448.289852202 +FinalizedPeptideCondition: occurences of 'C' == 7 && starts with 'M' && occurences of 'M' >= 2 @ 407.263298202 +FinalizedPeptideCondition: occurences of 'C' == 7 && starts with 'M' && occurences of 'M' >= 3 @ 391.268388202 +FinalizedPeptideCondition: occurences of 'C' == 8 && starts with 'M' && occurences of 'M' >= 2 @ 350.241834202 +FinalizedPeptideCondition: occurences of 'C' == 8 && starts with 'M' && occurences of 'M' >= 3 @ 334.246924202 +FinalizedPeptideCondition: occurences of 'C' == 0 && ends with 'K' && occurences of 'K' >= 1 @ 819.103366202 +FinalizedPeptideCondition: occurences of 'C' == 1 && ends with 'K' && occurences of 'K' >= 1 @ 762.081902202 +FinalizedPeptideCondition: occurences of 'C' == 2 && ends with 'K' && occurences of 'K' >= 1 @ 705.060438202 +FinalizedPeptideCondition: occurences of 'C' == 3 && ends with 'K' && occurences of 'K' >= 1 @ 648.038974202 +FinalizedPeptideCondition: occurences of 'C' == 4 && ends with 'K' && occurences of 'K' >= 1 @ 591.017510202 +FinalizedPeptideCondition: occurences of 'C' == 5 && ends with 'K' && occurences of 'K' >= 1 @ 533.996046202 +FinalizedPeptideCondition: occurences of 'C' == 6 && ends with 'K' && occurences of 'K' >= 1 @ 476.974582202 +FinalizedPeptideCondition: occurences of 'C' == 7 && ends with 'K' && occurences of 'K' >= 1 @ 419.953118202 +FinalizedPeptideCondition: occurences of 'C' == 8 && ends with 'K' && occurences of 'K' >= 1 @ 362.931654202 +FinalizedPeptideCondition: occurences of 'C' == 0 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 803.108456202 +FinalizedPeptideCondition: occurences of 'C' == 0 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 787.113546202 +FinalizedPeptideCondition: occurences of 'C' == 1 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 746.086992202 +FinalizedPeptideCondition: occurences of 'C' == 1 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 730.092082202 +FinalizedPeptideCondition: occurences of 'C' == 2 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 689.065528202 +FinalizedPeptideCondition: occurences of 'C' == 2 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 673.070618202 +FinalizedPeptideCondition: occurences of 'C' == 3 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 632.044064202 +FinalizedPeptideCondition: occurences of 'C' == 3 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 616.049154202 +FinalizedPeptideCondition: occurences of 'C' == 4 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 575.022600202 +FinalizedPeptideCondition: occurences of 'C' == 4 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 559.027690202 +FinalizedPeptideCondition: occurences of 'C' == 5 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 518.001136202 +FinalizedPeptideCondition: occurences of 'C' == 5 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 502.006226202 +FinalizedPeptideCondition: occurences of 'C' == 6 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 460.979672202 +FinalizedPeptideCondition: occurences of 'C' == 6 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 444.984762202 +FinalizedPeptideCondition: occurences of 'C' == 7 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 403.958208202 +FinalizedPeptideCondition: occurences of 'C' == 7 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 387.963298202 +FinalizedPeptideCondition: occurences of 'C' == 8 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 346.936744202 +FinalizedPeptideCondition: occurences of 'C' == 8 && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 330.941834202 +FinalizedPeptideCondition: occurences of 'C' == 0 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 802.108456202 +FinalizedPeptideCondition: occurences of 'C' == 1 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 745.086992202 +FinalizedPeptideCondition: occurences of 'C' == 2 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 688.065528202 +FinalizedPeptideCondition: occurences of 'C' == 3 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 631.044064202 +FinalizedPeptideCondition: occurences of 'C' == 4 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 574.022600202 +FinalizedPeptideCondition: occurences of 'C' == 5 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 517.001136202 +FinalizedPeptideCondition: occurences of 'C' == 6 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 459.979672202 +FinalizedPeptideCondition: occurences of 'C' == 7 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 402.958208202 +FinalizedPeptideCondition: occurences of 'C' == 8 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 1 @ 345.936744202 +FinalizedPeptideCondition: occurences of 'C' == 0 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 786.113546202 +FinalizedPeptideCondition: occurences of 'C' == 0 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 3 @ 770.118636202 +FinalizedPeptideCondition: occurences of 'C' == 1 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 729.092082202 +FinalizedPeptideCondition: occurences of 'C' == 1 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 3 @ 713.097172202 +FinalizedPeptideCondition: occurences of 'C' == 2 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 672.070618202 +FinalizedPeptideCondition: occurences of 'C' == 2 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 3 @ 656.075708202 +FinalizedPeptideCondition: occurences of 'C' == 3 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 615.049154202 +FinalizedPeptideCondition: occurences of 'C' == 3 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 3 @ 599.054244202 +FinalizedPeptideCondition: occurences of 'C' == 4 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 558.027690202 +FinalizedPeptideCondition: occurences of 'C' == 4 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 3 @ 542.032780202 +FinalizedPeptideCondition: occurences of 'C' == 5 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 501.006226202 +FinalizedPeptideCondition: occurences of 'C' == 5 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 3 @ 485.011316202 +FinalizedPeptideCondition: occurences of 'C' == 6 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 443.984762202 +FinalizedPeptideCondition: occurences of 'C' == 6 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 3 @ 427.989852202 +FinalizedPeptideCondition: occurences of 'C' == 7 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 386.963298202 +FinalizedPeptideCondition: occurences of 'C' == 7 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 3 @ 370.968388202 +FinalizedPeptideCondition: occurences of 'C' == 8 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 2 @ 329.941834202 +FinalizedPeptideCondition: occurences of 'C' == 8 && starts with 'M' && ends with 'K' && occurences of 'K' >= 1 && occurences of 'M' >= 3 @ 313.946924202 \ No newline at end of file