From 48e9a4ad41815e670d2e944286124bcd06ff6891 Mon Sep 17 00:00:00 2001 From: Propfend Date: Wed, 25 Feb 2026 18:50:24 +0100 Subject: [PATCH 1/2] Add embedding file to the appdir. Load the embeddings from disk. Add mcp `semantic_search` tool to get text from user, convert into embedding and give response. --- Cargo.lock | 697 +++++++++++++++++- Cargo.toml | 3 + Makefile | 3 + poet/Cargo.toml | 3 + poet/src/cmd/make/app_dir.rs | 79 ++ poet/src/cmd/make/embeddings.rs | 114 +++ poet/src/cmd/make/mod.rs | 1 + poet/src/cmd/mod.rs | 1 + poet/src/cmd/serve/mod.rs | 27 + poet/src/generate_embedding/mod.rs | 1 + .../paddler_embedding_client.rs | 76 ++ poet/src/lib.rs | 4 + poet/src/main.rs | 4 + poet/src/mcp/jsonrpc/request/initialize.rs | 2 + poet/src/semantic_search_index.rs | 108 +++ poet/src/semantic_search_index_holder.rs | 25 + poet/src/semantic_search_tool.rs | 117 +++ 17 files changed, 1252 insertions(+), 13 deletions(-) create mode 100644 poet/src/cmd/make/embeddings.rs create mode 100644 poet/src/generate_embedding/mod.rs create mode 100644 poet/src/generate_embedding/paddler_embedding_client.rs create mode 100644 poet/src/semantic_search_index.rs create mode 100644 poet/src/semantic_search_index_holder.rs create mode 100644 poet/src/semantic_search_tool.rs diff --git a/Cargo.lock b/Cargo.lock index 9e4d024..c4de16e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12,7 +12,7 @@ dependencies = [ "actix-rt", "actix_derive", "bitflags 2.10.0", - "bytes", + "bytes 1.11.0", "crossbeam-channel", "futures-core", "futures-sink", @@ -34,7 +34,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f7b0a21988c1bf877cf4759ef5ddaac04c1c9fe808c9142ecb78ba97d97a28a" dependencies = [ "bitflags 2.10.0", - "bytes", + "bytes 1.11.0", "futures-core", "futures-sink", "memchr", @@ -70,7 +70,7 @@ dependencies = [ "actix-utils", "actix-web", "bitflags 2.10.0", - "bytes", + "bytes 1.11.0", "derive_more", "futures-core", "http-range", @@ -95,14 +95,14 @@ dependencies = [ "base64", "bitflags 2.10.0", "brotli", - "bytes", + "bytes 1.11.0", "bytestring", "derive_more", "encoding_rs", "flate2", "foldhash", "futures-core", - "h2", + "h2 0.3.27", "http 0.2.12", "httparse", "httpdate", @@ -209,7 +209,7 @@ dependencies = [ "actix-service", "actix-utils", "actix-web-codegen", - "bytes", + "bytes 1.11.0", "bytestring", "cfg-if", "cookie", @@ -263,7 +263,7 @@ dependencies = [ "actix-web-lab-derive", "ahash", "arc-swap", - "bytes", + "bytes 1.11.0", "bytestring", "csv", "derive_more", @@ -542,6 +542,12 @@ dependencies = [ "syn", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.5.0" @@ -697,6 +703,16 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "bytes" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c" +dependencies = [ + "byteorder", + "iovec", +] + [[package]] name = "bytes" version = "1.11.0" @@ -709,7 +725,7 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "113b4343b5f6617e7ad401ced8de3cc8b012e73a594347c307b90db3e9271289" dependencies = [ - "bytes", + "bytes 1.11.0", ] [[package]] @@ -839,6 +855,26 @@ dependencies = [ "version_check", ] +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -994,6 +1030,12 @@ dependencies = [ "parking_lot_core", ] +[[package]] +name = "data-encoding" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea" + [[package]] name = "deranged" version = "0.5.5" @@ -1202,6 +1244,21 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -1240,12 +1297,33 @@ dependencies = [ "libc", ] +[[package]] +name = "futures" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a471a38ef8ed83cd6e40aa59c1ffe17db6855c18e3604d9c4ed8c08ebc28678" + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", +] + [[package]] name = "futures-core" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + [[package]] name = "futures-macro" version = "0.3.31" @@ -1276,11 +1354,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-core", + "futures-io", "futures-macro", + "futures-sink", "futures-task", + "memchr", "pin-project-lite", "pin-utils", "slab", + "tokio-io", ] [[package]] @@ -1328,7 +1410,7 @@ version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" dependencies = [ - "bytes", + "bytes 1.11.0", "fnv", "futures-core", "futures-sink", @@ -1341,6 +1423,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "h2" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +dependencies = [ + "atomic-waker", + "bytes 1.11.0", + "fnv", + "futures-core", + "futures-sink", + "http 1.4.0", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "hashbrown" version = "0.14.5" @@ -1382,7 +1483,7 @@ version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" dependencies = [ - "bytes", + "bytes 1.11.0", "fnv", "itoa", ] @@ -1393,10 +1494,33 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" dependencies = [ - "bytes", + "bytes 1.11.0", "itoa", ] +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes 1.11.0", + "http 1.4.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes 1.11.0", + "futures-core", + "http 1.4.0", + "http-body", + "pin-project-lite", +] + [[package]] name = "http-range" version = "0.1.5" @@ -1425,6 +1549,85 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "hyper" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +dependencies = [ + "atomic-waker", + "bytes 1.11.0", + "futures-channel", + "futures-core", + "h2 0.4.13", + "http 1.4.0", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http 1.4.0", + "hyper", + "hyper-util", + "rustls", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes 1.11.0", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "base64", + "bytes 1.11.0", + "futures-channel", + "futures-util", + "http 1.4.0", + "http-body", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2 0.6.1", + "system-configuration", + "tokio", + "tower-service", + "tracing", + "windows-registry", +] + [[package]] name = "hyperloglogplus" version = "0.4.1" @@ -1620,6 +1823,31 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "iovec" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" +dependencies = [ + "libc", +] + +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "iri-string" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" @@ -1886,6 +2114,23 @@ dependencies = [ "rand 0.8.5", ] +[[package]] +name = "native-tls" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "nix" version = "0.30.1" @@ -2046,6 +2291,50 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "openssl" +version = "0.10.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +dependencies = [ + "bitflags 2.10.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + +[[package]] +name = "openssl-sys" +version = "0.9.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "ownedbytes" version = "0.9.0" @@ -2055,6 +2344,35 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "paddler_client" +version = "2.2.0" +dependencies = [ + "anyhow", + "dashmap", + "futures-util", + "log", + "nanoid", + "paddler_types", + "reqwest", + "serde", + "serde_json", + "thiserror", + "tokio", + "tokio-stream", + "tokio-tungstenite", + "url", +] + +[[package]] +name = "paddler_types" +version = "2.2.0" +dependencies = [ + "anyhow", + "serde", + "serde_json", +] + [[package]] name = "parking_lot" version = "0.12.5" @@ -2143,6 +2461,7 @@ dependencies = [ "async-stream", "async-trait", "base64", + "bincode", "blake3", "chrono", "clap", @@ -2164,6 +2483,8 @@ dependencies = [ "nom 8.0.0", "notify", "notify-debouncer-full", + "paddler_client", + "paddler_types", "petgraph", "rayon", "rhai", @@ -2419,6 +2740,49 @@ version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64", + "bytes 1.11.0", + "encoding_rs", + "futures-core", + "futures-util", + "h2 0.4.13", + "http 1.4.0", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-tls", + "hyper-util", + "js-sys", + "log", + "mime", + "native-tls", + "percent-encoding", + "pin-project-lite", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-native-tls", + "tokio-util", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", +] + [[package]] name = "rhai" version = "1.23.6" @@ -2461,6 +2825,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.16", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rust-stemmers" version = "1.2.0" @@ -2496,6 +2874,39 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "rustls" +version = "0.23.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" +dependencies = [ + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.22" @@ -2517,6 +2928,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "schannel" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "schemars" version = "1.1.0" @@ -2548,6 +2968,29 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "security-framework" +version = "3.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d17b898a6d6948c3a8ee4372c17cb384f90d2e6e912ef00895b14fd7ab54ec38" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.10.1", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "serde" version = "1.0.228" @@ -2779,6 +3222,12 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.111" @@ -2790,6 +3239,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + [[package]] name = "synstructure" version = "0.13.2" @@ -2822,6 +3280,27 @@ dependencies = [ "yaml-rust", ] +[[package]] +name = "system-configuration" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tantivy" version = "0.25.0" @@ -3066,7 +3545,7 @@ version = "1.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" dependencies = [ - "bytes", + "bytes 1.11.0", "libc", "mio", "parking_lot", @@ -3077,6 +3556,17 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "tokio-io" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57fc868aae093479e3131e3d165c93b1c7474109d13c90ec0dda2a1bbfff0674" +dependencies = [ + "bytes 0.4.12", + "futures", + "log", +] + [[package]] name = "tokio-macros" version = "2.6.0" @@ -3088,6 +3578,26 @@ dependencies = [ "syn", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + [[package]] name = "tokio-stream" version = "0.1.17" @@ -3097,6 +3607,19 @@ dependencies = [ "futures-core", "pin-project-lite", "tokio", + "tokio-util", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d25a406cddcc431a75d3d9afc6a7c0f7428d4891dd973e4d54c56b46127bf857" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite", ] [[package]] @@ -3105,7 +3628,7 @@ version = "0.7.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2efa149fe76073d6e8fd97ef4f4eca7b67f599660115591483572e406e165594" dependencies = [ - "bytes", + "bytes 1.11.0", "futures-core", "futures-sink", "pin-project-lite", @@ -3151,6 +3674,51 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df8b2b54733674ad286d16267dcfc7a71ed5c776e4ac7aa3c3e2561f7c637bf2" +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-http" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +dependencies = [ + "bitflags 2.10.0", + "bytes 1.11.0", + "futures-util", + "http 1.4.0", + "http-body", + "iri-string", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + [[package]] name = "tracing" version = "0.1.41" @@ -3183,6 +3751,29 @@ dependencies = [ "once_cell", ] +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "tungstenite" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8628dcc84e5a09eb3d8423d6cb682965dea9133204e8fb3efee74c2a0c259442" +dependencies = [ + "bytes 1.11.0", + "data-encoding", + "http 1.4.0", + "httparse", + "log", + "rand 0.9.2", + "sha1", + "thiserror", + "utf-8", +] + [[package]] name = "typenum" version = "1.19.0" @@ -3213,6 +3804,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "url" version = "2.5.7" @@ -3225,6 +3822,12 @@ dependencies = [ "serde", ] +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8-ranges" version = "1.0.5" @@ -3261,6 +3864,12 @@ version = "0.15.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4e8257fbc510f0a46eb602c10215901938b5c2a7d5e70fc11483b1d3c9b5b18c" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -3277,6 +3886,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -3305,6 +3923,19 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "551f88106c6d5e7ccc7cd9a16f312dd3b5d36ea8b4954304657d5dfba115d4a0" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.105" @@ -3337,6 +3968,29 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "web-sys" +version = "0.3.82" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "winapi" version = "0.3.9" @@ -3409,6 +4063,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + [[package]] name = "windows-result" version = "0.4.1" @@ -3692,6 +4357,12 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + [[package]] name = "zerotrie" version = "0.2.3" diff --git a/Cargo.toml b/Cargo.toml index 308ebbc..e939fe6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ ] [workspace.dependencies] +bincode = "1.3" actix = "0.13.5" actix-cors = "0.7.1" actix-files = "0.6.6" @@ -53,5 +54,7 @@ tokio = { version = "1.45.1", features = ["full"] } tokio-stream = "0.1.17" tokio-util = "0.7.16" toml = "0.9.5" +paddler_client = { path = "../paddler/paddler_client" } +paddler_types = { path = "../paddler/paddler_types", default-features = false } url = "2.5.6" uuid = { version = "1.18.1", features = ["rng", "serde", "v4"] } diff --git a/Makefile b/Makefile index 2c09b02..a502984 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,9 @@ test_site.AppDir: --title "Test Site" \ --version "1.2.3" +target/debug/poet: + cargo build + test_site.AppDir/poet: target/debug/poet test_site.AppDir cp target/debug/poet test_site.AppDir/poet diff --git a/poet/Cargo.toml b/poet/Cargo.toml index 011b7d4..02b28d7 100644 --- a/poet/Cargo.toml +++ b/poet/Cargo.toml @@ -14,6 +14,7 @@ actix-web = { workspace = true } actix-web-lab = { workspace = true } actix-ws = { workspace = true } anyhow = { workspace = true } +bincode = { workspace = true } async-stream = { workspace = true } async-trait = { workspace = true } base64 = { workspace = true } @@ -36,6 +37,8 @@ markdown = { workspace = true } mime = { workspace = true } mime_guess = { workspace = true } nom = { workspace = true } +paddler_client = { workspace = true } +paddler_types = { workspace = true } notify = { workspace = true } notify-debouncer-full = { workspace = true } petgraph = { workspace = true } diff --git a/poet/src/cmd/make/app_dir.rs b/poet/src/cmd/make/app_dir.rs index a5e1cdd..6efc7b4 100644 --- a/poet/src/cmd/make/app_dir.rs +++ b/poet/src/cmd/make/app_dir.rs @@ -8,10 +8,12 @@ use async_trait::async_trait; use clap::Parser; use indoc::formatdoc; use log::info; +use log::warn; use tokio::fs; use crate::app_dir_desktop_entry::AppDirDesktopEntry; use crate::assert_valid_desktop_entry_string::assert_valid_desktop_entry_string; +use crate::cmd::EMBEDDINGS_FILENAME; use crate::cmd::builds_project::BuildsProject; use crate::cmd::handler::Handler; use crate::cmd::value_parser::validate_is_directory; @@ -27,6 +29,9 @@ const ICON: &str = r#" Result { + Ok(formatdoc! { + r#" + #!/usr/bin/env sh + + ADDR="" + PADDLER_ADDR="" + PUBLIC_PATH="" + + while [ $# -gt 0 ]; do + case $1 in + --addr) + ADDR="$2" + shift 2 + ;; + --paddler-addr) + PADDLER_ADDR="$2" + shift 2 + ;; + --public-path) + PUBLIC_PATH="$2" + shift 2 + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --addr ADDRESS --public-path PATH [--paddler-addr ADDRESS]" + exit 1 + ;; + esac + done + + if [ -z "$ADDR" ]; then + echo "Error: --addr is required" + echo "Usage: $0 --addr ADDRESS --public-path PATH [--paddler-addr ADDRESS]" + exit 1 + fi + + if [ -z "$PUBLIC_PATH" ]; then + echo "Error: --public-path is required" + echo "Usage: $0 --addr ADDRESS --public-path PATH [--paddler-addr ADDRESS]" + exit 1 + fi + + EXTRA_ARGS="" + + if [ -n "$PADDLER_ADDR" ]; then + EXTRA_ARGS="--embeddings-file $APPDIR/{embeddings_filename} --paddler-addr $PADDLER_ADDR" + fi + + exec $APPDIR/poet serve $APPDIR --addr "$ADDR" --app-name "{name}" --public-path "$PUBLIC_PATH" $EXTRA_ARGS + "#, + embeddings_filename = EMBEDDINGS_FILENAME, + name = self.name, + }) + } + fn render_desktop_file(&self) -> Result { Ok(AppDirDesktopEntry { name: assert_valid_desktop_entry_string(&self.name)?, @@ -132,6 +201,16 @@ impl Handler for AppDir { ) .await?; + if let Some(embeddings_file) = &self.embeddings_file { + info!("Copying embeddings to AppDir..."); + + fs::copy(embeddings_file, app_dir_path.join(EMBEDDINGS_FILENAME)).await?; + } else { + warn!( + "No embeddings file provided. The AppDir will not include semantic search capabilities. Use --embeddings-file to include embeddings." + ); + } + info!("Copying assets to AppDir..."); let esbuild_metafile = read_esbuild_metafile_or_default(source_filesystem.clone()).await?; diff --git a/poet/src/cmd/make/embeddings.rs b/poet/src/cmd/make/embeddings.rs new file mode 100644 index 0000000..1e7739f --- /dev/null +++ b/poet/src/cmd/make/embeddings.rs @@ -0,0 +1,114 @@ +use std::collections::BTreeMap; +use std::fs; +use std::net::SocketAddr; +use std::path::PathBuf; +use std::str::FromStr; + +use anyhow::Result; +use async_trait::async_trait; +use clap::Parser; +use log::info; +use paddler_types::embedding::Embedding; +use paddler_types::embedding_input_document::EmbeddingInputDocument; +use url::Url; + +use crate::asset_path_renderer::AssetPathRenderer; +use crate::build_authors::build_authors; +use crate::build_project::build_project; +use crate::build_project::build_project_params::BuildProjectParams; +use crate::build_project::build_project_result_stub::BuildProjectResultStub; +use crate::build_timer::BuildTimer; +use crate::cmd::builds_project::BuildsProject; +use crate::cmd::handler::Handler; +use crate::cmd::value_parser::parse_socket_addr; +use crate::cmd::value_parser::validate_is_directory; +use crate::compile_shortcodes::compile_shortcodes; +use crate::find_text_content_in_mdast::find_text_content_in_mdast; +use crate::generate_embedding::paddler_embedding_client::PaddlerEmbeddingClient; +use crate::read_esbuild_metafile_or_default::read_esbuild_metafile_or_default; + +#[derive(Parser)] +pub struct Embeddings { + #[arg(long, value_parser = parse_socket_addr)] + paddler_addr: SocketAddr, + + #[arg(long)] + output_file: PathBuf, + + #[arg(value_parser = validate_is_directory)] + source_directory: PathBuf, +} + +impl BuildsProject for Embeddings { + fn source_directory(&self) -> PathBuf { + self.source_directory.clone() + } +} + +#[async_trait(?Send)] +impl Handler for Embeddings { + async fn handle(&self) -> Result<()> { + let source_filesystem = self.source_filesystem(); + let rhai_template_renderer = compile_shortcodes(source_filesystem.clone()).await?; + let authors = build_authors(source_filesystem.clone()).await?; + + let BuildProjectResultStub { + content_document_sources, + .. + } = build_project(BuildProjectParams { + asset_path_renderer: AssetPathRenderer { + base_path: String::new(), + }, + generate_sitemap: false, + authors, + esbuild_metafile: read_esbuild_metafile_or_default(source_filesystem.clone()).await?, + generated_page_base_path: String::new(), + is_watching: false, + rhai_template_renderer, + source_filesystem, + }) + .await?; + + let documents: Vec = content_document_sources + .iter() + .filter_map(|(basename, source)| { + let body = find_text_content_in_mdast(&source.mdast).ok()?; + let title = &source.reference.front_matter.title; + let description = &source.reference.front_matter.description; + + if body.is_empty() { + return None; + } + + Some(EmbeddingInputDocument { + id: basename.to_string(), + content: format!("{title}\n{description}"), + }) + }) + .collect(); + + info!("Generating embeddings for {} documents...", documents.len()); + + let _build_timer = BuildTimer::default(); + let inference_url: Url = Url::from_str(&format!("http://{}", self.paddler_addr))?; + let client = PaddlerEmbeddingClient::new(inference_url); + let results: Vec = client.generate_embeddings(documents).await?; + + let embeddings_map: BTreeMap> = results + .into_iter() + .map(|embedding| (embedding.source_document_id, embedding.embedding)) + .collect(); + info!( + "Saving {} embeddings to {}...", + embeddings_map.len(), + self.output_file.display() + ); + + let encoded = bincode::serialize(&embeddings_map)?; + fs::write(&self.output_file, encoded)?; + + info!("Done."); + + Ok(()) + } +} diff --git a/poet/src/cmd/make/mod.rs b/poet/src/cmd/make/mod.rs index 01dc2ea..ad6bde4 100644 --- a/poet/src/cmd/make/mod.rs +++ b/poet/src/cmd/make/mod.rs @@ -1,2 +1,3 @@ pub mod app_dir; +pub mod embeddings; pub mod static_pages; diff --git a/poet/src/cmd/mod.rs b/poet/src/cmd/mod.rs index a80676d..39c4623 100644 --- a/poet/src/cmd/mod.rs +++ b/poet/src/cmd/mod.rs @@ -9,4 +9,5 @@ mod service_manager; mod value_parser; pub mod watch; +const EMBEDDINGS_FILENAME: &str = "embeddings.bin"; const STATIC_FILES_PUBLIC_PATH: &str = "assets"; diff --git a/poet/src/cmd/serve/mod.rs b/poet/src/cmd/serve/mod.rs index 54c4263..1aca462 100644 --- a/poet/src/cmd/serve/mod.rs +++ b/poet/src/cmd/serve/mod.rs @@ -2,9 +2,13 @@ mod app_data; mod http_route; use std::net::SocketAddr; +use std::path::Path; use std::path::PathBuf; +use std::str::FromStr; use std::sync::Arc; +use url::Url; + use actix_files::Files; use actix_web::App; use actix_web::HttpServer; @@ -24,7 +28,9 @@ use crate::build_project::build_project_result::BuildProjectResult; use crate::build_project::build_project_result_holder::BuildProjectResultHolder; use crate::build_prompt_document_controller_collection::build_prompt_document_controller_collection; use crate::build_prompt_document_controller_collection::build_prompt_document_controller_collection_params::BuildPromptControllerCollectionParams; +use crate::generate_embedding::paddler_embedding_client::PaddlerEmbeddingClient; use crate::holder::Holder as _; +use crate::cmd::EMBEDDINGS_FILENAME; use crate::cmd::STATIC_FILES_PUBLIC_PATH; use crate::cmd::builds_project::BuildsProject; use crate::cmd::handler::Handler; @@ -47,6 +53,8 @@ use crate::search_index::SearchIndex; use crate::search_index_reader::SearchIndexReader; use crate::search_index_reader_holder::SearchIndexReaderHolder; use crate::search_tool::SearchTool; +use crate::semantic_search_index::SemanticSearchIndex; +use crate::semantic_search_tool::SemanticSearchTool; #[derive(Parser)] pub struct Serve { @@ -59,6 +67,9 @@ pub struct Serve { #[arg(long)] app_name: String, + #[arg(long, value_parser = parse_socket_addr)] + paddler_addr: Option, + #[arg(long)] public_path: String, @@ -184,6 +195,22 @@ impl Handler for Serve { search_index_reader_holder: search_index_reader_holder.clone(), }); + let embeddings_path = Path::new(EMBEDDINGS_FILENAME); + + if let Some(paddler_addr) = &self.paddler_addr { + let semantic_search_index = + Arc::new(SemanticSearchIndex::load_from_file(&embeddings_path)?); + let inference_url = Url::from_str(&format!("http://{paddler_addr}"))?; + let paddler_embeddings_client = Arc::new(PaddlerEmbeddingClient::new(inference_url)); + + tool_registry.register_owned(SemanticSearchTool { + mcp_resource_provider_content_documents: mcp_resource_provider_content_documents + .clone(), + paddler_embeddings_client, + semantic_search_index, + }); + } + let tool_registry_arc: Arc = Arc::new(tool_registry); HttpServer::new(move || { diff --git a/poet/src/generate_embedding/mod.rs b/poet/src/generate_embedding/mod.rs new file mode 100644 index 0000000..9f0abdd --- /dev/null +++ b/poet/src/generate_embedding/mod.rs @@ -0,0 +1 @@ +pub mod paddler_embedding_client; diff --git a/poet/src/generate_embedding/paddler_embedding_client.rs b/poet/src/generate_embedding/paddler_embedding_client.rs new file mode 100644 index 0000000..f2d6850 --- /dev/null +++ b/poet/src/generate_embedding/paddler_embedding_client.rs @@ -0,0 +1,76 @@ +use anyhow::Result; +use anyhow::anyhow; +use futures_util::StreamExt; +use paddler_client::PaddlerClient; +use paddler_types::embedding::Embedding; +use paddler_types::embedding_input_document::EmbeddingInputDocument; +use paddler_types::embedding_normalization_method::EmbeddingNormalizationMethod; +use paddler_types::embedding_result::EmbeddingResult; +use paddler_types::inference_client::Message as InferenceMessage; +use paddler_types::inference_client::Response as InferenceResponse; +use paddler_types::request_params::GenerateEmbeddingBatchParams; +use url::Url; + +pub struct PaddlerEmbeddingClient { + paddler_client: PaddlerClient, +} + +impl PaddlerEmbeddingClient { + pub fn new(inference_addr: Url) -> Self { + let paddler_client = PaddlerClient::new(inference_addr.clone(), inference_addr, 1); + + Self { paddler_client } + } + + pub async fn generate_embeddings( + &self, + documents: Vec, + ) -> Result> { + let params = GenerateEmbeddingBatchParams { + input_batch: documents, + normalization_method: EmbeddingNormalizationMethod::L2, + }; + + let mut stream = self + .paddler_client + .inference() + .generate_embedding_batch(¶ms) + .await + .map_err(|err| anyhow!("{err}"))?; + + let mut embeddings: Vec = Vec::new(); + + while let Some(message_result) = stream.next().await { + let message = message_result.map_err(|err| anyhow!("{err}"))?; + + match message { + InferenceMessage::Response(envelope) => match envelope.response { + InferenceResponse::Embedding(EmbeddingResult::Embedding(embedding)) => { + embeddings.push(embedding); + } + InferenceResponse::Embedding(EmbeddingResult::Done) => { + break; + } + InferenceResponse::Embedding(EmbeddingResult::Error(error)) => { + return Err(anyhow!("Embedding error: {error}")); + } + InferenceResponse::GeneratedToken(_) => {} + InferenceResponse::Timeout => { + return Err(anyhow!("Embedding request timed out")); + } + InferenceResponse::TooManyBufferedRequests => { + return Err(anyhow!("Too many buffered requests")); + } + }, + InferenceMessage::Error(error_envelope) => { + return Err(anyhow!( + "Paddler error: {}", + error_envelope.error.description + )); + } + } + } + + Ok(embeddings) + } +} diff --git a/poet/src/lib.rs b/poet/src/lib.rs index 8990cb1..f57b342 100644 --- a/poet/src/lib.rs +++ b/poet/src/lib.rs @@ -44,6 +44,7 @@ pub mod find_table_of_contents_in_mdast; pub mod find_text_content_in_mdast; pub mod flexible_datetime; pub mod generate_sitemap; +pub mod generate_embedding; pub mod holder; pub mod is_external_link; pub mod is_valid_desktop_entry_string; @@ -68,5 +69,8 @@ pub mod search_index_reader; pub mod search_index_reader_holder; pub mod search_index_schema; pub mod search_tool; +pub mod semantic_search_index; +pub mod semantic_search_index_holder; +pub mod semantic_search_tool; pub mod string_to_mdast; pub mod table_of_contents; diff --git a/poet/src/main.rs b/poet/src/main.rs index 3ae2d08..4230ac9 100644 --- a/poet/src/main.rs +++ b/poet/src/main.rs @@ -3,6 +3,7 @@ use clap::Parser; use clap::Subcommand; use poet::cmd::handler::Handler; use poet::cmd::make::app_dir::AppDir; +use poet::cmd::make::embeddings::Embeddings; use poet::cmd::make::static_pages::StaticPages; use poet::cmd::serve::Serve; use poet::cmd::watch::Watch; @@ -31,6 +32,8 @@ enum Commands { enum Make { /// Generates AppDir (packageable with AppImageKit) AppDir(AppDir), + /// Generates embeddings for all content documents + Embeddings(Embeddings), /// Generates static pages StaticPages(StaticPages), } @@ -39,6 +42,7 @@ fn get_handler() -> Option> { match Cli::parse().command { Some(Commands::Make { command }) => match command { Make::AppDir(handler) => Some(Box::new(handler)), + Make::Embeddings(handler) => Some(Box::new(handler)), Make::StaticPages(handler) => Some(Box::new(handler)), }, Some(Commands::Serve(handler)) => Some(Box::new(handler)), diff --git a/poet/src/mcp/jsonrpc/request/initialize.rs b/poet/src/mcp/jsonrpc/request/initialize.rs index 4886f6c..9d9d76c 100644 --- a/poet/src/mcp/jsonrpc/request/initialize.rs +++ b/poet/src/mcp/jsonrpc/request/initialize.rs @@ -28,6 +28,8 @@ pub struct ClientCapabilities { pub roots: Option, #[serde(skip_serializing_if = "Option::is_none")] pub sampling: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub tasks: Option, } #[derive(Debug, Deserialize, Serialize)] diff --git a/poet/src/semantic_search_index.rs b/poet/src/semantic_search_index.rs new file mode 100644 index 0000000..3cbb097 --- /dev/null +++ b/poet/src/semantic_search_index.rs @@ -0,0 +1,108 @@ +use std::collections::BTreeMap; +use std::fs; +use std::path::Path; + +use anyhow::Result; +use log::debug; + +pub struct SemanticSearchIndex { + embeddings: BTreeMap>, +} + +impl SemanticSearchIndex { + pub fn load_from_file(path: &Path) -> Result { + let bytes = fs::read(path)?; + let embeddings: BTreeMap> = bincode::deserialize(&bytes)?; + + Ok(Self { embeddings }) + } + + pub fn query(&self, embedding: &[f32], top_k: usize, min_score: f32) -> Vec<(String, f32)> { + let mut scored: Vec<(String, f32)> = self + .embeddings + .iter() + .map(|(basename, stored_embedding)| { + let score = cosine_similarity(embedding, stored_embedding); + + (basename.clone(), score) + }) + .inspect(|(basename, score)| { + debug!("Semantic search score for {basename}: {score:.6}"); + }) + .filter(|(_basename, score)| *score >= min_score) + .collect(); + + scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + scored.truncate(top_k); + + scored + } +} + +fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 { + let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); + let mag_a: f32 = a.iter().map(|x| x * x).sum::().sqrt(); + let mag_b: f32 = b.iter().map(|x| x * x).sum::().sqrt(); + + if mag_a == 0.0 || mag_b == 0.0 { + return 0.0; + } + + dot / (mag_a * mag_b) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cosine_similarity_identical() { + let a = vec![1.0, 2.0, 3.0]; + + assert!((cosine_similarity(&a, &a) - 1.0).abs() < 1e-6); + } + + #[test] + fn test_cosine_similarity_orthogonal() { + let a = vec![1.0, 0.0]; + let b = vec![0.0, 1.0]; + + assert!(cosine_similarity(&a, &b).abs() < 1e-6); + } + + #[test] + fn test_cosine_similarity_zero_vector() { + let a = vec![1.0, 2.0]; + let b = vec![0.0, 0.0]; + + assert_eq!(cosine_similarity(&a, &b), 0.0); + } + + #[test] + fn test_query_returns_top_k() { + let mut embeddings = BTreeMap::new(); + embeddings.insert("doc_a".to_string(), vec![1.0, 0.0]); + embeddings.insert("doc_b".to_string(), vec![0.7, 0.7]); + embeddings.insert("doc_c".to_string(), vec![0.0, 1.0]); + + let index = SemanticSearchIndex { embeddings }; + let results = index.query(&[1.0, 0.0], 2, 0.0); + + assert_eq!(results.len(), 2); + assert_eq!(results[0].0, "doc_a"); + } + + #[test] + fn test_query_filters_by_min_score() { + let mut embeddings = BTreeMap::new(); + embeddings.insert("doc_a".to_string(), vec![1.0, 0.0]); + embeddings.insert("doc_b".to_string(), vec![0.7, 0.7]); + embeddings.insert("doc_c".to_string(), vec![0.0, 1.0]); + + let index = SemanticSearchIndex { embeddings }; + let results = index.query(&[1.0, 0.0], 10, 0.8); + + assert_eq!(results.len(), 1); + assert_eq!(results[0].0, "doc_a"); + } +} diff --git a/poet/src/semantic_search_index_holder.rs b/poet/src/semantic_search_index_holder.rs new file mode 100644 index 0000000..960ef9d --- /dev/null +++ b/poet/src/semantic_search_index_holder.rs @@ -0,0 +1,25 @@ +use std::sync::Arc; + +use tokio::sync::Notify; +use tokio::sync::RwLock; + +use crate::holder::Holder; +use crate::semantic_search_index::SemanticSearchIndex; + +#[derive(Clone, Default)] +pub struct SemanticSearchIndexHolder { + semantic_search_index: Arc>>>, + pub update_notifier: Arc, +} + +impl Holder for SemanticSearchIndexHolder { + type Item = Arc; + + fn rw_lock(&self) -> Arc>> { + self.semantic_search_index.clone() + } + + fn update_notifier(&self) -> Arc { + self.update_notifier.clone() + } +} diff --git a/poet/src/semantic_search_tool.rs b/poet/src/semantic_search_tool.rs new file mode 100644 index 0000000..314fddb --- /dev/null +++ b/poet/src/semantic_search_tool.rs @@ -0,0 +1,117 @@ +use std::sync::Arc; + +use anyhow::Result; +use async_trait::async_trait; +use paddler_types::embedding_input_document::EmbeddingInputDocument; +use schemars::JsonSchema; +use serde::Deserialize; +use serde::Serialize; + +use crate::generate_embedding::paddler_embedding_client::PaddlerEmbeddingClient; +use crate::mcp::content_block::ContentBlock; +use crate::mcp::content_block::resource_link::ResourceLink; +use crate::mcp::jsonrpc::response::success::tool_call_result::ToolCallResult; +use crate::mcp::jsonrpc::response::success::tool_call_result::success::Success; +use crate::mcp::resource_provider::ResourceProvider as _; +use crate::mcp::resource_provider_list_params::ResourceProviderListParams; +use crate::mcp::tool_call_error_message::ToolCallErrorMessage; +use crate::mcp::tool_provider::ToolProvider; +use crate::mcp::tool_responder::ToolResponder; +use crate::mcp_resource_provider_content_documents::McpResourceProviderContentDocuments; +use crate::semantic_search_index::SemanticSearchIndex; + +const MIN_SCORE: f32 = 0.0; +const TOP_K: usize = 10; + +#[derive(Deserialize, JsonSchema, Serialize)] +pub struct SemanticSearchToolInput { + pub query: String, +} + +#[derive(Deserialize, JsonSchema, Serialize)] +pub struct SemanticSearchToolOutput {} + +pub struct SemanticSearchTool { + pub mcp_resource_provider_content_documents: McpResourceProviderContentDocuments, + pub paddler_embeddings_client: Arc, + pub semantic_search_index: Arc, +} + +impl ToolProvider for SemanticSearchTool { + type Input = SemanticSearchToolInput; + type Output = SemanticSearchToolOutput; + + fn name(&self) -> String { + "semantic_search".to_string() + } + + fn description(&self) -> Option { + Some("Search content using semantic similarity".to_string()) + } +} + +#[async_trait] +impl ToolResponder for SemanticSearchTool { + async fn respond( + &self, + SemanticSearchToolInput { query }: SemanticSearchToolInput, + ) -> Result> { + let query_embedding = self + .paddler_embeddings_client + .generate_embeddings(vec![EmbeddingInputDocument { + id: "query".to_string(), + content: query, + }]) + .await; + + let query_embedding = match query_embedding { + Ok(embeddings) if !embeddings.is_empty() => embeddings.into_iter().next().unwrap(), + Ok(_) => { + return Ok(ToolCallErrorMessage("Embedding service returned no results").into()); + } + Err(err) => { + return Ok( + ToolCallErrorMessage(&format!("Failed to generate embedding: {err}")).into(), + ); + } + }; + + let results = + self.semantic_search_index + .query(&query_embedding.embedding, TOP_K, MIN_SCORE); + + let resource_list = self + .mcp_resource_provider_content_documents + .list_resources(ResourceProviderListParams { + limit: usize::MAX, + offset: 0, + }) + .await?; + + let content: Vec = results + .iter() + .filter_map(|(basename, _score)| { + resource_list.iter().find_map(|resource| { + if resource.name != *basename { + return None; + } + + Some(ContentBlock::ResourceLink(ResourceLink { + description: Some(resource.description.clone()), + mime_type: Some("text/markdown".to_string()), + name: resource.title.clone(), + title: Some(resource.title.clone()), + uri: self + .mcp_resource_provider_content_documents + .resource_uri(basename), + })) + }) + }) + .collect(); + + Ok(ToolCallResult::Success(Success { + content, + structured_content: SemanticSearchToolOutput {}, + })) + } +} From 19b0535aa6c0a914ecad78f304ad70b60e46bc30 Mon Sep 17 00:00:00 2001 From: Propfend Date: Thu, 26 Feb 2026 17:05:19 +0100 Subject: [PATCH 2/2] write custom serializer to serialize the embeddings. Hide the embeddings behind a feature. --- Cargo.lock | 9 +- Cargo.toml | 2 +- embedding_codec/Cargo.toml | 9 + embedding_codec/src/lib.rs | 197 ++++++++++++++++++ poet/Cargo.toml | 9 +- poet/src/build_content_document_sources.rs | 96 +++++++++ .../build_content_document_sources_result.rs | 14 ++ poet/src/build_project/mod.rs | 75 +------ poet/src/cmd/make/app_dir.rs | 12 +- poet/src/cmd/make/embeddings.rs | 46 ++-- poet/src/cmd/make/mod.rs | 1 + poet/src/cmd/mod.rs | 1 + poet/src/cmd/serve/mod.rs | 47 +++-- poet/src/lib.rs | 8 +- poet/src/main.rs | 3 + poet/src/search_tool.rs | 4 + poet/src/semantic_search_index.rs | 6 +- 17 files changed, 418 insertions(+), 121 deletions(-) create mode 100644 embedding_codec/Cargo.toml create mode 100644 embedding_codec/src/lib.rs create mode 100644 poet/src/build_content_document_sources.rs create mode 100644 poet/src/build_content_document_sources_result.rs diff --git a/Cargo.lock b/Cargo.lock index c4de16e..0e13d07 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1124,6 +1124,13 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "embedding_codec" +version = "0.1.0" +dependencies = [ + "anyhow", +] + [[package]] name = "encoding_rs" version = "0.8.35" @@ -2461,12 +2468,12 @@ dependencies = [ "async-stream", "async-trait", "base64", - "bincode", "blake3", "chrono", "clap", "ctrlc", "dashmap", + "embedding_codec", "env_logger", "esbuild-metafile", "freedesktop_entry_parser", diff --git a/Cargo.toml b/Cargo.toml index e939fe6..642cb36 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,12 +1,12 @@ [workspace] resolver = "3" members = [ + "embedding_codec", "poet", "rhai_components", ] [workspace.dependencies] -bincode = "1.3" actix = "0.13.5" actix-cors = "0.7.1" actix-files = "0.6.6" diff --git a/embedding_codec/Cargo.toml b/embedding_codec/Cargo.toml new file mode 100644 index 0000000..0cc7498 --- /dev/null +++ b/embedding_codec/Cargo.toml @@ -0,0 +1,9 @@ +[package] +description = "Binary codec for embedding index files" +license = "Apache-2.0" +name = "embedding_codec" +version = "0.1.0" +edition = "2024" + +[dependencies] +anyhow = { workspace = true } diff --git a/embedding_codec/src/lib.rs b/embedding_codec/src/lib.rs new file mode 100644 index 0000000..67aac18 --- /dev/null +++ b/embedding_codec/src/lib.rs @@ -0,0 +1,197 @@ +use std::collections::BTreeMap; +use std::str::from_utf8; + +use anyhow::Result; +use anyhow::bail; +use anyhow::ensure; + +const EMBEDDING_CODEC_VERSION_STRING: &str = + concat!("embedding_codec_version ", env!("CARGO_PKG_VERSION")); + +pub struct EmbeddingCodec; + +impl EmbeddingCodec { + pub fn serialize(embeddings: &BTreeMap>) -> Vec { + let mut buffer = Vec::new(); + + buffer.extend_from_slice(&(EMBEDDING_CODEC_VERSION_STRING.len() as u32).to_le_bytes()); + buffer.extend_from_slice(EMBEDDING_CODEC_VERSION_STRING.as_bytes()); + buffer.extend_from_slice(&(embeddings.len() as u32).to_le_bytes()); + + for (key, embedding) in embeddings { + let key_bytes = key.as_bytes(); + + buffer.extend_from_slice(&(key_bytes.len() as u32).to_le_bytes()); + buffer.extend_from_slice(key_bytes); + buffer.extend_from_slice(&(embedding.len() as u32).to_le_bytes()); + + for &value in embedding { + buffer.extend_from_slice(&value.to_le_bytes()); + } + } + + buffer + } + + pub fn deserialize(bytes: &[u8]) -> Result>> { + let mut offset = 0; + + let version_length = read_u32_le(bytes, &mut offset)? as usize; + + ensure!( + offset + version_length <= bytes.len(), + "unexpected end of data while reading version" + ); + + let file_version = from_utf8(&bytes[offset..offset + version_length])?; + + if file_version != EMBEDDING_CODEC_VERSION_STRING { + bail!( + "embedding codec version mismatch: file was written with {file_version}, current version is {EMBEDDING_CODEC_VERSION_STRING}" + ); + } + + offset += version_length; + + let entry_count = read_u32_le(bytes, &mut offset)? as usize; + let mut embeddings = BTreeMap::new(); + + for _ in 0..entry_count { + let key_length = read_u32_le(bytes, &mut offset)? as usize; + + ensure!( + offset + key_length <= bytes.len(), + "unexpected end of data while reading key" + ); + + let key = from_utf8(&bytes[offset..offset + key_length])?.to_string(); + offset += key_length; + + let embedding_length = read_u32_le(bytes, &mut offset)? as usize; + let float_byte_length = embedding_length * 4; + + ensure!( + offset + float_byte_length <= bytes.len(), + "unexpected end of data while reading embedding" + ); + + let mut embedding = Vec::with_capacity(embedding_length); + + for index in 0..embedding_length { + let start = offset + index * 4; + let value = f32::from_le_bytes([ + bytes[start], + bytes[start + 1], + bytes[start + 2], + bytes[start + 3], + ]); + + embedding.push(value); + } + + offset += float_byte_length; + embeddings.insert(key, embedding); + } + + if offset != bytes.len() { + bail!( + "trailing data: {} bytes remaining after {} entries", + bytes.len() - offset, + entry_count + ); + } + + Ok(embeddings) + } +} + +fn read_u32_le(bytes: &[u8], offset: &mut usize) -> Result { + ensure!( + *offset + 4 <= bytes.len(), + "unexpected end of data while reading u32 at offset {}", + offset + ); + + let value = u32::from_le_bytes([ + bytes[*offset], + bytes[*offset + 1], + bytes[*offset + 2], + bytes[*offset + 3], + ]); + *offset += 4; + + Ok(value) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_round_trip() { + let mut embeddings = BTreeMap::new(); + embeddings.insert("doc_a".to_string(), vec![1.0, 2.0, 3.0]); + embeddings.insert("doc_b".to_string(), vec![0.5, -0.5]); + embeddings.insert("empty".to_string(), vec![]); + + let bytes = EmbeddingCodec::serialize(&embeddings); + let result = EmbeddingCodec::deserialize(&bytes).unwrap(); + + assert_eq!(embeddings, result); + } + + #[test] + fn test_empty_map() { + let embeddings = BTreeMap::new(); + let bytes = EmbeddingCodec::serialize(&embeddings); + let result = EmbeddingCodec::deserialize(&bytes).unwrap(); + + assert_eq!(embeddings, result); + } + + #[test] + fn test_truncated_data_is_rejected() { + let mut embeddings = BTreeMap::new(); + embeddings.insert("key".to_string(), vec![1.0]); + + let bytes = EmbeddingCodec::serialize(&embeddings); + let truncated = &bytes[..bytes.len() - 2]; + + assert!(EmbeddingCodec::deserialize(truncated).is_err()); + } + + #[test] + fn test_trailing_data_is_rejected() { + let mut embeddings = BTreeMap::new(); + embeddings.insert("key".to_string(), vec![1.0]); + + let mut bytes = EmbeddingCodec::serialize(&embeddings); + bytes.push(0xFF); + + assert!(EmbeddingCodec::deserialize(&bytes).is_err()); + } + + #[test] + fn test_version_mismatch_is_rejected() { + let mut bytes = Vec::new(); + let fake_version = b"99.99.99"; + + bytes.extend_from_slice(&(fake_version.len() as u32).to_le_bytes()); + bytes.extend_from_slice(fake_version); + bytes.extend_from_slice(&0u32.to_le_bytes()); + + let error = EmbeddingCodec::deserialize(&bytes).unwrap_err(); + + assert!(error.to_string().contains("version mismatch")); + } + + #[test] + fn test_serialized_data_starts_with_version() { + let embeddings = BTreeMap::new(); + let bytes = EmbeddingCodec::serialize(&embeddings); + let version_length = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize; + let version = from_utf8(&bytes[4..4 + version_length]).unwrap(); + + assert_eq!(version, EMBEDDING_CODEC_VERSION_STRING); + } +} diff --git a/poet/Cargo.toml b/poet/Cargo.toml index 02b28d7..12a6cd1 100644 --- a/poet/Cargo.toml +++ b/poet/Cargo.toml @@ -5,6 +5,9 @@ name = "poet" version = "0.5.14" edition = "2024" +[features] +embeddings = ["dep:embedding_codec", "dep:paddler_client", "dep:paddler_types"] + [dependencies] actix = { workspace = true } actix-cors = { workspace = true } @@ -14,7 +17,6 @@ actix-web = { workspace = true } actix-web-lab = { workspace = true } actix-ws = { workspace = true } anyhow = { workspace = true } -bincode = { workspace = true } async-stream = { workspace = true } async-trait = { workspace = true } base64 = { workspace = true } @@ -23,6 +25,7 @@ chrono = { workspace = true } clap = { workspace = true } ctrlc = { workspace = true } dashmap = { workspace = true } +embedding_codec = { path = "../embedding_codec", version = "0.1", optional = true } env_logger = { workspace = true } esbuild-metafile = { workspace = true } freedesktop_entry_parser = { workspace = true } @@ -37,8 +40,8 @@ markdown = { workspace = true } mime = { workspace = true } mime_guess = { workspace = true } nom = { workspace = true } -paddler_client = { workspace = true } -paddler_types = { workspace = true } +paddler_client = { workspace = true, optional = true } +paddler_types = { workspace = true, optional = true } notify = { workspace = true } notify-debouncer-full = { workspace = true } petgraph = { workspace = true } diff --git a/poet/src/build_content_document_sources.rs b/poet/src/build_content_document_sources.rs new file mode 100644 index 0000000..8c7bcaa --- /dev/null +++ b/poet/src/build_content_document_sources.rs @@ -0,0 +1,96 @@ +use std::collections::BTreeMap; +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Result; +use anyhow::anyhow; + +use crate::build_content_document_sources_result::BuildContentDocumentSourcesResult; +use crate::content_document::ContentDocument; +use crate::content_document_basename::ContentDocumentBasename; +use crate::content_document_front_matter::ContentDocumentFrontMatter; +use crate::content_document_reference::ContentDocumentReference; +use crate::content_document_source::ContentDocumentSource; +use crate::document_error_collection::DocumentErrorCollection; +use crate::filesystem::Filesystem as _; +use crate::filesystem::storage::Storage; +use crate::find_front_matter_in_mdast::find_front_matter_in_mdast; +use crate::string_to_mdast::string_to_mdast; + +pub async fn build_content_document_sources( + source_filesystem: &Arc, + generated_page_base_path: &str, +) -> Result { + let error_collection: DocumentErrorCollection = Default::default(); + let mut content_document_basename_by_id: HashMap = + HashMap::new(); + let mut content_document_by_basename: HashMap< + ContentDocumentBasename, + ContentDocumentReference, + > = HashMap::new(); + let mut content_document_list: Vec = Vec::new(); + let mut content_document_sources: BTreeMap = + Default::default(); + + for file in source_filesystem.read_project_files().await? { + if file.kind.is_content() { + let mdast = string_to_mdast(&file.contents)?; + let front_matter: ContentDocumentFrontMatter = find_front_matter_in_mdast(&mdast)? + .ok_or_else(|| { + anyhow!("No front matter found in file: {:?}", file.relative_path) + })?; + + let basename_path = file.get_stem_path_relative_to(&PathBuf::from("content")); + let basename: ContentDocumentBasename = basename_path.clone().into(); + let content_document_reference = ContentDocumentReference { + basename_path, + front_matter: front_matter.clone(), + generated_page_base_path: generated_page_base_path.to_string(), + }; + + if let Some(id) = &front_matter.id { + if content_document_basename_by_id.contains_key(id) { + error_collection.register_error( + content_document_reference.basename().to_string(), + anyhow!("Duplicate document id: #{id} in '{basename}'"), + ); + } + + content_document_basename_by_id.insert(id.clone(), basename.clone()); + } + + content_document_by_basename + .insert(basename.clone(), content_document_reference.clone()); + content_document_list.push(ContentDocument { + mdast: mdast.clone(), + reference: content_document_reference.clone(), + }); + + if content_document_reference.front_matter.render { + let relative_path = format!("{basename}.md"); + + content_document_sources.insert( + basename, + ContentDocumentSource { + file_entry: file, + mdast, + reference: content_document_reference, + relative_path, + }, + ); + } + } + } + + if !error_collection.is_empty() { + return Err(anyhow!("{error_collection}")); + } + + Ok(BuildContentDocumentSourcesResult { + content_document_basename_by_id, + content_document_by_basename, + content_document_list, + content_document_sources, + }) +} diff --git a/poet/src/build_content_document_sources_result.rs b/poet/src/build_content_document_sources_result.rs new file mode 100644 index 0000000..2682cba --- /dev/null +++ b/poet/src/build_content_document_sources_result.rs @@ -0,0 +1,14 @@ +use std::collections::BTreeMap; +use std::collections::HashMap; + +use crate::content_document::ContentDocument; +use crate::content_document_basename::ContentDocumentBasename; +use crate::content_document_reference::ContentDocumentReference; +use crate::content_document_source::ContentDocumentSource; + +pub struct BuildContentDocumentSourcesResult { + pub content_document_basename_by_id: HashMap, + pub content_document_by_basename: HashMap, + pub content_document_list: Vec, + pub content_document_sources: BTreeMap, +} diff --git a/poet/src/build_project/mod.rs b/poet/src/build_project/mod.rs index 4fbb4c6..066f7b9 100644 --- a/poet/src/build_project/mod.rs +++ b/poet/src/build_project/mod.rs @@ -4,11 +4,9 @@ pub mod build_project_result_holder; pub mod build_project_result_stub; mod content_document_rendering_context; -use std::collections::BTreeMap; use std::collections::HashMap; use std::collections::HashSet; use std::path::Path; -use std::path::PathBuf; use std::sync::Arc; use anyhow::Result; @@ -23,28 +21,25 @@ use syntect::parsing::SyntaxSet; use crate::asset_manager::AssetManager; use crate::author_resolve_result::AuthorResolveResult; +use crate::build_content_document_sources::build_content_document_sources; +use crate::build_content_document_sources_result::BuildContentDocumentSourcesResult; use crate::build_project::build_project_params::BuildProjectParams; use crate::build_project::build_project_result_stub::BuildProjectResultStub; use crate::build_project::content_document_rendering_context::ContentDocumentRenderingContext; use crate::build_timer::BuildTimer; use crate::content_document::ContentDocument; -use crate::content_document_basename::ContentDocumentBasename; use crate::content_document_collection::ContentDocumentCollection; use crate::content_document_collection_ranked::ContentDocumentCollectionRanked; use crate::content_document_component_context::ContentDocumentComponentContext; -use crate::content_document_front_matter::ContentDocumentFrontMatter; use crate::content_document_in_collection::ContentDocumentInCollection; use crate::content_document_linker::ContentDocumentLinker; use crate::content_document_reference::ContentDocumentReference; -use crate::content_document_source::ContentDocumentSource; use crate::document_error_collection::DocumentErrorCollection; use crate::eval_content_document_mdast::eval_content_document_mdast; use crate::filesystem::Filesystem as _; use crate::filesystem::memory::Memory; -use crate::find_front_matter_in_mdast::find_front_matter_in_mdast; use crate::find_table_of_contents_in_mdast::find_table_of_contents_in_mdast; use crate::generate_sitemap::create_sitemap; -use crate::string_to_mdast::string_to_mdast; fn render_document<'render>( ContentDocumentRenderingContext { @@ -130,69 +125,17 @@ pub async fn build_project( let memory_filesystem = Arc::new(Memory::default()); let syntax_set = SyntaxSet::load_defaults_newlines(); - let mut content_document_basename_by_id: HashMap = - HashMap::new(); - let mut content_document_by_basename: HashMap< - ContentDocumentBasename, - ContentDocumentReference, - > = HashMap::new(); + let BuildContentDocumentSourcesResult { + content_document_basename_by_id, + content_document_by_basename, + content_document_list, + content_document_sources, + } = build_content_document_sources(&source_filesystem, &generated_page_base_path).await?; + let mut content_document_collections: HashMap = HashMap::new(); let mut content_document_collections_ranked: HashMap = HashMap::new(); - let mut content_document_list: Vec = Vec::new(); - let mut content_document_sources: BTreeMap = - Default::default(); - - for file in source_filesystem.read_project_files().await? { - if file.kind.is_content() { - let mdast = string_to_mdast(&file.contents)?; - let front_matter: ContentDocumentFrontMatter = find_front_matter_in_mdast(&mdast)? - .ok_or_else(|| { - anyhow!("No front matter found in file: {:?}", file.relative_path) - })?; - - let basename_path = file.get_stem_path_relative_to(&PathBuf::from("content")); - let basename: ContentDocumentBasename = basename_path.clone().into(); - let content_document_reference = ContentDocumentReference { - basename_path, - front_matter: front_matter.clone(), - generated_page_base_path: generated_page_base_path.clone(), - }; - - if let Some(id) = &front_matter.id { - if content_document_basename_by_id.contains_key(id) { - error_collection.register_error( - content_document_reference.basename().to_string(), - anyhow!("Duplicate document id: #{id} in '{basename}'"), - ); - } - - content_document_basename_by_id.insert(id.clone(), basename.clone()); - } - - content_document_by_basename - .insert(basename.clone(), content_document_reference.clone()); - content_document_list.push(ContentDocument { - mdast: mdast.clone(), - reference: content_document_reference.clone(), - }); - - if content_document_reference.front_matter.render { - let relative_path = format!("{basename}.md"); - - content_document_sources.insert( - basename, - ContentDocumentSource { - file_entry: file, - mdast, - reference: content_document_reference, - relative_path, - }, - ); - } - } - } // Validate before/after/parent documents in collections for reference in content_document_by_basename.values() { diff --git a/poet/src/cmd/make/app_dir.rs b/poet/src/cmd/make/app_dir.rs index 6efc7b4..9098972 100644 --- a/poet/src/cmd/make/app_dir.rs +++ b/poet/src/cmd/make/app_dir.rs @@ -8,11 +8,13 @@ use async_trait::async_trait; use clap::Parser; use indoc::formatdoc; use log::info; +#[cfg(feature = "embeddings")] use log::warn; use tokio::fs; use crate::app_dir_desktop_entry::AppDirDesktopEntry; use crate::assert_valid_desktop_entry_string::assert_valid_desktop_entry_string; +#[cfg(feature = "embeddings")] use crate::cmd::EMBEDDINGS_FILENAME; use crate::cmd::builds_project::BuildsProject; use crate::cmd::handler::Handler; @@ -29,6 +31,7 @@ const ICON: &str = r#", @@ -54,11 +57,12 @@ impl AppDir { } fn render_app_run_file(&self) -> Result { + #[cfg(feature = "embeddings")] if self.embeddings_file.is_some() { - self.render_app_run_file_with_embeddings() - } else { - self.render_app_run_file_without_embeddings() + return self.render_app_run_file_with_embeddings(); } + + self.render_app_run_file_without_embeddings() } fn render_app_run_file_without_embeddings(&self) -> Result { @@ -105,6 +109,7 @@ impl AppDir { }) } + #[cfg(feature = "embeddings")] fn render_app_run_file_with_embeddings(&self) -> Result { Ok(formatdoc! { r#" @@ -201,6 +206,7 @@ impl Handler for AppDir { ) .await?; + #[cfg(feature = "embeddings")] if let Some(embeddings_file) = &self.embeddings_file { info!("Copying embeddings to AppDir..."); diff --git a/poet/src/cmd/make/embeddings.rs b/poet/src/cmd/make/embeddings.rs index 1e7739f..759acad 100644 --- a/poet/src/cmd/make/embeddings.rs +++ b/poet/src/cmd/make/embeddings.rs @@ -1,31 +1,27 @@ use std::collections::BTreeMap; -use std::fs; use std::net::SocketAddr; use std::path::PathBuf; use std::str::FromStr; +use anyhow::Context as _; use anyhow::Result; use async_trait::async_trait; use clap::Parser; +use embedding_codec::EmbeddingCodec; use log::info; use paddler_types::embedding::Embedding; use paddler_types::embedding_input_document::EmbeddingInputDocument; +use tokio::fs; use url::Url; -use crate::asset_path_renderer::AssetPathRenderer; -use crate::build_authors::build_authors; -use crate::build_project::build_project; -use crate::build_project::build_project_params::BuildProjectParams; -use crate::build_project::build_project_result_stub::BuildProjectResultStub; +use crate::build_content_document_sources::build_content_document_sources; use crate::build_timer::BuildTimer; use crate::cmd::builds_project::BuildsProject; use crate::cmd::handler::Handler; use crate::cmd::value_parser::parse_socket_addr; use crate::cmd::value_parser::validate_is_directory; -use crate::compile_shortcodes::compile_shortcodes; use crate::find_text_content_in_mdast::find_text_content_in_mdast; use crate::generate_embedding::paddler_embedding_client::PaddlerEmbeddingClient; -use crate::read_esbuild_metafile_or_default::read_esbuild_metafile_or_default; #[derive(Parser)] pub struct Embeddings { @@ -49,25 +45,10 @@ impl BuildsProject for Embeddings { impl Handler for Embeddings { async fn handle(&self) -> Result<()> { let source_filesystem = self.source_filesystem(); - let rhai_template_renderer = compile_shortcodes(source_filesystem.clone()).await?; - let authors = build_authors(source_filesystem.clone()).await?; - - let BuildProjectResultStub { - content_document_sources, - .. - } = build_project(BuildProjectParams { - asset_path_renderer: AssetPathRenderer { - base_path: String::new(), - }, - generate_sitemap: false, - authors, - esbuild_metafile: read_esbuild_metafile_or_default(source_filesystem.clone()).await?, - generated_page_base_path: String::new(), - is_watching: false, - rhai_template_renderer, - source_filesystem, - }) - .await?; + + let content_document_sources = build_content_document_sources(&source_filesystem, "") + .await? + .content_document_sources; let documents: Vec = content_document_sources .iter() @@ -98,14 +79,21 @@ impl Handler for Embeddings { .into_iter() .map(|embedding| (embedding.source_document_id, embedding.embedding)) .collect(); + + let encoded = EmbeddingCodec::serialize(&embeddings_map); + info!( "Saving {} embeddings to {}...", embeddings_map.len(), self.output_file.display() ); - let encoded = bincode::serialize(&embeddings_map)?; - fs::write(&self.output_file, encoded)?; + fs::write(&self.output_file, encoded) + .await + .context(format!( + "Failed to write file: {}", + self.output_file.display() + ))?; info!("Done."); diff --git a/poet/src/cmd/make/mod.rs b/poet/src/cmd/make/mod.rs index ad6bde4..77418e3 100644 --- a/poet/src/cmd/make/mod.rs +++ b/poet/src/cmd/make/mod.rs @@ -1,3 +1,4 @@ pub mod app_dir; +#[cfg(feature = "embeddings")] pub mod embeddings; pub mod static_pages; diff --git a/poet/src/cmd/mod.rs b/poet/src/cmd/mod.rs index 39c4623..40a664d 100644 --- a/poet/src/cmd/mod.rs +++ b/poet/src/cmd/mod.rs @@ -9,5 +9,6 @@ mod service_manager; mod value_parser; pub mod watch; +#[cfg(feature = "embeddings")] const EMBEDDINGS_FILENAME: &str = "embeddings.bin"; const STATIC_FILES_PUBLIC_PATH: &str = "assets"; diff --git a/poet/src/cmd/serve/mod.rs b/poet/src/cmd/serve/mod.rs index 1aca462..8ef3039 100644 --- a/poet/src/cmd/serve/mod.rs +++ b/poet/src/cmd/serve/mod.rs @@ -2,11 +2,12 @@ mod app_data; mod http_route; use std::net::SocketAddr; -use std::path::Path; use std::path::PathBuf; +#[cfg(feature = "embeddings")] use std::str::FromStr; use std::sync::Arc; +#[cfg(feature = "embeddings")] use url::Url; use actix_files::Files; @@ -18,6 +19,8 @@ use indoc::formatdoc; use async_trait::async_trait; use clap::Parser; use log::info; +#[cfg(feature = "embeddings")] +use log::warn; use crate::app_dir_desktop_entry::AppDirDesktopEntry; use crate::asset_path_renderer::AssetPathRenderer; @@ -28,8 +31,10 @@ use crate::build_project::build_project_result::BuildProjectResult; use crate::build_project::build_project_result_holder::BuildProjectResultHolder; use crate::build_prompt_document_controller_collection::build_prompt_document_controller_collection; use crate::build_prompt_document_controller_collection::build_prompt_document_controller_collection_params::BuildPromptControllerCollectionParams; +#[cfg(feature = "embeddings")] use crate::generate_embedding::paddler_embedding_client::PaddlerEmbeddingClient; use crate::holder::Holder as _; +#[cfg(feature = "embeddings")] use crate::cmd::EMBEDDINGS_FILENAME; use crate::cmd::STATIC_FILES_PUBLIC_PATH; use crate::cmd::builds_project::BuildsProject; @@ -53,7 +58,9 @@ use crate::search_index::SearchIndex; use crate::search_index_reader::SearchIndexReader; use crate::search_index_reader_holder::SearchIndexReaderHolder; use crate::search_tool::SearchTool; +#[cfg(feature = "embeddings")] use crate::semantic_search_index::SemanticSearchIndex; +#[cfg(feature = "embeddings")] use crate::semantic_search_tool::SemanticSearchTool; #[derive(Parser)] @@ -67,6 +74,7 @@ pub struct Serve { #[arg(long)] app_name: String, + #[cfg(feature = "embeddings")] #[arg(long, value_parser = parse_socket_addr)] paddler_addr: Option, @@ -195,20 +203,29 @@ impl Handler for Serve { search_index_reader_holder: search_index_reader_holder.clone(), }); - let embeddings_path = Path::new(EMBEDDINGS_FILENAME); - - if let Some(paddler_addr) = &self.paddler_addr { - let semantic_search_index = - Arc::new(SemanticSearchIndex::load_from_file(&embeddings_path)?); - let inference_url = Url::from_str(&format!("http://{paddler_addr}"))?; - let paddler_embeddings_client = Arc::new(PaddlerEmbeddingClient::new(inference_url)); - - tool_registry.register_owned(SemanticSearchTool { - mcp_resource_provider_content_documents: mcp_resource_provider_content_documents - .clone(), - paddler_embeddings_client, - semantic_search_index, - }); + #[cfg(feature = "embeddings")] + { + let embeddings_path = PathBuf::from(EMBEDDINGS_FILENAME); + + match &self.paddler_addr { + Some(paddler_addr) => { + let semantic_search_index = + Arc::new(SemanticSearchIndex::load_from_file(&embeddings_path)?); + let inference_url = Url::from_str(&format!("http://{paddler_addr}"))?; + let paddler_embeddings_client = + Arc::new(PaddlerEmbeddingClient::new(inference_url)); + + tool_registry.register_owned(SemanticSearchTool { + mcp_resource_provider_content_documents: + mcp_resource_provider_content_documents.clone(), + paddler_embeddings_client, + semantic_search_index, + }); + } + None => { + warn!("Paddler address not provided, semantic search tool will be disabled"); + } + } } let tool_registry_arc: Arc = Arc::new(tool_registry); diff --git a/poet/src/lib.rs b/poet/src/lib.rs index f57b342..bba7d3d 100644 --- a/poet/src/lib.rs +++ b/poet/src/lib.rs @@ -9,6 +9,8 @@ pub mod author_collection; pub mod author_data; pub mod author_resolve_result; pub mod build_authors; +pub mod build_content_document_sources; +pub mod build_content_document_sources_result; pub mod build_project; pub mod build_prompt_document_controller; pub mod build_prompt_document_controller_collection; @@ -43,8 +45,9 @@ pub mod find_front_matter_in_mdast; pub mod find_table_of_contents_in_mdast; pub mod find_text_content_in_mdast; pub mod flexible_datetime; -pub mod generate_sitemap; +#[cfg(feature = "embeddings")] pub mod generate_embedding; +pub mod generate_sitemap; pub mod holder; pub mod is_external_link; pub mod is_valid_desktop_entry_string; @@ -69,8 +72,11 @@ pub mod search_index_reader; pub mod search_index_reader_holder; pub mod search_index_schema; pub mod search_tool; +#[cfg(feature = "embeddings")] pub mod semantic_search_index; +#[cfg(feature = "embeddings")] pub mod semantic_search_index_holder; +#[cfg(feature = "embeddings")] pub mod semantic_search_tool; pub mod string_to_mdast; pub mod table_of_contents; diff --git a/poet/src/main.rs b/poet/src/main.rs index 4230ac9..1b400ef 100644 --- a/poet/src/main.rs +++ b/poet/src/main.rs @@ -3,6 +3,7 @@ use clap::Parser; use clap::Subcommand; use poet::cmd::handler::Handler; use poet::cmd::make::app_dir::AppDir; +#[cfg(feature = "embeddings")] use poet::cmd::make::embeddings::Embeddings; use poet::cmd::make::static_pages::StaticPages; use poet::cmd::serve::Serve; @@ -33,6 +34,7 @@ enum Make { /// Generates AppDir (packageable with AppImageKit) AppDir(AppDir), /// Generates embeddings for all content documents + #[cfg(feature = "embeddings")] Embeddings(Embeddings), /// Generates static pages StaticPages(StaticPages), @@ -42,6 +44,7 @@ fn get_handler() -> Option> { match Cli::parse().command { Some(Commands::Make { command }) => match command { Make::AppDir(handler) => Some(Box::new(handler)), + #[cfg(feature = "embeddings")] Make::Embeddings(handler) => Some(Box::new(handler)), Make::StaticPages(handler) => Some(Box::new(handler)), }, diff --git a/poet/src/search_tool.rs b/poet/src/search_tool.rs index 674c7ee..b994b5d 100644 --- a/poet/src/search_tool.rs +++ b/poet/src/search_tool.rs @@ -41,6 +41,10 @@ impl ToolProvider for SearchTool { fn name(&self) -> String { "search".to_string() } + + fn description(&self) -> Option { + Some("Search content using key-word match".to_string()) + } } #[async_trait] diff --git a/poet/src/semantic_search_index.rs b/poet/src/semantic_search_index.rs index 3cbb097..fee9996 100644 --- a/poet/src/semantic_search_index.rs +++ b/poet/src/semantic_search_index.rs @@ -1,8 +1,10 @@ +use std::cmp::Ordering; use std::collections::BTreeMap; use std::fs; use std::path::Path; use anyhow::Result; +use embedding_codec::EmbeddingCodec; use log::debug; pub struct SemanticSearchIndex { @@ -12,7 +14,7 @@ pub struct SemanticSearchIndex { impl SemanticSearchIndex { pub fn load_from_file(path: &Path) -> Result { let bytes = fs::read(path)?; - let embeddings: BTreeMap> = bincode::deserialize(&bytes)?; + let embeddings = EmbeddingCodec::deserialize(&bytes)?; Ok(Self { embeddings }) } @@ -32,7 +34,7 @@ impl SemanticSearchIndex { .filter(|(_basename, score)| *score >= min_score) .collect(); - scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal)); scored.truncate(top_k); scored