-
Notifications
You must be signed in to change notification settings - Fork 2
Embeddings #30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Embeddings #30
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| [package] | ||
| description = "Binary codec for embedding index files" | ||
| license = "Apache-2.0" | ||
| name = "embedding_codec" | ||
| version = "0.1.0" | ||
| edition = "2024" | ||
|
|
||
| [dependencies] | ||
| anyhow = { workspace = true } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,197 @@ | ||
| use std::collections::BTreeMap; | ||
| use std::str::from_utf8; | ||
|
|
||
| use anyhow::Result; | ||
| use anyhow::bail; | ||
| use anyhow::ensure; | ||
|
|
||
| const EMBEDDING_CODEC_VERSION_STRING: &str = | ||
| concat!("embedding_codec_version ", env!("CARGO_PKG_VERSION")); | ||
|
|
||
| pub struct EmbeddingCodec; | ||
|
|
||
| impl EmbeddingCodec { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would be nice to support arbitrary number of metadata parameters. Usually that is achieved by some field in the header that points to a place in a blob where embeddings actually start. So we could have the blob organized somewhat like: The blob organization is just to give the idea. Being flexible with metadata can help us long-term though |
||
| pub fn serialize(embeddings: &BTreeMap<String, Vec<f32>>) -> Vec<u8> { | ||
| let mut buffer = Vec::new(); | ||
|
|
||
| buffer.extend_from_slice(&(EMBEDDING_CODEC_VERSION_STRING.len() as u32).to_le_bytes()); | ||
| buffer.extend_from_slice(EMBEDDING_CODEC_VERSION_STRING.as_bytes()); | ||
| buffer.extend_from_slice(&(embeddings.len() as u32).to_le_bytes()); | ||
|
|
||
| for (key, embedding) in embeddings { | ||
| let key_bytes = key.as_bytes(); | ||
|
|
||
| buffer.extend_from_slice(&(key_bytes.len() as u32).to_le_bytes()); | ||
| buffer.extend_from_slice(key_bytes); | ||
| buffer.extend_from_slice(&(embedding.len() as u32).to_le_bytes()); | ||
|
|
||
| for &value in embedding { | ||
| buffer.extend_from_slice(&value.to_le_bytes()); | ||
| } | ||
| } | ||
|
|
||
| buffer | ||
| } | ||
|
|
||
| pub fn deserialize(bytes: &[u8]) -> Result<BTreeMap<String, Vec<f32>>> { | ||
| let mut offset = 0; | ||
|
|
||
| let version_length = read_u32_le(bytes, &mut offset)? as usize; | ||
|
|
||
| ensure!( | ||
| offset + version_length <= bytes.len(), | ||
| "unexpected end of data while reading version" | ||
| ); | ||
|
|
||
| let file_version = from_utf8(&bytes[offset..offset + version_length])?; | ||
|
|
||
| if file_version != EMBEDDING_CODEC_VERSION_STRING { | ||
| bail!( | ||
| "embedding codec version mismatch: file was written with {file_version}, current version is {EMBEDDING_CODEC_VERSION_STRING}" | ||
| ); | ||
| } | ||
|
|
||
| offset += version_length; | ||
|
|
||
| let entry_count = read_u32_le(bytes, &mut offset)? as usize; | ||
| let mut embeddings = BTreeMap::new(); | ||
|
|
||
| for _ in 0..entry_count { | ||
| let key_length = read_u32_le(bytes, &mut offset)? as usize; | ||
|
|
||
| ensure!( | ||
| offset + key_length <= bytes.len(), | ||
| "unexpected end of data while reading key" | ||
| ); | ||
|
|
||
| let key = from_utf8(&bytes[offset..offset + key_length])?.to_string(); | ||
| offset += key_length; | ||
|
|
||
| let embedding_length = read_u32_le(bytes, &mut offset)? as usize; | ||
| let float_byte_length = embedding_length * 4; | ||
|
|
||
| ensure!( | ||
| offset + float_byte_length <= bytes.len(), | ||
| "unexpected end of data while reading embedding" | ||
| ); | ||
|
|
||
| let mut embedding = Vec::with_capacity(embedding_length); | ||
|
|
||
| for index in 0..embedding_length { | ||
| let start = offset + index * 4; | ||
| let value = f32::from_le_bytes([ | ||
| bytes[start], | ||
| bytes[start + 1], | ||
| bytes[start + 2], | ||
| bytes[start + 3], | ||
| ]); | ||
|
|
||
| embedding.push(value); | ||
| } | ||
|
|
||
| offset += float_byte_length; | ||
| embeddings.insert(key, embedding); | ||
| } | ||
|
|
||
| if offset != bytes.len() { | ||
| bail!( | ||
| "trailing data: {} bytes remaining after {} entries", | ||
| bytes.len() - offset, | ||
| entry_count | ||
| ); | ||
| } | ||
|
|
||
| Ok(embeddings) | ||
| } | ||
| } | ||
|
|
||
| fn read_u32_le(bytes: &[u8], offset: &mut usize) -> Result<u32> { | ||
| ensure!( | ||
| *offset + 4 <= bytes.len(), | ||
| "unexpected end of data while reading u32 at offset {}", | ||
| offset | ||
| ); | ||
|
|
||
| let value = u32::from_le_bytes([ | ||
| bytes[*offset], | ||
| bytes[*offset + 1], | ||
| bytes[*offset + 2], | ||
| bytes[*offset + 3], | ||
| ]); | ||
| *offset += 4; | ||
|
|
||
| Ok(value) | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
|
|
||
| #[test] | ||
| fn test_round_trip() { | ||
| let mut embeddings = BTreeMap::new(); | ||
| embeddings.insert("doc_a".to_string(), vec![1.0, 2.0, 3.0]); | ||
| embeddings.insert("doc_b".to_string(), vec![0.5, -0.5]); | ||
| embeddings.insert("empty".to_string(), vec![]); | ||
|
|
||
| let bytes = EmbeddingCodec::serialize(&embeddings); | ||
| let result = EmbeddingCodec::deserialize(&bytes).unwrap(); | ||
|
|
||
| assert_eq!(embeddings, result); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_empty_map() { | ||
| let embeddings = BTreeMap::new(); | ||
| let bytes = EmbeddingCodec::serialize(&embeddings); | ||
| let result = EmbeddingCodec::deserialize(&bytes).unwrap(); | ||
|
|
||
| assert_eq!(embeddings, result); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_truncated_data_is_rejected() { | ||
| let mut embeddings = BTreeMap::new(); | ||
| embeddings.insert("key".to_string(), vec![1.0]); | ||
|
|
||
| let bytes = EmbeddingCodec::serialize(&embeddings); | ||
| let truncated = &bytes[..bytes.len() - 2]; | ||
|
|
||
| assert!(EmbeddingCodec::deserialize(truncated).is_err()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_trailing_data_is_rejected() { | ||
| let mut embeddings = BTreeMap::new(); | ||
| embeddings.insert("key".to_string(), vec![1.0]); | ||
|
|
||
| let mut bytes = EmbeddingCodec::serialize(&embeddings); | ||
| bytes.push(0xFF); | ||
|
|
||
| assert!(EmbeddingCodec::deserialize(&bytes).is_err()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_version_mismatch_is_rejected() { | ||
| let mut bytes = Vec::new(); | ||
| let fake_version = b"99.99.99"; | ||
|
|
||
| bytes.extend_from_slice(&(fake_version.len() as u32).to_le_bytes()); | ||
| bytes.extend_from_slice(fake_version); | ||
| bytes.extend_from_slice(&0u32.to_le_bytes()); | ||
|
|
||
| let error = EmbeddingCodec::deserialize(&bytes).unwrap_err(); | ||
|
|
||
| assert!(error.to_string().contains("version mismatch")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_serialized_data_starts_with_version() { | ||
| let embeddings = BTreeMap::new(); | ||
| let bytes = EmbeddingCodec::serialize(&embeddings); | ||
| let version_length = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize; | ||
| let version = from_utf8(&bytes[4..4 + version_length]).unwrap(); | ||
|
|
||
| assert_eq!(version, EMBEDDING_CODEC_VERSION_STRING); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| use std::collections::BTreeMap; | ||
| use std::collections::HashMap; | ||
| use std::path::PathBuf; | ||
| use std::sync::Arc; | ||
|
|
||
| use anyhow::Result; | ||
| use anyhow::anyhow; | ||
|
|
||
| use crate::build_content_document_sources_result::BuildContentDocumentSourcesResult; | ||
| use crate::content_document::ContentDocument; | ||
| use crate::content_document_basename::ContentDocumentBasename; | ||
| use crate::content_document_front_matter::ContentDocumentFrontMatter; | ||
| use crate::content_document_reference::ContentDocumentReference; | ||
| use crate::content_document_source::ContentDocumentSource; | ||
| use crate::document_error_collection::DocumentErrorCollection; | ||
| use crate::filesystem::Filesystem as _; | ||
| use crate::filesystem::storage::Storage; | ||
| use crate::find_front_matter_in_mdast::find_front_matter_in_mdast; | ||
| use crate::string_to_mdast::string_to_mdast; | ||
|
|
||
| pub async fn build_content_document_sources( | ||
| source_filesystem: &Arc<Storage>, | ||
| generated_page_base_path: &str, | ||
| ) -> Result<BuildContentDocumentSourcesResult> { | ||
| let error_collection: DocumentErrorCollection = Default::default(); | ||
| let mut content_document_basename_by_id: HashMap<String, ContentDocumentBasename> = | ||
| HashMap::new(); | ||
| let mut content_document_by_basename: HashMap< | ||
| ContentDocumentBasename, | ||
| ContentDocumentReference, | ||
| > = HashMap::new(); | ||
| let mut content_document_list: Vec<ContentDocument> = Vec::new(); | ||
| let mut content_document_sources: BTreeMap<ContentDocumentBasename, ContentDocumentSource> = | ||
| Default::default(); | ||
|
|
||
| for file in source_filesystem.read_project_files().await? { | ||
| if file.kind.is_content() { | ||
| let mdast = string_to_mdast(&file.contents)?; | ||
| let front_matter: ContentDocumentFrontMatter = find_front_matter_in_mdast(&mdast)? | ||
| .ok_or_else(|| { | ||
| anyhow!("No front matter found in file: {:?}", file.relative_path) | ||
| })?; | ||
|
|
||
| let basename_path = file.get_stem_path_relative_to(&PathBuf::from("content")); | ||
| let basename: ContentDocumentBasename = basename_path.clone().into(); | ||
| let content_document_reference = ContentDocumentReference { | ||
| basename_path, | ||
| front_matter: front_matter.clone(), | ||
| generated_page_base_path: generated_page_base_path.to_string(), | ||
| }; | ||
|
|
||
| if let Some(id) = &front_matter.id { | ||
| if content_document_basename_by_id.contains_key(id) { | ||
| error_collection.register_error( | ||
| content_document_reference.basename().to_string(), | ||
| anyhow!("Duplicate document id: #{id} in '{basename}'"), | ||
| ); | ||
| } | ||
|
|
||
| content_document_basename_by_id.insert(id.clone(), basename.clone()); | ||
| } | ||
|
|
||
| content_document_by_basename | ||
| .insert(basename.clone(), content_document_reference.clone()); | ||
| content_document_list.push(ContentDocument { | ||
| mdast: mdast.clone(), | ||
| reference: content_document_reference.clone(), | ||
| }); | ||
|
|
||
| if content_document_reference.front_matter.render { | ||
| let relative_path = format!("{basename}.md"); | ||
|
|
||
| content_document_sources.insert( | ||
| basename, | ||
| ContentDocumentSource { | ||
| file_entry: file, | ||
| mdast, | ||
| reference: content_document_reference, | ||
| relative_path, | ||
| }, | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if !error_collection.is_empty() { | ||
| return Err(anyhow!("{error_collection}")); | ||
| } | ||
|
|
||
| Ok(BuildContentDocumentSourcesResult { | ||
| content_document_basename_by_id, | ||
| content_document_by_basename, | ||
| content_document_list, | ||
| content_document_sources, | ||
| }) | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| use std::collections::BTreeMap; | ||
| use std::collections::HashMap; | ||
|
|
||
| use crate::content_document::ContentDocument; | ||
| use crate::content_document_basename::ContentDocumentBasename; | ||
| use crate::content_document_reference::ContentDocumentReference; | ||
| use crate::content_document_source::ContentDocumentSource; | ||
|
|
||
| pub struct BuildContentDocumentSourcesResult { | ||
| pub content_document_basename_by_id: HashMap<String, ContentDocumentBasename>, | ||
| pub content_document_by_basename: HashMap<ContentDocumentBasename, ContentDocumentReference>, | ||
| pub content_document_list: Vec<ContentDocument>, | ||
| pub content_document_sources: BTreeMap<ContentDocumentBasename, ContentDocumentSource>, | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The workspace dependencies reference external path dependencies (
../paddler/paddler_clientand../paddler/paddler_types) that are not part of this repository. This makes the build depend on code outside the repository, which can cause issues for users trying to build the project. Consider one of the following approaches: 1) Include the paddler dependencies as git submodules, 2) Publish these dependencies to crates.io and use version dependencies, or 3) Document the requirement for users to have the paddler repository cloned as a sibling directory.