|
4 | 4 | grid_search_params: |
5 | 5 | chunk_size: [10, 20, 50, 100, 250] |
6 | 6 | chunk_overlap: [0, 5, 10, 25, 50] |
7 | | - chunking_strategy: ["langchain", "raw", "semchunk", "nltk", "spacy"] |
8 | | - similarity_metrics: ["cosine", "euclidean", "manhattan", "dot_product", "chebyshev"] |
| 7 | + chunking_strategy: ["langchain", "raw", "semchunk", "nltk"] |
| 8 | + similarity_metrics: ["cosine", "euclidean", "dot_product"] |
9 | 9 | themes: |
10 | 10 | sports: ["ball", "team", "stadium", "game", "player", "match", "competition", "score", "victory", "defeat", "training"] |
11 | 11 | architecture: ["building", "structure", "design", "construction", "urbanism", "facade", "materials", "bridge", "tower", "window", "roof"] |
12 | 12 | cuisine: ["food", "restaurant", "recipe", "ingredient", "dish", "flavor", "meal", "kitchen", "chef", "taste", "menu"] |
13 | 13 |
|
14 | 14 | # Models to be tested in the grid search |
15 | 15 | models_to_test: |
| 16 | + - type: "sentence_transformers" |
| 17 | + name: "Qwen/Qwen3-Embedding-0.6B" |
| 18 | + dimensions: 1024 |
| 19 | + max_tokens: 32768 |
| 20 | + pooling_strategy: "average" |
16 | 21 | - type: "sentence_transformers" |
17 | 22 | name: "google/embeddinggemma-300m" |
18 | 23 | dimensions: 768 |
| 24 | + max_tokens: 2048 |
| 25 | + pooling_strategy: "average" |
19 | 26 | - type: "fastembed" |
20 | 27 | name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" |
21 | 28 | dimensions: 384 |
22 | | - - type: "fastembed" |
23 | | - name: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" |
24 | | - dimensions: 768 |
| 29 | + max_tokens: 512 |
| 30 | + pooling_strategy: "average" |
25 | 31 | - type: "fastembed" |
26 | 32 | name: "jinaai/jina-embeddings-v3" |
27 | 33 | dimensions: 1024 |
| 34 | + max_tokens: 8192 |
| 35 | + pooling_strategy: "average" |
| 36 | + - type: "sentence_transformers" |
| 37 | + name: "jinaai/jina-embeddings-v5-text-small-retrieval" |
| 38 | + dimensions: 1024 |
| 39 | + max_tokens: 32768 |
| 40 | + pooling_strategy: "average" |
| 41 | + - type: "sentence_transformers" |
| 42 | + name: "KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5" |
| 43 | + dimensions: 896 |
| 44 | + max_tokens: 32768 |
| 45 | + pooling_strategy: "average" |
| 46 | + - type: "sentence_transformers" |
| 47 | + name: "voyageai/voyage-4-nano" |
| 48 | + dimensions: 1024 |
| 49 | + max_tokens: 32000 |
| 50 | + pooling_strategy: "average" |
28 | 51 |
|
29 | 52 | # General settings |
30 | | -similarity_threshold: 0.6 |
31 | 53 | output_dir: "reports" |
32 | 54 | generate_filtered_markdowns: false |
33 | 55 |
|
|
0 commit comments