Skip to content

Commit d92907e

Browse files
Refactor grid search parameters and model settings
1 parent 3ea264b commit d92907e

1 file changed

Lines changed: 28 additions & 6 deletions

File tree

configs/chicago.yml

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,52 @@
44
grid_search_params:
55
chunk_size: [10, 20, 50, 100, 250]
66
chunk_overlap: [0, 5, 10, 25, 50]
7-
chunking_strategy: ["langchain", "raw", "semchunk", "nltk", "spacy"]
8-
similarity_metrics: ["cosine", "euclidean", "manhattan", "dot_product", "chebyshev"]
7+
chunking_strategy: ["langchain", "raw", "semchunk", "nltk"]
8+
similarity_metrics: ["cosine", "euclidean", "dot_product"]
99
themes:
1010
sports: ["ball", "team", "stadium", "game", "player", "match", "competition", "score", "victory", "defeat", "training"]
1111
architecture: ["building", "structure", "design", "construction", "urbanism", "facade", "materials", "bridge", "tower", "window", "roof"]
1212
cuisine: ["food", "restaurant", "recipe", "ingredient", "dish", "flavor", "meal", "kitchen", "chef", "taste", "menu"]
1313

1414
# Models to be tested in the grid search
1515
models_to_test:
16+
- type: "sentence_transformers"
17+
name: "Qwen/Qwen3-Embedding-0.6B"
18+
dimensions: 1024
19+
max_tokens: 32768
20+
pooling_strategy: "average"
1621
- type: "sentence_transformers"
1722
name: "google/embeddinggemma-300m"
1823
dimensions: 768
24+
max_tokens: 2048
25+
pooling_strategy: "average"
1926
- type: "fastembed"
2027
name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
2128
dimensions: 384
22-
- type: "fastembed"
23-
name: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
24-
dimensions: 768
29+
max_tokens: 512
30+
pooling_strategy: "average"
2531
- type: "fastembed"
2632
name: "jinaai/jina-embeddings-v3"
2733
dimensions: 1024
34+
max_tokens: 8192
35+
pooling_strategy: "average"
36+
- type: "sentence_transformers"
37+
name: "jinaai/jina-embeddings-v5-text-small-retrieval"
38+
dimensions: 1024
39+
max_tokens: 32768
40+
pooling_strategy: "average"
41+
- type: "sentence_transformers"
42+
name: "KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5"
43+
dimensions: 896
44+
max_tokens: 32768
45+
pooling_strategy: "average"
46+
- type: "sentence_transformers"
47+
name: "voyageai/voyage-4-nano"
48+
dimensions: 1024
49+
max_tokens: 32000
50+
pooling_strategy: "average"
2851

2952
# General settings
30-
similarity_threshold: 0.6
3153
output_dir: "reports"
3254
generate_filtered_markdowns: false
3355

0 commit comments

Comments
 (0)