-
Notifications
You must be signed in to change notification settings - Fork 27
Expand file tree
/
Copy pathinstall_stanza_models.py
More file actions
53 lines (41 loc) · 1.82 KB
/
install_stanza_models.py
File metadata and controls
53 lines (41 loc) · 1.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from zeeguu.core.model.language import Language
import stanza
import os
from zeeguu.core.tokenization.stanza_tokenizer import STANZA_RESOURCE_DIR
def stanza_model_installation():
"""
Downloads Stanza models only if they don't already exist.
This allows models to persist in Docker volumes across container restarts.
Downloads all processors needed for:
- Basic tokenization (tokenize)
- POS tagging (pos)
- Lemmatization (lemma)
- Dependency parsing (depparse) - for particle verb detection
"""
# Map language codes for Stanza compatibility
stanza_lang_map = {
'no': 'nb', # Norwegian → Norwegian Bokmål
}
for l_code in Language.CODES_OF_LANGUAGES_THAT_CAN_BE_LEARNED:
stanza_code = stanza_lang_map.get(l_code, l_code)
model_path = os.path.join(STANZA_RESOURCE_DIR, stanza_code)
# Check if depparse models exist (not just basic tokenize models)
depparse_path = os.path.join(model_path, "depparse")
if os.path.exists(depparse_path):
print(f"✓ Stanza models (including depparse) for {l_code} already exist, skipping")
else:
if os.path.exists(model_path):
print(f"⬇ Updating Stanza models for {l_code} (adding lemma+depparse)...")
else:
print(f"⬇ Downloading Stanza models for {l_code} ({stanza_code})...")
try:
stanza.download(
stanza_code,
processors="tokenize,pos,lemma,depparse",
model_dir=STANZA_RESOURCE_DIR,
)
print(f"✓ Stanza models for {l_code} downloaded successfully")
except Exception as e:
print(f"✗ Failed to download models for {l_code}: {e}")
if __name__ == "__main__":
stanza_model_installation()