-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlog_words.py
More file actions
executable file
·25 lines (19 loc) · 825 Bytes
/
log_words.py
File metadata and controls
executable file
·25 lines (19 loc) · 825 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os
import json
from transformers import AutoTokenizer
from config import LOG_DIR, BASIL_MODEL_NAME
tokenizer = AutoTokenizer.from_pretrained(BASIL_MODEL_NAME)
# Get all today's log filenames
log_fnames = [f for f in os.listdir(LOG_DIR) if f.startswith("log_2025") and f.endswith(".jsonl")]
for fname in log_fnames:
total_words = 0
total_tokens = 0
full_text = ""
with open(os.path.join(LOG_DIR, fname), "r", encoding="utf-8") as f:
for line in f:
entry = json.loads(line)
if entry["speaker"] in ["Tutor", "Sophie", "Basil"]:
full_text += f"{entry['speaker']}: {entry['text']}\n"
total_words = len(full_text.split())
total_tokens = len(tokenizer(full_text)["input_ids"])
print(f"{fname}: {total_words:,} words, {total_tokens:,} tokens")