-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
26 lines (18 loc) · 753 Bytes
/
data.py
File metadata and controls
26 lines (18 loc) · 753 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
"""Load Poe corpus - scraped from poemuseum.org via scrape.py."""
import os
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
def load_corpus() -> str:
"""Load the pre-scraped corpus. Run scrape.py first if it doesn't exist."""
corpus_path = os.path.join(DATA_DIR, "poe_corpus.txt")
if not os.path.exists(corpus_path):
print("Corpus not found. Run: python scrape.py")
raise FileNotFoundError(corpus_path)
with open(corpus_path, "r") as f:
return f.read()
if __name__ == "__main__":
corpus = load_corpus()
chars = set(corpus)
print(f"Corpus size: {len(corpus):,} chars")
print(f"Unique characters: {len(chars)}")
words = corpus.split()
print(f"Word count: {len(words):,}")