-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck_scraped_data.py
More file actions
77 lines (64 loc) · 2.88 KB
/
check_scraped_data.py
File metadata and controls
77 lines (64 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import json
import os
from transformers import BertTokenizer
def extract_topic(filename: str) -> str:
#format: level_topic_problems.json
parts = filename.split("_")
if len(parts) >= 3:
return parts[1]
return ""
def main() -> None:
total_problems = 0
total_chars = 0
total_tokens = 0
min_chars = float('inf')
max_chars = float('-inf')
min_tokens = float('inf')
max_tokens = float('-inf')
tokenizer = BertTokenizer.from_pretrained('models/topic_classifier_10000_epoch3_0815_00-15-27')
problem_files = os.listdir("scraped_data/problems")
problem_files = sorted(problem_files, key=extract_topic)
suspicious_problems = {}
for file in problem_files:
if any(t in file for t in ['amc8', 'amc10', 'amc12', 'aime']):
with open(f"scraped_data/problems/{file}", "r") as f:
if not file.endswith(".json"):
continue
data = json.load(f)
num_problems = len(data)
print(f"{file:<40} {num_problems:>5} problems")
total_problems += num_problems
for id, problem_data in data.items():
problem_text = problem_data['problem'] + problem_data['answer_choices']
char_len = len(problem_text)
tokens = tokenizer.tokenize(problem_text)
token_len = len(tokens)
if token_len <= 18:
suspicious_problems[id] = problem_data
total_chars += char_len
total_tokens += token_len
min_chars = min(min_chars, char_len)
max_chars = max(max_chars, char_len)
min_tokens = min(min_tokens, token_len)
max_tokens = max(max_tokens, token_len)
total_ticks = 0
problem_files = os.listdir("scraped_data/ticks")
for file in sorted(problem_files):
with open(f"scraped_data/ticks/{file}", "r") as f:
data = json.load(f)
num_ticks = len(data)
total_ticks += num_ticks
print(f"...\n{'Total problems indexed on wiki:':<40} {total_ticks:>5}")
print(f"{'Total problems scraped:':<40} {total_problems:>5} ({100 * total_problems / total_ticks:.2f}%)")
print(f"{'Total characters:':<40} {total_chars:>8}")
print(f"{'Total tokens:':<40} {total_tokens:>8}")
print(f"{'Average sequence length:':<40} {total_tokens / total_problems:>8.2f} tokens")
print(f"{'Shortest sequence length:':<40} {min_tokens:>8} tokens")
print(f"{'':<40} {min_chars:>8} chars")
print(f"{'Longest sequence length:':<40} {max_tokens:>8} tokens")
print(f"{'':<40} {max_chars:>8} chars")
print(f"{'Vocab size:':<40} {len(tokenizer.vocab):>8}")
with open("suspicious_problems.json", "w") as f:
json.dump(suspicious_problems, f, indent=4)
if __name__ == "__main__":
main()