classiphi/check_scraped_data.py at main · coimf/classiphi · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import json
import os
from transformers import BertTokenizer

def extract_topic(filename: str) -> str:
    #format: level_topic_problems.json
    parts = filename.split("_")
    if len(parts) >= 3:
        return parts[1]
    return ""

def main() -> None:
    total_problems = 0
    total_chars = 0
    total_tokens = 0
    min_chars = float('inf')
    max_chars = float('-inf')
    min_tokens = float('inf')
    max_tokens = float('-inf')
    tokenizer = BertTokenizer.from_pretrained('models/topic_classifier_10000_epoch3_0815_00-15-27')

    problem_files = os.listdir("scraped_data/problems")
    problem_files = sorted(problem_files, key=extract_topic)

    suspicious_problems = {}

    for file in problem_files:
        if any(t in file for t in ['amc8', 'amc10', 'amc12', 'aime']):
            with open(f"scraped_data/problems/{file}", "r") as f:
                if not file.endswith(".json"):
                    continue
                data = json.load(f)
                num_problems = len(data)
                print(f"{file:<40} {num_problems:>5} problems")
                total_problems += num_problems

                for id, problem_data in data.items():
                    problem_text = problem_data['problem'] + problem_data['answer_choices']
                    char_len = len(problem_text)
                    tokens = tokenizer.tokenize(problem_text)
                    token_len = len(tokens)

                    if token_len <= 18:
                        suspicious_problems[id] = problem_data

                    total_chars += char_len
                    total_tokens += token_len

                    min_chars = min(min_chars, char_len)
                    max_chars = max(max_chars, char_len)
                    min_tokens = min(min_tokens, token_len)
                    max_tokens = max(max_tokens, token_len)

    total_ticks = 0
    problem_files = os.listdir("scraped_data/ticks")
    for file in sorted(problem_files):
        with open(f"scraped_data/ticks/{file}", "r") as f:
            data = json.load(f)
            num_ticks = len(data)
            total_ticks += num_ticks

    print(f"...\n{'Total problems indexed on wiki:':<40} {total_ticks:>5}")
    print(f"{'Total problems scraped:':<40} {total_problems:>5} ({100 * total_problems / total_ticks:.2f}%)")
    print(f"{'Total characters:':<40} {total_chars:>8}")
    print(f"{'Total tokens:':<40} {total_tokens:>8}")
    print(f"{'Average sequence length:':<40} {total_tokens / total_problems:>8.2f} tokens")
    print(f"{'Shortest sequence length:':<40} {min_tokens:>8} tokens")
    print(f"{'':<40} {min_chars:>8} chars")
    print(f"{'Longest sequence length:':<40} {max_tokens:>8} tokens")
    print(f"{'':<40} {max_chars:>8} chars")
    print(f"{'Vocab size:':<40} {len(tokenizer.vocab):>8}")

    with open("suspicious_problems.json", "w") as f:
        json.dump(suspicious_problems, f, indent=4)

if __name__ == "__main__":
    main()