cppquiz-questions/script.py at main · aqua4/cppquiz-questions · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
from tqdm import tqdm

# --- Configuration ---
BASE_URL = "https://cppquiz.org/quiz/question/{}"
START_QUESTION = 1
MAX_QUESTION = 400
MAX_WORKERS = 10  # Number of parallel requests to make

# It's good practice to set a User-Agent
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_question_difficulty(question_number):
    """
    Fetches a single question page and extracts its difficulty level.

    Args:
        question_number (int): The ID of the question to fetch.

    Returns:
        tuple: A tuple of (question_number, difficulty_level) if successful,
               otherwise None.
    """
    url = BASE_URL.format(question_number)
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)

        # Check if the page exists (status code 200)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the specific image tag by its 'src' attribute pattern
            # The lambda function makes the search more robust
            img_tag = soup.find('img', src=lambda s: s and 'static.cppquiz.org/level-' in s)

            if img_tag and 'alt' in img_tag.attrs:
                difficulty = int(img_tag['alt'])
                return (question_number, difficulty)

    except requests.exceptions.RequestException as e:
        # Handle network errors, timeouts, etc.
        # print(f"Error fetching question {question_number}: {e}")
        pass # Silently ignore network errors for this script

    return None # Return None for non-existent questions or errors

def main():
    """
    Main function to orchestrate the scraping and printing of results.
    """
    print(f"Fetching questions {START_QUESTION}-{MAX_QUESTION} from cppquiz.org...")

    # Use a defaultdict to automatically create lists for new keys
    questions_by_difficulty = defaultdict(list)

    question_range = range(START_QUESTION, MAX_QUESTION + 1)

    # Use ThreadPoolExecutor to fetch pages concurrently
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Create a dictionary of future-to-question_number
        future_to_num = {executor.submit(get_question_difficulty, num): num for num in question_range}

        # Use tqdm for a progress bar
        for future in tqdm(as_completed(future_to_num), total=len(question_range), desc="Scraping Questions"):
            result = future.result()
            if result:
                question_num, difficulty = result
                questions_by_difficulty[difficulty].append(question_num)

    print("\n--- Scraping Complete ---\n")

    # Sort the results and print them
    # Sort difficulty levels (keys) to ensure a consistent output order (1, 2, 3)
    for difficulty in sorted(questions_by_difficulty.keys()):
        print(f"--- Difficulty Level {difficulty} ---")

        # The question numbers are already sorted as we process them in order,
        # but sorting here ensures correctness even if the completion order changes.
        question_list = sorted(questions_by_difficulty[difficulty])

        for q_num in question_list:
            print(BASE_URL.format(q_num))
        print() # Add a blank line for readability

if __name__ == "__main__":
    main()