This repository was archived by the owner on Dec 21, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript.py
More file actions
91 lines (71 loc) · 3.49 KB
/
script.py
File metadata and controls
91 lines (71 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
from tqdm import tqdm
# --- Configuration ---
BASE_URL = "https://cppquiz.org/quiz/question/{}"
START_QUESTION = 1
MAX_QUESTION = 400
MAX_WORKERS = 10 # Number of parallel requests to make
# It's good practice to set a User-Agent
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def get_question_difficulty(question_number):
"""
Fetches a single question page and extracts its difficulty level.
Args:
question_number (int): The ID of the question to fetch.
Returns:
tuple: A tuple of (question_number, difficulty_level) if successful,
otherwise None.
"""
url = BASE_URL.format(question_number)
try:
response = requests.get(url, headers=HEADERS, timeout=10)
# Check if the page exists (status code 200)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Find the specific image tag by its 'src' attribute pattern
# The lambda function makes the search more robust
img_tag = soup.find('img', src=lambda s: s and 'static.cppquiz.org/level-' in s)
if img_tag and 'alt' in img_tag.attrs:
difficulty = int(img_tag['alt'])
return (question_number, difficulty)
except requests.exceptions.RequestException as e:
# Handle network errors, timeouts, etc.
# print(f"Error fetching question {question_number}: {e}")
pass # Silently ignore network errors for this script
return None # Return None for non-existent questions or errors
def main():
"""
Main function to orchestrate the scraping and printing of results.
"""
print(f"Fetching questions {START_QUESTION}-{MAX_QUESTION} from cppquiz.org...")
# Use a defaultdict to automatically create lists for new keys
questions_by_difficulty = defaultdict(list)
question_range = range(START_QUESTION, MAX_QUESTION + 1)
# Use ThreadPoolExecutor to fetch pages concurrently
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
# Create a dictionary of future-to-question_number
future_to_num = {executor.submit(get_question_difficulty, num): num for num in question_range}
# Use tqdm for a progress bar
for future in tqdm(as_completed(future_to_num), total=len(question_range), desc="Scraping Questions"):
result = future.result()
if result:
question_num, difficulty = result
questions_by_difficulty[difficulty].append(question_num)
print("\n--- Scraping Complete ---\n")
# Sort the results and print them
# Sort difficulty levels (keys) to ensure a consistent output order (1, 2, 3)
for difficulty in sorted(questions_by_difficulty.keys()):
print(f"--- Difficulty Level {difficulty} ---")
# The question numbers are already sorted as we process them in order,
# but sorting here ensures correctness even if the completion order changes.
question_list = sorted(questions_by_difficulty[difficulty])
for q_num in question_list:
print(BASE_URL.format(q_num))
print() # Add a blank line for readability
if __name__ == "__main__":
main()