-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerateLabeledData.py
More file actions
121 lines (93 loc) · 3.82 KB
/
generateLabeledData.py
File metadata and controls
121 lines (93 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json
import os
import time
from datetime import datetime, timedelta
from typing import List, Optional
import pandas as pd
from source.Event.mapper import map_blog, map_rating_update, map_submission
from source.extractFeatures import extract_features
from source.User.mapper import map_user
from source.User.User import User
MONTH_IN_SECONDS = 30 * 24 * 60 * 60
SIX_MONTHS_IN_SECONDS = 6 * MONTH_IN_SECONDS
MAX_USERS = 1_000_000
def load_users() -> List[User]:
"""Load all users from the users.json file"""
print("Loading users...")
users_data = json.load(open('data/users.json'))
users = []
for i, user_data in enumerate(users_data):
if i == MAX_USERS:
break
user = map_user(user_data)
users.append(user)
print(f"Loaded {len(users)} users")
return users
def load_user_events(user: User) -> None:
"""Load all events (submissions and rating updates) for a specific user"""
handle = user.handle
# Load submissions
submissions_file = os.path.join('data/users_submissions', f'{handle}.json')
submissions_data = json.load(open(submissions_file))
for submission_data in submissions_data:
if 'rating' not in submission_data['problem'] or 'participantType' not in submission_data['author']:
continue
submission_event = map_submission(submission_data)
user.add_event(submission_event)
# Load rating updates
ratings_file = os.path.join('data/users_ratings', f'{handle}.json')
ratings_data = json.load(open(ratings_file))
for rating_data in ratings_data:
rating_event = map_rating_update(rating_data)
user.add_event(rating_event)
# Load blogs
blogs_file = os.path.join('data/users_blogs', f'{handle}.json')
blogs_data = json.load(open(blogs_file))
for blog_data in blogs_data:
blog_event = map_blog(blog_data)
user.add_event(blog_event)
def get_monthly_timestamps() -> List[int]:
"""Generate timestamps for the 1st day of each month from Jan 2021 to Nov 2024"""
timestamps = []
start_date = datetime(2021, 1, 1)
end_date = datetime(2024, 11, 1)
current_date = start_date
while current_date <= end_date:
timestamp = int(current_date.timestamp())
timestamps.append(timestamp)
current_date = current_date + timedelta(days=32) # Move to next month
current_date = current_date.replace(day=1) # Set to 1st day
return timestamps
def generate_labeled_data() -> pd.DataFrame:
"""Generate labeled data for all users at monthly intervals"""
print("Starting labeled data generation...")
users = load_users()
timestamps = get_monthly_timestamps()
results = []
for i, user in enumerate(users):
if i % 100 == 0:
print(f"Processing user {i+1}/{len(users)}: {user.handle}")
load_user_events(user)
# Process each timestamp
for timestamp in timestamps:
# Extract features at this timestamp
features = extract_features(user, timestamp)
if features is None:
continue
features['user_handle'] = user.handle
features['timestamp'] = timestamp
features['rating_6m_after'] = user.get_rating_at_time(timestamp + SIX_MONTHS_IN_SECONDS)
results.append(features)
df = pd.DataFrame(results)
print(f"Generated {len(df)} labeled samples")
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
return df
def save_labeled_data(df: pd.DataFrame, filename: str = 'data/labeled_data.csv') -> None:
"""Save the labeled data to a CSV file"""
print(f"Saving labeled data to {filename}...")
df.to_csv(filename, index=False)
print(f"Saved {len(df)} samples to {filename}")
if __name__ == "__main__":
df = generate_labeled_data()
save_labeled_data(df)