-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathInitialDatasetInsights.py
More file actions
262 lines (213 loc) · 9.48 KB
/
InitialDatasetInsights.py
File metadata and controls
262 lines (213 loc) · 9.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
import json
import pandas as pd
import numpy as np
from pathlib import Path
def read_jsonl_to_dataframe(file_path, max_rows=None):
"""
Read JSONL file and convert to pandas DataFrame
Args:
file_path: Path to JSONL file
max_rows: Maximum number of rows to read (None for all)
Returns:
pandas DataFrame
"""
data = []
with open(file_path, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
if max_rows and i >= max_rows:
break
line = line.strip()
if line:
try:
data.append(json.loads(line))
except json.JSONDecodeError:
print(f"Skipping invalid JSON at line {i+1}")
continue
return pd.DataFrame(data)
def load_all_training_chunks(data_dir, max_rows_per_file=None):
"""
Load all training chunk files and combine into one DataFrame
"""
data_dir = Path(data_dir)
train_files = sorted(data_dir.glob("cls-train-chunk-*.jsonl"))
if not train_files:
print("No training files found.")
return pd.DataFrame()
all_dfs = []
for file_path in train_files:
print(f"Loading {file_path.name}...")
df = read_jsonl_to_dataframe(file_path, max_rows_per_file)
print(f" Loaded {len(df)} rows")
all_dfs.append(df)
if all_dfs:
combined_df = pd.concat(all_dfs, ignore_index=True)
print(f"Combined training data: {len(combined_df):,} rows")
return combined_df
else:
return pd.DataFrame()
# Base language mapping - we'll expand this automatically
BASE_LANGUAGE_MAP = {
'py': 'python',
'java': 'java',
'js': 'javascript',
'cpp': 'cpp',
'go': 'go',
'.cs': 'csharp',
'rb': 'ruby',
'c': 'c',
'php': 'php'
}
def discover_all_languages(df):
"""Discover all unique language values in the dataset and create complete mapping"""
print(f"\n=== DISCOVERING ALL LANGUAGES ===")
if 'lang' not in df.columns:
print("No 'lang' column found")
return BASE_LANGUAGE_MAP
# Get all unique language values (including NaN)
all_langs = df['lang'].value_counts(dropna=False)
print(f"Found {len(all_langs)} unique language values:")
for lang, count in all_langs.items():
percentage = (count / len(df)) * 100
if pd.isna(lang):
print(f" NaN/Missing: {count:,} ({percentage:.1f}%)")
else:
print(f" '{lang}': {count:,} ({percentage:.1f}%)")
# Create complete language mapping
complete_map = BASE_LANGUAGE_MAP.copy()
# Add any missing languages with automatic mapping
for lang in all_langs.index:
if pd.isna(lang):
complete_map['undefined'] = 'undefined' # For NaN values
elif lang not in complete_map:
# Auto-generate readable name for unknown languages
if lang.startswith('.'):
# File extension like .cs, .py, etc.
readable_name = lang[1:] if len(lang) > 1 else 'unknown'
else:
readable_name = lang
complete_map[lang] = readable_name
print(f" Added new language mapping: '{lang}' -> '{readable_name}'")
print(f"\nComplete language mapping:")
for code, name in sorted(complete_map.items()):
print(f" {code} -> {name}")
return complete_map
def clean_and_fill_missing_data(df, language_map):
"""Clean data and fill missing language values with 'undefined'"""
print(f"\n=== CLEANING DATA ===")
print(f"Original data: {len(df):,} rows")
# Fill missing language data with 'undefined'
if 'lang' in df.columns:
missing_lang_count = df['lang'].isna().sum()
if missing_lang_count > 0:
print(f"Missing language data: {missing_lang_count:,} rows")
df['lang'] = df['lang'].fillna('undefined')
print(f"Filled missing language data with 'undefined'")
# Fill missing project data with 'unknown-project'
if 'proj' in df.columns:
missing_proj_count = df['proj'].isna().sum()
if missing_proj_count > 0:
print(f"Missing project data: {missing_proj_count:,} rows")
df['proj'] = df['proj'].fillna('unknown-project')
print(f"Filled missing project data with 'unknown-project'")
return df
def analyze_all_data_including_undefined(df, language_map):
"""Analyze all data using the complete language mapping"""
print(f"\n=== ANALYSIS OF ALL DATA ===")
print(f"Total samples: {len(df):,}")
# Language distribution
if 'lang' in df.columns:
print(f"\nLanguage distribution:")
lang_counts = df['lang'].value_counts()
total = len(df)
for lang, count in lang_counts.items():
readable_name = language_map.get(lang, f"UNKNOWN({lang})")
percentage = (count / total) * 100
print(f" {lang} ({readable_name}): {count:,} ({percentage:.1f}%)")
# Label distribution
if 'y' in df.columns:
print(f"\nLabel distribution:")
label_counts = df['y'].value_counts()
total = len(df)
for label, count in label_counts.items():
percentage = (count / total) * 100
meaning = "Review needed" if label == 1 else "No review needed"
print(f" {label} ({meaning}): {count:,} ({percentage:.1f}%)")
# Language vs Label breakdown
if 'lang' in df.columns and 'y' in df.columns:
print(f"\nLanguage vs Label breakdown:")
lang_label_cross = pd.crosstab(df['lang'], df['y'], margins=True)
print(lang_label_cross)
# Percentage breakdown by language
print(f"\nPercentage needing review by language:")
for lang in sorted(df['lang'].unique()):
lang_data = df[df['lang'] == lang]
if len(lang_data) > 0:
review_needed = len(lang_data[lang_data['y'] == 1])
no_review = len(lang_data[lang_data['y'] == 0])
total_lang = len(lang_data)
percentage = (review_needed / total_lang) * 100
readable_name = language_map.get(lang, f"UNKNOWN({lang})")
print(f" {lang} ({readable_name}): {review_needed:,}/{total_lang:,} ({percentage:.1f}%) need review")
def filter_by_language_code(df, lang_codes, include_undefined=False):
"""
Filter DataFrame by language codes
Args:
df: pandas DataFrame
lang_codes: String or list of language codes to filter by
include_undefined: Whether to include undefined language samples
Returns:
Filtered DataFrame
"""
if isinstance(lang_codes, str):
lang_codes = [lang_codes]
if include_undefined and 'undefined' not in lang_codes:
lang_codes = lang_codes + ['undefined']
if 'lang' not in df.columns:
print("No 'lang' column found")
return df
filtered_df = df[df['lang'].isin(lang_codes)]
print(f"Filtered from {len(df):,} to {len(filtered_df):,} rows for languages: {lang_codes}")
return filtered_df
# Example usage:
if __name__ == "__main__":
# Load FULL training data
print("=== Loading FULL training data ===")
print("Warning: This will load ~266k samples (~11GB). This may take a few minutes...")
full_train_df = load_all_training_chunks("./data")
if len(full_train_df) > 0:
# STEP 1: Discover all languages in the dataset
complete_language_map = discover_all_languages(full_train_df)
# STEP 2: Clean and fill missing data
full_train_df = clean_and_fill_missing_data(full_train_df, complete_language_map)
# STEP 3: Analyze all data using complete mapping
analyze_all_data_including_undefined(full_train_df, complete_language_map)
# Project distribution (top 20)
print(f"\nTop 20 projects by sample count:")
proj_counts = full_train_df['proj'].value_counts().head(20)
total = len(full_train_df)
for proj, count in proj_counts.items():
percentage = (count / total) * 100
print(f" {proj}: {count:,} ({percentage:.1f}%)")
print(f"\nDataset loaded and cleaned successfully!")
# Show final language mapping used
print(f"\n=== FINAL LANGUAGE MAPPING USED ===")
for code, name in sorted(complete_language_map.items()):
print(f" {code} -> {name}")
# Example filtering with complete language list
print(f"\n=== Example Filtering ===")
# Get all defined language codes (excluding undefined)
defined_langs = [lang for lang in complete_language_map.keys() if lang != 'undefined']
print(f"All defined languages: {defined_langs}")
# Python only
if 'py' in complete_language_map:
python_data = filter_by_language_code(full_train_df, "py")
# Undefined language only
if 'undefined' in complete_language_map:
undefined_data = filter_by_language_code(full_train_df, "undefined")
# Save cleaned dataset with language mapping
# full_train_df.to_pickle("cleaned_training_data.pkl")
# with open("language_mapping.json", "w") as f:
# json.dump(complete_language_map, f, indent=2)
# print("Saved cleaned dataset and language mapping!")
else:
print("No training data was loaded. Check your file paths and try again.")