-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreatePythonDataset.py
More file actions
201 lines (160 loc) · 6.95 KB
/
createPythonDataset.py
File metadata and controls
201 lines (160 loc) · 6.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import json
import pandas as pd
import numpy as np
from pathlib import Path
def read_jsonl_to_dataframe(file_path, max_rows=None):
"""Read JSONL file and convert to pandas DataFrame"""
data = []
with open(file_path, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
if max_rows and i >= max_rows:
break
line = line.strip()
if line:
try:
data.append(json.loads(line))
except json.JSONDecodeError:
print(f"Skipping invalid JSON at line {i+1}")
continue
return pd.DataFrame(data)
def load_all_training_chunks(data_dir):
"""Load all training chunk files and combine into one DataFrame"""
data_dir = Path(data_dir)
train_files = sorted(data_dir.glob("cls-train-chunk-*.jsonl"))
if not train_files:
print("No training files found.")
return pd.DataFrame()
all_dfs = []
for file_path in train_files:
print(f"Loading {file_path.name}...")
df = read_jsonl_to_dataframe(file_path)
print(f" Loaded {len(df)} rows")
all_dfs.append(df)
if all_dfs:
combined_df = pd.concat(all_dfs, ignore_index=True)
print(f"Combined training data: {len(combined_df):,} rows")
return combined_df
else:
return pd.DataFrame()
def load_test_and_val_data(data_dir):
"""Load cls-test.jsonl and cls-valid.jsonl files"""
data_dir = Path(data_dir)
# Load test data
test_path = data_dir / "cls-test.jsonl"
if test_path.exists():
print(f"Loading {test_path.name}...")
test_df = read_jsonl_to_dataframe(test_path)
print(f" Loaded {len(test_df)} rows")
else:
print(f"Warning: {test_path} not found")
test_df = pd.DataFrame()
# Load validation data
val_path = data_dir / "cls-valid.jsonl"
if val_path.exists():
print(f"Loading {val_path.name}...")
val_df = read_jsonl_to_dataframe(val_path)
print(f" Loaded {len(val_df)} rows")
else:
print(f"Warning: {val_path} not found")
val_df = pd.DataFrame()
return test_df, val_df
def clean_and_filter_data(df, dataset_name):
"""Clean data and filter for Python and undefined languages"""
print(f"\n=== PROCESSING {dataset_name.upper()} ===")
print(f"Original data: {len(df):,} rows")
if len(df) == 0:
return df
# Fill missing language data with 'undefined'
if 'lang' in df.columns:
df['lang'] = df['lang'].fillna('undefined')
# Fill missing project data with 'unknown-project'
if 'proj' in df.columns:
df['proj'] = df['proj'].fillna('unknown-project')
# Filter for Python and undefined only
target_languages = ['py', 'undefined']
filtered_df = df[df['lang'].isin(target_languages)].copy()
print(f"Filtered data: {len(filtered_df):,} rows")
# Show distribution
if len(filtered_df) > 0:
lang_counts = filtered_df['lang'].value_counts()
print(f"Language distribution:")
for lang, count in lang_counts.items():
percentage = (count / len(filtered_df)) * 100
readable_name = "Python" if lang == 'py' else "Undefined"
print(f" {lang} ({readable_name}): {count:,} ({percentage:.1f}%)")
if 'y' in filtered_df.columns:
label_counts = filtered_df['y'].value_counts()
print(f"Label distribution:")
for label, count in label_counts.items():
percentage = (count / len(filtered_df)) * 100
meaning = "Review needed" if label == 1 else "No review needed"
print(f" {label} ({meaning}): {count:,} ({percentage:.1f}%)")
return filtered_df
def save_dataset(df, filename, output_dir="./python_data"):
"""Save dataset in JSONL and pickle formats"""
if len(df) == 0:
print(f"Warning: {filename} is empty, skipping save")
return
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
# Save as JSONL
jsonl_path = output_path / f"{filename}.jsonl"
with open(jsonl_path, 'w', encoding='utf-8') as f:
for _, row in df.iterrows():
json.dump(row.to_dict(), f)
f.write('\n')
# Save as pickle for fast loading
pickle_path = output_path / f"{filename}.pkl"
df.to_pickle(pickle_path)
print(f"Saved {filename}: {len(df):,} rows")
return jsonl_path, pickle_path
# Main execution
if __name__ == "__main__":
print("=== CREATING PYTHON + UNDEFINED DATASET ===")
print("Using existing test/val files, keeping all training data")
# Load training data (all chunks combined)
print("\n=== LOADING TRAINING DATA ===")
train_df = load_all_training_chunks("./data")
# Load test and validation data
print("\n=== LOADING TEST AND VALIDATION DATA ===")
test_df, val_df = load_test_and_val_data("./data")
# Process each dataset
train_filtered = clean_and_filter_data(train_df, "training")
test_filtered = clean_and_filter_data(test_df, "test")
val_filtered = clean_and_filter_data(val_df, "validation")
# Save filtered datasets
print(f"\n=== SAVING FILTERED DATASETS ===")
if len(train_filtered) > 0:
save_dataset(train_filtered, "train")
if len(test_filtered) > 0:
save_dataset(test_filtered, "test")
if len(val_filtered) > 0:
save_dataset(val_filtered, "val")
# Save summary
summary_path = Path("./python_data/dataset_summary.txt")
with open(summary_path, 'w') as f:
f.write("=== PYTHON + UNDEFINED DATASET SUMMARY ===\n\n")
f.write("Dataset sizes:\n")
f.write(f" train.jsonl: {len(train_filtered):,} rows\n")
f.write(f" test.jsonl: {len(test_filtered):,} rows\n")
f.write(f" val.jsonl: {len(val_filtered):,} rows\n")
f.write(f" Total: {len(train_filtered) + len(test_filtered) + len(val_filtered):,} rows\n\n")
f.write("Languages included: Python (py) and Undefined\n")
f.write("Source files:\n")
f.write(" - Training: cls-train-chunk-*.jsonl (all chunks combined)\n")
f.write(" - Test: cls-test.jsonl\n")
f.write(" - Validation: cls-valid.jsonl\n\n")
if len(train_filtered) > 0:
f.write("Available columns:\n")
for col in train_filtered.columns:
f.write(f" {col}\n")
f.write(f"\nLabel meanings:\n")
f.write(f" 0 = No review needed\n")
f.write(f" 1 = Review needed\n")
print(f"Saved summary: {summary_path}")
print(f"\n=== COMPLETE ===")
print(f"Filtered datasets ready:")
print(f" train.jsonl: {len(train_filtered):,} rows")
print(f" test.jsonl: {len(test_filtered):,} rows")
print(f" val.jsonl: {len(val_filtered):,} rows")
print(f"\nAll files saved in ./python_data/ directory")