-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess_split.py
More file actions
36 lines (31 loc) · 1.36 KB
/
preprocess_split.py
File metadata and controls
36 lines (31 loc) · 1.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# %%
from glob import glob
from sklearn.model_selection import train_test_split
import os
import shutil
# Some code for preprocessing
def get_file_list(data_dir, file_extension="*.npz"):
file_list = [f for f in glob(os.path.join(data_dir, file_extension))]
return file_list
def split_dataset(file_list, train_path, test_path, test_size, random_state=42):
# Ensure output directories exist
os.makedirs(train_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)
# Split the dataset
train_files, test_files = train_test_split(file_list, test_size=test_size, random_state=random_state)
# Copy files to respective directories
for file in train_files:
shutil.copy2(file, train_path)
for file in test_files:
shutil.copy2(file, test_path)
return train_files, test_files
if __name__ == "__main__":
data_dir = "./data/all" # Directory containing all data files
train_path = "./data/train" # Directory to save training files
test_path = "./data/test" # Directory to save testing files
test_size = 0.2 # splits data into 80% train and 20% test
file_list = get_file_list(data_dir)
print(f"found {len(file_list)} files.")
train_files, test_files = split_dataset(file_list, train_path, test_path, test_size=test_size)
print(f"Training files: {len(train_files)}, Testing files: {len(test_files)}")
# %%