-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
63 lines (54 loc) · 3 KB
/
data.py
File metadata and controls
63 lines (54 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
class PatientDataset(torch.utils.data.Dataset):
def __init__(self, patient_ids, dataset_dir, numerical_cols, categorical_cols, numerical_norm='zscore',
ts_norm='zscore'):
self.patient_ids = patient_ids
self.numerical_cols = numerical_cols
self.categorical_cols = categorical_cols
self.data_dir = dataset_dir
self.numerical_norm = numerical_norm
self.ts_norm = ts_norm
patient_path = os.path.join(self.data_dir, f'patients_{numerical_norm}.csv')
self.df = pd.read_csv(patient_path)
def __len__(self):
return len(self.patient_ids)
def __getitem__(self, idx):
patient_id = self.patient_ids[idx]
df1 = self.df[self.df['ID'] == patient_id] # get numerical and categorical data by patient_id
numerical_data = df1[self.numerical_cols].values[0].astype(float)
categorical_data = df1[self.categorical_cols].values[0].astype(int)
ts_file_path = os.path.join(self.data_dir, f'{patient_id}_{self.ts_norm}.csv')
ts_data = pd.read_csv(ts_file_path).values
numerical_tensor = torch.tensor(numerical_data, dtype=torch.float32)
categorical_tensor = torch.tensor(categorical_data, dtype=torch.long)
ts_tensor = torch.tensor(ts_data, dtype=torch.float32)
sample = {'numerical': numerical_tensor, 'categorical': categorical_tensor, 'time_series': ts_tensor}
label = df1['composite_outcome'].values[0].astype(int)
return sample, label
def collate_batch(batch):
samples, labels = zip(*batch)
num_batch = torch.stack([item['numerical'] for item in samples]) # [batch_size, 116]
cat_batch = torch.stack([item['categorical'] for item in samples]) # [batch_size, 32]
lengths = torch.tensor([len(item['time_series']) for item in samples])
ts_batch = pad_sequence([item['time_series'] for item in samples],
batch_first=True) # [batch_size, max_seq_len, 25]
ts_lengths, sort_idx = torch.sort(lengths, descending=True)
num_batch = num_batch[sort_idx] # [batch_size, 116]
cat_batch = cat_batch[sort_idx] # [batch_size, 32]
ts_batch = ts_batch[sort_idx] # [batch_size, max_seq_len, 25]
labels = [labels[x] for x in sort_idx]
return num_batch, cat_batch, ts_batch, ts_lengths, labels
def load_splits(dataset_dir):
work_dir = os.path.join(dataset_dir, 'exp1_5fold')
splits_samples, splits_labels = [], []
for i in range(5):
f1 = pd.read_csv(os.path.join(work_dir, 'fold{}_train.txt'.format(i + 1)), header=None, sep='\t')
f2 = pd.read_csv(os.path.join(work_dir, 'fold{}_test.txt'.format(i + 1)), header=None, sep='\t')
train_samples, val_samples = f1[0].values, f2[0].values
train_labels, val_labels = f1[1].values, f2[1].values
splits_samples.append((train_samples, val_samples))
splits_labels.append((train_labels, val_labels))
return splits_samples, splits_labels