-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataloader.py
More file actions
145 lines (130 loc) · 5.64 KB
/
dataloader.py
File metadata and controls
145 lines (130 loc) · 5.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import csv
import string
import numpy as np
import tensorflow as tf
import conf
from sklearn.preprocessing import StandardScaler
# convert index list to string
def index2str(index_list):
# transform label index to character
str_ = []
for ch in index_list:
if ch <= conf.PHONEME_SIZE:
str_.append(conf.PHONEMES[ch])
# else: # <EOS>
# break
return str_
# convert sentence to index list
def str2index(str_):
# clean white space
str_ = ' '.join(str_.split())
# remove punctuation and make lower case
translator = str.maketrans('', '', string.punctuation)
str_ = str_.translate(translator).lower()
res = []
for ch in str_:
try:
res.append(conf.PHONEMES_DICT[ch])
except KeyError:
# drop OOV
pass
return res
class DataLoader(object):
def __init__(self, batch_size=16, set_name='train'):
# load meta file
label, mfcc_files, mfcc_seq_len, mfccs, = [], [], [], []
with open(conf.PREPROCESSED_DATA + 'preprocess/meta/%s.csv' % set_name) as csv_file:
reader = csv.reader(csv_file, delimiter=',')
for i, row in enumerate(reader):
# if i >= conf.BATCH_SIZE:
# break
# mfcc file
mfcc_filename = conf.PREPROCESSED_DATA + 'preprocess/mfcc/' + row[0] + '.npy'
mfcc = np.load(mfcc_filename, allow_pickle=False).T
mfcc_seq_len.append(mfcc.shape[0]) # sequence length
mfccs.append(mfcc)
mfcc_files.append(mfcc_filename)
# label info ( convert to string object for variable-length support )
label.append(np.asarray(row[1:], dtype=np.int).tostring())
# Zero mean and unit variance
# This step somehow causes 0 loss, so disabling until a better fix can be found
# mfcc_massive = np.vstack(mfccs)
# sc = StandardScaler()
# mfcc_massive_normalized = sc.fit_transform(mfcc_massive)
# mfccs = np.split(mfcc_massive_normalized, mfcc_seq_len)
# to constant tensor
label_t = tf.convert_to_tensor(label)
# Generator used to feed data into Dataset. Using generator because of uneven shapes (from_tensor_slices could
# not handle
def gen():
for i, mfcc in enumerate(mfccs):
yield mfcc
# New pipeline, loads mfcc content, labels, and mfcc file name
datasource_mfcc = tf.data.Dataset.from_generator(gen, output_types=tf.float32, output_shapes=[None, 20])
datasource_labels = tf.data.Dataset.from_tensor_slices(label_t)
datasource_sound_file = tf.data.Dataset.from_tensor_slices(mfcc_files)
datasource = tf.data.Dataset.zip((datasource_labels, datasource_mfcc, datasource_sound_file))
#dataset = datasource.shuffle(buffer_size=1024)
# Python processing function
dataset = datasource.map(lambda x, y, z: tf.py_func(func=self._load_mfcc, inp=[x, y, z], Tout=[tf.int32, tf.string, tf.float32, tf.int32, tf.string]),
num_parallel_calls=64)
dataset = dataset.repeat()
dataset = dataset.shuffle(1000)
# Padding batch so the output has the same shape. E.g. MFCC have different sequence lengths,
# This padding will dynamically pad to the longest length
dataset = dataset.padded_batch(batch_size,
padded_shapes=([None],[], [None, conf.FEATURE_DIM],1, []),
padding_values=(27, '', 0.0, 0, '')) # 27 is <EMP> Hard coded
# Sparse matrix conversion
dataset = dataset.map(self.to_sparse_representation, num_parallel_calls=24)
dataset = dataset.prefetch(100)
# Iterator used for initliazation
self.dataset = dataset
self.iterator = dataset.make_initializable_iterator()
self.next_batch = self.iterator.get_next()
def training_set(self):
return self.next_batch
def _load_mfcc(self,label, mfcc, mfcc_file):
"""
Main processing function for converting mfcc to int, convert labels to int
:param label:
:param mfcc:
:param mfcc_file:
:return:
"""
# decode string to integer
seq_len = np.array(mfcc.shape).astype('int32')[0] # int32
mfcc = mfcc.astype('float32')
label_new = np.fromstring(label, np.int)
label_encoded = label_new.astype('int32')
return label_encoded, label, mfcc, [seq_len], mfcc_file # stupid batch
def _augment_speech(self, mfcc):
'''
Cited in another paper that augmented mfcc are more resilient to noise
:param mfcc:
:return:
'''
r = np.random.randint(-2, 2)
# shifting mfcc
mfcc = np.roll(mfcc, r, axis=1)
# zero padding
if r > 0:
mfcc[:, :r] = 0
elif r < 0:
mfcc[:, r:] = 0
return mfcc
def to_sparse_representation(self, labels, label_text, x, seq_len, mfcc_file_list):
'''
Convert a dense label matrix into a sparse matrix
:param labels:
:param label_text:
:param x:
:param seq_len:
:param mfcc_file_list:
:return:
'''
indices = tf.where(tf.not_equal(labels, 27)) # 27 being the <Epsilon> char used for CTC
sparse_label = tf.SparseTensor(indices=indices,
values=tf.gather_nd(tf.cast(labels,tf.int32), indices), # for zero-based index
dense_shape=tf.cast(tf.shape(labels), tf.int64))
return sparse_label,label_text, x, seq_len, mfcc_file_list