-
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtrain.py
More file actions
116 lines (80 loc) · 3.64 KB
/
train.py
File metadata and controls
116 lines (80 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import re
import pickle
import shutil
import pythainlp
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
model_name = input('Model name: ')
data_th = pd.read_json('./data/intents_th.json')
data_en = pd.read_json('./data/intents_en.json')
data = list(data_th['intents_th']) + list(data_en['intents_en'])
training_sentences = []
training_labels = []
labels = []
responses = []
for intent in data:
for pattern in intent['patterns']:
training_sentences.append(pattern.lower())
training_labels.append(intent['tag'])
responses.append(intent['responses'])
if intent['tag'] not in labels:
labels.append(intent['tag'])
num_classes = len(labels)
wordlist = [pythainlp.word_tokenize(re.sub(r'[\^!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', '', seq.lower()), keep_whitespace = False) for seq in training_sentences]
wordset = list(set([j for i in wordlist for j in i]))
label_encoder = LabelEncoder()
label_encoder.fit(training_labels)
training_labels = label_encoder.transform(training_labels)
word_label_encoder = LabelEncoder()
word_label_encoder.fit(wordset)
encoded_sentences = pad_sequences([word_label_encoder.transform(wl) for wl in wordlist], truncating = 'post', maxlen = 20)
max_length = 20
embedding_size = 32
model = Sequential()
model.add(Embedding(len(wordset), embedding_size, input_length = 20))
model.add(GlobalAveragePooling1D())
model.add(Dense(embedding_size, activation = 'relu'))
model.add(Dense(embedding_size, activation = 'relu'))
model.add(Dense(embedding_size, activation = 'relu'))
model.add(Dense(num_classes, activation = 'softmax'))
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit(encoded_sentences, np.array(training_labels), epochs = 800)
os.makedirs(f'./model/{model_name}')
model.save(f'./model/{model_name}/chat_model')
with open(f'./model/{model_name}/label_encoder.pickle', 'wb') as ecn_file:
pickle.dump(label_encoder, ecn_file, protocol = pickle.HIGHEST_PROTOCOL)
with open(f'./model/{model_name}/word_label_encoder.pickle', 'wb') as ecn_file:
pickle.dump(word_label_encoder, ecn_file, protocol = pickle.HIGHEST_PROTOCOL)
for (en, th) in zip(data_en['intents_en'], data_th['intents_th']):
del en['patterns'], th['patterns']
out = pd.concat([data_en, data_th], axis = 1)
(fes_res_en, fes_res_th) = ([], [])
for en in out['intents_en'].iloc:
if isinstance(en, type(np.NaN)):
fes_res_en.append(None)
continue
if isinstance(en['responses'], dict):
data = en.pop('responses')
en['date'] = [en['date']] if isinstance(en['date'], int) else en['date']
fes_res_en.append(data)
elif isinstance(en['responses'], list):
fes_res_en.append(None)
for th in out['intents_th'].iloc:
if isinstance(th, type(np.NaN)):
fes_res_th.append(None)
continue
if isinstance(th['responses'], dict):
data = th.pop('responses')
th['date'] = [th['date']] if isinstance(th['date'], int) else th['date']
fes_res_th.append(data)
elif isinstance(th['responses'], list):
fes_res_th.append(None)
out = pd.concat([out, pd.DataFrame({'fes_res_en': fes_res_en})], axis = 1)
out = pd.concat([out, pd.DataFrame({'fes_res_th': fes_res_th})], axis = 1)
out.to_parquet(f'./model/{model_name}/intents.parquet')
shutil.make_archive(f'model/{model_name}', 'zip', f'./model/{model_name}')