-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSentimentExtraction.py
More file actions
187 lines (171 loc) · 6.65 KB
/
SentimentExtraction.py
File metadata and controls
187 lines (171 loc) · 6.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# Sentiment Analysis of Movie Reviews using Deep Learning
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
from keras.datasets import imdb
from azureml.dataprep.package import run
from azureml.logging import get_azureml_logger
import h5py
import numpy as np
import pandas as pd
import csv
import argparse
# initialize the logger
run_logger = get_azureml_logger()
run_logger.log('amlrealworld.SentimentAnalysis.SentimentExtraction','true')
def read_reviews_from_csv(dataset):
'''
Reads the csv file containing reviews and sentiments.
@param
dataset = input dataset
@returns:
df: a dataframe containing the reviews and sentiments
'''
df = pd.read_csv(dataset, encoding='cp437', sep='|')
df = df.apply(lambda x: x.astype(str).str.lower())
return df
def train_model(dataset, ratio=.5):
'''
Main function to build the model. The funcion sets parameters for building the model.
@returns:
model: model built using the reviews
'''
# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2
seed = 113
# get the reviews_list and labels_ist from the csv file
df = read_reviews_from_csv(dataset)
rows, columns = df.shape
reviews_list = []
labels_list = []
for i in range(0, rows):
try:
labels_list.append(int(float(df.iloc[i,1])))
reviews_list.append(df.iloc[i,0])
except UnicodeEncodeError:
pass
# get the corresponding vectors from the data set
reviews_list_vec = get_vectors_from_text(reviews_list)
# shuffle the data set
np.random.seed(seed)
np.random.shuffle(reviews_list_vec)
np.random.seed(seed)
np.random.shuffle(labels_list)
# split the data set into train and test data
x_train = reviews_list_vec[:int(len(reviews_list)*ratio)]
y_train = labels_list[:int(len(labels_list)*ratio)]
x_test = reviews_list_vec[int(len(reviews_list)*ratio):]
y_test = labels_list[int(len(labels_list)*ratio):]
print('Building model...')
model = Sequential()
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
embedding_dims,
input_length=maxlen))
model.add(Dropout(0.2))
# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
kernel_size,
padding='valid',
activation='relu',
strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(x_test, y_test))
return model
def get_vectors_from_text(dataset_list,word_to_ind=imdb.get_word_index(),
start_char=1,
index_from=3,
maxlen=400,
num_words=5000,
oov_char=2,skip_top=0):
'''
Gets the list vector mapped according to the word to indices dictionary.
@param
dataset_list = list of review texts in unicode format
word_to_ind = word to indices dictionary
hyperparameters: start_char-->sentence starting after this char.
index_from-->indices below this will not be encoded.
max-len-->maximum length of the sequence to be considered.
num_words-->number of words to be considered according to the rank.Rank is
given according to the frequency of occurence
oov_char-->out of variable character.
skip_top-->no of top rank words to be skipped
@returns:
x_train: Final list of vectors(as list) of the review texts
'''
x_train = []
for review_string in dataset_list:
review_string_list = text_to_word_sequence(review_string)
review_string_list = [ele for ele in review_string_list]
x_predict = []
for i in range(len(review_string_list)):
if review_string_list[i] not in word_to_ind:
continue
x_predict.append(word_to_ind[review_string_list[i]])
x_train.append((x_predict))
# add te start char and also take care of indexfrom
if start_char is not None:
x_train = [[start_char] + [w + index_from for w in x] for x in x_train]
elif index_from:
x_train = [[w + index_from for w in x] for x in x_train]
# only maxlen is out criteria
x_train=[ele[:maxlen] for ele in x_train]
# if num is not given take care
if not num_words:
num_words = max([max(x) for x in x_train])
# by convention, use 2 as OOV word
# reserve 'index_from' (=3 by default) characters:
# 0 (padding), 1 (start), 2 (OOV)
if oov_char is not None:
x_train = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in x_train]
else:
x_train = [[w for w in x if (skip_top <= w < num_words)] for x in x_train]
# padd the sequences
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
# return the vectors form of the text
return x_train
def predict_review(model,review_text):
'''
Predict the sentiment of the review text.
@param
model: SequentialModel which we trained the data on.
review_text: Review text to be predicted on
@returns
sentiment score on the review text.
'''
# convert the review text into vector
x_predict = get_vectors_from_text([review_text])[0]
# reshape the x_predict
x_predict = np.reshape(x_predict,(1,len(x_predict)))
# predict on the model
return model.predict(x_predict)[0][0]
# the dataset in the csv format
dataset = 'sampleReviews.txt'
review_text = 'i loved the movie'
# now train the model using the dataset
model = train_model(dataset)
print("Review Sentiment:", predict_review(model, review_text.lower()))
model.save('./outputs/sentModel.h5')