-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathUsingEmoticons.py
More file actions
99 lines (84 loc) · 3.81 KB
/
UsingEmoticons.py
File metadata and controls
99 lines (84 loc) · 3.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
WQU Project
$This is an implementation of a tweet sentiment classifier. The accuracy on the test data containing positive and negative sentiment tweets is 84%.
Training and test data was downloaded from Univesity of Stanford data repository
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline
<Read data>
columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']
dftrain = pd.read_csv('stanford-sentiment-twitter-data/training.1600000.processed.noemoticon.csv',
header = None,
encoding ='ISO-8859-1')
dftest = pd.read_csv('stanford-sentiment-twitter-data/testdata.manual.2009.06.14.csv',
header = None,
encoding ='ISO-8859-1')
dftrain.columns = columns
dftest.columns = columns
Text pre-processing
class RegexPreprocess(object):
"""Create a preprocessing module for a tweet or data structure of tweets.
1) replace username, e.g., @crawles -> USERNAME
2) replace http links -> URL
3) replace repeated letters to two letters
"""
user_pat = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)'
http_pat = '(https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})'
repeat_pat, repeat_repl = "(.)\\1\\1+",'\\1\\1'
def __init__(self):
pass
def transform(self, X):
is_pd_series = isinstance(X, pd.core.frame.Series)
if not is_pd_series:
pp_text = pd.Series(X)
else:
pp_text = X
pp_text = pp_text.str.replace(pat = self.user_pat, repl = 'USERNAME')
pp_text = pp_text.str.replace(pat = self.http_pat, repl = 'URL')
pp_text.str.replace(pat = self.repeat_pat, repl = self.repeat_repl)
return pp_text
def fit(self, X, y=None):
return self
Train and test model
In [4]:
sentiment_lr = Pipeline([('regex_preprocess', RegexPreprocess()),
('count_vect', CountVectorizer(min_df = 100,
ngram_range = (1,1),
stop_words = 'english')),
('lr', LogisticRegression())])
sentiment_lr.fit(dftrain.text, dftrain.polarity)
Out[4]:
Pipeline(steps=[('regex_preprocess', <__main__.RegexPreprocess object at 0x1354bc6a0>), ('count_vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=10...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False))])
In [16]:
Xtest, ytest = dftest.text[dftest.polarity!=2], dftest.polarity[dftest.polarity!=2]
print(classification_report(ytest,sentiment_lr.predict(Xtest)))
precision recall f1-score support
0 0.86 0.81 0.83 177
4 0.82 0.87 0.85 182
avg / total 0.84 0.84 0.84 359
Export model for production
In [13]:
import dill
f = open('twitter_sentiment_model.pkl','wb')
r = RegexPreprocess()
dill.dump(sentiment_lr, f)
f.close()
In [15]:
# test
f = open('twitter_sentiment_model.pkl','rb')
cl = dill.load(f)
print(classification_report(ytest,cl.predict(Xtest)))
print(cl.predict_proba("Hello big beautiful world"))
f.close()
precision recall f1-score support
0 0.86 0.81 0.83 177
4 0.82 0.87 0.85 182
avg / total 0.84 0.84 0.84 359
[[ 0.07068138 0.92931862]]