-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfunctions.py
More file actions
116 lines (94 loc) · 3.84 KB
/
functions.py
File metadata and controls
116 lines (94 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk import word_tokenize
import re
import os
import datetime
import pickle
import tweepy
import warnings
from os import path
from wordcloud import WordCloud
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.data import load
from sklearn.neural_network import MLPClassifier
warnings.filterwarnings("ignore")
tagdict = load('help/tagsets/upenn_tagset.pickle')
nlp_keys = tagdict.keys()
def cleanupdata(tweet, date):
#function to get the avg number of words of each sentence
def avg_word(sentence):
words = sentence.split()
if len(words) == 0:
return 0
return (sum(len(word) for word in words)/len(words))
#define_individual features
tweets_text = [] #raw tweet text without links, hashtags, or mentions
mentions = [] # number of mentions
links = [] # number of links in tweet
hashtags = [] # number of hashtags in tweet
basic_features = [] # [num_words, num_char, avg_word_len, num_stopwords, num_schar, num_numerics, num_uppercase]
nlp_tags_frequency = [] # tags frequency provided by the NLP
time = [] # time of day the tweet was published
stop = stopwords.words('english')
#feature extraction
nlp_dict = {key: 0 for key in nlp_keys}
frequency = np.zeros((1, len(nlp_keys)))
tweets_text = re.sub("[^a-zA-Z0-9]", " ", (re.sub('https?://[A-Za-z0-9./]+','',
re.sub(r'@[A-Za-z0-9]+','', tweet)))).lower()
mentions = len(re.findall('@[A-Za-z0-9]+', tweet))
links = float(re.findall('https?://[A-Za-z0-9./]+', tweet) != [])
hashtags = len(re.findall('#[A-Za-z0-9./]+', tweet))
try:
time.append(date.hour)
except:
time.append(date.hour)
text = re.sub("[^a-zA-Z0-9 ]", "", (re.sub('https?://[A-Za-z0-9./]+' or "[^a-zA-Z0-9]",'', tweet)))
text = text.replace(' ', '')
if text != '' and text[-1] == ' ':
text = text[:-1]
num_words = lambda x: len(str(x).split(" "))
avg_word_len = lambda x: avg_word(x)
num_stopwords = lambda x: len([x for x in x.split() if x in stop])
num_numerics = lambda x: len([x for x in x.split() if x.isdigit()])
num_upper = lambda x: len([x for x in x.split() if x.isupper()])
basic_features.append([num_words(text), len(text), avg_word_len(text), num_stopwords(text),
len(re.findall('#[A-Za-z0-9./]+', tweet)), num_numerics(text),
num_upper(text)])
for word, tag in nltk.pos_tag(word_tokenize(re.sub('#', '',tweet))):
nlp_dict[tag] += 1
k=0
for key in nlp_dict:
frequency[0][k] = nlp_dict[key]
k +=1
nlp_tags_frequency.append(frequency)
X_features=[]
# puts all features for a tweet in a numpy array
arr = np.concatenate([np.array([mentions, links, hashtags]), np.array(basic_features[0])])
arr = np.concatenate([arr, nlp_tags_frequency[0][0]])
arr = np.concatenate([arr, time])
X_features = arr
return X_features
#check if the user made the tweet
def from_creator(status):
if hasattr(status, 'retweeted_status'):
return False
elif status.in_reply_to_status_id != None:
return False
elif status.in_reply_to_screen_name != None:
return False
elif status.in_reply_to_user_id != None:
return False
else:
return True
#gets prediction from the new tweet
def get_prediction(text, date):
x_Tweet = cleanupdata(text, date)
loaded_model = pickle.load(open('mlpsave.sav', 'rb'))
prediction = loaded_model.predict([x_Tweet])
if prediction == 1:
return True
else:
return False