-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWordcloud_Generator.py
More file actions
150 lines (122 loc) · 5.58 KB
/
Wordcloud_Generator.py
File metadata and controls
150 lines (122 loc) · 5.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# -*- coding: utf-8 -*-
import sys, os, csv
import re #regex
import tweepy
import operator
import unicodedata #smileys
import datetime
import json
import pickle
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from collections import defaultdict
from dateutil import parser
class Wordcloud_Generator:
def __init__(self, commonWords, reportFileName, tweetsFolder):
self.emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
self.regexp = re.compile(r'[!@#$:).;,?&]')
self.commonWords = json.loads(commonWords)
# dataset with sentiment-wise evaluated words using values within interval <-5,5>
self.sentiment_dict = {}
for line in open('AFFINN-111.txt'):
word, score = line.split('\t')
self.sentiment_dict[word] = int(score)
self.mypath = os.path.dirname(__file__)
self.reportFileName = reportFileName
self.tweetsFolder = tweetsFolder
def createWordcloud(self, chartsFolder, maskFolder, maxCloudWords, borderDateString):
# strings to store all the words from tweets
words = ""
wordsAfter = ""
borderDate = parser.parse(borderDateString)
#read tweets
tweetFilesPath = os.path.join(self.mypath, self.tweetsFolder)
tweetFiles = [f for f in os.listdir(tweetFilesPath) if os.path.isfile(os.path.join(tweetFilesPath, f))]
for file in tweetFiles:
with open(os.path.join(tweetFilesPath, file)) as csvFile:
reader = csv.reader(csvFile, delimiter=';')
reader.next() # pass headers
for row in reader:
tweet = row[4]
# iterate over words in tweets
for word in tweet.split():
if not self.emoji_pattern.match(word) and not self.isCommon(word.lower()):
if self.regexp.search(word):
continue;
postedDate = parser.parse(row[1].split(' ', 1)[0])
if postedDate < borderDate:
words = words + "," + word
else:
wordsAfter = wordsAfter + "," + word
#FIX: Wordcloud interprets word HELLO-WORLD as two separate words, therefore I'm replacing "-" with empty string to make the word unique
words = words.replace("-", "")
wordsAfter = wordsAfter.replace("-", "")
#FIX: "/" interpreted as space as well, causing word STATUS to be popular
words = words.replace("/", "")
wordsAfter = wordsAfter.replace("/", "")
wordcloud = WordCloud(stopwords=STOPWORDS, max_words=int(maxCloudWords)).generate(words)
wordcloud2 = WordCloud(stopwords=STOPWORDS, background_color='white', max_words=int(maxCloudWords)).generate(wordsAfter)
self.plotWordcloud(wordcloud, wordcloud2,chartsFolder,'CommonWords')
#now plot the same but without shared words
uniqueWords = self.getUniqueWords(words, wordsAfter)
uniqueWordsAfter = self.getUniqueWords(wordsAfter, words)
wordcloud = WordCloud(stopwords=STOPWORDS, max_words=int(maxCloudWords)).generate(uniqueWords)
wordcloud2 = WordCloud(stopwords=STOPWORDS, background_color='white', max_words=int(maxCloudWords)).generate(uniqueWordsAfter)
self.plotWordcloud(wordcloud, wordcloud2, chartsFolder, 'UniqueWords')
#replace common words in the report file
with open(self.reportFileName) as f:
newText = f.read().replace('<EXCLUDED_WORDS_LIST>', ",".join(self.commonWords))
newText = newText.replace('<BORDER_DATE>', borderDateString)
with open(self.reportFileName, "w") as f:
f.write(newText)
#generate shaped wordcloud if there's a mask
maskFolderPath = os.path.join(self.mypath, maskFolder)
if any(File.endswith(".png") for File in os.listdir(maskFolderPath)):
self.createShapedWordcloud(words+wordsAfter, chartsFolder, maskFolder);
def transform_mask_format(self,val):
if val == 0:
return 255
else:
return val
def createShapedWordcloud(self, allWords, chartsFolder, maskFolder):
wine_mask = np.array(Image.open(maskFolder+"/mask.png"))
transformed_wine_mask = np.ndarray((wine_mask.shape[0],wine_mask.shape[1]), np.int32)
for i in range(len(wine_mask)):
transformed_wine_mask[i] = list(map(self.transform_mask_format, wine_mask[i]))
wc = WordCloud(colormap="Reds", max_words=500, mask=transformed_wine_mask, stopwords=STOPWORDS, contour_width=1, contour_color='white')
wc.generate(allWords)
wc.to_file(chartsFolder+"/shapedWords.png")
#compares two comma separated strings and returns just unique words from the first string
def getUniqueWords(self,commaSeparatedWords1, commaSeparatedWords2):
words1 = commaSeparatedWords1.split(",")
words2 = commaSeparatedWords2.split(",")
wordset1 = set(words1)
wordset2 = set(words2)
uniqueWordsList = list(wordset1 - wordset2)
#return uniqueWordsList
return ','.join(map(str, uniqueWordsList))
def isCommon(self,word):
for comm in self.commonWords:
stripped = word.strip().decode('utf-8')
if comm == stripped:
return True
return False
#return word.lower() in self.commonWords;
def plotWordcloud(self,before, after, chartsFolder, picName):
plt.subplot(211)
plt.title('Before')
plt.axis('off')
plt.imshow(before, interpolation="bilinear") #interpolation needed because Matplotlib v2 doesn't do image interpolation for imshow any more (https://github.com/amueller/word_cloud/issues/188)
plt.subplot(212)
plt.title('After')
plt.axis('off')
plt.imshow(after, interpolation="bilinear") #interpolation needed because Matplotlib v2 doesn't do image interpolation for imshow any more(https://github.com/amueller/word_cloud/issues/188)
plt.savefig(chartsFolder + "/" + picName + ".png")
plt.close()