PublicFiles/word doc create.py at master · quantkeyvis/PublicFiles · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: cp1252 -*-
#python -m pip install blahblah
import BeautifulSoup as bs
import numpy as np
import pandas as pd
import textmining as tm
#import Dcluster as dcl
import urllib
import sys
import re
import random
import numpy as np
import textmining as tm
import sys
import re
import random
import glob
import pickle
from nltk.corpus import stopwords


stop_words = stopwords.words("english")

stop_codec=[u'\xd2',u'\xe0',u'\u2011',u'\ude31',u'\ude0d',u'\ud83d',u'\u05d3',u'\u05de',u'\u05d7',u'\u202c',u'\xe4',u'\xf6',u'\u2013',u'\xf3',u'\xeb',u'\xad',u'\xf4'
            ,u'\u2014',u'\u0302',u'\u200b',u'\xa9',u'\xc9',u'\xed',u'\xfc',u'\xc1',u'\xd6',u'\U0001f9c0',u'\u2013',u'\xe8',u'\u20ac',u'\u2022',u'\u2026',u'\xe1'
            ,u'\xf1',u'\xfa',u'\xe7',u'\xa3',u'\xef','\xa0',u'\u2019',u'\xe9',u'\u201c',u'\u201d',u'0xc3',u'\u2018',u'\u0131',u'\u011f',u'\u015f',u'\u0159',u'\u0101'
            ,u'\u0161',u'\u0301',u'\xbf',u'\xe1',u'\ufffd',u'\u20b9']

pickle_loc=open(r'articleslinkstitle2017-10-19.pickle',"rb")
sites=pickle.load(pickle_loc)
pickle_loc.close()
[u'articles', u'Titles', u'site', u'Links', u'Time_Stamp']
docs={}
#stop_words.extend([])
print('got it')
for i in xrange(len(sites['articles'])/8):
    #text_file = open(r'doc'+str(i)+'.txt', "w")
    #text_file.write(fil)
    if len(sites['articles'][i])<1284:
        continue
    sites['Links'][i]=sites['Links'][i]+str(i)
    docs[sites['Links'][i]]=sites['articles'][i]
    #text_file.close()
print('This many docs were looked at: '+str(i)+' \n and this many were real: '+str(len(docs.keys())))

print('\n')
sites=docs.keys()
for i,index in enumerate(sites):
    doc=re.sub('[^\w]',' ',docs[index].lower())
    doc=re.sub("\n", " ",doc)
    doc=re.sub("[123456789]", "#",doc)
    doc=re.sub("###", "#",doc)
    doc=re.sub("##", "#",doc)
    doc=re.sub("\t", " ",doc)
    for j in stop_words:
        doc=re.sub('( |$|^)'+j+'( |$|^)', " ",doc)
    for j in stop_codec:
        doc=re.sub('( |$|^)'+j+'( |$|^)', "",doc)
    doc=re.sub('( |$|^)t( |$|^)', "t",doc)
    doc.replace("  "," ")
    docs[index]=doc
    doc_split=doc.split(" ")
    if i==0:
        doc_set=list(set(doc_split))
    else:
        doc_set.extend(list(set(doc_split)))
doc_set=list(set(doc_set))
word_cnt={}
for i in doc_set:
    word_cnt[i]=0
    for j in sites:
        word_cnt[i]+=docs[j].count(i)
cut_off=np.percentile(np.array(word_cnt.values()),20)
print('Word Cut-off is: '+str(cut_off))
cut_off=len(sites)
for i in doc_set:
    if word_cnt[i]<=cut_off:
        for j in sites:
            docs[j]=re.sub('( |$|^)'+i+'( |$|^)',' ',docs[j])
        del word_cnt[i]
doc_set=word_cnt.keys()
##rmv_wd=[]
##for i in doc_set:
##    if '\\' in i:
##        print(i)
##        rmv_wd.append(i)
##print('\n')
##decide=input("Remove this list of 'words'? (y/n)")
##if decide=='y':
##    doc_set=list(set(doc_set)-set(rmv_wd))
word_cnt=None
print('doc splitted')
docs_names= sites
doc_dict=dict.fromkeys(doc_set,0)
tdl={}
sites=None
print('index made')
print(len(docs_names))
while docs!={}:
    doc_dict_temp=dict(doc_dict)
    index=docs_names[0]
    del docs_names[0]
    doc_split=docs[index].split(" ")
    del docs[index]
    for word in doc_split:
        try:
            doc_dict_temp[word]+=1
        except KeyError:
            #print('missed words somehow')
            #sys.exit()
            pass
    tdl[index]=doc_dict_temp
print('have tdm')
data=pd.DataFrame(tdl)
tdl=None
data.to_csv(r'play_tdm.csv')
sys.exit()
6421