-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtf_idf_driver.py
More file actions
48 lines (40 loc) · 1.26 KB
/
tf_idf_driver.py
File metadata and controls
48 lines (40 loc) · 1.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from tf_idf import calculate_tfidf
from ngram_articles import ngrams
import math
import sys
import json
import codecs
from textblob import TextBlob as tb
from collections import defaultdict
s=sys.argv
text_file=codecs.open(s[1],'r',encoding='utf-8')
out_file=codecs.open(s[2], 'w', encoding='utf-8')
corpus_article_list=codecs.open(s[3],'r',encoding='utf-8')
ngram=s[4]
article_list=[]
corpus_articles=[]
count=0
for article in text_file:
count+=1
article=tb(article)
article=article.ngrams(n=int(ngram))
ngram_obj=ngrams(article)
article=ngram_obj.get_string_from_ngrams()
article=tb(article)
article_list.append(article)
for corpus_article in corpus_article_list:
corpus_article=tb(corpus_article)
corpus_articles.append(corpus_article)
tf_idf_scores=defaultdict(lambda:{})
article_count=0
for i, article in enumerate(article_list):
article_count+=1
#print("Top words in article {}".format(i + 1))
scores={}
for word in article.words:
tfidf_obj=calculate_tfidf(word,article,corpus_articles)
scores[word]= tfidf_obj.tfidf()
tf_idf_scores["article"+str(article_count)]=scores
#scores = {word: tfidf(word, article, corpus_articles) for word in article.words}
#tf_idf_scores["article"+str(article_count)]=scores
json.dump(tf_idf_scores,out_file,ensure_ascii=False)