AutoAbstract/test.py at master · wikty/AutoAbstract · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import json
from split_sentence import SimpleSplitSentence, SplitSentence
from split_word import SimpleSplitWord
from text_rank import TextRank
from extract_abstract import text_rank_extract_abstract
from get_data import read_data, get_df
from tf_idf import get_keywords
from similarity import article_similarity
from extract_word import SimpleExtractor

def test_split_sentence(raw):
	spliter1 = SimpleSplitSentence(raw)
	spliter2 = SplitSentence(raw)
	return spliter2.split()

def test_split_word(raw):
	spliter = SimpleSplitWord(raw)
	return spliter.split()

def test_text_rank(sentences):
	ranker = TextRank(sentences)
	return ranker.rank()

def test_extract_abstract_by_text_rank(sentences, k=5):
	return text_rank_extract_abstract(sentences, k)

def test_article_similarity(art1, art2):
	return article_similarity(art1, art2)

if __name__ == '__main__':
	# articles = read_data()
	# sentences = test_split_sentence(articles[0])
	# sentences = [test_split_word(sentence) for sentence in sentences]
	#print(test_text_rank(sentences))
	#print(test_extract_abstract_by_text_rank(sentences))

	# with open('input.txt', 'w', encoding='utf-8') as f:
	# 	f.write(articles[0])
	# with open('output.txt', 'w', encoding='utf-8') as f:
	# 	f.write(test_extract_abstract_by_text_rank(sentences))
	#print(get_keywords(sentences))

	# sentences1 = test_split_sentence(articles[1])
	# sentences1 = [test_split_word(sentence) for sentence in sentences1]

	#print(test_article_similarity(sentences, sentences1))
	articles = read_data()
	with open('test.txt', 'w', encoding='utf-8') as f:
		extractor = SimpleExtractor(articles[0])
		f.write(json.dumps(extractor.get_suffix(), ensure_ascii=False))
		f.write('\n')
		f.write(json.dumps(extractor.get_reverse_suffix(), ensure_ascii=False))
		f.write('\n')
		f.write(json.dumps(extractor.get_term_freq(), ensure_ascii=False))
		f.write('\n')
		f.write(json.dumps(extractor.extract(), ensure_ascii=False))