-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathclean_np.py
More file actions
72 lines (54 loc) · 2.03 KB
/
clean_np.py
File metadata and controls
72 lines (54 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3 -*- coding: utf-8 -*-
"""
This script takes an output from:
python3 nltk_cli/senna.py --np test.txt > test.np
And process the noun phrases to filter out phrases
(i) that don't have the last word tagged as NN
(ii) has any token that is a stopword
(iii) the first and last word in phrase is not a punctuation
This is part of the Terminator software from
https://github.com/alvations/Terminator (Tan, 2015)
Usage:
python3 nltk_cli/clean_np.py test.np --output test.filtered.np
Reference:
Liling Tan. 2015. EXPERT Innovations in Terminology Extraction and
Ontology Induction. In Proceedings of Proceedings of the EXPERT Scientific
and Technological Workshop. Malaga, Spain.
"""
import io, sys
from os.path import expanduser
from string import punctuation
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tag import PerceptronTagger
tagger = PerceptronTagger()
pos_tag = tagger.tag
STOPWORDS = stopwords.words('english')
def simple_filter(list_of_ngrams):
return [ng for ng in list_of_ngrams if
ng.lower() not in STOPWORDS and
ng[0] not in punctuation and ng[-1] not in punctuation and
ng.split()[-1].lower() not in STOPWORDS and
ng.split()[0].lower() not in STOPWORDS and
not any(i for i in ng.split() if i.lower() in STOPWORDS) and
any(pos for word,pos in pos_tag(ng.lower().split())
if pos.startswith('NN')) and
')' not in ng and '(' not in ng and ',' not in ng and
'pinyin' not in ng and
ng.split()[0] not in ['more', 'less']]
outfile = ""
try:
if sys.argv[2] == '--output':
outfile = sys.argv[3]
fout = io.open(outfile, 'w', encoding='utf8')
except IndexError:
pass
with io.open(sys.argv[1], 'r', encoding='utf8') as fin:
for line in fin:
list_of_ngrams = line.split('\t')[0].split('|')
for ng in simple_filter(list_of_ngrams):
if outfile:
fout.write(ng + '\n')
else:
print(ng)