-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathword_count.py
More file actions
130 lines (95 loc) · 3.63 KB
/
word_count.py
File metadata and controls
130 lines (95 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#encoding:utf8
import os
import os.path
import bz2
import sys
import argparse
from joblib import Parallel, delayed
from collections import defaultdict
ID, FORM, LEMMA, CPOS, FPOS, MORPH, HEAD, REL, PHEAD, PREL=range(10)
def get_conll_files(path):
sys.stderr.write("Entering {}\n".format(path))
dirs = []
files = []
for f in os.listdir(path):
f = os.path.join(path,f)
if os.path.isfile(f):
if f.endswith(".conll"):
files.append(f)
if os.path.isdir(f):
dirs.append(f)
for d in dirs:
files.extend( get_conll_files(d) )
return files
def read_conll(filename):
with open(filename) as f:
corpus = []
sentences = f.read().split("\n\n")
for sentence in sentences:
sentence = sentence.strip()
if sentence:
tokens = [ line.strip().split("\t") for line in sentence.split("\n") ]
assert(all([len(line) == 10 for line in tokens]))
corpus.append(tokens)
return corpus
def write_conll(corpus, filename):
of = open(filename, "w")
for sent in corpus:
for token in sent:
of.write("{}\n".format("\t".join(token)))
of.write("\n")
of.close()
def dump_distribution_dict(filename, dic):
ofstream = open(filename, "w")
for item in sorted(dic, key = lambda x : dic[x], reverse = True):
ofstream.write("{}\t{}\n".format(item, dic[item]))
ofstream.close()
def main(root, output):
os.system("mkdir -p {}".format(output))
file_list = get_conll_files(root)
voc = defaultdict(int)
tags = defaultdict(int)
sent_length = defaultdict(int)
doc_length_s = defaultdict(int)
doc_length_t = defaultdict(int)
n_tokens = 0
n_sentences = 0
n_documents = 0
for f in file_list:
corpus = read_conll(f)
n_documents += 1
doc_length_s[len(corpus)] += 1
doc_length_t[sum([len(s) for s in corpus])] += 1
n_sentences += len(corpus)
for sentence in corpus:
n_tokens += len(sentence)
sent_length[len(sentence)] += 1
for token in sentence:
voc[token[FORM]] += 1
tags[token[CPOS]] += 1
of = open(output+"/stats.txt", "w")
of.write("Number of documents : {}\n".format(n_documents))
of.write("Number of sentences : {}\n".format(n_sentences))
of.write("Number of tokens : {} (ignoring punctuation: {})\n".format(n_tokens, n_tokens - tags["PONCT"]))
of.write("Number of word types : {}\n".format(len(voc)))
of.close()
dump_distribution_dict(output+"/vocabulary", voc)
dump_distribution_dict(output+"/tags", tags)
dump_distribution_dict(output+"/doc_length_w", doc_length_t)
dump_distribution_dict(output+"/doc_length_s", doc_length_s)
dump_distribution_dict(output+"/sent_length", sent_length)
if __name__ == "__main__":
usage = """
Computes some statistics about parsed corpus (number of tokens, word types, etc.):
- Number of documents
- size of documents (num tokens, num sentences)
- Number of sentences
- size of sentences (num tokens)
- Number of tokens
- Number of word types (all, excluding NPP and ET)
"""
parser = argparse.ArgumentParser(description = usage, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("root", type = str, help="Directory (every subfolder will be searched for conll files)")
parser.add_argument("output", type = str, help="Output dir")
args = parser.parse_args()
main(args.root, args.output)