forked from yanshao9798/segmenter
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreader.py
More file actions
56 lines (49 loc) · 1.29 KB
/
reader.py
File metadata and controls
56 lines (49 loc) · 1.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-
"""
@author: Yan Shao, yan.shao@lingfil.uu.se
"""
import codecs
def conll(path, is_dev=True):
sents = []
sent = []
cter = 0
sents_dev = None
if not is_dev:
sents_dev = []
for line in codecs.open(path, 'rb', encoding='utf8'):
line = line.strip()
segs = line.split('\t')
if len(segs) == 10:
if '.' not in segs[0]:
sent.append(tuple(segs))
elif len(sent) > 0:
if not is_dev and cter == 9:
sents_dev.append(sent)
cter = 0
else:
sents.append(sent)
cter += 1
sent = []
if is_dev:
return sents
else:
return sents, sents_dev
def raw(path):
sents = []
for line in codecs.open(path, 'rb', encoding='utf-8'):
line = line.strip()
sents.append(line)
return sents
def conll_gold(path):
sents = []
st = ''
for line in codecs.open(path, 'rb', encoding='utf-8'):
line = line.strip()
segs = line.split('\t')
if len(segs) == 10:
if '.' not in segs[0] and '-' not in segs[0]:
st += ' ' + segs[1]
elif len(st) > 0:
sents.append(st.strip())
st = ''
return sents