-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsplit_sentence.py
More file actions
68 lines (62 loc) · 1.62 KB
/
split_sentence.py
File metadata and controls
68 lines (62 loc) · 1.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import re
class SimpleSplitSentence(object):
def __init__(self, raw):
self.raw = raw
self.delimiter_regex = r'。'
def remove_newline(self, raw):
s = set('\n\r\t')
p = []
for char in raw:
if char not in s:
p.append(char)
return ''.join(p)
def split(self):
raw = self.remove_newline(self.raw)
sentences = re.split(self.delimiter_regex, raw)
return [sentence.strip() for sentence in sentences if sentence.strip()]
class SplitSentence(object):
def __init__(self, raw):
self.raw = raw
self.delimiter = set("。!?")
self.punctuation_pair = set('《》“”‘’{}()()【】""')
self.punctuation_pair_prefix = {
'《': '》',
'“': '”',
'‘': '’',
'【': '】',
'(': ')',
'(': ')',
'{': '}',
'"': '"'
}
self.punctuation_pair_suffix = {
'》': '《',
'”': '“',
'’': '‘',
'】': '【',
')': '(',
')': '(',
'}': '{',
'"': '"'
}
self.punctuation_pair_stack = []
def split(self):
sentences = []
sentence = []
last_char = ''
for char in self.raw:
sentence.append(char)
if char in self.punctuation_pair_prefix:
self.punctuation_pair_stack.append(char)
elif char in self.punctuation_pair_suffix:
prefix = self.punctuation_pair_suffix[char]
index = 0
for i in range(len(self.punctuation_pair_stack)-1, -1, -1):
if self.punctuation_pair_stack[i] == prefix:
index = i
break
self.punctuation_pair_stack = self.punctuation_pair_stack[:index]
elif char in self.delimiter and len(self.punctuation_pair_stack)==0:
sentences.append(''.join(sentence))
sentence = []
return sentences