forked from maslinych/daba
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsyllables.py
More file actions
75 lines (68 loc) · 2.35 KB
/
syllables.py
File metadata and controls
75 lines (68 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/python
# -*- encoding: utf-8 -*-
import re
from orthograph import detone
import unicodedata
#consonants = 'klbmsrdtgfjchzpɲŋ'
#literals = 'ywrln'
vowels = u'aiɛɔoeu'
consonants = u'klbsrdtgfjchzp'
def segment(text, segs):
words = []
last = 0
for i in range(len(segs)):
if segs[i] == '1':
words.append(text[last:i+1])
last = i+1
words.append(text[last:])
return words
def split(struct, segstring):
'wordform -> [segstring]'
def appendtoall(res, list):
return [res[k] + b for k in range(len(res)) for b in list]
next = 1
if len(struct) > 1:
if unicodedata.category(struct[next]) == 'Mn':
#?`
segstring = appendtoall(segstring, ['0'])
if len(struct) > 2:
next = 2
else:
return segstring
if struct[0] in vowels:
# v...
if struct[0] == struct[next]:
# vv
return split(struct[next:], appendtoall(segstring, ['0']))
elif len(struct) == next+1:
# v?$
if struct[next] in u'n':
# -vn$
return appendtoall(segstring, ['0'])
else:
# v-[^n]$
return appendtoall(segstring, ['1'])
else:
if struct[next] in u'n':
if struct[next+1] in consonants:
# v-nc | vn-c
return split(struct[next+1:], appendtoall(segstring, ['01', '10']))
elif struct[next+1] in vowels:
# v-nv
return split(struct[next+1:], appendtoall(segstring, ['10']))
else:
# vn-[^cv]
return split(struct[next+1:], appendtoall(segstring, ['01']))
else:
# v-[^n]
return split(struct[next:], appendtoall(segstring, ['1']))
else:
# [^v]
return split(struct[next:], appendtoall(segstring, ['0']))
else:
# $
return segstring
def syllabify(word):
nword = unicodedata.normalize('NFD', word)
seglist = [segment(nword, seg) for seg in split(nword, [''])]
return [[unicodedata.normalize('NFC', sy) for sy in w] for w in seglist]