-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathferup_generateHand.py
More file actions
97 lines (81 loc) · 2.59 KB
/
ferup_generateHand.py
File metadata and controls
97 lines (81 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
'''
generateHand.py
combine corpus from pw and ws to generate format in handcraft
Author: Jeff Pan
Contact: junjie.pan@nuance.com
'''
#-*- coding:utf-8 -*-
#!usr/bin/env python3
import os
import sys
import codecs
import logging
class GenHand(object):
'''
generate handcraft class
'''
def __init__(self, basepath, outfile, logger):
self.basepath = basepath
self.of = outfile
self.cf = []
self.logging = logger
self.filecheck()
def filecheck(self):
pwfs = sorted(os.listdir(os.path.join(self.basepath,'pw')))
wsfs = sorted(os.listdir(os.path.join(self.basepath,'ws')))
missfs = []
for f in wsfs:
if f not in pwfs:
missfs.append(f)
else:
self.cf.append(f)
self.logging.info('Checking:\n\tword-seg files: %d\tprodosy files: %d\tmatched files:%d'%(len(wsfs),len(pwfs),len(self.cf)))
if len(missfs)>0:
self.logging.warning('no match word-seg files:')
for f in missfs:
self.logging.warning('%s'%f)
def process(self):
with codecs.open(self.of, 'w', 'utf-8') as fo:
for f in self.cf:
self.logging.info('Process: %s'%f)
wsfn = os.path.join(self.basepath, 'ws',f)
pwfn = os.path.join(self.basepath, 'pw',f)
with codecs.open(wsfn, 'r', 'utf-8') as wsfi, codecs.open(pwfn, 'r', 'utf-8') as pwfi:
wslines = wsfi.readlines()
pwlines = pwfi.readlines()
for wsline, pwline in zip(wslines, pwlines):
ws = wsline.strip().split()
pw = pwline.strip().split()
hand = []
pos = 0
for p in pw:
plen = len(p)
wlen = 0
tmp = []
wsp = ws[pos:]
for idx, w in enumerate(wsp):
wlen += len(w)
tmp.append(w)
pos+=1
if plen == wlen:
hand.append('-'.join(tmp))
break
fo.write('%s\n'%(' '.join(hand)))
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description='generate handcraft file')
parser.add_argument('--version', action='version', version='1.0')
parser.add_argument('base',type=str)
parser.add_argument('outfile',type=str)
args = parser.parse_args()
formatter = logging.Formatter('[%(asctime)s][*%(levelname)s*][%(filename)s:%(lineno)d|%(funcName)s] - %(message)s', '%Y%m%d-%H:%M:%S')
logger = logging.getLogger()
file_handler = logging.FileHandler('log-GenHand.txt', 'w','utf-8')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.setLevel(logging.INFO)
handcraft = GenHand(args.base, args.outfile, logger)
handcraft.process()