-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstatistic.py
More file actions
138 lines (112 loc) · 4.13 KB
/
statistic.py
File metadata and controls
138 lines (112 loc) · 4.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# encoding:utf-8
import docx2txt
import os
import pandas as pd
import re
import string
def printPath(level, path):
global allFileNum
'''
打印一个目录下的所有文件夹和文件
'''
# 所有文件夹,第一个字段是次目录的级别
dirList = []
# 所有文件
fileList = []
# 返回一个列表,其中包含在目录条目的名称(google翻译)
files = os.listdir(path)
# 先添加目录级别
dirList.append(str(level))
for f in files:
if(os.path.isdir(path + '/' + f)):
# 排除隐藏文件夹。因为隐藏文件夹过多
if(f[0] == '.'):
pass
else:
# 添加非隐藏文件夹
dirList.append(f)
if(os.path.isfile(path + '/' + f)):
# 添加文件
fileList.append(f)
# 当一个标志使用,文件夹列表第一个级别不打印
i_dl = 0
for dl in dirList:
if(i_dl == 0):
i_dl = i_dl + 1
else:
# 打印至控制台,不是第一个的目录
print '-' * (int(dirList[0])), dl
# 打印目录下的所有文件夹和文件,目录级别+1
printPath((int(dirList[0]) + 1), path + '/' + dl)
return fileList
file_list = printPath(1, os.getcwd())
def cut2sentence(text):
all_sentence = []
for key in string.split(text, '\n'):
if key != u'':
all_sentence.append(key)
return all_sentence
result = []
tc_pattern = [re.compile(u' [\u4e00-\u9fa5]+'), re.compile(r' \w+'), re.compile(u':[\u4e00-\u9fa5]+'), re.compile(u':[\u4e00-\u9fa5]+')]
chinese_pattern = re.compile(u'[\u4e00-\u9fa5]{5}')
for filename in file_list:
if 'docx' in filename and '~$' not in filename:
print filename
text = docx2txt.process(filename)
all_paragraph = cut2sentence(text)
# english subject
english_subject = all_paragraph[0]
# chinese subject
chinese_subject = all_paragraph[1]
# name
translator_name = ''
checker_name = ''
for i in xrange(1, 10):
paragraph = all_paragraph[i]
first_word = paragraph[0]
if first_word == u'翻' or first_word == u'译':
c = ''
for k in xrange(0, len(tc_pattern)):
pattern = tc_pattern[k]
a = pattern.search(paragraph)
if a:
c = a.group()
break
for j in xrange(1, len(c)):
if c[j] != ' ':
translator_name += c[j]
elif first_word == u'审' or first_word == u'校':
c = ''
for k in xrange(0, len(tc_pattern)):
pattern = tc_pattern[k]
a = pattern.search(paragraph)
if a:
c = a.group()
break
for j in xrange(1, len(c)):
if c[j] != ' ':
checker_name += c[j]
# the number of words:
num_chinese = 0
num_english = 0
for paragraph in all_paragraph:
if chinese_pattern.search(paragraph):
#print paragraph
for words in paragraph:
if re.compile(u'[\u4e00-\u9fa5]').search(words) :
num_chinese += 1
else:
'''for word in paragraph:
if word in string.punctuation:
num_english += 1'''
num_english += string.split(paragraph).__len__()
if translator_name == u'M':
translator_name = u'Meatle'
if checker_name == u'M':
checker_name = u'Meatle'
result.append([english_subject, chinese_subject, translator_name, checker_name, num_english, num_chinese])
len = result.__len__()
result = pd.DataFrame(result)
result.set_axis(1, [u'英文题目', u'中文题目', u'翻译', u'审校', u'英文字数', u'中文字数'])
result.set_axis(0, range(1, len+1))
result.to_csv('统计结果.csv', encoding='utf-8')