Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
92 changes: 92 additions & 0 deletions homeworks/A14075/homework1/GenerateDicts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# -*- coding: utf-8 -*-

import codecs
import os

# 1.读取new_dicts文件夹 找到新词典
def get_file_from_folder():
dict_names = []
for root, dirs, files in os.walk('new_dicts'):
for file in files:
dict_names.append(file)
return dict_names

# 2.1截取至第一个非空格字符
def get_alph(text):
for index,char in enumerate(text):
if char != ' ':
text = text[index:]
break
return text

# 2.读取新词典
def get_dict(filename):
dict_path = os.path.join('new_dicts', filename)
dicts = {}
f = codecs.open(dict_path, 'r', 'utf-8')
lines = f.readlines()
for line in lines: # Todo:不同字典文件 分法不同
try:
line = line.strip()
line = line.replace(",",' ') #防止输出excel时的bug
#line = get_alph(line)
list = line.split("\t",1)
word = list[0]
#value = get_alph(list[1])
value = list[1]
if len(word)<2 or len(value) <2 :
continue
dicts[word] = value
except:
print (filename,"本行读取错误:",line)
f.close()
print ("已读取",filename)
return dicts

# 2.2删除词典
def delete_dict(filename):
dict_path = os.path.join('new_dicts', filename)
if Delete_On:
try:
os.remove(dict_path)
except:
print ("删除失败",filename)

# 3.输出新词典
def generate_dicts(filename,dicts):
save_path = os.path.join('buildin_dicts', filename)
nfile = open(save_path,'w+')
for key,value in dicts.items():
try:
nfile.write("%s_____%s\n" % (key,value)) #用_____分隔
except:
print ('写入失败',key,value)
nfile.close()
print ("已生成",filename)

def main():
dict_names = get_file_from_folder()
for dict_file in dict_names:
dicts = get_dict(dict_file)
delete_dict(dict_file)
generate_dicts(dict_file,dicts)

# 测试生成的词典
def test(dict_path):
dicts = {}
f = codecs.open(dict_path, 'r')
lines = f.readlines()
for line in lines:
line = line.strip()
list = line.split("_____", 1)
word = list[0]
value = list[1]
dicts[word] = value
print("已获取", len(dicts.keys()), '个单词释义')
return dicts


if __name__ == '__main__':
Delete_On = False # 读取后自动删除
main()
#test('buildin_dicts\\liuji.txt')
233 changes: 233 additions & 0 deletions homeworks/A14075/homework1/Main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
# -*- coding: utf-8 -*-

import codecs
import os

# 1.获取文件路径列表
def get_file_from_folder(folder_path):
paths = []
for root, dirs, files in os.walk(folder_path): # .walk()遍历文件夹
for file in files:
file_path = os.path.join(root, file) # path.join 将目录和文件名组合成路径
paths.append(file_path)
if show_details:
print ("总共",len(paths),"个文件")
return paths

# 3.分词
## 3.1去除非英文字符
def format_word(text):
fmt = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ' # 注意空格
for char in text:
if char not in fmt:
text = text.replace(char, ' ') # 用' '替换文本中的char
return text.lower() # Todo: 解决JAVA等大写问题

## 3.2处理单字母
def discard_single(words):
new_words = []
for word in words:
if len(word) > 1:
new_words.append(word)
return new_words

## 3.0 text由文件操作传递
def process_words(text):
text = format_word(text)
words = text.split(' ')
words = discard_single(words)
return words

# 2.文件操作
## 2.2读取单文件 返回单词
def read_file(file_path):
words = []
f = codecs.open(file_path, 'r', "utf-8")
lines = f.readlines() # .readlines()返回各行组成的list
for line in lines:
line = line.strip() # .strip() 清除行尾'\n'
if len(line) > 0: # 本行非空
words.extend(process_words(line))
return words

## 2.1读取所有文件 整合所有单词
def read_files(paths):
world_words = []
for index,path in enumerate(paths):
words = read_file(path)
world_words.extend(words)
if show_details:
print ("分词over,总共",len(world_words),'个单词')
return world_words

# 4.统计词频
def statictics_words(words):
s_dict = {}
for word in words:
if word in s_dict:
s_dict[word] = s_dict[word] + 1
else:
s_dict[word] = 1
print ("统计词频over.共",len(s_dict.keys()),'个不重复单词')
return s_dict


## 5.5 转换
def list_to_dict(wlist):
wdict = {}
for item in wlist:
wdict[item[0]] = item[1:]
return wdict

# 5.排序
def sort_by_value(word_dict):
item_list = word_dict.items() # 返回(key,value)的list
item_list = [[it[1], it[0]] for it in item_list] # key,value换位
item_list.sort(reverse=True) # reverse降序排序
item_list = [[it[1], it[0]] for it in item_list] # key,value换位
item_dict = list_to_dict(item_list)
return item_dict

# 6.百分比统计
def rate_statistics(items_Dict,total_num,rate_on):
rate_begin,rate_end = 0, 100
if rate_on:
rate_begin , rate_end = rate[0],rate[1]
final_Dict = {}
curr_total = 0
for word,item in items_Dict.items():
curr_total = curr_total + item[0]
curr_percent = (float(curr_total) / total_num) * 100
if curr_percent<rate_begin:
continue
if curr_percent>rate_end:
break
curr_percent_str = '%0.3f' % (curr_percent)
final_Dict[word] = [str(item[0]),curr_percent_str,''] # value:词频-百分比-释义
if show_details:
print ('排序over')
return final_Dict

# 7.筛选词汇
# 7.1
def clear_words(wordDict,clearlist):
for cword in clearlist:
if cword in wordDict.keys():
del wordDict[cword]
print ('已去除已掌握单词')
return wordDict

# 7.2 读单文件
def read_clearlist(clear_path):
clearlist = []
f = codecs.open(clear_path, 'r')
lines = f.readlines()
for line in lines:
line = line.strip()
clearlist.append(line)
return clearlist

# 7.3 整合多文件
def read_clearlist_from_folder(folder):
clear_paths = get_file_from_folder(folder)
clearlists = []
for clear_path in clear_paths:
clearlist = read_clearlist(clear_path)
clearlists.extend(clearlist)
if show_details:
print("共有", len(clearlists), '个已掌握单词')
return clearlists

# 8.添加释义
## 8.1读取词典
def read_dict(dict_path,dicts):
f = codecs.open(dict_path, 'r')
lines = f.readlines()
for line in lines:
line = line.strip()
list = line.split("_____", 1)
word = list[0]
value = list[1]
dicts[word] = value
return dicts

def read_dicts_from_folder(folder):
dict_paths = get_file_from_folder(folder)
dicts = {}
for dict_path in dict_paths:
dicts = read_dict(dict_path,dicts)
print("已获取", len(dicts.keys()), '个单词释义')
return dicts

# 8.2 添加释义
def add_meaning(final_dict,dicts):
found_words = {}
notfound_words = {}
for word,value in final_dict.items():
if word in dicts.keys():
value[2] = dicts[word]
found_words[word] = value
else:
notfound_words[word] = value
if show_details:
print ("有释义的单词",len(found_words),'个')
return found_words,notfound_words

# 9.输出csv
def print_to_csv(word_Dict, filename):
to_file_path = os.path.join('output', filename)
nfile = codecs.open(to_file_path,'w+')
for word, item in word_Dict.items():
if show_statistics:
nfile.write("%s,%s,%s,%s\n" % (word,item[0], item[1],item[2]))
else :
nfile.write("%s,%s\n" % (word, item[2]))
nfile.close()
print ("输出文件",to_file_path)

# 9.2 输出anki导入文件
def print_to_anki(word_Dict, filename):
to_file_path = os.path.join('output', filename)
nfile = codecs.open(to_file_path,'w+','utf-8')
for word, item in word_Dict.items():
nfile.write("%s\t%s\r\n" % (word, item[2]))
nfile.close()
print ("输出文件",to_file_path)

def main():
# words = read_file('data1/dt01.txt')
file_path = get_file_from_folder('text') #1.获取文件路径列表

words = read_files(file_path) #2.3.读取文件并分词

total_num = len(words)
word_dict = statictics_words(words) #4.统计词频

word_dict = sort_by_value(word_dict) #5.排序


word_dict = rate_statistics(word_dict,total_num, rate_on)
if show_details:
print_to_csv(word_dict, 'before_clear.csv') #6.百分比统计 True-按百分比

clearlists = read_clearlist_from_folder('clear_lists')
word_dict = clear_words(word_dict,clearlists)
if show_details:
print_to_csv(word_dict, 'after_clear.csv') # 7. 筛选已掌握单词

Real_dicts = read_dicts_from_folder('buildin_dicts') #8.1读取词典
found_words, notfound_words = add_meaning(word_dict,Real_dicts) # 8.2 生成单词释义

print_to_csv(found_words, 'found.csv')
print_to_csv(notfound_words, 'notfound.csv') # 9.输出至文档

print_to_anki(found_words,'anki.txt') #9.2输出anki导入文件

if __name__ == "__main__":
# 将需修改的参数放至此处 方便修改
show_details = True # 显示调试信息
rate_on = False # 筛选百分比开关
rate = [50,70] # 百分比始末
show_statistics = True # 显示词频&百分比
eachday_recite_num = 20 # 每天背单词数
main()
52 changes: 52 additions & 0 deletions homeworks/A14075/homework1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
具体可看: https://github.com/hydewww/TermFrequency

# 词频分析

```coding in python3.6```

为摆脱无脑谷歌翻译而诞生的程序,希望能帮助自己啃下绝望的众英文文档。

## 用法

- **程序所需文件夹需先行创建**,可复制```example```中文件进行测试

- 用户添加的文件最好存为utf-8格式,可参考```example```中文件

### Main.py

分析文本,筛去常用词汇()后按```词频顺序```输出固定比例(自定义)的表格

- 文本放在```text```

- 英汉词典位于```buildin_dicts```(内置牛津词典)

- 常用词汇位于```clear_lists```

- 输出文件夹为```output```

- 可改参数位于代码末尾

### GenerateDicts.py

将希望导入的词典转为程序所需格式

- 需修改分词部分的代码

- 新词典放在```new_dicts```

- 生成词典所在位置```buildin_dicts```

- Tips:若```buildin_dicts```只有中高考/四六级词汇,输出的就是文本中考纲范围内的单词

### 导入至anki

anki是本人强推的背诵软件,各平台均有客户端,官网:https://apps.ankiweb.net/index.html

点击```导入文件```选中```anki.txt```即可导入,记忆库中不会添加重复项。

关于anki的各种奇技淫巧可关注anki的知乎专栏:https://zhuanlan.zhihu.com/-anki


## Todo

- 用爬虫抓取文本
Loading