guoylyy · Baoshizhongri · Aug 18, 2017
diff --git a/homeworks/A10300/A10300-饱食终日-S2班个人学习总结.docx b/homeworks/A10300/A10300-饱食终日-S2班个人学习总结.docx
diff --git a/homeworks/A10300/A10300-饱食终日-homework1/._homework1.py b/homeworks/A10300/A10300-饱食终日-homework1/._homework1.py
diff --git a/homeworks/A10300/A10300-饱食终日-homework1/Day 3 use case.png b/homeworks/A10300/A10300-饱食终日-homework1/Day 3 use case.png
diff --git a/homeworks/A10300/A10300-饱食终日-homework1/Day3 flowchart.png b/homeworks/A10300/A10300-饱食终日-homework1/Day3 flowchart.png
diff --git a/homeworks/A10300/A10300-饱食终日-homework1/Day4 Homework.py b/homeworks/A10300/A10300-饱食终日-homework1/Day4 Homework.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+
+import codecs
+import os
+
+# 1. read source file
+# ['aa', 'aaa-bbb-sds'] => ['aa', 'aaa', 'bbb', 'sds']
+def word_split(words):
+    new_list = []
+    for word in words:
+        if '-' not in word:
+            new_list.append(word)
+        else:
+            lst = word.split('-')
+            new_list.extend(lst)
+    return new_list
+
+
+def read_file(file_path):
+    f = codecs.open(file_path, 'r', "utf-8")  # open file
+    lines = f.readlines()
+    word_list = []
+    for line in lines:
+        line = line.strip()
+        words = line.split(" ")  # space split
+        words = word_split(words)  # -split
+        word_list.extend(words)
+    return word_list
+
+
+def get_file_from_folder(folder_path):
+    file_paths = []
+    for root, dirs, files in os.walk(folder_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_paths.append(file_path)
+    return file_paths
+
+
+# read words from txts
+def read_files(file_paths):
+    final_words = []
+    for path in file_paths:
+        final_words.extend(read_file(path))
+    return final_words
+
+
+# 2.format&obtain word
+def format_word(word):
+    fmt = 'abcdefghijklmnopqrstuvwxyz-'
+    for char in word:
+        if char not in fmt:
+            word = word.replace(char, '')
+    return word.lower()
+
+
+def format_words(words):
+    word_list = []
+    for word in words:
+        wd = format_word(word)
+        if wd:
+            word_list.append(wd)
+    return word_list
+
+
+# 3. count words
+# {'aa':4, 'bb':1}
+def statictcs_words(words):
+    s_word_dict = {}
+    for word in words:
+        if s_word_dict.has_key(word):
+            s_word_dict[word] = s_word_dict[word] + 1
+        else:
+            s_word_dict[word] = 1
+    # word ranking
+    sorted_dict = sorted(s_word_dict.iteritems(), key=lambda d: d[1], reverse=True)
+    return sorted_dict
+
+
+# 4.export csv
+def print_to_csv(volcaulay_list, to_file_path, total_count):
+    nfile = open(to_file_path, 'w+')
+    current_count = 0
+    for val in volcaulay_list:
+        num = val[1]
+        current_count = current_count + num
+        word_rate = (float(current_count) / total_count) * 100
+        nfile.write("%s,%s,%0.2f\n" % (val[0], str(val[1]), word_rate))
+    nfile.close()
+
+
+def main():
+    # 1. read files
+    words = read_files(get_file_from_folder('data1'))
+    print '获取未格式化的单词 %d 个' % (len(words))
+
+    # 2. format file
+    f_words = format_words(words)
+    total_word_count = len(f_words)
+    print '获取了已格式化的单词 %d 个' % (len(f_words))
+
+    # 3. count word&ranking
+    word_list = statictcs_words(f_words)
+
+    start_and_end = [0.5, 0.7]  # extract words
+
+    # 4. file export
+    print_to_csv(word_list, 'output/test.csv', total_word_count)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/homeworks/A10300/A10300-饱食终日-homework1/homework1.py b/homeworks/A10300/A10300-饱食终日-homework1/homework1.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+
+import codecs
+import os
+
+def handle_split(words):
+  new_words = []
+  for word in words:
+    if '-' in word:
+      temp_words = word.split('-')
+      new_words.extend(temp_words)
+    else:
+      new_words.append(word)
+  return new_words
+
+#1. 读取文件
+def read_file(file_path):
+  f = codecs.open(file_path, 'r', "utf-8")
+  lines = f.readlines()
+  words = []
+  for line in lines:
+    line = line.strip()
+    line_words = line.split(" ")
+    if len(line_words) > 0:
+      line_words = handle_split(line_words)
+      words.extend(line_words)
+  return words
+
+def format_word(word):
+    fmt = 'abcdefghijklmnopqrstuvwxyz-'
+    for char in word:
+        if char not in fmt:
+            word = word.replace(char, '')
+    return word.lower()
+
+def format_words(words):
+  f_words = []
+  for word in words:
+    new_word = format_word(word)
+    if new_word:
+      f_words.append(new_word)
+  return f_words
+
+def statictics_words(words):
+  s_dict = {}
+  for word in words:
+    if s_dict.has_key(word):
+      s_dict[word] = s_dict[word] + 1
+    else:
+      s_dict[word] = 1
+  return s_dict
+
+#4.输出成csv
+def print_to_csv(items_list, to_file_path, total_num):
+    nfile = open(to_file_path,'w+')
+    curr_total = 0
+    for item in items_list:
+      curr_total = curr_total + item[0]
+      curr_percent = (float(curr_total)/total_num)*100
+
+      curr_percent_str = '%0.3f' %(curr_percent)
+
+      nfile.write("%s,%s,%s\n" % (item[1], str(item[0]),curr_percent_str))
+    nfile.close()
+
+def get_file_from_folder(folder_path):
+  paths = []
+  for root, dirs, files in os.walk(folder_path):
+    for file in files:
+      file_path = os.path.join(root, file)
+      paths.append(file_path)
+  print paths
+  return paths
+
+
+def read_files(paths):
+  world_words = []
+  for path in paths:
+    words = read_file(path)
+    world_words.extend(words)
+  return world_words
+
+def sort_by_value(word_dict):
+  items = word_dict.items()
+  item_list = [[it[1], it[0]] for it in items]
+  item_list.sort(reverse=True)
+  return item_list
+
+def main():
+  # words = read_file('data1/dt01.txt') # todo:扩展成所有的文件
+  words = read_files(get_file_from_folder('data2'))
+
+  f_words = format_words(words)
+
+  total_num = len(f_words)
+
+  word_dict = statictics_words(f_words)
+
+  items_list = sort_by_value(word_dict)
+
+  print_to_csv(items_list, 'output/test.csv', total_num)
+
+
+if __name__ == "__main__":
+  main()