forked from jeina7/GPT2-essay-writer
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsave_data.py
More file actions
59 lines (43 loc) · 1.88 KB
/
save_data.py
File metadata and controls
59 lines (43 loc) · 1.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#-*- coding:utf-8 -*-
import re, os
import pandas as pd
from glob import glob
def PreProcess(text):
text = re.sub(pattern='Posted on [0-9]{4} [0-9]{2} [0-9]{2} .+ Posted in \S+ \s?', \
repl='', string=text)
text = re.sub(pattern='Posted on [0-9]{8} .+ Posted in \S+ \s?', \
repl='', string=text)
text = re.sub(pattern='[0-9]{4}년 [0-9]{,2}월 [0-9]{,2}일 [0-9]{,2}시 [0-9]{,2}분 [0-9]{,2}초', \
repl='', string=text)
text = re.sub(pattern='[0-9]{4}. [0-9]{,2}. [0-9]{,2}', \
repl='', string=text)
_filter = re.compile('[ㄱ-ㅣ]+')
text = _filter.sub('', text)
_filter = re.compile('[^가-힣 0-9 a-z A-Z \. \, \' \" \? \!]+')
text = _filter.sub('', text)
return text
def save_data(dir_path, save_path):
files = [f for f in glob(dir_path + "*", recursive=True)]
data = ''
for file in files:
suffix = file.split("/")[-1].split(".")[-1]
if suffix == 'csv':
df = pd.read_csv(file).reset_index()
print('{} data saving. size:'.format(file.split('/')[-1]), df.shape[0])
for i, text in enumerate(df['content'].values):
text = PreProcess(text)
df.loc[i, 'content'] = text
data += "\n".join(df['content'].values)
elif (suffix == 'txt') and (not file.split("/")[-1].startswith("data")):
print('{} data saving.'.format(file.split('/')[-1]))
with open(file, 'r', encoding='utf-8') as f:
text = f.read()
text = PreProcess(text)
data += text
with open(save_path, 'w') as f:
f.write(data)
print("\nAll saved.".format(dir_path.split('/')[-1]))
data_dir = './Crawling/textcrawler/use_data/'
save_path = './Crawling/textcrawler/use_data/data.txt'
if __name__ == "__main__":
save_data(data_dir, save_path)