-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsaveData.py
More file actions
executable file
·65 lines (46 loc) · 1.48 KB
/
saveData.py
File metadata and controls
executable file
·65 lines (46 loc) · 1.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python
# -*- coding: utf-8 -*-
#comment
from util import *
import os
#first day of logs
start_day = dt.datetime(2015,02,13)
#where to put the .pkl
target_dir = "data/"
#big archive
archive = target_dir + "data.7z"
#do subsampling
sample = True
# number of consecutives time windows
num_tests = 10
train_window = 6
stats_list = [ ]
save_params = {
"usecols": col_idx, #range(1,len(names)+1),
"names": names[col_idx],
"sep": '\t'
}
def subsample(df_logs):
from collections import Counter
import operator
head_k = 10
tail_k = 20
logs_c = Counter(df_logs["target_ip"])
xs, freqs = zip( *sorted( logs_c.items(), key=operator.itemgetter(1), reverse=True) )
return df_logs[ df_logs.target_ip.map(lambda x: x in xs[ head_k:-tail_k ]) ]
#extract raw logs files
#for i in range(0,15):
# cur_day = start_day + dt.timedelta(days=i)
# os.system(r"7za x {} {} {}".format(archive, "logs{}.txt".format( cur_day.date().isoformat() ) , target_dir) )
for i in range(0,num_tests):
cur_day = start_day + dt.timedelta(days=i)
df_logs = loadData(cur_day, save_params)
if sample:
df_logs = subsample(df_logs)
fn = target_dir + "df_sample_" + cur_day.date().isoformat() +".pkl"
else:
fn = target_dir + "df_" + cur_day.date().isoformat() +".pkl"
df_logs.to_pickle(fn)
del df_logs
#delete all raw logs
#os.system( r"rm -rf {}df_*.txt".format(target_dir) )