numer.ai/common.py at master · travisbrady/numer.ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import json
from time import time
import pandas as pd
from scipy.stats import percentileofscore
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.cross_validation import cross_val_predict, cross_val_score

FN = 'numerai_training_data.csv'
FN_TOURN = 'numerai_tournament_data.csv'
X_cols = ['f%d' % (i) for i in range(1, 15)] + ['c1_int']
DB_FN = 'results.json'

def load_train():
    df = pd.read_csv(FN)
    df['c1_int'] = df.c1.apply(lambda x: x.split('_')[1])
    return df

def l():
    df = pd.read_csv(FN)
    dum = pd.get_dummies(df.c1)
    df = pd.concat((df, dum), axis=1)
    df_train, df_val = df[df.validation==0], df[df.validation==1]
    return df_train, df_val

def load_train_dummies():
    df_train, df_val = l()
    X_dum_cols = [c for c in df_train if c[0]=='f' or c.startswith('c1_')]
    X_train = df_train.loc[:, X_dum_cols].values
    X_val = df_val.loc[:, X_dum_cols].values
    y_train, y_val = df_train.target.values, df_val.target.values
    return X_train, X_val, y_train, y_val

def load_tourn_dummies():
    df = pd.read_csv(FN_TOURN)
    dum = pd.get_dummies(df.c1)
    df = pd.concat((df, dum), axis=1)
    df.set_index(['t_id'], inplace=True)
    return df

def load_tourn():
    df_tourn = pd.read_csv(FN_TOURN)
    df_tourn['c1_int'] = df.c1.apply(lambda x: x.split('_')[1])
    df_tourn.set_index(['t_id'], inplace=True)
    return df_tourn

def dump_to_db(est, auc, acc, f1, start_time, end_time, fn=DB_FN):
    db = open(fn, 'a')
    out_row = dict(est=est, auc=auc, acc=acc, f1=f1, start_time=start_time, end_time=end_time)
    db.write(json.dumps(out_row) + '\n')
    db.close()

def compare_to_history(auc, fn=DB_FN):
    recs = map(json.loads, open(fn).readlines())
    df = pd.DataFrame.from_records(recs)
    print '[compare_to_history] N = %d' % (df.shape[0])
    print '[compare_to_history] latest auc of %.6f is in the %.1fth percentile' % (auc, percentileofscore(df.auc.unique(), auc))

def predict_and_report(est, X, y, cv=5):
    t0 = time()
    y_pred = cross_val_predict(est, X, y, cv=cv)
    print confusion_matrix(y, y_pred)
    print classification_report(y, y_pred)
    auc = roc_auc_score(y, y_pred)
    print 'AUC: %f' % (auc)
    t1 = time()
    dump_to_db(repr(est), auc, accuracy_score(y, y_pred), f1_score(y, y_pred), t0, t1)
    compare_to_history(auc)

def predict_and_report_val(est, X_train, X_val, y_train, y_val):
    t0 = time()
    est.fit(X_train, y_train)
    y_pred = est.predict(X_val)
    print confusion_matrix(y_val, y_pred)
    print classification_report(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred)
    print 'AUC: %f' % (auc)
    t1 = time()
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    dump_to_db(repr(est), auc, acc, f1, t0, t1, fn='validation_metrics.json')
    compare_to_history(auc, fn='validation_metrics.json')