tensorflow-rec-practice/main.py at master · shlin168/tensorflow-rec-practice · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from __future__ import print_function
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import os
from datetime import timedelta
from sklearn.model_selection import train_test_split
from preprocess.features import Feature_engineering
from tf_model.tf_feature_builder import FeatureColumnsBuilder
from tf_model.model import Model

os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"

def main():
    start_time = time.time()
    print('read data ...')
    data = pd.read_csv('dataset/rec_data.csv', low_memory=False)

    print('feature engineering ...')
    features = Feature_engineering.get_numeric_features(data)
    label = 'product_code'
    float2int_list = ['employee_idx', 'country_residence', 'sex', 'relation_type',
                        'residence_idx', 'foreigner_idx', 'spouse_idx', 'channel',
                        'deceased_idx', 'province_name', 'type', 'new_customer',
                        'primary', 'address_type','province_code','activity_idx',
                        'segmentation', 'age', 'seniority']
    data = Feature_engineering.fillna(data, features, -1)
    data = Feature_engineering.float2int(data, float2int_list)
    train_data = data[(data['partition'] != '2016-06-28') & data['product'].notnull()]
    train_data = Feature_engineering.label_encoder(train_data, 'product', 'product_code')

    feature_dict = {
        'num': ['foreigner_idx',  'spouse_idx', 'deceased_idx', 'new_customer',
                'primary', 'address_type','province_code','activity_idx'],
        'norm_num': ['age', 'seniority', 'gross_income_household'],
        'bucket': [('age', list(range(0,170,10)))],
        'embeddings': [('channel', 10), ('province_code', 10), ('country_residence', 10)]
    }
    # return feature columns in tensorflow format
    tf_cols = FeatureColumnsBuilder.get_all(train_data, feature_dict)

    # seperate train and valid data
    print('split data to train and valid data')
    X_train, X_valid, y_train, y_valid = train_test_split(train_data[features], train_data[label], test_size=0.2, random_state=0)
    train_dataset = X_train.copy()
    train_dataset[label] = y_train
    valid_dataset  = X_valid.copy()
    valid_dataset[label] = y_valid

    # params for input
    BATCH_SIZE = 300000
    VALID_ROWS = len(X_valid)
    dup_cols_map = {
        'bucket': ['age']
    }

    # get input
    train_X, train_y = Model.rename_dup_cols(train_dataset, features, label, dup_cols_map)
    valid_X, valid_y = Model.rename_dup_cols(valid_dataset, features, label, dup_cols_map)

    # params for model
    N_CLASSES = train_data[label].nunique()
    TOP_K = 7

    # create model
    classifier = tf.estimator.Estimator(
        model_fn = Model.create_model,
        model_dir = 'my_model',
        config = tf.estimator.RunConfig(
                save_checkpoints_steps=50,
                save_summary_steps=10
            ),
        params={
            'feature_columns': tf_cols,
            'hidden_units': [100, 50, 25],
            'n_classes': N_CLASSES,
            'k': TOP_K
    })

    # hide logging
    tf.logging.set_verbosity(tf.logging.ERROR)

    # params for training
    EPOCHS = 2
    DISPLAY_STEPS = 1

    # start training
    print('start training ...')
    for n in range(EPOCHS):
        classifier.train(
            input_fn = lambda: Model.train_input_fn(train_X, train_y, BATCH_SIZE),
            steps = 10
        )
        results = classifier.evaluate(
            input_fn = lambda: Model.eval_input_fn(valid_X, valid_y, VALID_ROWS)
        )

        if (n+1) % DISPLAY_STEPS == 0:
            print(n + 1, 'rounds')
            # display evaluation metrics
            print('Results at epoch', (n + 1) * DISPLAY_STEPS)
            print('-' * 30)
            for key in sorted(results):
                print('%s: %s' % (key, results[key]))

    print('execution time: {}'.format(timedelta(seconds=time.time() - start_time)))

if __name__ == '__main__':
    main()