CCDM2014-contest/multilabel.py at master · hitalex/CCDM2014-contest · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#coding=utf8

"""
Task1: Multi-label task
"""

import numpy as np

from dimension_reduction import PCA_transform
from evaluation import predition2dict
from evaluation import score_list_v2 as caculate_AP_dict

def average_precision(outputs, test_target):
    """ Example-based AP for multilable classification
    outputs: shape=n, num_class, probility prediction
    test_targe: the same size of outputs, 0/1 indicates whether belong to the class
    """
    assert(outputs.shape == test_target.shape)
    num_instance, num_class = outputs.shape

    aveprec = 0.0
    num_valid_instance = 0 # 记录那些至少属于一个类的样本个数
    for i in range(num_instance):
        # sort prediction prob.
        tmp = outputs[i, :]
        # decending order
        index = list(np.argsort(-1 * tmp)) # [tempvalue,index] = sort(temp)

        indicator = np.array([0] * num_class, int)
        # Which and how many classes does this sample belong to ?
        label_list = np.array(range(num_class))[test_target[i, :] == 1]
        label_size = len(label_list)
        for m in range(label_size):
            label = label_list[m]
            loc = index[label]
            indicator[loc] = 1

        summary = 0
        for m in range(label_size):
            label = label_list[m]
            loc = index[label]
            #summary = summary + sum(indicator[loc:num_class]) * 1.0 / (num_class-loc+1)
            summary = summary + sum(indicator[:loc+1]) * 1.0 / (loc+1)

        # 不考虑那些不属于任何一类的样本
        if label_size > 0:
            aveprec = aveprec + summary / label_size
            num_valid_instance += 1

    Average_Precision = aveprec / num_valid_instance

    return Average_Precision

def OneVsRest_multilabel(train_feature, train_label, test_feature, BinaryClassifier, **kwargs):
    """ multi-label classification
    """
    from sklearn.multiclass import OneVsRestClassifier
    clf = OneVsRestClassifier(BinaryClassifier(**kwargs)).fit(train_feature, train_label)

    train_pred = clf.predict_proba(train_feature)
    test_pred = clf.predict_proba(test_feature)

    return train_pred, test_pred

def save_multilabel_result(method_name, n_components, model_test_AP, test_pred):
    """ Save result to file
    """
    # save the final prediction
    index = 0
    path = 'results/task1-%s-%d-%f-.csv' % (method_name, n_components, model_test_AP)
    print 'Saving result to file: ', path
    f = open(path, 'w')
    n = len(test_pred)
    for i in range(n):
        result = ','.join(str(x) for x in test_pred[i, :])
        f.write('%d,%s\n' % (index+1, result))
        index += 1

    f.close()

if __name__ == '__main__':
    import sys
    n_components = int(sys.argv[1])
    n_folds = int(sys.argv[2])

    # load dataset
    f = open('task1-dataset/task1-dataset.pickle')
    import pickle
    # the following variables are accessed throughout this script, be careful
    train_feature, train_label, test_feature = pickle.load(f)
    f.close()

    # apply the PCA dimension reduction
    train_feature, test_feature = PCA_transform(train_feature, test_feature, 'task1-dataset/task1-PCA-decomp.mat')
    train_feature = train_feature[:, :n_components]
    test_feature = test_feature[:, :n_components]

    train_count, num_class = train_label.shape

    from sklearn.naive_bayes import GaussianNB as BinaryClassifier
    method_name = 'GaussianNB+OneVsRest'
    kwargs = {}

    #from sklearn.svm import SVC as BinaryClassifier
    #method_name = 'SVC+OneVsRest'
    #kwargs = {'C':5, 'gamma':0.05, 'probability':'True'}

    #method_name = 'kNN+OneVsRest'
    #from sklearn.neighbors import KNeighborsClassifier as BinaryClassifier
    #kwargs = {'k':1}

    print 'Method: ', method_name
    from sklearn.cross_validation import KFold
    kf = KFold(len(train_label), n_folds, indices=True)
    index = 0
    model_test_AP = [0] * n_folds
    for train_index, test_index in kf:
        print 'Prepare cv dataset: %d' % index
        model_train_feature = train_feature[train_index, :]
        model_test_feature = train_feature[test_index, :]
        model_train_label = train_label[train_index, :]
        model_test_label = train_label[test_index, :]

        print 'Train classifiers using the CLR method...'
        train_pred, test_pred = OneVsRest_multilabel(model_train_feature, model_train_label, model_test_feature, BinaryClassifier, **kwargs)

        #import ipdb; ipdb.set_trace()
        model_test_AP[index] = average_precision(test_pred, model_test_label)
        # use the evaluation metric provided by CCDM2014 host
        #model_test_AP[index] = caculate_AP_dict(predition2dict(test_pred), predition2dict(model_test_label))
        print 'Model testing AP:', model_test_AP[index]

        index += 1

    avg_AP = sum(model_test_AP) / n_folds
    print '\nAverage AP:', avg_AP

    print 'Train the whole dataset...'
    train_pred, test_pred = OneVsRest_multilabel(train_feature, train_label, test_feature, BinaryClassifier, **kwargs)
    save_multilabel_result(method_name, n_components, avg_AP, test_pred)