ml/adaboost.py at master · mazefeng/ml · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# coding=utf-8

import sys
import random
import numpy as np
from math import log

from common import read_dense_data

random.seed(1024 * 1024)

class WeakLearner:

    def train(self, X, Y, D, alpha = 1.0, max_update = 100, eps = 5e-2):
        m, n = X.shape
        update = 0

        min_loss = 1.0
        w_best = np.matrix(np.zeros([n, 1]))
        Y_best = np.matrix(np.zeros([m, 1]))

        w = np.matrix(np.zeros([n, 1]))

        while update <= max_update and min_loss > eps:
            i = random.randint(0, m - 1)
            if np.sign(X[i] * w) == Y[i]: continue
            w += alpha * X[i].T * Y[i] * D[i]
            update += 1
            # w += alpha * m * X[i].T * Y[i] * D[i]
            loss, Y_pred = self.loss(X, Y, D, w)
            if loss < min_loss:
                min_loss = loss
                w_best = np.matrix(w)
                Y_best = np.matrix(Y_pred)

        return min_loss, w_best, Y_best

    def classify(self, X, w):
        Y_pred= np.sign(X * w)
        return Y_pred

    def loss(self, X, Y, D, w):
        Y_pred = self.classify(X, w)
        P = np.matrix(np.zeros(Y.shape))
        P[np.where(Y_pred != Y)] = 1
        return (D.T * P).sum(), Y_pred

class AdaBoost:
    def __init__(self):

        self.weak_learner = WeakLearner()

    def train(self, X, Y, run = 40):

        m, n = X.shape

        D = (1.0 / m) * np.matrix(np.ones([m, 1]))

        self.alpha = [0.0] * run
        self.W = np.matrix(np.zeros([n, run]))

        for i in range(run):

            loss, self.W[:, i], Y_pred = self.weak_learner.train(X, Y, D)

            self.alpha[i] = 0.5 * log((1.0 - loss) / loss)
            assert self.alpha[i] > 0

            D = np.multiply(D, np.exp(-self.alpha[i] * np.multiply(Y, Y_pred)))
            D = D / D.sum()

            # test training data after each round
            # set a larger run, loss on training data will be 0.0
            print >> sys.stderr, 'run: %d' % i
            self.test(X, Y)

    def classify(self, X):
        m, n = X.shape
        S = np.matrix(np.zeros([m, 1]))
        for i in range(len(self.alpha)):
            S += self.alpha[i] * self.weak_learner.classify(X, self.W[:, i])
        Y_pred = np.sign(S)
        return Y_pred

    def test(self, X, Y):
        P = np.matrix(np.zeros(Y.shape))
        Y_pred = self.classify(X)
        P[np.where(Y_pred == Y)] = 1
        c = P.sum()

        print >> sys.stderr, 'Accuracy = %lf%% (%d/%d)' % (100.0 * c / len(Y), c, len(Y))

        return 1.0 * c / len(Y)


if __name__ == '__main__':

    train_path = 'data/heart_scale.train'
    test_path = 'data/heart_scale.test'

    X_train, Y_train = read_dense_data(open(train_path))
    X_test, Y_test = read_dense_data(open(test_path))

    X_train = np.matrix(X_train)
    Y_train = [int(y) for y in Y_train]
    Y_train = np.matrix(Y_train).T

    X_test = np.matrix(X_test)
    Y_test = [int(y) for y in Y_test]
    Y_test = np.matrix(Y_test).T

    clf = AdaBoost()
    clf.train(X_train, Y_train)
    acc = clf.test(X_test, Y_test)
    print >> sys.stderr, 'Accuracy for AdaBoost : %lf%%' % (100 *  acc)