MS2LDA/lda_cgs_numpy.py at master · sdrogers/MS2LDA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import sys

from scipy.special import gammaln

import numpy as np
from lda_cgs import Sample

def sample_numpy(random_state, n_burn, n_samples, n_thin,
            D, N, K, document_indices,
            alpha, beta,
            Z, cdk, cd, previous_K,
            ckn, ck, previous_ckn, previous_ck):

    samples = []
    all_lls = []
    thin = 0
    N_beta = np.sum(beta)
    K_alpha = np.sum(alpha)
    for samp in range(n_samples):

        s = samp+1
        if s > n_burn:
            print("Sample " + str(s) + " "),
        else:
            print("Burn-in " + str(s) + " "),

        for d in range(D):

            if d%10==0:
                sys.stdout.write('.')
                sys.stdout.flush()

            word_locs = document_indices[d]
            for pos, n in word_locs:

                # remove word from model
                k = Z[(d, pos)]
                cdk[d, k] -= 1
                cd[d] -= 1
                ckn[k, n] -= 1
                ck[k] -= 1

                if previous_K == 0:

                    # for training
                    log_likelihood = np.log(ckn[:, n] + beta[n]) - np.log(ck + N_beta)

                else:

                    # for testing on unseen data
                    log_likelihood_previous = np.log(previous_ckn[:, n] + beta[n]) - np.log(previous_ck + N_beta)
                    log_likelihood_current = np.log(ckn[:, n] + beta[n]) - np.log(ck + N_beta)

                    # The combined likelihood:
                    # front is from previous topic-word distribution
                    # back is from current topic-word distribution
                    # Because of the values from the hyperparameter, we cannot do
                    # log_likelihood = log_likelihood_previous + log_likelihood_current
                    front = log_likelihood_previous[0:previous_K]
                    back = log_likelihood_current[previous_K:]
                    log_likelihood = np.hstack((front, back))

                log_prior = np.log(cdk[d, :] + alpha) - np.log(cd[d] + K_alpha)

                # sample new k from the posterior distribution log_post
                log_post = log_likelihood + log_prior
                post = np.exp(log_post - log_post.max())
                post = post / post.sum()

                # k = random_state.multinomial(1, post).argmax()
                cumsum = np.empty(K, dtype=np.float64)
                random_number = random_state.rand()
                total = 0
                for i in range(len(post)):
                    val = post[i]
                    total += val
                    cumsum[i] = total
                k = 0
                for k in range(len(cumsum)):
                    c = cumsum[k]
                    if random_number <= c:
                        break

                # reassign word back into model
                cdk[d, k] += 1
                cd[d] += 1
                ckn[k, n] += 1
                ck[k] += 1
                Z[(d, pos)] = k

        ll = K * ( gammaln(N_beta) - np.sum(gammaln(beta)) )
        for k in range(K):
            for n in range(N):
                ll += gammaln(ckn[k, n]+beta[n])
            ll -= gammaln(ck[k] + N_beta)

        ll += D * ( gammaln(K_alpha) - np.sum(gammaln(alpha)) )
        for d in range(D):
            for k in range(K):
                ll += gammaln(cdk[d, k]+alpha[k])
            ll -= gammaln(cd[d] + K_alpha)

        all_lls.append(ll)
        print(" Log likelihood = %.3f " % ll)

        # store all the samples after thinning
        if n_burn > 0 and s > n_burn:
            thin += 1
            if thin%n_thin==0:
                cdk_copy = np.copy(cdk)
                ckn_copy = np.copy(ckn)
                to_store = Sample(cdk_copy, ckn_copy)
                samples.append(to_store)

    # store the last sample only
    if n_burn == 0:
        cdk_copy = np.copy(cdk)
        ckn_copy = np.copy(ckn)
        to_store = Sample(cdk_copy, ckn_copy)
        samples.append(to_store)

    all_lls = np.array(all_lls)
    return all_lls, samples