-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathlda_cgs_numpy.py
More file actions
123 lines (99 loc) · 4.34 KB
/
lda_cgs_numpy.py
File metadata and controls
123 lines (99 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import sys
from scipy.special import gammaln
import numpy as np
from lda_cgs import Sample
def sample_numpy(random_state, n_burn, n_samples, n_thin,
D, N, K, document_indices,
alpha, beta,
Z, cdk, cd, previous_K,
ckn, ck, previous_ckn, previous_ck):
samples = []
all_lls = []
thin = 0
N_beta = np.sum(beta)
K_alpha = np.sum(alpha)
for samp in range(n_samples):
s = samp+1
if s > n_burn:
print("Sample " + str(s) + " "),
else:
print("Burn-in " + str(s) + " "),
for d in range(D):
if d%10==0:
sys.stdout.write('.')
sys.stdout.flush()
word_locs = document_indices[d]
for pos, n in word_locs:
# remove word from model
k = Z[(d, pos)]
cdk[d, k] -= 1
cd[d] -= 1
ckn[k, n] -= 1
ck[k] -= 1
if previous_K == 0:
# for training
log_likelihood = np.log(ckn[:, n] + beta[n]) - np.log(ck + N_beta)
else:
# for testing on unseen data
log_likelihood_previous = np.log(previous_ckn[:, n] + beta[n]) - np.log(previous_ck + N_beta)
log_likelihood_current = np.log(ckn[:, n] + beta[n]) - np.log(ck + N_beta)
# The combined likelihood:
# front is from previous topic-word distribution
# back is from current topic-word distribution
# Because of the values from the hyperparameter, we cannot do
# log_likelihood = log_likelihood_previous + log_likelihood_current
front = log_likelihood_previous[0:previous_K]
back = log_likelihood_current[previous_K:]
log_likelihood = np.hstack((front, back))
log_prior = np.log(cdk[d, :] + alpha) - np.log(cd[d] + K_alpha)
# sample new k from the posterior distribution log_post
log_post = log_likelihood + log_prior
post = np.exp(log_post - log_post.max())
post = post / post.sum()
# k = random_state.multinomial(1, post).argmax()
cumsum = np.empty(K, dtype=np.float64)
random_number = random_state.rand()
total = 0
for i in range(len(post)):
val = post[i]
total += val
cumsum[i] = total
k = 0
for k in range(len(cumsum)):
c = cumsum[k]
if random_number <= c:
break
# reassign word back into model
cdk[d, k] += 1
cd[d] += 1
ckn[k, n] += 1
ck[k] += 1
Z[(d, pos)] = k
ll = K * ( gammaln(N_beta) - np.sum(gammaln(beta)) )
for k in range(K):
for n in range(N):
ll += gammaln(ckn[k, n]+beta[n])
ll -= gammaln(ck[k] + N_beta)
ll += D * ( gammaln(K_alpha) - np.sum(gammaln(alpha)) )
for d in range(D):
for k in range(K):
ll += gammaln(cdk[d, k]+alpha[k])
ll -= gammaln(cd[d] + K_alpha)
all_lls.append(ll)
print(" Log likelihood = %.3f " % ll)
# store all the samples after thinning
if n_burn > 0 and s > n_burn:
thin += 1
if thin%n_thin==0:
cdk_copy = np.copy(cdk)
ckn_copy = np.copy(ckn)
to_store = Sample(cdk_copy, ckn_copy)
samples.append(to_store)
# store the last sample only
if n_burn == 0:
cdk_copy = np.copy(cdk)
ckn_copy = np.copy(ckn)
to_store = Sample(cdk_copy, ckn_copy)
samples.append(to_store)
all_lls = np.array(all_lls)
return all_lls, samples