-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathAB_test.py
More file actions
123 lines (105 loc) · 4.34 KB
/
AB_test.py
File metadata and controls
123 lines (105 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Split Test Analysis (A/B test)
An A/B test measures the effectiveness of new features introduced into a
population with a priori statistical significance. In most cases
This code performs a split test analysis with test data, but also accepts input
from a file in a certain format of (n x 2) CSV. The first row should be the control with a
conversion, view
tuple or list, and similarly for the rest of the variations. Example:
Sample data
Baseline: 32 conversions out of 595 viewers
Variation 1: 30 conversions out of 599 viewers
Variation 2: 18 conversions out of 622 viewers
Variation 3: 51 conversions out of 606 viewers
Variation 4: 38 conversions out of 578 viewers
stored as
data = [np.array([32.,595.]),
np.array([30.,599.]),
np.array([18.,622.]),
np.array([51.,606.]),
np.array([38.,578.])]
The test assumes each view is independent, and binomially distributed with
a true population conversion rate. This allows for a Central Limit Theorem
estimate for statistical significance.
What's your interpretation of these results? What conclusions would you draw? What
questions would you ask me about my goals and methodology? Do you have any thoughts on the
experimental design? Please provide statistical justification for your conclusions and
explain the choices you made in your analysis.
Ricky Kwok, rickyk9487@gmail.com, 2014-10-08."""
#import csv
import numpy as np
import math
import scipy.stats as stats
class AB_test(object):
""" Performs an A/B Test with a control (conversion, view) in data[0]
and the rest of the variations in data[1:]. """
def __init__(self, data):
self._data = data
self.control = data[0]
self.test = data[1:]
return
def unpack(self):
# Unpacks the data of object self into tuples.
conversion, view = [], []
for i in range(np.shape(self.test)[0]):
conversion.append(self.test[i][0])
view.append(self.test[i][1])
return conversion, view
def stats(self):
""" Computes and returns probabilities, standard errors, Z-scores
and one-sided p-values resulting from the A/B test. """
conversion, view = AB_test.unpack(self)
prob_ctrl = self.control[0]/self.control[1]
SE_ctrl = math.sqrt(prob_ctrl * (1 - prob_ctrl) / self.control[1])
prob = np.zeros(len(conversion))
SE = np.zeros(len(conversion))
ZScore = np.zeros(len(conversion))
pvalue = np.zeros(len(conversion))
for i in range(len(conversion)):
# probability is the conversion / view,
prob[i] = conversion[i]/view[i]
# SE is \sqrt{ {p (1-p) \over view} } in LaTeX
SE[i] = math.sqrt(prob[i] * (1-prob[i]) / view[i])
# Zscore is {p_{test} - p_{control} \over SE_{test} ^ 2 + SE_{control} ^ 2}
ZScore[i] = (prob[i] - prob_ctrl)/math.sqrt(SE[i] ** 2 + SE_ctrl **2)
# pvalue is given as the cdf of the normal distribution funcion
pvalue[i] = stats.norm.cdf(ZScore[i])
self.prob = prob
self.se = SE
self.zscore = ZScore
self.pvalue = pvalue
return
def print_stats(self):
print "List of ratios of conversion to view: ",
print self.prob
print "List of standard errors: ",
print self.se
print "List of Z-scores with corresponding one-sided p-values: "
print "Z-scores:", self.zscore
print "p-values:", self.pvalue
def openfile(filename):
""" Stores the CSV file into ints and returns a 2D-array."""
Data = np.genfromtxt(filename, delimiter = ",")
data = [[]]
for i in range(np.shape(Data)[0]):
#Stores information row-by-row
data.append(Data[i][0:])
return data
def get_data():
""" Test data to perform A/B test """
data = [np.array([32.,595.]),
np.array([30.,599.]),
np.array([18.,622.]),
np.array([51.,606.]),
np.array([38.,578.])]
return data
def main():
""" Gets data, calls class abtest to get stats, then print them."""
# openfile allows for CSV files with stored data of two columns
# data = openfile("filename")
data = get_data()
abtest = AB_test(data)
abtest.stats()
abtest.print_stats()
if __name__ == "__main__":
main()