-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLIME_Education_ex.py
More file actions
121 lines (109 loc) · 5 KB
/
LIME_Education_ex.py
File metadata and controls
121 lines (109 loc) · 5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import sys
sys.path.insert(0, '../LIMEaid/LIMEaid/controller')
sys.path.insert(0, '../LIMEaid/LIMEaid/model')
sys.path.insert(0, '../LIMEaid/LIMEaid/view')
import fit_sklearn_models as fsm
import LIMEaid as la
import LIMEdisplay as ld
import load_college_dataset as gcd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import tree
# Number of perturbed samples to be generated.
n = 100000
# Number of bins for the histograms of continous attributes.
num_bins = 25
# The functions below merge and prepare the data, after uploading it,
# and leave it ready for us to fit the logistic regression model.
college = gcd.load_college_dataset()
features = list(college)[1:]
salary_class = np.array(college.iloc[:, 0])
college = np.array(college.iloc[:, 1:])
# Next, we fit the logistic regression model.
clf = fsm.fit_multiclass_logistic_regression()
# Interpreting the output of the Logistic Regression model
# using LIME.
# We now take the dataset, attributes only, and form them we generate
# n random samples with similar distributions in the values of each
# attribute.
# Note that, unlike the Iris example where all attributes had continuous
# values, in this dataset the first attribute has discrete values.
# So, we call lime_sample setting the "continuous" parameter, second one
# in the call, to False.
perturbed_samples = la.lime_sample(n, False, college[:, 0], num_bins)
# The for loop calls lime_sample for each of the remaining attributes,
# setting the "continuous" parameter to True.
for j in range(1, college.shape[1]):
array = college[:, j]
output = la.lime_sample(n, True, array, num_bins)
perturbed_samples = np.vstack((perturbed_samples, output))
perturbed_samples = np.transpose(perturbed_samples)
# Once random samples with the right distributions for each attribute
# have been generated, we provide them as input to the Logistic
# Regression model we fitted earlier, and obtain the classification
# for each.
class_perturb_samples = clf.predict(perturbed_samples)
# We select a single instance from the dataset. This is the instance
# we will try to interpret using LIME.
# Normalizing the data.
college_norm = preprocessing.scale(college)
# Selecting the instance to interpret.
inst_num = np.round(np.random.uniform(0, college_norm.shape[0], 1))
inst_num = inst_num[0].astype(int)
# x is the selected instance, and x_class is the class assigned
# by the decision tree.
x = college_norm[inst_num, :]
x_class = salary_class[inst_num]
# Calling LIME to get interpretation.
# We now fit the LIME linear model to get the coefficients and
# intercept, as well as the weight of each random sample,
# based on its L2 distance to the instance that is being
# interpreted.
lime_beta, lime_int, lime_weight = la.lime_fit(x,
x_class,
perturbed_samples,
class_perturb_samples)
# Interpreting the results.
# Below we present the results obtained from the LIME linear
# regression model, and identify those attributes that played a
# significant role in the classification that was assigned to each
# instance by the Logistic Regression classifier.
# Print output of LIME results.
# First list the attributes and values for the instance
# we are looking at.
print("Instance to be interpreted:")
for j in range(0, len(lime_beta)):
print("Feature: ", features[j], "\tvalue: ",
college[inst_num, j], "\tnormalized value: ",
college_norm[inst_num, j])
print("Classification: ", x_class)
# Second we list the attributes that LIME has identified as
# significant in this case, along with the corresponding LIME
# coefficients and the LIME intercept.
print("\nSignificant coefficients from LIME adjusted"
" linear model:")
significant_attributes = 0
for j in range(0, len(lime_beta)):
if(lime_beta[j] != 0):
significant_attributes = np.append(significant_attributes, j)
print("Feature: ", features[j],
"\tCoefficient: ", lime_beta[j])
significant_attributes = significant_attributes[1:]
print("Intercept: ", lime_int)
# If the number of significant attributes in the interpretation of a
# particular instance is equal to two, we plot the random samples and
# the instance that is being interpreted on a cathesian plane. In
# some cases, classes can be separated for the most part by looking
# at the two selected attributes. Note that those random samples that
# were classified differently from the instance we are interpreting
# are merged into a single out-of-class representation.
# Finally, we plot the output of the fitted LIME Linear Regression
# model for each sample, against its class, as yielded by the
# Logistic Regression model and see that, in most cases, the classes
# can be differentiated using this model only.
full_data = np.column_stack((perturbed_samples,
class_perturb_samples))
ld.lime_display(full_data, lime_beta, lime_int, x, x_class, features,
[1, 2, 3])