-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsortinghat_functions.py
More file actions
69 lines (63 loc) · 2.58 KB
/
sortinghat_functions.py
File metadata and controls
69 lines (63 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
__author__ = 'Michael Silverstein'
def generate_feature(means, stds, n, labels, feature=None, seed=None, MIN=None, MAX=None):
"""
Generate feature data assuming features are normally distributed.
For each input parameter, either a list (associated with the order of `labels`) or a single value can be passed.
Passing a single value will assume that all classes share that parameter
Inputs:
| means: <list> or <float> Mean(s) of feature distribution
| stds: <list> or <float> Standard deviation(s) of feature distribution
| n: <list> or <float> Number of samples for each class
| labels: <array> List of labels (this order is associated with the other passed parameters)
| feature: <str> Name of feature
| seed: <int> Random seed for sampling
| {MIN, MAX}: <float> Minimum and maximum thresholds
Output:
| data: <dataframe> || `feature` | class ||
"""
# If any inputs are single values, convert them to lists
if np.isscalar(means):
means = [means]*len(labels)
if np.isscalar(stds):
stds = [stds]*len(labels)
if np.isscalar(n):
n = [n]*len(labels)
# Assign parameters to each class
params = {label: {'mean': m, 'std': s, 'n': size} for label, m, s, size in zip(labels, means, stds, n)}
# Generate data
if seed:
np.random.seed(seed)
## For each class, sample `n` points from a normal distribution with that classes mean and standard deviation
data = [[x, label] for label in labels for x in np.random.normal(params[label]['mean'], params[label]['std'], params[label]['n'])]
# Place into dataframe
if not feature:
feature = 'feature'
data = pd.DataFrame(data, columns=[feature, 'class'])
# Apply thresholds
data.loc[data[feature]>=MAX, feature] = MAX
data.loc[data[feature]<=MIN, feature] = MIN
return data
def rescale(x, axis=0):
"""
Rescale a matrix X between [0, 1] along `axis`
"""
rescaled = (x - x.min(axis))/(x.max(axis) - x.min(axis))
return rescaled
def zebra(ax=None, color='gray', alpha=.3, zorder=0, **kwargs):
"""
Stripe figure - Color every other x position with `fill_between()`
If no ax provided, use current ax
"""
if not ax:
ax = plt.gca()
# Choose x positions to color
xs = ax.get_xticks()[::2]
ylim = ax.get_ylim()
kwargs.update({'color': color, 'alpha': alpha, 'zorder': zorder})
for x in xs:
ax.fill_between((x - .5, x + .5), ylim[0], ylim[1], **kwargs)
ax.set_ylim(ylim)
return ax