-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathno_face_value.R
More file actions
96 lines (72 loc) · 2.82 KB
/
no_face_value.R
File metadata and controls
96 lines (72 loc) · 2.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#### run the main function by typing:
# success_rate = test_venter(rounds, n)
#n is the number of people in each group
#rounds is the number of simulation rounds.
#success_rate is the identifiability power of demographic iddentifiers
# The function runs a simple procedure that matches the Venter et al.
# definition of identifiability.
# In each round, the function genreates sex, age, and self-reported ethnicity labels
# for n people according to the distributions of the Venter paper.
# It then takes the first person to be the person of interest
# and compares whether this person is unique in the n people.
# If the combination of labels for this person is unique, it says: "Success!"
# Venter had a team of 30 researchers that developed fancy face morphology predictions,
# voice signatures, and many other sophisticated algorithms. Using the same success creterion,
# they had a success rate of 80%.
# You are about to test a procedure that uses age, sex, and ethnic group which are not protected by
#HIPAA and took me less than an hour to develop.
# Try running test_venter(1000, 10)
# and see the success rate.
library(reshape)
freq <- function(x){
return (x/sum(x))
}
inverse_cdf<-function(cdf) {
p = runif(1, 0, 1)
for (i in 1:length(cdf)) {
if (p < cdf[i]) {
return (i)
}
}
return (length(cdf))
}
test_venter<-function(rounds, n) {
#rounds: the number of times to run the procedure
#n: the group of individuals to re-identify the sample
#lading data
male = c(381, 126, 39, 39, 8, 39)
female = c(188, 147, 24, 24, 10, 36)
people = data.frame(male,female)
row.names(people) = c('African', 'European','Latino','East Asian','South Asian', 'Other')
age = c(295, 190, 130, 90, 95, 120, 100, 35, 5, 1)
#calculating pdfs
people.freq = freq(people)
age.freq = freq(age)
people.freq_v = melt(people.freq) #table to a long vector
people.cdf = cumsum(people.freq_v$value)
age.cdf = cumsum(age.freq)
success = 0
#start sampling
for (i in 1:rounds){
#let draw n people from the distribution
a_group_of_people = (simulate_a_group_of_people(n, people.cdf, age.cdf))
our_person = a_group_of_people[1] # we decided arbitrarly that the person of interest is always first
z = length(grep(our_person, a_group_of_people))
if (z == 1) {
success = success + 1
}
}
return(success/rounds)
}
simulate_a_group_of_people<-function(n, people.cdf, age.cdf) {
take_n_people = c()
#let's generate a group of n people from Venter's cohort
for (i in 1:n){
sex_ethnicity_label = inverse_cdf(people.cdf)
age_label = inverse_cdf(age.cdf)
final_label = paste(sex_ethnicity_label, age_label, sep='_')
take_n_people = rbind(take_n_people, final_label)
}
colnames(take_n_people)[1] <- 'labels' #the name of the column
return(take_n_people)
}