-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmodel-sl-basic.Rmd
More file actions
125 lines (94 loc) · 3.6 KB
/
model-sl-basic.Rmd
File metadata and controls
125 lines (94 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
---
title: "Model: SL basic"
output: html_document
---
```{r setup, include=TRUE}
knitr::opts_chunk$set(echo = TRUE)
# Load an initial pair of startup functions.
source("R/_startup.R")
# Load necessary libraries; set auto_install = TRUE to try to install any needed packages.
startup(auto_install = FALSE, verbose = FALSE)
# Load all .R files in the R/ subdirectory.
ck37r::load_all_code("R", verbose = TRUE)
# File created in clean.Rmd
load("data/clean.RData")
# Define a model name specifically for this Rmd file.
task$model_name = "sl-basic"
```
## Define estimators
```{r define_estimators}
# This will select only the top 5 covariates based on correlation with the outcome.
screen.corRank15 = function(...) screen.corRank2(..., rank = 15)
# Setup parallel backend for glmnet_fast.
doParallel::registerDoParallel(cores = RhpcBLASctl::get_num_cores())
# Add screeners so that it's not as slow to estimate.
# TODO: test wider numbers of covariates, e.g. 15 or all.
# TODO: grid search on xgboost hyperparameters.
sl_lib = list(c("SL.xgboost_fast", "screen.corRank15"),
c("SL.ranger_fast", "screen.corRank15"),
c("SL.glmnet_fast", "screen.corRank15"),
"SL.mean")
```
## Run estimation
```{r superlearner}
set.seed(3137033, "L'Ecuyer-CMRG")
sl =
SuperLearner(Y = task$outcome,
X = task$data[, task$covariates],
family = binomial(),
verbose = TRUE,
SL.library = sl_lib,
# TODO: consider method = nnlogLik
cvControl = SuperLearner.CV.control(V = 3L))
# Re-run with 20 folds when we want to finalize our model.
# cvControl = SuperLearner.CV.control(V = 20L))
sl
cat("Execution time:", round(sl$times$everything["elapsed"] / 60, 1), "minutes.\n")
# Save our results and our task for posterity.
save(sl, task,
file = paste0("data/model-", task$model_name, ".RData"))
```
## Review model
```{r review_model}
# Review auc of the learners.
# xgboost and ranger are both around 0.88, glmnet at 0.77
ck37r::auc_table(sl, y = task$outcome)
# TODO: export table.
# Plot ROC curve.
ck37r::plot_roc(sl, y = task$outcome)
ggsave(paste0("visuals/roc-", task$model_name, ".png"))
```
## Predict on test
TODO: convert more of this to a general function so that we can use across Rmd files.
```{r test_prediction}
test = data.table::fread("data-raw/test.csv", data.table = FALSE)
dim(test)
(names(test) = tolower(names(test)))
# Restrict to columns that we want.
test_df = test[, task$covariates]
# Apply SuperLearner to generate predictions.
system.time({
predictions =
predict(sl, test_df,
# Only estimate the models that are used in the SL ensemble.
onlySL = TRUE,
# Allow multithreaded prediction to speed this up.
num.threads = RhpcBLASctl::get_num_cores())$pred
})
# Review prediction distribution.
summary(predictions)
qplot(predictions) + theme_minimal()
ggsave(paste0("visuals/test-hist-", task$model_name, ".png"))
# Create a dataframe that contains just what we need to submit an entry.
# $eventid is the primary id for each observation.
export = data.frame(EventId = test$eventid, Label = predictions)
# Convert probability prediction to a class prediction.
# TODO: run optimal threshold analysis to decide best probability threshold.
export$Label = ifelse(export$Label > 0.5, "s", "b")
# Review predicted class labels.
table(export$Label)
prop.table(table(export$Label))
# Generate a csv file to upload to competition submission page.
rio::export(export,
file = paste0("exports/submission-", task$model_name, ".csv"))
```