diff --git a/DESCRIPTION b/DESCRIPTION index 4ddd23c..abeb684 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,20 +1,23 @@ Package: SuperLearner Type: Package Title: Super Learner Prediction -Version: 2.0-30-9000 -Date: 2024-02-06 -Authors@R: c(person("Eric", "Polley", email = "epolley@uchicago.edu", role = c("aut", "cre")), person("Erin", "LeDell", role = c("aut")), person("Chris", "Kennedy", role = c("aut")), person("Sam", "Lendle", role = c("ctb")), person("Mark", "van der Laan", role = c("aut", "ths"))) -Maintainer: Eric Polley +Version: 2.0-30-9001 +Authors@R: c(person("Eric", "Polley", email = "epolley@uchicago.edu", role = c("aut", "cre")), + person("Erin", "LeDell", role = c("aut")), + person("Chris", "Kennedy", role = c("aut")), + person("Sam", "Lendle", role = c("ctb")), + person("Mark", "van der Laan", role = c("aut", "ths"))) Description: Implements the super learner prediction method and contains a library of prediction algorithms to be used in the super learner. License: GPL-3 URL: https://github.com/ecpolley/SuperLearner +BugReports: https://github.com/ecpolley/SuperLearner/issues Depends: - R (>= 2.14.0), - nnls + R (>= 4.0.0) Imports: cvAUC, - methods + nnls, + stats Suggests: arm, bartMachine, @@ -40,6 +43,7 @@ Suggests: mlbench, nloptr, nnet, + parallel, party, polspline, prettydoc, @@ -55,7 +59,9 @@ Suggests: spls, sva, testthat, - xgboost (>= 0.6) + xgboost (>= 3.1.2.1) LazyLoad: yes -VignetteBuilder: knitr, rmarkdown -RoxygenNote: 6.0.1 +Encoding: UTF-8 +VignetteBuilder: knitr +RoxygenNote: 7.3.3 +Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index 02b87c6..d5d8bab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,57 +1,120 @@ -import(nnls) -exportPattern("^[^\\.]") +# Generated by roxygen2: do not edit by hand -importFrom(stats, coef) -importFrom(stats, predict) -importFrom(graphics, plot) -importFrom(methods, is) - -importFrom("stats", "as.formula", "binomial", "cor.test", "dlogis", "formula", "gaussian", "glm", "lm", "loess", "loess.control", "model.matrix", "optim", "plogis", "pnorm", "predict.lm", "qnorm", "reorder", "sd", "step", "var", "weighted.mean") -importFrom("utils", "capture.output", "RShowDoc", "packageVersion") - -S3method(print, SuperLearner) -S3method(coef, SuperLearner) -S3method(predict, SuperLearner) - -S3method(print, CV.SuperLearner) -S3method(coef, CV.SuperLearner) -S3method(plot, CV.SuperLearner) -S3method(summary, CV.SuperLearner) -S3method(print, summary.CV.SuperLearner) - -S3method(predict, SL.bartMachine) -S3method(predict, SL.bayesglm) -S3method(predict, SL.biglasso) -S3method(predict, SL.caret) -S3method(predict, SL.cforest) -S3method(predict, SL.earth) -S3method(predict, SL.gam) -S3method(predict, SL.gbm) -S3method(predict, SL.glm) -S3method(predict, SL.glmnet) -S3method(predict, SL.ipredbagg) -S3method(predict, SL.knn) -S3method(predict, SL.kernelKnn) -S3method(predict, SL.ksvm) -S3method(predict, SL.leekasso) -S3method(predict, SL.lda) -S3method(predict, SL.lm) -S3method(predict, SL.loess) -S3method(predict, SL.logreg) -S3method(predict, SL.mean) -S3method(predict, SL.nnet) -S3method(predict, SL.nnls) -S3method(predict, SL.polymars) -S3method(predict, SL.qda) -S3method(predict, SL.randomForest) -S3method(predict, SL.ranger) -S3method(predict, SL.ridge) -S3method(predict, SL.rpart) -S3method(predict, SL.speedglm) -S3method(predict, SL.speedlm) -S3method(predict, SL.step) -S3method(predict, SL.stepAIC) -S3method(predict, SL.svm) -S3method(predict, SL.template) -S3method(predict, SL.xgboost) -S3method(predict, SuperLearner) +S3method(plot,CV.SuperLearner) +S3method(predict,SL.bartMachine) +S3method(predict,SL.bayesglm) +S3method(predict,SL.biglasso) +S3method(predict,SL.caret) +S3method(predict,SL.cforest) +S3method(predict,SL.earth) +S3method(predict,SL.gam) +S3method(predict,SL.gbm) +S3method(predict,SL.glm) +S3method(predict,SL.glmnet) +S3method(predict,SL.ipredbagg) +S3method(predict,SL.kernelKnn) +S3method(predict,SL.knn) +S3method(predict,SL.ksvm) +S3method(predict,SL.lda) +S3method(predict,SL.leekasso) +S3method(predict,SL.lm) +S3method(predict,SL.loess) +S3method(predict,SL.logreg) +S3method(predict,SL.mean) +S3method(predict,SL.nnet) +S3method(predict,SL.nnls) +S3method(predict,SL.polymars) +S3method(predict,SL.qda) +S3method(predict,SL.randomForest) +S3method(predict,SL.ranger) +S3method(predict,SL.ridge) +S3method(predict,SL.rpart) +S3method(predict,SL.speedglm) +S3method(predict,SL.speedlm) +S3method(predict,SL.step) +S3method(predict,SL.stepAIC) +S3method(predict,SL.svm) +S3method(predict,SL.xgboost) +S3method(predict,SuperLearner) +S3method(print,CV.SuperLearner) +S3method(print,SuperLearner) +S3method(print,summary.CV.SuperLearner) +S3method(stats::coef,CV.SuperLearner) +S3method(stats::coef,SuperLearner) +S3method(summary,CV.SuperLearner) +export(All) +export(CVFolds) +export(SL.bartMachine) +export(SL.bayesglm) +export(SL.biglasso) +export(SL.caret) +export(SL.caret.rpart) +export(SL.cforest) +export(SL.earth) +export(SL.gam) +export(SL.gbm) +export(SL.glm) +export(SL.glm.interaction) +export(SL.glmnet) +export(SL.ipredbagg) +export(SL.kernelKnn) +export(SL.knn) +export(SL.ksvm) +export(SL.lda) +export(SL.leekasso) +export(SL.lm) +export(SL.loess) +export(SL.logreg) +export(SL.mean) +export(SL.nnet) +export(SL.nnls) +export(SL.polymars) +export(SL.qda) +export(SL.randomForest) +export(SL.ranger) +export(SL.ridge) +export(SL.rpart) +export(SL.rpartPrune) +export(SL.speedglm) +export(SL.speedlm) +export(SL.step) +export(SL.step.forward) +export(SL.step.interaction) +export(SL.stepAIC) +export(SL.svm) +export(SL.xgboost) +export(SampleSplitSuperLearner) +export(SuperLearner) +export(SuperLearner.control) +export(SuperLearnerDocs) +export(SuperLearnerNews) +export(`CV.SuperLearner`) +export(`SL.template`) +export(`SuperLearner.CV.control`) +export(`create.Learner`) +export(`predict.SL.template`) +export(`write.SL.template`) +export(`write.predict.SL.template`) +export(`write.screen.template`) +export(create.SL.xgboost) +export(listWrappers) +export(mcSuperLearner) +export(method.AUC) +export(method.CC_LS) +export(method.CC_nloglik) +export(method.NNLS) +export(method.NNLS2) +export(method.NNloglik) +export(method.template) +export(recombineCVSL) +export(screen.SIS) +export(screen.corP) +export(screen.corRank) +export(screen.glmnet) +export(screen.randomForest) +export(screen.template) +export(screen.ttest) +export(snowSuperLearner) +export(trimLogit) +export(write.method.template) +import(stats) diff --git a/R/CV.SuperLearner.R b/R/CV.SuperLearner.R index 7674519..fee2b49 100644 --- a/R/CV.SuperLearner.R +++ b/R/CV.SuperLearner.R @@ -1,12 +1,176 @@ -# V-fold Cross-validation wrapper for SuperLearner +#' V-fold cross-validated risk estimate for SuperLearner +#' +#' This function simply splits the data into V folds and then calls SuperLearner. +#' Most of the arguments are passed directly to SuperLearner. +#' +#' The \code{SuperLearner} function builds a estimator, but does not contain an +#' estimate on the performance of the estimator. Various methods exist for +#' estimator performance evaluation. If you are familiar with the super learner +#' algorithm, it should be no surprise we recommend using cross-validation to +#' evaluate the honest performance of the super learner estimator. The function +#' \code{CV.SuperLearner} computes the usual V-fold cross-validated risk +#' estimate for the super learner (and all algorithms in \code{SL.library} for +#' comparison). +#' +#' @param Y The outcome. +#' @param X The covariates. +#' @param V The number of folds for \code{CV.SuperLearner}. This argument will +#' be depreciated and moved into the \code{cvControl}. If Both \code{V} and +#' \code{cvControl} set the number of cross-validation folds, an error message +#' will appear. The recommendation is to use \code{cvControl}. This is not the +#' number of folds for \code{SuperLearner}. The number of folds for +#' \code{SuperLearner} is controlled with \code{innerCvControl}. +#' @param family Currently allows \code{gaussian} or \code{binomial} to +#' describe the error distribution. Link function information will be ignored +#' and should be contained in the method argument below. +#' @param SL.library Either a character vector of prediction algorithms or a +#' list containing character vectors. See details below for examples on the +#' structure. A list of functions included in the SuperLearner package can be +#' found with [listWrappers()]. +#' @param method A list (or a function to create a list) containing details on +#' estimating the coefficients for the super learner and the model to combine +#' the individual algorithms in the library. See \code{?method.template} for +#' details. Currently, the built in options are either `"method.NNLS"` (the +#' default), `"method.NNLS2"`, `"method.NNloglik"`, `"method.CC_LS"`, +#' `"method.CC_nloglik"`, or `"method.AUC"`. NNLS and NNLS2 are non-negative least +#' squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb +#' and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and +#' binomial outcomes. NNloglik is a non-negative binomial likelihood +#' maximization using the BFGS quasi-Newton optimization method. NN* methods +#' are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's +#' quadratic programming algorithm to calculate the best convex combination of +#' weights to minimize the squared error loss. CC_nloglik calculates the convex +#' combination of weights that minimize the negative binomial log likelihood on +#' the logistic scale using the sequential quadratic programming algorithm. +#' AUC, which only works for binary outcomes, uses the Nelder-Mead method via +#' the optim function to minimize rank loss (equivalent to maximizing AUC). +#' @param id Optional cluster identification variable. For the cross-validation +#' splits, \code{id} forces observations in the same cluster to be in the same +#' validation fold. \code{id} is passed to the prediction and screening +#' algorithms in `SL.library`, but be sure to check the individual wrappers as +#' many of them ignore the information. +#' @param verbose Logical; `TRUE` for printing progress during the computation +#' (helpful for debugging). +#' @param control A list of parameters to control the estimation process. +#' Parameters include \code{saveFitLibrary} and \code{trimLogit}. See +#' \code{\link{SuperLearner.control}} for details. +#' @param cvControl A list of parameters to control the outer cross-validation +#' process. The outer cross-validation is the sample spliting for evaluating +#' the SuperLearner. Parameters include \code{V}, \code{stratifyCV}, +#' \code{shuffle} and \code{validRows}. See +#' \code{\link{SuperLearner.CV.control}} for details. +#' @param innerCvControl A list of lists of parameters to control the inner +#' cross-validation process. It should have \code{V} elements in the list, each +#' a valid \code{cvControl} list. If only a single value, then replicated +#' across all folds. The inner cross-validation are the values passed to each +#' of the \code{V} \code{SuperLearner} calls. Parameters include \code{V}, +#' \code{stratifyCV}, \code{shuffle} and \code{validRows}. See +#' \code{\link{SuperLearner.CV.control}} for details. +#' @param obsWeights Optional observation weights variable. As with \code{id} +#' above, \code{obsWeights} is passed to the prediction and screening +#' algorithms, but many of the built in wrappers ignore (or can't use) the +#' information. If you are using observation weights, make sure the library you +#' specify uses the information. +#' @param saveAll Logical; Should the entire \code{SuperLearner} object be +#' saved for each fold? +#' @param parallel Options for parallel computation of the V-fold step. Use +#' "seq" (the default) for sequential computation. \code{parallel = 'multicore'} to use \code{mclapply} for the V-fold step (but note that \code{SuperLearner()} will still be sequential). The default for `mclapply()` is +#' to check the \code{mc.cores} option, and if not set to default to 2 cores. +#' Be sure to set \code{options()$mc.cores} to the desired number of cores if +#' you don't want the default. Or \code{parallel} can be the name of a snow +#' cluster and will use \code{parLapply} for the V-fold step. For both +#' multicore and snow, the inner \code{SuperLearner} calls will be sequential. +#' @param env Environment containing the learner functions. Defaults to the +#' calling environment. +#' +#' @returns +#' An object of class \code{CV.SuperLearner} (a list) with components: +#' +#' \item{call}{ The matched call. } +#' \item{AllSL}{ If \code{saveAll = TRUE}, a +#' list with output from each call to \code{SuperLearner}, otherwise NULL. } +#' \item{SL.predict}{ The predicted values from the super learner when each +#' particular row was part of the validation fold. } +#' \item{discreteSL.predict}{ +#' The traditional cross-validated selector. Picks the algorithm with the +#' smallest cross-validated risk (in super learner terms, gives that algorithm +#' coefficient 1 and all others 0). } +#' \item{whichDiscreteSL}{ A list of length +#' \code{V}. The elements in the list are the algorithm that had the smallest +#' cross-validated risk estimate for that fold. } +#' \item{library.predict}{ A +#' matrix with the predicted values from each algorithm in \code{SL.library}. +#' The columns are the algorithms in \code{SL.library} and the rows represent +#' the predicted values when that particular row was in the validation fold +#' (i.e. not used to fit that estimator). } +#' \item{coef}{ A matrix with the coefficients for the super learner on each fold. The columns are the +#' algorithms in \code{SL.library} the rows are the folds. } +#' \item{folds}{ A list containing the row numbers for each validation fold. } +#' \item{V}{ Number of folds for \code{CV.SuperLearner}. } +#' \item{libraryNames}{ A character +#' vector with the names of the algorithms in the library. The format is +#' 'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the +#' prediction algorithm run on all variables in X. } +#' \item{SL.library}{ Returns \code{SL.library} in the same format as the argument with the same name above. } +#' \item{method}{ A list with the method functions. } +#' \item{Y}{ The outcome } +#' +#' @author Eric C Polley \email{epolley@@uchicago.edu} +#' +#' @seealso \code{\link{SuperLearner}} +#' +#' @keywords models +#' @examples +#' +#' \dontrun{ +#' set.seed(23432) +#' ## training set +#' n <- 500 +#' p <- 50 +#' X <- matrix(rnorm(n*p), nrow = n, ncol = p) +#' colnames(X) <- paste("X", 1:p, sep="") +#' X <- data.frame(X) +#' Y <- X[, 1] + sqrt(abs(X[, 2] * X[, 3])) + X[, 2] - X[, 3] + rnorm(n) +#' +#' ## build Library and run Super Learner +#' SL.library <- c("SL.glm", "SL.randomForest", "SL.gam", "SL.polymars", "SL.mean") +#' +#' test <- CV.SuperLearner(Y = Y, X = X, V = 10, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNLS") +#' test +#' summary(test) +#' ## Look at the coefficients across folds +#' coef(test) +#' +#' # Example with specifying cross-validation options for both +#' # CV.SuperLearner (cvControl) and the internal SuperLearners (innerCvControl) +#' test <- CV.SuperLearner(Y = Y, X = X, SL.library = SL.library, +#' cvControl = list(V = 10, shuffle = FALSE), +#' innerCvControl = list(list(V = 5)), +#' verbose = TRUE, method = "method.NNLS") +#' +#' ## examples with snow +#' library(parallel) +#' cl <- makeCluster(2, type = "PSOCK") # can use different types here +#' clusterSetRNGStream(cl, iseed = 2343) +#' testSNOW <- CV.SuperLearner(Y = Y, X = X, SL.library = SL.library, method = "method.NNLS", +#' parallel = cl) +#' summary(testSNOW) +#' stopCluster(cl) +#' } +#' -CV.SuperLearner <- function(Y, X, V = NULL, family = gaussian(), SL.library, method = 'method.NNLS', id = NULL, verbose = FALSE, control = list(saveFitLibrary = FALSE), cvControl = list(), innerCvControl = list(), obsWeights = NULL, saveAll = TRUE, parallel = "seq", env = parent.frame()) { +#' @export `CV.SuperLearner` +CV.SuperLearner <- function(Y, X, V = NULL, family = gaussian(), SL.library, method = 'method.NNLS', id = NULL, + verbose = FALSE, control = list(saveFitLibrary = FALSE), cvControl = list(), + innerCvControl = list(), obsWeights = NULL, saveAll = TRUE, parallel = "seq", + env = parent.frame()) { call <- match.call() N <- dim(X)[1L] p <- dim(X)[2L] # create CV folds: - if(any(names(cvControl) == "V") & !is.null(V)) { + if (any(names(cvControl) == "V") && !is.null(V)) { stop(paste0("You specified a value for V and a value in the cvControl, please only use one, preferably the cvControl")) } @@ -19,8 +183,8 @@ CV.SuperLearner <- function(Y, X, V = NULL, family = gaussian(), SL.library, met folds <- CVFolds(N = N, id = id, Y = Y, cvControl = cvControl) V <- cvControl$V # save this because it appears in the output value - if(length(innerCvControl) > 0) { - if(length(innerCvControl) == 1) { + if (length(innerCvControl) > 0) { + if (length(innerCvControl) == 1) { warning("Only a single innerCvControl is given, will be replicated across all cross-validation split calls to SuperLearner") newInnerCvControl <- vector("list", cvControl$V) for(ii in seq(cvControl$V)) { diff --git a/R/CVFolds.R b/R/CVFolds.R index 83e3228..c5d7fc9 100644 --- a/R/CVFolds.R +++ b/R/CVFolds.R @@ -1,14 +1,33 @@ -# create a list of row numbers for the V-fold cross validation. -# based on sample size N, id, Y, and cvControl -# -# inside cvControl: -# V : number of folds -# stratifyCV : if Y is binary, stratify folds to try and keep proportion constant -# shuffle : should the rows of X,Y be shuffled since the split function is deterministic +#' Generate list of row numbers for each fold in the cross-validation +#' +#' \code{CVFolds} is used in the \code{SuperLearner} to create the +#' cross-validation splits. +#' +#' @param N Sample size +#' @param id Optional cluster id variable. If present, all observations in the +#' same cluster will always be in the same split. +#' @param Y outcome +#' @param cvControl Control parameters for the cross-validation step. See +#' \code{\link{SuperLearner.CV.control}} for details. +#' +#' @returns +#' A list of length V where each element in the list +#' is a vector with the row numbers of the corresponding validation sample. +#' +#' @author Eric C Polley \email{epolley@@uchicago.edu} +#' @keywords utilities +#' +#' @export +CVFolds <- function(N, id, Y, cvControl) { + # create a list of row numbers for the V-fold cross validation. + # based on sample size N, id, Y, and cvControl + # + # inside cvControl: + # V : number of folds + # stratifyCV : if Y is binary, stratify folds to try and keep proportion constant + # shuffle : should the rows of X,Y be shuffled since the split function is deterministic + # created by Eric Polley on 2011-01-18. -# created by Eric Polley on 2011-01-18. - -CVFolds <- function(N, id, Y, cvControl){ # validRows would be a user specified list of row numbers for the validation sets if(!is.null(cvControl$validRows)) { return(cvControl$validRows) @@ -16,7 +35,7 @@ CVFolds <- function(N, id, Y, cvControl){ stratifyCV <- cvControl$stratifyCV shuffle <- cvControl$shuffle V <- cvControl$V - + if(!stratifyCV) { if(shuffle) { if(is.null(id)) { @@ -74,6 +93,6 @@ CVFolds <- function(N, id, Y, cvControl){ stop("stratified sampling with id not currently implemented") } } - } + } return(validRows) } diff --git a/R/SL.bartMachine.R b/R/SL.bartMachine.R index 7dbd481..f71aa9c 100644 --- a/R/SL.bartMachine.R +++ b/R/SL.bartMachine.R @@ -1,46 +1,41 @@ #' Wrapper for bartMachine learner #' -#' Support bayesian additive regression trees via the bartMachine package. +#' Support Bayesian additive regression trees via the \pkg{bartMachine} package. #' -#' @param Y Outcome variable -#' @param X Covariate dataframe -#' @param newX Optional dataframe to predict the outcome +#' @inheritParams SL.template +#' @inheritParams predict.SL.template #' @param obsWeights Optional observation-level weights (supported but not tested) #' @param id Optional id to group observations from the same unit (not used #' currently). -#' @param family "gaussian" for regression, "binomial" for binary -#' classification #' @param num_trees The number of trees to be grown in the sum-of-trees model. #' @param num_burn_in Number of MCMC samples to be discarded as "burn-in". #' @param num_iterations_after_burn_in Number of MCMC samples to draw from the -#' posterior distribution of f(x). +#' posterior distribution of \eqn{f(x)}. #' @param alpha Base hyperparameter in tree prior for whether a node is #' nonterminal or not. #' @param beta Power hyperparameter in tree prior for whether a node is #' nonterminal or not. -#' @param k For regression, k determines the prior probability that E(Y|X) is -#' contained in the interval (y_{min}, y_{max}), based on a normal -#' distribution. For example, when k=2, the prior probability is 95\%. For -#' classification, k determines the prior probability that E(Y|X) is between -#' (-3,3). Note that a larger value of k results in more shrinkage and a more +#' @param k For regression, `k` determines the prior probability that \eqn{E(Y|X)} is +#' contained in the interval \eqn{[y_\text{min}, y_\text{max}]}, based on a normal +#' distribution. For example, when `k = 2`, the prior probability is 95%. For +#' classification, `k` determines the prior probability that \eqn{E(Y|X)} is between +#' -3 and 3. Note that a larger value of `k` results in more shrinkage and a more #' conservative fit. #' @param q Quantile of the prior on the error variance at which the data-based #' estimate is placed. Note that the larger the value of q, the more #' aggressive the fit as you are placing more prior weight on values lower #' than the data-based estimate. Not used for classification. -#' @param nu Degrees of freedom for the inverse chi^2 prior. Not used for +#' @param nu Degrees of freedom for the inverse \eqn{\chi^2} prior. Not used for #' classification. #' @param verbose Prints information about progress of the algorithm to the #' screen. #' @param ... Additional arguments (not used) -#' -#' @encoding utf-8 + #' @export -SL.bartMachine <- function(Y, X, newX, family, obsWeights, id, +SL.bartMachine <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, id, num_trees = 50, num_burn_in = 250, verbose = F, alpha = 0.95, beta = 2, k = 2, q = 0.9, nu = 3, - num_iterations_after_burn_in = 1000, - ...) { + num_iterations_after_burn_in = 1000, ...) { .SL.require("bartMachine") model = bartMachine::bartMachine(X, Y, num_trees = num_trees, num_burn_in = num_burn_in, verbose = verbose, @@ -56,18 +51,10 @@ SL.bartMachine <- function(Y, X, newX, family, obsWeights, id, return(out) } -#' bartMachine prediction -#' @param object SuperLearner object -#' @param newdata Dataframe to predict the outcome -#' @param family "gaussian" for regression, "binomial" for binary -#' classification. (Not used) -#' @param Y Outcome variable (not used) -#' @param X Covariate dataframe (not used) -#' @param ... Additional arguments (not used) -#' -#' @export -predict.SL.bartMachine <- function(object, newdata, family, X = NULL, Y = NULL,...) { +#' @exportS3Method predict SL.bartMachine +#' @rdname SL.bartMachine +predict.SL.bartMachine <- function(object, newdata, ...) { .SL.require("bartMachine") - pred <- predict(object$object, newdata) - return(pred) + + predict(object$object, newdata) } diff --git a/R/SL.bayesglm.R b/R/SL.bayesglm.R index b8f05e8..d7068a1 100644 --- a/R/SL.bayesglm.R +++ b/R/SL.bayesglm.R @@ -1,7 +1,13 @@ -# bayesglm{arm} -# Bayesian generalized linear regression +#' Wrapper for Bayesian GLM learner using `arm` +#' +#' Support Bayesian GLM via the \pkg{arm} package. +#' +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm -SL.bayesglm <- function(Y, X, newX, family, obsWeights, ...){ +#' @export +SL.bayesglm <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, ...){ .SL.require('arm') fit.glm <- arm::bayesglm(Y ~ ., data = X, family = family, weights = obsWeights) pred <- predict(fit.glm, newdata = newX, type = "response") @@ -11,8 +17,10 @@ SL.bayesglm <- function(Y, X, newX, family, obsWeights, ...){ return(out) } -predict.SL.bayesglm <- function(object, newdata, ...){ +#' @exportS3Method predict SL.bayesglm +#' @rdname SL.bayesglm +predict.SL.bayesglm <- function(object, newdata, ...) { .SL.require('arm') - pred <- predict(object = object$object, newdata = newdata, type = "response") - return(pred) + + predict(object = object$object, newdata = newdata, type = "response") } \ No newline at end of file diff --git a/R/SL.biglasso.R b/R/SL.biglasso.R index f970923..be478ce 100644 --- a/R/SL.biglasso.R +++ b/R/SL.biglasso.R @@ -1,11 +1,8 @@ #' @title SL wrapper for biglasso -#' @description SL wrapper for biglasso #' -#' @param Y Outcome variable -#' @param X Training dataframe -#' @param newX Test dataframe -#' @param family Gaussian or binomial -#' @param obsWeights Observation-level weights +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm #' @param penalty The penalty to be applied to the model. Either "lasso" #' (default), "ridge", or "enet" (elastic net). #' @param alg.logistic The algorithm used in logistic regression. If "Newton" @@ -30,13 +27,8 @@ #' @param ncores The number of cores to use for parallel execution across a #' cluster created by the \code{parallel} package. #' @param nfolds The number of cross-validation folds. Default is 5. -#' @param ... Any additional arguments, not currently used. -#' -#' @importFrom biglasso cv.biglasso -#' @importFrom bigmemory as.big.matrix #' #' @examples -#' #' data(Boston, package = "MASS") #' Y = Boston$medv #' # Remove outcome from covariate dataframe. @@ -66,20 +58,18 @@ #' \code{\link[biglasso]{predict.biglasso}} \code{\link{SL.glmnet}} #' #' @export -SL.biglasso <- - function(Y, X, newX, family, - obsWeights, - penalty = "lasso", - alg.logistic = "Newton", - screen = "SSR", - alpha = 1, - nlambda = 100, - eval.metric = "default", - ncores = 1, - nfolds = 5, - ...) { +SL.biglasso <- function(Y, X, newX = X, family = gaussian(), + penalty = "lasso", + alg.logistic = "Newton", + screen = "SSR", + alpha = 1, + nlambda = 100, + eval.metric = "default", + ncores = 1, + nfolds = 5, + ...) { .SL.require("biglasso") - .SL.require("bigmemory") + .SL.require("bigmemory") # If binomial, biglasso still wants Y to be a numeric. @@ -90,25 +80,25 @@ SL.biglasso <- } # This will give a warning if X is only a single covariate. - X = bigmemory::as.big.matrix(X) + X <- bigmemory::as.big.matrix(X) - fit = biglasso::cv.biglasso(X, Y, family = family$family, - penalty = penalty, - alg.logistic = alg.logistic, - screen = screen, - eval.metric = eval.metric, - ncores = ncores, - alpha = alpha, - nfolds = nfolds, - nlambda = nlambda) + fit <- biglasso::cv.biglasso(X, Y, family = family$family, + penalty = penalty, + alg.logistic = alg.logistic, + screen = screen, + eval.metric = eval.metric, + ncores = ncores, + alpha = alpha, + nfolds = nfolds, + nlambda = nlambda) if (!is.matrix(newX)) { - newX = model.matrix(~ ., newX) + newX <- model.matrix(~ ., newX) # Remove intercept that was added. - newX = newX[, -1] + newX <- newX[, -1] } - newX = bigmemory::as.big.matrix(newX) + newX <- bigmemory::as.big.matrix(newX) pred <- predict(fit, newX, type = "response") fit <- list(object = fit) @@ -119,32 +109,19 @@ SL.biglasso <- return(out) } -#' @title Prediction wrapper for SL.biglasso -#' -#' @description Prediction wrapper for SL.biglasso objects. -#' -#' @param object SL.kernlab object -#' @param newdata Dataframe to generate predictions -#' @param ... Unused additional arguments -#' -#' @importFrom bigmemory as.big.matrix -#' -#' @seealso \code{\link{SL.biglasso}} \code{\link[biglasso]{biglasso}} -#' \code{\link[biglasso]{predict.biglasso}} -#' -#' @export -predict.SL.biglasso <- function(object, newdata, - ...) { +#' @exportS3Method predict SL.biglasso +#' @rdname SL.biglasso +predict.SL.biglasso <- function(object, newdata, ...) { .SL.require("biglasso") .SL.require("bigmemory") if (!is.matrix(newdata)) { - newdata = model.matrix(~ ., newdata) + newdata <- model.matrix(~ ., newdata) # Remove intercept that was added. - newdata = newdata[, -1] + newdata <- newdata[, -1] } - newdata = bigmemory::as.big.matrix(newdata) + newdata <- bigmemory::as.big.matrix(newdata) # Binomial and gaussian prediction is the same. pred <- predict(object$object, newdata, type = "response") diff --git a/R/SL.caret.R b/R/SL.caret.R index ff8c86c..14ad3fa 100644 --- a/R/SL.caret.R +++ b/R/SL.caret.R @@ -1,14 +1,34 @@ -SL.caret <- function(Y, X, newX, family, obsWeights, method = "rf", tuneLength = 3, trControl = caret::trainControl(method = "cv", number = 10, verboseIter = TRUE), metric = ifelse(family$family == 'gaussian', 'RMSE', 'Accuracy'), ...) { +#' SL wrapper for machine learning learner using `caret` +#' +#' Support machine learning via the \pkg{caret} package. +#' +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm +#' @param method,tuneLength,trControl,metric arguments passed to `caret::train()` +#' +#' @examples +#' # how to change to a different method: +#' SL.caret.rpart <- function(..., method = "rpart") { +#' SL.caret(..., method = method) +#' } + +#' @export +SL.caret <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, method = "rf", tuneLength = 3, + trControl = caret::trainControl(method = "cv", number = 10, verboseIter = TRUE), + metric = ifelse(family$family == 'gaussian', 'RMSE', 'Accuracy'), ...) { .SL.require('caret') if (family$family == "gaussian") { - fit.train <- caret::train(x = X, y = Y, weights = obsWeights, metric = metric, method = method, tuneLength = tuneLength, trControl = trControl) + fit.train <- caret::train(x = X, y = Y, weights = obsWeights, metric = metric, + method = method, tuneLength = tuneLength, trControl = trControl) pred <- predict(fit.train, newdata = newX, type = "raw") } if (family$family == "binomial") { # outcome must be factor, and have real labels Y.f <- as.factor(Y) levels(Y.f) <- c("A0", "A1") - fit.train <- caret::train(x = X, y = Y.f, weights = obsWeights, metric = metric, method = method, tuneLength = tuneLength, trControl = trControl) + fit.train <- caret::train(x = X, y = Y.f, weights = obsWeights, metric = metric, + method = method, tuneLength = tuneLength, trControl = trControl) pred <- predict(fit.train, newdata = newX, type = "prob")[, 2] } fit <- list(object = fit.train) @@ -17,17 +37,19 @@ SL.caret <- function(Y, X, newX, family, obsWeights, method = "rf", tuneLength = return(out) } +#' @exportS3Method predict SL.caret predict.SL.caret <- function(object, newdata, ...) { .SL.require('caret') if (object$object$modelType == "Regression") { pred <- predict(object$object, newdata = newdata, type = "raw") } else if (object$object$modelType == "Classification") { - pred <- predict(object$object, newdata = newdata, type = "prob")[, 2] + pred <- predict(object$object, newdata = newdata, type = "prob")[, 2L] } return(pred) } # how to change to a different method: +#' @export SL.caret.rpart <- function(..., method = "rpart") { SL.caret(..., method = method) } diff --git a/R/SL.cforest.R b/R/SL.cforest.R index 79c5221..809a7f2 100644 --- a/R/SL.cforest.R +++ b/R/SL.cforest.R @@ -1,24 +1,16 @@ -#' cforest {party} +#' SL wrapper for `party::cforest()` #' -#' These defaults emulate cforest_unbiased() but allow customization. +#' These defaults emulate `party::cforest_unbiased()` but allow customization. #' -#' @param Y Outcome variable -#' @param X Covariate dataframe -#' @param newX Optional dataframe to predict the outcome -#' @param family "gaussian" for regression, "binomial" for binary -#' classification -#' @param obsWeights Optional observation-level weights (supported but not tested) -#' @param id Optional id to group observations from the same unit (not used -#' currently). -#' @param ntree Number of trees -#' @param mtry Number of randomly selected features per node -#' @param mincriterion See ?cforest_control -#' @param teststat See ?cforest_control -#' @param testtype See ?cforest_control -#' @param replace See ?cforest_control -#' @param fraction See ?cforest_control -#' @param ... Remaining arguments (unused) -SL.cforest <- function(Y, X, newX, family, obsWeights, id, ntree = 1000, +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm +#' @param ntree Number of trees. +#' @param mtry Number of randomly selected features per node. +#' @param mincriterion,teststat,testtype,replace,fraction Arguments passed to `party::cforest_control()`. + +#' @export +SL.cforest <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, ntree = 1000, mtry = max(floor(ncol(X) / 3), 1), mincriterion = 0, teststat = "quad", testtype = "Univ", replace = F, fraction = 0.632, ...) { @@ -37,9 +29,10 @@ SL.cforest <- function(Y, X, newX, family, obsWeights, id, ntree = 1000, return(out) } -# +#' @exportS3Method predict SL.cforest +#' @rdname SL.cforest predict.SL.cforest <- function(object, newdata, ...) { .SL.require('party') - pred <- predict(object = object$object, newdata = newdata) - return(pred) + + predict(object = object$object, newdata = newdata) } \ No newline at end of file diff --git a/R/SL.earth.R b/R/SL.earth.R index b45c967..49ddc05 100644 --- a/R/SL.earth.R +++ b/R/SL.earth.R @@ -1,11 +1,25 @@ -# earth {earth} -SL.earth <- function(Y, X, newX, family, obsWeights, id, degree = 2, penalty = 3, nk = max(21, 2*ncol(X) + 1), pmethod = "backward", nfold = 0, ncross = 1, minspan = 0, endspan = 0,...) { +#' SL wrapper for `earth::earth()` +#' +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm +#' @param degree,penalty,nk,pmethod,nfold,ncross,minspan,endspan Arguments passed to `earth::earth()`. + +#' @export +SL.earth <- function(Y, X, newX = X, family = gaussian(), degree = 2, penalty = 3, + nk = max(21, 2*ncol(X) + 1), pmethod = "backward", nfold = 0, + ncross = 1, minspan = 0, endspan = 0, ...) { .SL.require('earth') + if(family$family == "gaussian") { - fit.earth <- earth::earth(x = X, y = Y, degree = degree, nk = nk, penalty = penalty, pmethod = pmethod, nfold = nfold, ncross = ncross, minspan = minspan, endspan = endspan) + fit.earth <- earth::earth(x = X, y = Y, degree = degree, nk = nk, penalty = penalty, + pmethod = pmethod, nfold = nfold, ncross = ncross, + minspan = minspan, endspan = endspan) } if(family$family == "binomial") { - fit.earth <- earth::earth(x = X, y = Y, degree = degree, nk = nk, penalty = penalty, pmethod = pmethod, nfold = nfold, ncross = ncross, minspan = minspan, endspan = endspan, glm = list(family = binomial)) + fit.earth <- earth::earth(x = X, y = Y, degree = degree, nk = nk, penalty = penalty, + pmethod = pmethod, nfold = nfold, ncross = ncross, + minspan = minspan, endspan = endspan, glm = list(family = binomial)) } pred <- predict(fit.earth, newdata = newX, type = "response") fit <- list(object = fit.earth) @@ -14,9 +28,10 @@ SL.earth <- function(Y, X, newX, family, obsWeights, id, degree = 2, penalty = 3 return(out) } -# +#' @exportS3Method predict SL.earth +#' @rdname SL.earth predict.SL.earth <- function(object, newdata,...) { .SL.require('earth') - pred <- predict(object$object, newdata = newdata, type = "response") - return(pred) + + predict(object$object, newdata = newdata, type = "response") } diff --git a/R/SL.gam.R b/R/SL.gam.R index 36528e4..78ca3dc 100644 --- a/R/SL.gam.R +++ b/R/SL.gam.R @@ -1,27 +1,49 @@ -## gam{gam} -## generalized additive models (degree = 2) -# functions considers any variable with more than 4 (change with cts.num) unique values to be continuous and able to be in smoothing splines. -# easy to add additional algorithms with different degrees -# SL.gam.3 <- function(...,deg.gam = 3) SL.gam(..., deg.gam = deg.gam) +#' SL wrapper for GAMs +#' +#' Wrapper for generalized additivie models (GAMs) using `gam::gam()`. +#' +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm +#' @param deg.gam Degree of smoothing (passed to the `df` argument of `gam::s()`). +#' @param cts.num Number of levels required for a variable to be consider continuous (and therefore given a smoothing spline). +#' +#' @examples +#' # easy to add additional algorithms with different degrees +# SL.gam.3 <- function(...,deg.gam = 3) SL.gam(..., deg.gam = deg.gam) -SL.gam <- function(Y, X, newX, family, obsWeights, deg.gam = 2, cts.num = 4, ...) { +#' @export +## generalized additive models (degree = 2) +# functions considers any variable with more than 4 (change with cts.num) unique values to be continuous and able to be in smoothing splines. +SL.gam <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, deg.gam = 2, cts.num = 4, ...) { # requireNamespace() alone does not work. requireNamespace, unlike require(), does not attached the package and allow the formula to parse correctly with s(), gam::s() doesn't work, is not recognized as a special function - if(!requireNamespace('gam')) {stop("SL.gam requires the gam package, but it isn't available")} - if (!"package:gam" %in% search()) attachNamespace('gam') # check if gam attached, if not, then attached - if("mgcv" %in% loadedNamespaces()) warning("mgcv and gam packages are both in use. You might see an error because both packages use the same function names.") + .SL.require('gam') + # require("gam") + + if ("mgcv" %in% loadedNamespaces()) { + warning("mgcv and gam packages are both in use. You might see an error because both packages use the same function names.") + } + # create the formula for gam with a spline for each continuous variable cts.x <- apply(X, 2, function(x) (length(unique(x)) > cts.num)) - if (sum(!cts.x) > 0) { + + if (sum(!cts.x) > 0) { gam.model <- as.formula(paste("Y~", paste(paste("s(", colnames(X[, cts.x, drop = FALSE]), ",", deg.gam,")", sep=""), collapse = "+"), "+", paste(colnames(X[, !cts.x, drop=FALSE]), collapse = "+"))) - } else { + } + else { gam.model <- as.formula(paste("Y~", paste(paste("s(", colnames(X[, cts.x, drop = FALSE]), ",", deg.gam, ")", sep=""), collapse = "+"))) } + # fix for when all variables are binomial if (sum(!cts.x) == length(cts.x)) { gam.model <- as.formula(paste("Y~", paste(colnames(X), collapse = "+"), sep = "")) } - fit.gam <- gam::gam(gam.model, data = X, family = family, control = gam::gam.control(maxit = 50, bf.maxit = 50), weights = obsWeights) - if(packageVersion('gam') >= "1.15") { + + fit.gam <- gam::gam(gam.model, data = X, family = family, + control = gam::gam.control(maxit = 50, bf.maxit = 50), + weights = obsWeights) + + if (utils::packageVersion('gam') >= "1.15") { pred <- gam::predict.Gam(fit.gam, newdata = newX, type = "response") # updated gam class in version 1.15 } else { stop("This SL.gam wrapper requires gam version >= 1.15, please update the gam package with 'update.packages('gam')'") @@ -32,14 +54,16 @@ SL.gam <- function(Y, X, newX, family, obsWeights, deg.gam = 2, cts.num = 4, ... return(out) } +#' @exportS3Method predict SL.gam +#' @rdname SL.gam predict.SL.gam <- function(object, newdata, ...){ .SL.require('gam') - if(packageVersion('gam') >= "1.15") { + if(utils::packageVersion('gam') >= "1.15") { pred <- gam::predict.Gam(object = object$object, newdata = newdata, type = "response") # updated gam class in version 1.15 } else { stop("This SL.gam wrapper requires gam version >= 1.15, please update the gam package with 'update.packages('gam')'") } - + return(pred) } diff --git a/R/SL.gbm.R b/R/SL.gbm.R index fb3a6e8..65e036a 100644 --- a/R/SL.gbm.R +++ b/R/SL.gbm.R @@ -2,26 +2,30 @@ # generalized boosting algorithm # can alter number ot trees in intial fit # also alter interaction depth (SL.gbm.1 and SL.gbm.2) -SL.gbm <- function(Y, X, newX, family, obsWeights, gbm.trees = 10000, interaction.depth = 2, shrinkage = 0.001, bag.fraction = 0.5, cv.folds = 5, n.minobsinnode = 10, n.cores = NULL, ...) { +#' @export +SL.gbm <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, gbm.trees = 10000, + interaction.depth = 2, shrinkage = 0.001, bag.fraction = 0.5, cv.folds = 5, + n.minobsinnode = 10, n.cores = NULL, ...) { .SL.require('gbm') - gbm.model <- as.formula(paste("Y~", paste(colnames(X), collapse="+"))) - if(family$family == "gaussian") { - fit.gbm <- gbm::gbm(formula = gbm.model, data = X, distribution = "gaussian", n.trees = gbm.trees, interaction.depth = interaction.depth, shrinkage = shrinkage, bag.fraction = bag.fraction, n.minobsinnode = n.minobsinnode, cv.folds = cv.folds, keep.data = TRUE, weights = obsWeights, verbose = FALSE, n.cores = n.cores) - } - if(family$family == "binomial") { - fit.gbm <- gbm::gbm(formula = gbm.model, data = X, distribution = "bernoulli", n.trees = gbm.trees, interaction.depth = interaction.depth, shrinkage = shrinkage, bag.fraction = bag.fraction, n.minobsinnode = n.minobsinnode, cv.folds = cv.folds, keep.data = TRUE, weights = obsWeights, verbose = FALSE, n.cores = n.cores) - } - best.iter <- gbm::gbm.perf(fit.gbm, method = "cv", plot.it = FALSE) - pred <- predict(fit.gbm, newdata = newX, n.trees = best.iter, type = "response") - fit <- list(object = fit.gbm, n.trees = best.iter) - out <- list(pred = pred, fit = fit) - class(out$fit) <- c("SL.gbm") - return(out) + gbm.model <- reformulate(colnames(X), "Y") + if (family$family == "gaussian") { + fit.gbm <- gbm::gbm(formula = gbm.model, data = X, distribution = "gaussian", n.trees = gbm.trees, interaction.depth = interaction.depth, shrinkage = shrinkage, bag.fraction = bag.fraction, n.minobsinnode = n.minobsinnode, cv.folds = cv.folds, keep.data = TRUE, weights = obsWeights, verbose = FALSE, n.cores = n.cores) + } + else if (family$family == "binomial") { + fit.gbm <- gbm::gbm(formula = gbm.model, data = X, distribution = "bernoulli", n.trees = gbm.trees, interaction.depth = interaction.depth, shrinkage = shrinkage, bag.fraction = bag.fraction, n.minobsinnode = n.minobsinnode, cv.folds = cv.folds, keep.data = TRUE, weights = obsWeights, verbose = FALSE, n.cores = n.cores) + } + best.iter <- gbm::gbm.perf(fit.gbm, method = "cv", plot.it = FALSE) + pred <- predict(fit.gbm, newdata = newX, n.trees = best.iter, type = "response") + fit <- list(object = fit.gbm, n.trees = best.iter) + out <- list(pred = pred, fit = fit) + class(out$fit) <- c("SL.gbm") + return(out) } -predict.SL.gbm <- function(object, newdata,...){ +#' @exportS3Method predict SL.gbm +predict.SL.gbm <- function(object, newdata,...) { .SL.require('gbm') - pred <- predict(object$object, newdata = newdata, n.trees = object$n.trees, type = "response") - return(pred) + + predict(object$object, newdata = newdata, n.trees = object$n.trees, type = "response") } diff --git a/R/SL.glm.R b/R/SL.glm.R index c7a6c7a..6af7edd 100755 --- a/R/SL.glm.R +++ b/R/SL.glm.R @@ -1,20 +1,25 @@ -#' @title Wrapper for glm -#' @description Wrapper for generalized linear models via glm(). +#' SL wrapper for `glm()` #' -#' Note that for outcomes bounded by [0, 1] the binomial family can be used in -#' addition to gaussian. +#' `SL.glm()` is a wrapper for generalized linear models (GLMs) fit with [glm()]. `SL.glm.interaction()` is the same but automatically includes all 2-way interactions between predictors. `SL.lm()` is a wrapper for linear models fit with [lm()]. `SL.mean()` is a wrapper for an intercept-only model. #' -#' @param Y Outcome variable -#' @param X Training dataframe -#' @param newX Test dataframe -#' @param family Gaussian or binomial -#' @param obsWeights Observation-level weights -#' @param model Whether to save model.matrix of data in fit object. Set to FALSE +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @param model Whether to save `model.matrix` of data in fit object. Set to `FALSE` #' to save memory. -#' @param ... Any remaining arguments, not used. +#' @param ... Ignored. #' -#' @examples +#' @details +#' Note that for outcomes bounded by \[0, 1\] the binomial family can be used in +#' addition to gaussian. +#' +#' @references +#' Fox, J. (2015). Applied regression analysis and generalized linear models. +#' Sage Publications. +#' +#' @seealso \code{\link{predict.SL.glm}} \code{\link[stats]{glm}} +#' \code{\link[stats]{predict.glm}} \code{\link{SL.speedglm}} #' +#' @examples #' data(Boston, package = "MASS") #' Y = Boston$medv #' # Remove outcome from covariate dataframe. @@ -22,21 +27,17 @@ #' #' set.seed(1) #' -#' sl = SuperLearner(Y, X, family = gaussian(), -#' SL.library = c("SL.mean", "SL.glm")) +#' sl <- SuperLearner(Y, X, family = gaussian(), +#' SL.library = c("SL.mean", "SL.glm", +#' "SL.glm.interaction")) #' #' print(sl) -#' -#' @references -#' -#' Fox, J. (2015). Applied regression analysis and generalized linear models. -#' Sage Publications. -#' -#' @seealso \code{\link{predict.SL.glm}} \code{\link[stats]{glm}} -#' \code{\link[stats]{predict.glm}} \code{\link{SL.speedglm}} -#' + #' @export -SL.glm <- function(Y, X, newX, family, obsWeights, model = TRUE, ...) { +SL.glm <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, model = TRUE, ...) { + if (is.null(obsWeights)) { + obsWeights <- rep.int(1, length(Y)) + } # X must be a dataframe, not a matrix. if (is.matrix(X)) { @@ -58,34 +59,20 @@ SL.glm <- function(Y, X, newX, family, obsWeights, model = TRUE, ...) { return(out) } -#' @title Prediction for SL.glm -#' @description Prediction for SL.glm -#' -#' @param object SL.glm object -#' @param newdata Dataframe to generate predictions -#' @param ... Unused additional arguments -#' -#' @seealso \code{\link{SL.glm}} \code{\link[stats]{glm}} -#' \code{\link[stats]{predict.glm}} \code{\link{SL.speedglm}} -#' #' @export -predict.SL.glm <- function(object, newdata, ...) { - # newdata must be a dataframe, not a matrix. - if (is.matrix(newdata)) { - newdata = as.data.frame(newdata) +#' @rdname SL.glm +SL.glm.interaction <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, model = TRUE, ...) { + if (is.null(obsWeights)) { + obsWeights <- rep.int(1, length(Y)) } - pred <- predict(object = object$object, newdata = newdata, type = "response") - pred -} - -SL.glm.interaction <- function(Y, X, newX, family, obsWeights, ...) { # X must be a dataframe, not a matrix. if (is.matrix(X)) { X = as.data.frame(X) } - fit.glm <- glm(Y ~ .^2, data = X, family = family, weights = obsWeights) + fit.glm <- glm(Y ~ .^2, data = X, family = family, weights = obsWeights, + model = model) # newX must be a dataframe, not a matrix. if (is.matrix(newX)) { @@ -98,3 +85,88 @@ SL.glm.interaction <- function(Y, X, newX, family, obsWeights, ...) { out <- list(pred = pred, fit = fit) return(out) } + +#' @export +#' @rdname SL.glm +SL.lm <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, model = TRUE, ...) { + if (is.null(obsWeights)) { + obsWeights <- rep.int(1, length(Y)) + } + + # X must be a dataframe, not a matrix. + if (is.matrix(X)) { + X = as.data.frame(X) + } + + # qr element is needed in order to predict(), unless we extract coefficents. + fit <- stats::lm(Y ~ ., data = X, weights = obsWeights, model = model) + + if (is.matrix(newX)) { + newX = as.data.frame(newX) + } + + pred <- predict(fit, newdata = newX, type = "response") + + # For binomial family restrict predicted probability to [0, 1]. + if (family$family == "binomial") { + pred = pmin(pmax(pred, 0), 1) + } + + fit <- list(object = fit, family = family) + class(fit) <- "SL.lm" + + out <- list(pred = pred, fit = fit) + + return(out) +} + +#' @export +#' @rdname SL.glm +SL.mean <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, model = TRUE, ...) { + if (is.null(obsWeights)) { + obsWeights <- rep.int(1, length(Y)) + } + + meanY <- weighted.mean(Y, w = obsWeights) + + pred <- rep.int(meanY, times = nrow(newX)) + fit <- list(object = meanY) + out <- list(pred = pred, fit = fit) + class(out$fit) <- c("SL.mean") + + return(out) +} + +#' @exportS3Method predict SL.glm +#' @rdname SL.glm +predict.SL.glm <- function(object, newdata, ...) { + # newdata must be a dataframe, not a matrix. + if (is.matrix(newdata)) { + newdata = as.data.frame(newdata) + } + + predict(object = object$object, newdata = newdata, type = "response") +} + +#' @exportS3Method predict SL.lm +predict.SL.lm <- function(object, newdata, ...) { + + # newdata must be a dataframe, not a matrix. + if (is.matrix(newdata)) { + newdata = as.data.frame(newdata) + } + + pred <- predict(object = object$object, newdata = newdata, type = "response") + + # For binomial family restrict predicted probability to [0, 1]. + if (object$family$family == "binomial") { + pred = pmin(pmax(pred, 0), 1) + } + + pred +} + +#' @exportS3Method predict SL.mean +predict.SL.mean <- function(object, newdata, ...) { + rep.int(object$object, times = nrow(newdata)) +} diff --git a/R/SL.glmnet.R b/R/SL.glmnet.R index 4116699..10c17d0 100644 --- a/R/SL.glmnet.R +++ b/R/SL.glmnet.R @@ -1,4 +1,4 @@ -#' @title Elastic net regression, including lasso and ridge +#' SL wrapper for elastic net regression, including lasso and ridge #' #' @description #' Penalized regression using elastic net. Alpha = 0 corresponds to ridge @@ -7,17 +7,10 @@ #' See \code{vignette("glmnet_beta", package = "glmnet")} for a nice tutorial on #' glmnet. #' -#' @param Y Outcome variable -#' @param X Covariate dataframe -#' @param newX Dataframe to predict the outcome -#' @param obsWeights Optional observation-level weights -#' @param id Optional id to group observations from the same unit (not used -#' currently). -#' @param family "gaussian" for regression, "binomial" for binary -#' classification. Untested options: "multinomial" for multiple classification -#' or "mgaussian" for multiple response, "poisson" for non-negative outcome -#' with proportional mean and variance, "cox". -#' @param alpha Elastic net mixing parameter, range [0, 1]. 0 = ridge regression +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm +#' @param alpha Elastic net mixing parameter, range \[0, 1\]. 0 = ridge regression #' and 1 = lasso. #' @param nfolds Number of folds for internal cross-validation to optimize lambda. #' @param nlambda Number of lambda values to check, recommended to be 100 or more. @@ -27,7 +20,11 @@ #' standard-error rule which chooses a higher penalty with performance within #' one standard error of the minimum (see Breiman et al. 1984 on CART for #' background). -#' @param ... Any additional arguments are passed through to cv.glmnet. +#' @param ... Any additional arguments are passed through to `cv.glmnet()`. +#' @param remove_extra_cols Remove any extra columns in the new data that were +#' not part of the original model. +#' @param add_missing_cols Add any columns from original data that do not exist +#' in the new data, and set values to 0. #' #' @examples #' @@ -65,7 +62,7 @@ #' #' @seealso \code{\link{predict.SL.glmnet}} \code{\link[glmnet]{cv.glmnet}} #' \code{\link[glmnet]{glmnet}} -#' + #' @export SL.glmnet <- function(Y, X, newX, family, obsWeights, id, alpha = 1, nfolds = 10, nlambda = 100, useMin = TRUE, @@ -102,24 +99,11 @@ SL.glmnet <- function(Y, X, newX, family, obsWeights, id, return(out) } -#' @title Prediction for an SL.glmnet object -#' -#' @description Prediction for the glmnet wrapper. -#' -#' @param object Result object from SL.glmnet -#' @param newdata Dataframe or matrix that will generate predictions. -#' @param remove_extra_cols Remove any extra columns in the new data that were -#' not part of the original model. -#' @param add_missing_cols Add any columns from original data that do not exist -#' in the new data, and set values to 0. -#' @param ... Any additional arguments (not used). -#' -#' @seealso \code{\link{SL.glmnet}} -#' -#' @export +#' @exportS3Method predict SL.glmnet +#' @rdname SL.glmnet predict.SL.glmnet <- function(object, newdata, - remove_extra_cols = T, - add_missing_cols = T, + remove_extra_cols = TRUE, + add_missing_cols = TRUE, ...) { .SL.require('glmnet') @@ -135,7 +119,7 @@ predict.SL.glmnet <- function(object, newdata, extra_cols = setdiff(colnames(newdata), original_cols) if (length(extra_cols) > 0) { warning(paste("Removing extra columns in prediction data:", - paste(extra_cols, collapse = ", "))) + toString(extra_cols))) newdata = newdata[, !colnames(newdata) %in% extra_cols, drop = FALSE] } @@ -146,7 +130,7 @@ predict.SL.glmnet <- function(object, newdata, missing_cols = setdiff(original_cols, colnames(newdata)) if (length(missing_cols) > 0) { warning(paste("Adding missing columns in prediction data:", - paste(missing_cols, collapse = ", "))) + toString(missing_cols))) new_cols = matrix(0, nrow = nrow(newdata), ncol = length(missing_cols)) colnames(new_cols) = missing_cols @@ -159,8 +143,7 @@ predict.SL.glmnet <- function(object, newdata, # If we predict with the cv.glmnet object we can specify lambda using a # string. - pred <- predict(object$object, newx = newdata, type = "response", - s = ifelse(object$useMin, "lambda.min", "lambda.1se")) - return(pred) + predict(object$object, newx = newdata, type = "response", + s = ifelse(object$useMin, "lambda.min", "lambda.1se")) } diff --git a/R/SL.ipredbagg.R b/R/SL.ipredbagg.R index c988f17..b243b56 100644 --- a/R/SL.ipredbagg.R +++ b/R/SL.ipredbagg.R @@ -1,13 +1,16 @@ # bagging {ipred} -# +# -SL.ipredbagg <- function(Y, X, newX, family, nbagg = 100, control = rpart::rpart.control(xval = 0, maxsurrogate = 0, minsplit = 20, cp = 0.01, maxdepth = 30), ...){ +#' @export +SL.ipredbagg <- function(Y, X, newX = X, family = gaussian(), nbagg = 100, + control = rpart::rpart.control(xval = 0, maxsurrogate = 0, minsplit = 20, cp = 0.01, maxdepth = 30), + ...) { .SL.require('ipred') - if(family$family == "gaussian"){ + if (family$family == "gaussian"){ fit.bag <- ipred::ipredbagg(y=Y, X = X, nbagg = nbagg, control = control) pred <- predict(fit.bag, newdata = newX, aggregation = "average") } - if(family$family == "binomial"){ + if (family$family == "binomial"){ fit.bag <- ipred::ipredbagg(y = as.factor(Y), X = X, nbagg = nbagg, control = control) pred <- predict(fit.bag, newdata = newX, type = "prob", aggregation = "average")[, 2] } @@ -17,15 +20,14 @@ SL.ipredbagg <- function(Y, X, newX, family, nbagg = 100, control = rpart::rpart return(out) } -# -predict.SL.ipredbagg <- function(object, newdata, family, X=NULL, Y=NULL,...) { +#' @exportS3Method predict SL.ipredbagg +predict.SL.ipredbagg <- function(object, newdata, family, ...) { .SL.require('ipred') - if(family$family=="gaussian"){ + if (family$family == "gaussian") { pred <- predict(object = object$object, newdata = newdata, aggregation = "average") } - if(family$family=="binomial"){ - pred <- predict(object = object$object, newdata = newdata, type = "prob", aggregation = "average")[, 2] + else if (family$family == "binomial"){ + pred <- predict(object = object$object, newdata = newdata, type = "prob", aggregation = "average")[, 2L] } return(pred) } - diff --git a/R/SL.kernelKnn.R b/R/SL.kernelKnn.R index 4c6a31e..22a7d72 100644 --- a/R/SL.kernelKnn.R +++ b/R/SL.kernelKnn.R @@ -1,11 +1,11 @@ -#' @title SL wrapper for KernelKNN +#' SL wrapper for KernelKNN #' -#' @description Wrapper for a configurable implementation of k-nearest +#' Wrapper for a configurable implementation of k-nearest #' neighbors. Supports both binomial and gaussian outcome distributions. -#' @param Y Outcome variable -#' @param X Training dataframe -#' @param newX Test dataframe -#' @param family Gaussian or binomial +#' +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm #' @param k Number of nearest neighbors to use #' @param method Distance method, can be 'euclidean' (default), 'manhattan', #' 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', @@ -20,12 +20,8 @@ #' k-nearest-neighbors will be removed (can be thought as outlier removal). #' @param h the bandwidth, applicable if the weights_function is not NULL. #' Defaults to 1.0. -#' @param ... Any additional parameters, not currently passed through. -#' @return List with predictions and the original training data & -#' hyperparameters. #' #' @examples -#' #' # Load a test dataset. #' data(PimaIndiansDiabetes2, package = "mlbench") #' @@ -42,20 +38,20 @@ #' sl = SuperLearner(Y_bin, X, family = binomial(), #' SL.library = c("SL.mean", "SL.kernelKnn")) #' sl -#' + #' @export -SL.kernelKnn = function(Y, X, newX, family, - k = 10, - method = "euclidean", - weights_function = NULL, - extrema = F, - h = 1, - ...) { +SL.kernelKnn <- function(Y, X, newX = X, family = gaussian(), + k = 10, + method = "euclidean", + weights_function = NULL, + extrema = FALSE, + h = 1, + ...) { .SL.require("KernelKnn") if (family$family != "gaussian" && min(Y) == 0) { # Make sure that Y starts at 1 rather than 0. - Y = Y + 1 + Y <- Y + 1 } if (family$family == "gaussian") { @@ -77,8 +73,8 @@ SL.kernelKnn = function(Y, X, newX, family, # Save configuration plus original X and Y data to the fit object. fit = list(k = k, method = method, weights_function = weights_function, - extrema = extrema, h = h, - X = X, Y = Y, family = family) + extrema = extrema, h = h, + X = X, Y = Y, family = family) out = list(pred = pred, fit = fit) @@ -86,11 +82,8 @@ SL.kernelKnn = function(Y, X, newX, family, return(out) } -#' Prediction for SL.kernelKnn -#' @param object SL.kernelKnn object -#' @param newdata Dataframe to generate predictions -#' @param ... Unused additional arguments -#' @export +#' @exportS3Method predict SL.kernelKnn +#' @rdname SL.kernelKnn predict.SL.kernelKnn <- function(object, newdata, ...) { .SL.require("KernelKnn") @@ -112,7 +105,7 @@ predict.SL.kernelKnn <- function(object, newdata, ...) { if (object$family$family == "binomial") { # Pred is a two-column matrix, where column 1 is Pr(Y = 0), 2 is Pr(Y = 1) - pred = pred[, 2] + pred = pred[, 2L] } return(pred) diff --git a/R/SL.knn.R b/R/SL.knn.R index fabfb2f..bc0d31a 100644 --- a/R/SL.knn.R +++ b/R/SL.knn.R @@ -3,9 +3,10 @@ # to create additional algorithms with different values of k, for example k=20 # SL.knn20 <- function(..., k = 20) SL.knn(...,k = k) +#' @export SL.knn <- function(Y, X, newX, family, k = 10, ...) { .SL.require('class') - if(family$family=="gaussian") { + if(family$family=="gaussian") { stop("SL.knn only available for family = binomial()") } fit.knn <- class::knn(train = X, test = newX, k = k, cl = Y, prob = TRUE) @@ -17,9 +18,10 @@ SL.knn <- function(Y, X, newX, family, k = 10, ...) { } # will need original Y and X data for this +#' @exportS3Method predict SL.knn predict.SL.knn <- function(object, newdata, X, Y, ...){ .SL.require('class') fit.knn <- class::knn(train = X, test = newdata, k = object$k, cl = Y, prob = TRUE) - pred <- (as.numeric(fit.knn) - 1) * attr(fit.knn, "prob") + (1 - (as.numeric(fit.knn) - 1)) * (1 - attr(fit.knn, "prob")) - return(pred) + + (as.numeric(fit.knn) - 1) * attr(fit.knn, "prob") + (1 - (as.numeric(fit.knn) - 1)) * (1 - attr(fit.knn, "prob")) } \ No newline at end of file diff --git a/R/SL.ksvm.R b/R/SL.ksvm.R index df06e62..56d39a9 100644 --- a/R/SL.ksvm.R +++ b/R/SL.ksvm.R @@ -1,11 +1,10 @@ -#' @title Wrapper for Kernlab's SVM algorithm +#' SL wrapper for Kernlab's SVM algorithm #' -#' @description Wrapper for Kernlab's support vector machine algorithm. +#' Wrapper for Kernlab's support vector machine algorithm. #' -#' @param Y Outcome variable -#' @param X Training dataframe -#' @param newX Test dataframe -#' @param family Gaussian or binomial +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm #' @param type ksvm can be used for classification , for regression, or for #' novelty detection. Depending on whether y is a factor or not, the default #' setting for type is C-svc or eps-svr, respectively, but can be overwritten @@ -43,13 +42,10 @@ #' @param cache cache memory in MB (default 40) #' @param tol tolerance of termination criterion (default: 0.001) #' @param shrinking option whether to use the shrinking-heuristics (default: TRUE) -#' @param ... Any additional parameters, not currently passed through. -#' -#' @return List with predictions and the original training data & -#' hyperparameters. +#' @param coupler Coupling method used in the multiclass case, can be one of +#' minpair or pkpd (see kernlab package for details). For future usage. #' #' @references -#' #' Hsu, C. W., Chang, C. C., & Lin, C. J. (2016). A practical guide to support #' vector classification. \url{http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf} #' @@ -63,7 +59,6 @@ #' 1-20. #' #' @examples -#' #' data(Boston, package = "MASS") #' Y = Boston$medv #' # Remove outcome from covariate dataframe. @@ -78,29 +73,27 @@ #' pred = predict(sl, X) #' summary(pred$pred) #' -#' @importFrom kernlab predict #' #' @seealso \code{\link{predict.SL.ksvm}} \code{\link[kernlab]{ksvm}} #' \code{\link[kernlab]{predict.ksvm}} #' -#' @encoding utf-8 -#' + #' @export -SL.ksvm = function(Y, X, newX, family, - type = NULL, - kernel = "rbfdot", - kpar = "automatic", - scaled = T, - C = 1, - nu = 0.2, - epsilon = 0.1, - cross = 0, - prob.model = family$family == "binomial", - class.weights = NULL, - cache = 40, - tol = 0.001, - shrinking = T, - ...) { +SL.ksvm <- function(Y, X, newX = X, family = gaussian(), + type = NULL, + kernel = "rbfdot", + kpar = "automatic", + scaled = TRUE, + C = 1, + nu = 0.2, + epsilon = 0.1, + cross = 0, + prob.model = family$family == "binomial", + class.weights = NULL, + cache = 40, + tol = 0.001, + shrinking = TRUE, + ...) { .SL.require("kernlab") # Make sure X is a matrix rather than a dataframe. @@ -151,20 +144,8 @@ SL.ksvm = function(Y, X, newX, family, return(out) } -#' Prediction for SL.ksvm -#' -#' @param object SL.kernlab object -#' @param newdata Dataframe to generate predictions -#' @param family Gaussian or binomial -#' @param coupler Coupling method used in the multiclass case, can be one of -#' minpair or pkpd (see kernlab package for details). For future usage. -#' @param ... Unused additional arguments -#' -#' @importFrom kernlab predict -#' -#' @seealso \code{\link{SL.ksvm}} \code{\link[kernlab]{ksvm}} \code{\link[kernlab]{predict.ksvm}} -#' -#' @export +#' @exportS3Method predict SL.ksvm +#' @rdname SL.ksvm predict.SL.ksvm <- function(object, newdata, family, coupler = "minpair", ...) { .SL.require("kernlab") @@ -182,7 +163,7 @@ predict.SL.ksvm <- function(object, newdata, family, coupler = "minpair", ...) { } # CK: I could not get this to work using simply predict(). Not sure why. - pred = kernlab::predict(object$object, newdata, predict_type, coupler = coupler) + pred <- kernlab::predict(object$object, newdata, predict_type, coupler = coupler) if (family$family == "binomial") { # Second column is P(Y = 1 | X). diff --git a/R/SL.lda.R b/R/SL.lda.R index 0591f29..59ece6b 100644 --- a/R/SL.lda.R +++ b/R/SL.lda.R @@ -1,28 +1,30 @@ -#' @title SL wrapper for MASS:lda -#' @description Linear discriminant analysis, used for classification. +#' SL wrappers for `MASS::lda()` and `MASS::qda()` #' -#' @param Y Outcome variable -#' @param X Training dataframe -#' @param newX Test dataframe +#' Linear and quadratic discriminant analysis, used for classification. +#' +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm #' @param family Binomial only, cannot be used for regression. -#' @param obsWeights Observation-level weights -#' @param id Not supported. #' @param verbose If TRUE, display additional output during execution. -# -# Algorithm-specific arguments: -# #' @param prior the prior probabilities of class membership. If unspecified, the -#' class proportions for the training set are used. If present, the -#' probabilities should be specified in the order of the factor levels. -#' @param method "moment" for standard estimators of the mean and variance, +#' class proportions for the training set are used. For `SL.lda()` and `SL.qda()`, if present, the +#' probabilities should be specified in the order of the factor levels. For `predict.SL.lda()`, defaults to the +#' proportions in the training set or what was set in the call to `lda()` or `qda()`. +#' @param method for `SL.lda()` and `SL.qda()`, "moment" for standard estimators of the mean and variance, #' "mle" for MLEs, "mve" to use cov.mve, or "t" for robust estimates based on -#' a t distribution. +#' a t distribution. For `predict.SL.lda()` and `predict.SL.qda()`, this determines how the parameter estimation is handled. With "plug-in" (the default) the usual unbiased parameter estimates are used and +#' assumed to be correct. With "debiased" an unbiased estimator of the log +#' posterior probabilities is used, and with "predictive" the parameter +#' estimates are integrated out using a vague prior. #' @param tol tolerance -#' @param CV If true, returns results (classes and posterior probabilities) for +#' @param CV If `TRUE`, returns results (classes and posterior probabilities) for #' leave-one-out cross-validation. Note that if the prior is estimated, the #' proportions in the whole dataset are used. -#' @param nu degrees of freedom for method = "t". -#' @param ... Any additional arguments, not currently used. +#' @param nu degrees of freedom for `method = "t"`. +#' @param dimen the dimension of the space to be used. If this is less than +#' `min(p, ng-1)`, only the first dimen discriminant components are used (except +#' for `method="predictive"`), and only those dimensions are returned in x. #' #' @examples #' @@ -35,7 +37,8 @@ #' #' # Use only 2 CV folds to speed up example. #' sl = SuperLearner(Y, X, family = binomial(), cvControl = list(V = 2), -#' SL.library = c("SL.mean", "SL.lda")) +#' SL.library = c("SL.mean", "SL.lda", +#' "SL.qda")) #' sl #' #' pred = predict(sl, X) @@ -49,18 +52,15 @@ #' @seealso \code{\link{predict.SL.lda}} \code{\link[MASS]{lda}} #' \code{\link[MASS]{predict.lda}} \code{\link{SL.qda}} #' -#' @importFrom utils capture.output -#' #' @export -SL.lda = - function(Y, X, newX, family, obsWeights = rep(1, nrow(X)), - id = NULL, verbose = F, - prior = as.vector(prop.table(table(Y))), - method = "mle", - tol = 1.0e-4, - CV = F, - nu = 5, - ...) { +SL.lda <- function(Y, X, newX = X, family = binomial(), + verbose = FALSE, + prior = as.vector(prop.table(table(Y))), + method = "mle", + tol = 1.0e-4, + CV = FALSE, + nu = 5, + ...) { .SL.require("MASS") @@ -74,7 +74,7 @@ SL.lda = # X can be a matrix or dataframe. # If method = "t" this will print a lot of unnecessary output, so capture it. - capture.output({ + utils::capture.output({ fit = MASS::lda(x = X, grouping = Y, prior = prior, @@ -95,28 +95,52 @@ SL.lda = return(out) } -#' @title Prediction wrapper for SL.lda -#' -#' @description Prediction wrapper for SL.lda -#' -#' @param object SL.lda object -#' @param newdata Dataframe to generate predictions -#' @param prior The prior probabilities of the classes, by default the -#' proportions in the training set or what was set in the call to lda. -#' @param dimen the dimension of the space to be used. If this is less than -#' min(p, ng-1), only the first dimen discriminant components are used (except -#' for method="predictive"), and only those dimensions are returned in x. -#' @param method This determines how the parameter estimation is handled. With -#' "plug-in" (the default) the usual unbiased parameter estimates are used and -#' assumed to be correct. With "debiased" an unbiased estimator of the log -#' posterior probabilities is used, and with "predictive" the parameter -#' estimates are integrated out using a vague prior. -#' @param ... Unused additional arguments -#' -#' @seealso \code{\link{SL.lda}} \code{\link[MASS]{lda}} -#' \code{\link[MASS]{predict.lda}} -#' #' @export +#' @rdname SL.lda +SL.qda <- function(Y, X, newX = X, family = binomial(), + verbose = FALSE, + prior = as.vector(prop.table(table(Y))), + method = "mle", + tol = 1.0e-4, + CV = FALSE, + nu = 5, + ...) { + + .SL.require("MASS") + + if (family$family != "binomial") { + stop("SL.qda only supports binomial outcomes.") + } + + if (!is.factor(Y)) { + Y = as.factor(Y) + } + + # X can be a matrix or dataframe. + # If method = "t" this will print a lot of unnecessary output, so capture it. + utils::capture.output({ + fit = MASS::qda(x = X, + grouping = Y, + prior = prior, + method = method, + tol = tol, + CV = CV, + nu = nu) + }) + + pred = predict(fit, newX)$posterior + + # $posterior is a two-column matrix; we want P(Y = 1 | X). + pred = pred[, "1"] + + fit = list(object = fit, verbose = verbose) + class(fit) = "SL.qda" + out = list(pred = pred, fit = fit) + return(out) +} + +#' @exportS3Method predict SL.lda +#' @rdname SL.lda predict.SL.lda <- function(object, newdata, prior = object$object$prior, dimen = NULL, @@ -134,3 +158,7 @@ predict.SL.lda <- function(object, newdata, return(pred) } + +#' @exportS3Method predict SL.qda +#' @rdname SL.lda +predict.SL.qda <- predict.SL.lda \ No newline at end of file diff --git a/R/SL.leekasso.R b/R/SL.leekasso.R index 92b826c..5826580 100644 --- a/R/SL.leekasso.R +++ b/R/SL.leekasso.R @@ -1,5 +1,5 @@ -SL.leekasso <- function (Y, X, newX, family, obsWeights, id, ...) -{ +#' @export +SL.leekasso <- function (Y, X, newX = X, family = gaussian(), ...) { .SL.require("sva") # Bioconductor package, but really only need the f.pvalue function, might just replace it with internal function? N <- length(Y) mod <- cbind(rep.int(1, N), Y) @@ -7,8 +7,8 @@ SL.leekasso <- function (Y, X, newX, family, obsWeights, id, ...) pValues <- sva::f.pvalue(t(X), mod, mod0) index <- which(rank(pValues) <= 10) # always 10! - lm1 <- lm(Y ~ ., data = X[, index]) - pred <- predict.lm(lm1, newdata = newX[, index]) + lm1 <- glm(Y ~ ., data = X[, index], family = family) + pred <- predict(lm1, newdata = newX[, index], type = "response") # pred <- numeric() fit <- list(object = lm1, index = index) class(fit) <- c("SL.leekasso") @@ -16,13 +16,13 @@ SL.leekasso <- function (Y, X, newX, family, obsWeights, id, ...) return(out) } -predict.SL.leekasso <- function(object, newdata, ...){ - pred <- predict(object = object$object, newdata = newdata[, object$index], type = "response") - pred +#' @exportS3Method predict SL.leekasso +predict.SL.leekasso <- function(object, newdata, ...) { + predict(object = object$object, newdata = newdata[, object$index], type = "response") } -## +## ## f.pvalue function from sva package: -# f.pvalue <- function (dat, mod, mod0) +# f.pvalue <- function (dat, mod, mod0) # { # n <- dim(dat)[2] # m <- dim(dat)[1] @@ -30,11 +30,11 @@ predict.SL.leekasso <- function(object, newdata, ...){ # df0 <- dim(mod0)[2] # p <- rep(0, m) # Id <- diag(n) -# resid <- dat %*% (Id - mod %*% solve(t(mod) %*% mod) %*% +# resid <- dat %*% (Id - mod %*% solve(t(mod) %*% mod) %*% # t(mod)) # rss1 <- rowSums(resid * resid) # rm(resid) -# resid0 <- dat %*% (Id - mod0 %*% solve(t(mod0) %*% mod0) %*% +# resid0 <- dat %*% (Id - mod0 %*% solve(t(mod0) %*% mod0) %*% # t(mod0)) # rss0 <- rowSums(resid0 * resid0) # rm(resid0) diff --git a/R/SL.lm.R b/R/SL.lm.R deleted file mode 100644 index 0b3bbf0..0000000 --- a/R/SL.lm.R +++ /dev/null @@ -1,91 +0,0 @@ -#' @title Wrapper for lm -#' @description Wrapper for OLS via lm(), which may be faster than glm(). -#' -#' @param Y Outcome variable -#' @param X Training dataframe -#' @param newX Test dataframe -#' @param family Gaussian or binomial -#' @param obsWeights Observation-level weights -#' @param model Whether to save model.matrix of data in fit object. Set to FALSE -#' to save memory. -#' @param ... Any remaining arguments, not used. -#' -#' @examples -#' -#' data(Boston, package = "MASS") -#' Y = Boston$medv -#' # Remove outcome from covariate dataframe. -#' X = Boston[, -14] -#' -#' set.seed(1) -#' -#' sl = SuperLearner(Y, X, family = gaussian(), -#' SL.library = c("SL.mean", "SL.lm")) -#' -#' print(sl) -#' -#' @references -#' -#' Fox, J. (2015). Applied regression analysis and generalized linear models. -#' Sage Publications. -#' -#' @seealso \code{\link{predict.SL.lm}} \code{\link[stats]{lm}} -#' \code{\link[stats]{predict.lm}} \code{\link{SL.speedlm}} -#' -#' @export -SL.lm <- function(Y, X, newX, family, obsWeights, model = TRUE, ...) { - - # X must be a dataframe, not a matrix. - if (is.matrix(X)) { - X = as.data.frame(X) - } - - # qr element is needed in order to predict(), unless we extract coefficents. - fit <- stats::lm(Y ~ ., data = X, weights = obsWeights, model = model) - - if (is.matrix(newX)) { - newX = as.data.frame(newX) - } - - pred <- predict(fit, newdata = newX, type = "response") - - # For binomial family restrict predicted probability to [0, 1]. - if (family$family == "binomial") { - pred = pmin(pmax(pred, 0), 1) - } - - fit <- list(object = fit, family = family) - class(fit) <- "SL.lm" - - out <- list(pred = pred, fit = fit) - - return(out) -} - -#' @title Prediction for SL.lm -#' @description Prediction for SL.lm -#' -#' @param object SL.lm object -#' @param newdata Dataframe to generate predictions -#' @param ... Unused additional arguments -#' -#' @seealso \code{\link{SL.lm}} \code{\link[stats]{lm}} -#' \code{\link[stats]{predict.lm}} \code{\link{SL.speedlm}} -#' -#' @export -predict.SL.lm <- function(object, newdata, ...) { - - # newdata must be a dataframe, not a matrix. - if (is.matrix(newdata)) { - newdata = as.data.frame(newdata) - } - - pred <- predict(object = object$object, newdata = newdata, type = "response") - - # For binomial family restrict predicted probability to [0, 1]. - if (object$family$family == "binomial") { - pred = pmin(pmax(pred, 0), 1) - } - - pred -} \ No newline at end of file diff --git a/R/SL.loess.R b/R/SL.loess.R index ff1f492..2056edf 100644 --- a/R/SL.loess.R +++ b/R/SL.loess.R @@ -1,9 +1,11 @@ # loess {stats} # l.family can be either 'gaussian' (least squares) or 'symmetric' (M-estimator with Tukey's biweight) -SL.loess <- function(Y, X, newX, family, obsWeights, span = 0.75, l.family = "gaussian", ...) { +#' @export +SL.loess <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, span = 0.75, l.family = "gaussian", ...) { if(family$family == "gaussian") { - fit.loess <- loess(Y ~ ., data = X, family = l.family, span = span, control = loess.control(surface = "direct"), weights = obsWeights) + fit.loess <- loess(Y ~ ., data = X, family = l.family, span = span, + control = loess.control(surface = "direct"), weights = obsWeights) } if(family$family == "binomial") { stop('family = binomial() not currently implemented for SL.loess') @@ -15,9 +17,8 @@ SL.loess <- function(Y, X, newX, family, obsWeights, span = 0.75, l.family = "ga return(out) } -# +#' @exportS3Method predict SL.loess predict.SL.loess <- function(object, newdata, ...) { - pred <- predict(object = object$object, newdata = newdata) - return(pred) + predict(object = object$object, newdata = newdata) } diff --git a/R/SL.logreg.R b/R/SL.logreg.R index d17dbb8..8bad93b 100644 --- a/R/SL.logreg.R +++ b/R/SL.logreg.R @@ -1,7 +1,8 @@ # logreg {LogicReg} # currently uses CV to select model size -SL.logreg <- function(Y, X, newX, family, ntrees = c(1, 3), nleaves = c(1, 7), kfold = 10, ...) { +#' @export +SL.logreg <- function(Y, X, newX = X, family = gaussian(), ntrees = c(1, 3), nleaves = c(1, 7), kfold = 10, ...) { .SL.require('LogicReg') if(family$family == "gaussian") { fit.cv.logreg <- LogicReg::logreg(resp = Y, bin = X, type = 2, select = 3, ntrees = ntrees, nleaves = nleaves, kfold = kfold) @@ -40,10 +41,9 @@ SL.logreg <- function(Y, X, newX, family, ntrees = c(1, 3), nleaves = c(1, 7), k return(out) } -# -predict.SL.logreg <- function(object, newdata, family, X=NULL, Y=NULL,...) { +#' @exportS3Method predict SL.logreg +predict.SL.logreg <- function(object, newdata, ...) { .SL.require('LogicReg') - pred <- predict(object$object, newbin=newdata) - return(pred) + predict(object$object, newbin = newdata) } diff --git a/R/SL.mean.R b/R/SL.mean.R deleted file mode 100644 index ee2e9f8..0000000 --- a/R/SL.mean.R +++ /dev/null @@ -1,15 +0,0 @@ -SL.mean <- function (Y, X, newX, family, obsWeights, id, ...) -{ - meanY <- weighted.mean(Y, w = obsWeights) - pred <- rep.int(meanY, times = nrow(newX)) - fit <- list(object = meanY) - out <- list(pred = pred, fit = fit) - class(out$fit) <- c("SL.mean") - return(out) -} - -predict.SL.mean <- function (object, newdata, family, X = NULL, Y = NULL, ...) -{ - pred <- rep.int(object$object, times = nrow(newdata)) - return(pred) -} diff --git a/R/SL.nnet.R b/R/SL.nnet.R index 2186fd8..b214569 100644 --- a/R/SL.nnet.R +++ b/R/SL.nnet.R @@ -2,7 +2,8 @@ # can change the size # SL.nnet.3 <- function(..., size = 3) SL.nnet(..., size = size) -SL.nnet <- function(Y, X, newX, family, obsWeights, size = 2, ...){ +#' @export +SL.nnet <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, size = 2, ...){ .SL.require('nnet') if(family$family == "gaussian") { fit.nnet <- nnet::nnet(x = X, y = Y, size = size, linout = TRUE, trace = FALSE, maxit = 500, weights = obsWeights) @@ -17,9 +18,9 @@ SL.nnet <- function(Y, X, newX, family, obsWeights, size = 2, ...){ return(out) } -predict.SL.nnet <- function(object, newdata,...) { +#' @exportS3Method predict SL.nnet +predict.SL.nnet <- function(object, newdata, ...) { .SL.require('nnet') - pred <- predict(object$object, newdata = newdata, type = "raw") - return(pred) + predict(object$object, newdata = newdata, type = "raw") } diff --git a/R/SL.nnls.R b/R/SL.nnls.R index 05c5329..af8fa54 100644 --- a/R/SL.nnls.R +++ b/R/SL.nnls.R @@ -1,10 +1,17 @@ # SL-wrapper for Non-negative least squares algorithm -# Same functionality as the method.NNLS metalearner, but +# Same functionality as the method.NNLS metalearner, but # in the SL-wrapper format -SL.nnls <- function(Y, X, newX, family, obsWeights, ...) { +#' @export +SL.nnls <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, ...) { .SL.require("nnls") - fit.nnls <- nnls::nnls(sqrt(obsWeights)*as.matrix(X), sqrt(obsWeights)*Y) + if (is.null(obsWeights)) { + fit.nnls <- nnls::nnls(as.matrix(X), Y) + } + else { + fit.nnls <- nnls::nnls(sqrt(obsWeights)*as.matrix(X), sqrt(obsWeights)*Y) + } + initCoef <- coef(fit.nnls) initCoef[is.na(initCoef)] <- 0 if (sum(initCoef) > 0) { @@ -20,15 +27,17 @@ SL.nnls <- function(Y, X, newX, family, obsWeights, ...) { return(out) } +#' @exportS3Method predict SL.nnls predict.SL.nnls <- function(object, newdata, ...) { initCoef <- coef(object$object) initCoef[is.na(initCoef)] <- 0 if (sum(initCoef) > 0) { - coef <- initCoef/sum(initCoef) - } else { + coef <- initCoef / sum(initCoef) + } + else { warning("All algorithms have zero weight", call. = FALSE) coef <- initCoef } - pred <- crossprod(t(as.matrix(newdata)), coef) - return(pred) + + crossprod(t(as.matrix(newdata)), coef) } diff --git a/R/SL.polymars.R b/R/SL.polymars.R index 9ec8494..966c8bc 100644 --- a/R/SL.polymars.R +++ b/R/SL.polymars.R @@ -1,8 +1,9 @@ ## polymars{polspline} # in the binomial case, drop the cv=5 selects model based on AIC -SL.polymars <- function(Y, X, newX, family, obsWeights, ...){ +#' @export +SL.polymars <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, ...){ .SL.require('polspline') - if(family$family == "gaussian") { + if(family$family == "gaussian") { fit.mars <- polspline::polymars(Y, X, weights = obsWeights) pred <- predict(fit.mars, x = newX) fit <- list(object = fit.mars) @@ -17,12 +18,13 @@ SL.polymars <- function(Y, X, newX, family, obsWeights, ...){ return(out) } +#' @exportS3Method predict SL.polymars predict.SL.polymars <- function(object, newdata, family, ...) { .SL.require('polspline') - if(family$family=="gaussian"){ + if (family$family == "gaussian"){ pred <- predict(object = object$object, x = newdata) } - if(family$family=="binomial"){ + else if (family$family == "binomial"){ pred <- polspline::ppolyclass(cov=newdata, fit=object$fit)[, 2] } return(pred) diff --git a/R/SL.qda.R b/R/SL.qda.R deleted file mode 100644 index 4dc33c9..0000000 --- a/R/SL.qda.R +++ /dev/null @@ -1,137 +0,0 @@ -#' @title SL wrapper for MASS:qda -#' @description Quadratic discriminant analysis, used for classification. -#' -#' @param Y Outcome variable -#' @param X Training dataframe -#' @param newX Test dataframe -#' @param family Binomial only, cannot be used for regression. -#' @param obsWeights Observation-level weights -#' @param id Not supported. -#' @param verbose If TRUE, display additional output during execution. -# -# Algorithm-specific arguments: -# -#' @param prior the prior probabilities of class membership. If unspecified, the -#' class proportions for the training set are used. If present, the -#' probabilities should be specified in the order of the factor levels. -#' @param method "moment" for standard estimators of the mean and variance, -#' "mle" for MLEs, "mve" to use cov.mve, or "t" for robust estimates based on -#' a t distribution. -#' @param tol tolerance -#' @param CV If true, returns results (classes and posterior probabilities) for -#' leave-one-out cross-validation. Note that if the prior is estimated, the -#' proportions in the whole dataset are used. -#' @param nu degrees of freedom for method = "t". -#' @param ... Any additional arguments, not currently used. -#' -#' @examples -#' -#' data(Boston, package = "MASS") -#' Y = as.numeric(Boston$medv > 23) -#' # Remove outcome from covariate dataframe. -#' X = Boston[, -14] -#' -#' set.seed(1) -#' -#' # Use only 2 CV folds to speed up example. -#' sl = SuperLearner(Y, X, family = binomial(), cvControl = list(V = 2), -#' SL.library = c("SL.mean", "SL.qda")) -#' sl -#' -#' pred = predict(sl, X) -#' summary(pred$pred) -#' -#' -#' @references -#' -#' James, G., Witten, D., Hastie, T., & Tibshirani, R. (2013). An Introduction -#' to Statistical Learning (Vol. 6). New York: Springer. Section 4.4. -#' -#' @seealso \code{\link{predict.SL.qda}} \code{\link[MASS]{qda}} -#' \code{\link[MASS]{predict.qda}} \code{\link{SL.lda}} -#' -#' @importFrom utils capture.output -#' -#' @export -SL.qda = - function(Y, X, newX, family, obsWeights = rep(1, nrow(X)), - verbose = F, id = NULL, - prior = as.vector(prop.table(table(Y))), - method = "mle", - tol = 1.0e-4, - CV = F, - nu = 5, - ...) { - - .SL.require("MASS") - - if (family$family != "binomial") { - stop("SL.qda only supports binomial outcomes.") - } - - if (!is.factor(Y)) { - Y = as.factor(Y) - } - - # X can be a matrix or dataframe. - # If method = "t" this will print a lot of unnecessary output, so capture it. - capture.output({ - fit = MASS::qda(x = X, - grouping = Y, - prior = prior, - method = method, - tol = tol, - CV = CV, - nu = nu) - }) - - pred = predict(fit, newX)$posterior - - # $posterior is a two-column matrix; we want P(Y = 1 | X). - pred = pred[, "1"] - - fit = list(object = fit, verbose = verbose) - class(fit) = "SL.qda" - out = list(pred = pred, fit = fit) - return(out) -} - -#' @title Prediction wrapper for SL.qda -#' -#' @description Prediction wrapper for SL.qda -#' -#' @param object SL.lda object -#' @param newdata Dataframe to generate predictions -#' @param prior The prior probabilities of the classes, by default the -#' proportions in the training set or what was set in the call to lda. -#' @param dimen the dimension of the space to be used. If this is less than -#' min(p, ng-1), only the first dimen discriminant components are used (except -#' for method="predictive"), and only those dimensions are returned in x. -#' @param method This determines how the parameter estimation is handled. With -#' "plug-in" (the default) the usual unbiased parameter estimates are used and -#' assumed to be correct. With "debiased" an unbiased estimator of the log -#' posterior probabilities is used, and with "predictive" the parameter -#' estimates are integrated out using a vague prior. -#' @param ... Unused additional arguments -#' -#' @seealso \code{\link{SL.qda}} \code{\link[MASS]{qda}} -#' \code{\link[MASS]{predict.qda}} -#' -#' @export -predict.SL.qda <- function(object, newdata, - prior = object$object$prior, - dimen = NULL, - method = "plug-in", - ...) { - .SL.require("MASS") - - pred = predict(object$object, newdata, - prior = prior, - dimen = dimen, - method = method)$posterior - - # $posterior is a two-column matrix. - pred = pred[, "1"] - - return(pred) -} diff --git a/R/SL.randomForest.R b/R/SL.randomForest.R index 199a16f..b7aec0c 100644 --- a/R/SL.randomForest.R +++ b/R/SL.randomForest.R @@ -1,6 +1,7 @@ # randomForest{randomForest} -SL.randomForest <- function(Y, X, newX, family, mtry = ifelse(family$family == "gaussian", +#' @export +SL.randomForest <- function(Y, X, newX = X, family = gaussian(), mtry = ifelse(family$family == "gaussian", max(floor(ncol(X)/3), 1), floor(sqrt(ncol(X)))), ntree = 1000, nodesize = ifelse(family$family == "gaussian", 5, 1), maxnodes = NULL, @@ -21,12 +22,13 @@ SL.randomForest <- function(Y, X, newX, family, mtry = ifelse(family$family == " return(out) } +#' @exportS3Method predict SL.randomForest predict.SL.randomForest <- function(object, newdata, family, ...) { .SL.require('randomForest') if (family$family == "gaussian") { pred <- predict(object$object, newdata = newdata, type = 'response') } - if (family$family == "binomial") { + else if (family$family == "binomial") { pred <- predict(object$object, newdata = newdata, type = 'vote')[,2] } pred diff --git a/R/SL.ranger.R b/R/SL.ranger.R index a5b60b0..4a72fc6 100644 --- a/R/SL.ranger.R +++ b/R/SL.ranger.R @@ -1,14 +1,13 @@ -#' @title SL wrapper for ranger -#' @description Ranger is a fast implementation of Random Forest (Breiman 2001) +#' SL wrapper for ranger +#' +#' Ranger is a fast implementation of Random Forest (Breiman 2001) #' or recursive partitioning, particularly suited for high dimensional data. #' #' Extending code by Eric Polley from the SuperLearnerExtra package. #' -#' @param Y Outcome variable -#' @param X Training dataframe -#' @param newX Test dataframe -#' @param family Gaussian or binomial -#' @param obsWeights Observation-level weights +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm #' @param num.trees Number of trees. #' @param mtry Number of variables to possibly split at in each node. Default is #' the (rounded down) square root of the number variables. @@ -21,8 +20,7 @@ #' @param sample.fraction Fraction of observations to sample. Default is 1 for #' sampling with replacement and 0.632 for sampling without replacement. #' @param num.threads Number of threads to use. -#' @param verbose If TRUE, display additional output during execution. -#' @param ... Any additional arguments, not currently used. +#' @param verbose If `TRUE`, display additional output during execution. #' #' @examples #' @@ -51,11 +49,10 @@ #' #' @seealso \code{\link{SL.ranger}} \code{\link[ranger]{ranger}} #' \code{\link[ranger]{predict.ranger}} -#' + #' @export -SL.ranger <- - function(Y, X, newX, family, - obsWeights, +SL.ranger <- function(Y, X, newX = X, family = gaussian(), + obsWeights = NULL, num.trees = 500, mtry = floor(sqrt(ncol(X))), write.forest = TRUE, @@ -64,7 +61,7 @@ SL.ranger <- replace = TRUE, sample.fraction = ifelse(replace, 1, 0.632), num.threads = 1, - verbose = T, + verbose = FALSE, ...) { # need write.forest = TRUE for predict method .SL.require("ranger") @@ -107,22 +104,9 @@ SL.ranger <- return(out) } -#' @title Prediction wrapper for ranger random forests -#' -#' @description Prediction wrapper for SL.ranger objects. -#' -#' @param object SL.kernlab object -#' @param newdata Dataframe to generate predictions -#' @param family Gaussian or binomial -#' @param num.threads Number of threads used for parallelization -#' @param verbose If TRUE output additional information during execution. -#' @param ... Unused additional arguments -#' -#' @seealso \code{\link{SL.ranger}} \code{\link[ranger]{ranger}} -#' \code{\link[ranger]{predict.ranger}} -#' -#' @export -predict.SL.ranger <- function(object, newdata, family, +#' @exportS3Method predict SL.ranger +#' @rdname SL.ranger +predict.SL.ranger <- function(object, newdata, num.threads = 1, verbose = object$verbose, ...) { @@ -133,7 +117,7 @@ predict.SL.ranger <- function(object, newdata, family, num.threads = num.threads)$predictions # For binomial family $predictions is a two-column matrix. - if (family$family == "binomial") { + if (NCOL(pred) > 1L) { # P(Y = 1 | X) for binomial. pred = pred[, "1"] } diff --git a/R/SL.ridge.R b/R/SL.ridge.R index b2ff77f..a8148ae 100644 --- a/R/SL.ridge.R +++ b/R/SL.ridge.R @@ -1,7 +1,9 @@ ## lm.ridge{MASS} # may want to change range lambda searches over # will only work with guassian -SL.ridge <- function(Y, X, newX, family, lambda = seq(1, 20, .1), ...) { + +#' @export +SL.ridge <- function(Y, X, newX = X, family = gaussian(), lambda = seq(1, 20, .1), ...) { .SL.require('MASS') if(family$family=="binomial"){ stop("Currently only works with gaussian data") @@ -9,7 +11,7 @@ SL.ridge <- function(Y, X, newX, family, lambda = seq(1, 20, .1), ...) { fit.ridge <- MASS::lm.ridge(Y ~ ., data = X, lambda = lambda) bestCoef <- as.matrix(coef(fit.ridge)[which.min(fit.ridge$GCV), ]) m <- dim(newX)[1] - newx.ridge <- as.matrix(cbind(rep(1, m), newX)) + newx.ridge <- cbind(rep(1, m), as.matrix(newX)) pred <- newx.ridge %*% bestCoef fit <- list(bestCoef = bestCoef) out <- list(pred = pred, fit = fit) @@ -17,10 +19,10 @@ SL.ridge <- function(Y, X, newX, family, lambda = seq(1, 20, .1), ...) { return(out) } -predict.SL.ridge <- function(object, newdata,...){ - .SL.require('MASS') - m <- dim(newdata)[1] - newx.ridge <- as.matrix(cbind(rep(1, m), newdata)) - pred <- newx.ridge %*% object$bestCoef - return(pred) +#' @exportS3Method predict SL.ridge +predict.SL.ridge <- function(object, newdata, ...){ + m <- nrow(newdata) + newx.ridge <- as.matrix(cbind(rep.int(1, m), newdata)) + + newx.ridge %*% object$bestCoef } \ No newline at end of file diff --git a/R/SL.rpart.R b/R/SL.rpart.R index 96c6f88..bdc9909 100644 --- a/R/SL.rpart.R +++ b/R/SL.rpart.R @@ -1,7 +1,10 @@ # rpart {rpart} -SL.rpart <- function(Y, X, newX, family, obsWeights, cp = 0.01, minsplit = 20, xval = 0L, maxdepth = 30, minbucket = round(minsplit/3), ...) { +#' @export +SL.rpart <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, + cp = 0.01, minsplit = 20, xval = 0L, + maxdepth = 30, minbucket = round(minsplit/3), ...) { .SL.require('rpart') - if(family$family == "gaussian"){ + if (family$family == "gaussian"){ fit.rpart <- rpart::rpart(Y~., data = data.frame(Y, X), control = rpart::rpart.control(cp = cp, minsplit = minsplit, xval = xval, maxdepth = maxdepth, minbucket = minbucket), method = "anova", weights = obsWeights) pred <- predict(fit.rpart, newdata = newX) } @@ -15,15 +18,37 @@ SL.rpart <- function(Y, X, newX, family, obsWeights, cp = 0.01, minsplit = 20, x return(out) } -# -predict.SL.rpart <- function(object, newdata, family, ...) { - .SL.require('rpart') - if(family$family=="gaussian") { - pred <- predict(object$object, newdata = newdata) +#' @export +SL.rpartPrune <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, + cp = 0.001, minsplit = 20, xval = 10, maxdepth = 20, minbucket = 5, ...) { + .SL.require("rpart") + if (family$family == "gaussian") { + fit.rpart <- rpart::rpart(Y ~ ., data = data.frame(Y, X), control = rpart::rpart.control(cp = cp, minsplit = minsplit, xval = xval, maxdepth = maxdepth, minbucket = minbucket), method = "anova", weights = obsWeights) + CP <- fit.rpart$cptable[which.min(fit.rpart$cptable[, "xerror"]), "CP"] + fitPrune <- rpart::prune(fit.rpart, cp = CP) + pred <- predict(fitPrune, newdata = newX) } - if(family$family=="binomial") { - pred <- predict(object$object, newdata = newdata)[, 2] + if (family$family == "binomial") { + fit.rpart <- rpart::rpart(Y ~ ., data = data.frame(Y, X), control = rpart::rpart.control(cp = cp, minsplit = minsplit, xval = xval, maxdepth = maxdepth, minbucket = minbucket), method = "class", weights = obsWeights) + CP <- fit.rpart$cptable[which.min(fit.rpart$cptable[, "xerror"]), "CP"] + fitPrune <- rpart::prune(fit.rpart, cp = CP) + pred <- predict(fitPrune, newdata = newX)[, 2] } + fit <- list(object = fitPrune, fit = fit.rpart, cp = CP) + out <- list(pred = pred, fit = fit) + class(out$fit) <- c("SL.rpart") + return(out) +} + +#' @exportS3Method predict SL.rpart +predict.SL.rpart <- function(object, newdata, ...) { + .SL.require('rpart') + pred <- predict(object$object, newdata = newdata) + + if (NCOL(pred) > 1L) { + pred <- pred[, 2] + } + return(pred) } diff --git a/R/SL.rpartPrune.R b/R/SL.rpartPrune.R deleted file mode 100644 index 3cd50e0..0000000 --- a/R/SL.rpartPrune.R +++ /dev/null @@ -1,20 +0,0 @@ -SL.rpartPrune <- function (Y, X, newX, family, obsWeights, cp = 0.001, minsplit = 20, xval = 10, maxdepth = 20, minbucket = 5, ...) -{ - .SL.require("rpart") - if (family$family == "gaussian") { - fit.rpart <- rpart::rpart(Y ~ ., data = data.frame(Y, X), control = rpart::rpart.control(cp = cp, minsplit = minsplit, xval = xval, maxdepth = maxdepth, minbucket = minbucket), method = "anova", weights = obsWeights) - CP <- fit.rpart$cptable[which.min(fit.rpart$cptable[, "xerror"]), "CP"] - fitPrune <- rpart::prune(fit.rpart, cp = CP) - pred <- predict(fitPrune, newdata = newX) - } - if (family$family == "binomial") { - fit.rpart <- rpart::rpart(Y ~ ., data = data.frame(Y, X), control = rpart::rpart.control(cp = cp, minsplit = minsplit, xval = xval, maxdepth = maxdepth, minbucket = minbucket), method = "class", weights = obsWeights) - CP <- fit.rpart$cptable[which.min(fit.rpart$cptable[, "xerror"]), "CP"] - fitPrune <- rpart::prune(fit.rpart, cp = CP) - pred <- predict(fitPrune, newdata = newX)[, 2] - } - fit <- list(object = fitPrune, fit = fit.rpart, cp = CP) - out <- list(pred = pred, fit = fit) - class(out$fit) <- c("SL.rpart") - return(out) -} \ No newline at end of file diff --git a/R/SL.speedglm.R b/R/SL.speedglm.R index f438ab9..053ade3 100644 --- a/R/SL.speedglm.R +++ b/R/SL.speedglm.R @@ -1,15 +1,13 @@ -#' @title Wrapper for speedglm -#' @description Speedglm is a fast version of glm() +#' Wrapper for `speedglm` #' -#' @param Y Outcome variable -#' @param X Training dataframe -#' @param newX Test dataframe -#' @param family Gaussian or binomial -#' @param obsWeights Observation-level weights +#' `speedglm::speedglm()` and `speedglm::speedlm()` are fast versions of `glm()` and `lm()`, respectively. +#' +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm #' @param maxit Maximum number of iterations before stopping. #' @param k numeric, the penalty per parameter to be used; the default k = 2 is #' the classical AIC. -#' @param ... Any remaining arguments, not used. #' #' @references #' @@ -19,12 +17,10 @@ #' #' @seealso \code{\link{predict.SL.speedglm}} \code{\link[speedglm]{speedglm}} #' \code{\link[speedglm]{predict.speedglm}} -#' + #' @export -SL.speedglm <- function(Y, X, newX, family, obsWeights, - maxit = 25, - k = 2, - ...) { +SL.speedglm <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, + maxit = 25, k = 2, ...) { .SL.require("speedglm") # X must be a dataframe, not a matrix. @@ -33,9 +29,9 @@ SL.speedglm <- function(Y, X, newX, family, obsWeights, } fit <- speedglm::speedglm(Y ~ ., data = X, family = family, - weights = obsWeights, - maxit = maxit, - k = k) + weights = obsWeights, + maxit = maxit, + k = k) if (is.matrix(newX)) { newX = as.data.frame(newX) @@ -51,17 +47,34 @@ SL.speedglm <- function(Y, X, newX, family, obsWeights, return(out) } -#' @title Prediction for SL.speedglm -#' @description Prediction for SL.speedglm -#' -#' @param object SL.speedglm object -#' @param newdata Dataframe to generate predictions -#' @param ... Unused additional arguments -#' -#' @seealso \code{\link{SL.speedglm}} \code{\link[speedglm]{speedglm}} -#' \code{\link[speedglm]{predict.speedglm}} -#' #' @export +#' @rdname SL.speedglm +SL.speedlm <- function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, ...) { + .SL.require("speedglm") + + # X must be a dataframe, not a matrix. + if (is.matrix(X)) { + X = as.data.frame(X) + } + + fit <- speedglm::speedlm(Y ~ ., data = X, weights = obsWeights) + + if (is.matrix(newX)) { + newX = as.data.frame(newX) + } + + pred <- predict(fit, newdata = newX, type = "response") + + fit <- list(object = fit) + class(fit) <- "SL.speedlm" + + out <- list(pred = pred, fit = fit) + + return(out) +} + +#' @exportS3Method predict SL.speedglm +#' @rdname SL.speedglm predict.SL.speedglm <- function(object, newdata, ...) { .SL.require("speedglm") @@ -70,7 +83,9 @@ predict.SL.speedglm <- function(object, newdata, ...) { newdata = as.data.frame(newdata) } - pred <- predict(object = object$object, newdata = newdata, type = "response") + predict(object = object$object, newdata = newdata, type = "response") +} - pred -} \ No newline at end of file +#' @exportS3Method predict SL.speedlm +#' @rdname SL.speedglm +predict.SL.speedlm <- predict.SL.speedglm \ No newline at end of file diff --git a/R/SL.speedlm.R b/R/SL.speedlm.R deleted file mode 100644 index 0e0030f..0000000 --- a/R/SL.speedlm.R +++ /dev/null @@ -1,68 +0,0 @@ -#' @title Wrapper for speedlm -#' @description Speedlm is a fast version of lm() -#' -#' @param Y Outcome variable -#' @param X Training dataframe -#' @param newX Test dataframe -#' @param family Gaussian or binomial -#' @param obsWeights Observation-level weights -#' @param ... Any remaining arguments, not used. -#' -#' @references -#' -#' Enea, M. A. R. C. O. (2013). Fitting linear models and generalized linear -#' models with large data sets in R. Statistical Methods for the Analysis of -#' Large Datasets: book of short papers, 411-414. -#' -#' @seealso \code{\link{predict.SL.speedlm}} \code{\link[speedglm]{speedlm}} -#' \code{\link[speedglm]{predict.speedlm}} \code{\link{SL.speedglm}} -#' -#' @export -SL.speedlm <- function(Y, X, newX, family, obsWeights, - ...) { - .SL.require("speedglm") - - # X must be a dataframe, not a matrix. - if (is.matrix(X)) { - X = as.data.frame(X) - } - - fit <- speedglm::speedlm(Y ~ ., data = X, weights = obsWeights) - - if (is.matrix(newX)) { - newX = as.data.frame(newX) - } - - pred <- predict(fit, newdata = newX, type = "response") - - fit <- list(object = fit) - class(fit) <- "SL.speedlm" - - out <- list(pred = pred, fit = fit) - - return(out) -} - -#' @title Prediction for SL.speedlm -#' @description Prediction for SL.speedlm, a fast lm() -#' -#' @param object SL.speedlm object -#' @param newdata Dataframe to generate predictions -#' @param ... Unused additional arguments -#' -#' @seealso \code{\link{SL.speedlm}} \code{\link[speedglm]{speedlm}} -#' \code{\link[speedglm]{predict.speedlm}} \code{\link{SL.speedglm}} -#' -#' @export -predict.SL.speedlm <- function(object, newdata, ...){ - .SL.require("speedglm") - - # newdata must be a dataframe, not a matrix. - if (is.matrix(newdata)) { - newdata = as.data.frame(newdata) - } - - pred <- predict(object = object$object, newdata = newdata, type = "response") - - pred -} \ No newline at end of file diff --git a/R/SL.step.R b/R/SL.step.R index 1e82ef7..8e94a45 100644 --- a/R/SL.step.R +++ b/R/SL.step.R @@ -1,15 +1,16 @@ - -SL.step <- function(Y, X, newX, family, direction = "both", trace = 0, k = 2, ...) { +#' @export +SL.step <- function(Y, X, newX = X, family = gaussian(), direction = "both", trace = 0, k = 2, ...) { fit.glm <- glm(Y ~ ., data = X, family = family) fit.step <- step(fit.glm, direction = direction, trace = trace, k = k) pred <- predict(fit.step, newdata = newX, type = "response") fit <- list(object = fit.step) - out <- list(pred = pred, fit=fit) + out <- list(pred = pred, fit = fit) class(out$fit) <- c("SL.step") return(out) } -SL.step.forward <- function(Y, X, newX, family, direction = "forward", trace = 0, k = 2, ...) { +#' @export +SL.step.forward <- function(Y, X, newX = X, family = gaussian(), direction = "forward", trace = 0, k = 2, ...) { fit.glm <- glm(Y ~ ., data = X, family = family) fit.step <- step(glm(Y ~ 1, data = X, family = family), scope = formula(fit.glm), direction = direction, trace = trace, k = k) pred <- predict(fit.step, newdata = newX, type = "response") @@ -19,7 +20,8 @@ SL.step.forward <- function(Y, X, newX, family, direction = "forward", trace = 0 return(out) } -SL.step.interaction <- function(Y, X, newX, family, direction = "both", trace = 0, k = 2, ...) { +#' @export +SL.step.interaction <- function(Y, X, newX = X, family = gaussian(), direction = "both", trace = 0, k = 2, ...) { fit.glm <- glm(Y ~ ., data = X, family = family) fit.step <- step(fit.glm, scope = Y ~ .^2, direction = direction, trace = trace, k = k) pred <- predict(fit.step, newdata = newX, type = "response") @@ -29,12 +31,11 @@ SL.step.interaction <- function(Y, X, newX, family, direction = "both", trace = return(out) } -# -predict.SL.step <- function(object, newdata,...) { - predict(object = object$object, newdata = newdata, type = "response") -} +#' @exportS3Method predict SL.step +predict.SL.step <- predict.SL.glm -SL.stepAIC <- function(Y, X, newX, family, direction = "both", steps = 30, k = log(nrow(X)), ...) { +#' @export +SL.stepAIC <- function(Y, X, newX = X, family = gaussian(), direction = "both", steps = 30, k = log(nrow(X)), ...) { .SL.require('MASS') g0 <- glm(Y ~ 1, data = X, family = family) upper <- formula(paste("~", paste(colnames(X), collapse="+"))) @@ -47,8 +48,9 @@ SL.stepAIC <- function(Y, X, newX, family, direction = "both", steps = 30, k = l return(out) } +#' @exportS3Method predict SL.stepAIC predict.SL.stepAIC <- function(object, newdata, ...) { .SL.require('MASS') - pred <- predict(object = object$object, newdata = newdata, type = "response") - return(pred) + + predict(object = object$object, newdata = newdata, type = "response") } \ No newline at end of file diff --git a/R/SL.svm.R b/R/SL.svm.R index b222a55..d32a8c6 100644 --- a/R/SL.svm.R +++ b/R/SL.svm.R @@ -2,16 +2,20 @@ # two types for regression: "nu-regression" and "eps-regression". # two types for classification: "nu-classification" and "C-classification" # many other tuning parameters to consider -SL.svm <- function(Y, X, newX, family, type.reg = "nu-regression", type.class = "nu-classification", kernel = -"radial", nu = 0.5, degree = 3, cost = 1, coef0 = 0, ...) { + +#' @export +SL.svm <- function(Y, X, newX = X, family = gaussian(), type.reg = "nu-regression", type.class = "nu-classification", + kernel = "radial", nu = 0.5, degree = 3, cost = 1, coef0 = 0, ...) { .SL.require('e1071') - if(family$family == "gaussian") { - fit.svm <- e1071::svm(y = Y, x = X, nu = nu, type = type.reg, fitted = FALSE, kernel = kernel, degree = degree, cost = cost, coef0 = coef0) + if (family$family == "gaussian") { + fit.svm <- e1071::svm(y = Y, x = X, nu = nu, type = type.reg, fitted = FALSE, + kernel = kernel, degree = degree, cost = cost, coef0 = coef0) pred <- predict(fit.svm, newdata = newX) fit <- list(object = fit.svm) } if(family$family == "binomial") { - fit.svm <- e1071::svm(y = as.factor(Y), x = X, nu = nu, type = type.class, fitted = FALSE, probability = TRUE, kernel = kernel, degree = degree, cost = cost, coef0 = coef0) + fit.svm <- e1071::svm(y = as.factor(Y), x = X, nu = nu, type = type.class, fitted = FALSE, + probability = TRUE, kernel = kernel, degree = degree, cost = cost, coef0 = coef0) pred <- attr(predict(fit.svm, newdata = newX, probability = TRUE), "prob")[, "1"] # assumes Y is 0/1 numeric fit <- list(object = fit.svm) } @@ -20,12 +24,13 @@ SL.svm <- function(Y, X, newX, family, type.reg = "nu-regression", type.class = return(out) } -predict.SL.svm <- function(object, newdata, family,...){ +#' @exportS3Method predict SL.svm +predict.SL.svm <- function(object, newdata, family, ...){ .SL.require('e1071') - if(family$family == "gaussian") { + if (family$family == "gaussian") { pred <- predict(object$object, newdata = newdata) } - if(family$family == "binomial") { + else if (family$family == "binomial") { pred <- attr(predict(object$object, newdata = newdata, probability = TRUE), "prob")[, "1"] } return(pred) diff --git a/R/SL.template.R b/R/SL.template.R index e472ed5..5914c79 100644 --- a/R/SL.template.R +++ b/R/SL.template.R @@ -1,28 +1,59 @@ -# +#' Wrapper functions for prediction algorithms in SuperLearner +#' @aliases SL.caret.rpart SL.gbm SL.ipredbagg SL.knn SL.leekasso SL.loess SL.logreg SL.nnet SL.nnls SL.polymars SL.randomForest SL.ridge SL.rpart SL.rpartPrune SL.step SL.step.forward SL.step.interaction SL.stepAIC SL.svm +#' +#' @description +#' Template function for SuperLearner prediction wrappers and built in options. +#' +#' @param Y The outcome in the training data set. Must be a numeric vector. +#' @param X The predictor variables in the training data set, usually a data.frame. +#' @param newX The predictor variables in the validation data set. The +#' structure should match X. +#' @param family Either [gaussian()] or [binomial()] to +#' describe the error distribution. Link function information will be ignored. +#' @param obsWeights Optional observation weights. +#' @param id Optional cluster identification variable. +#' @param \dots For SL wrappers, other remaining arguments. For `write.SL.template()`, arguments passed to [cat()]. +#' @param file A connection, or a character string naming a file to print to. +#' Passed to [cat()]. +#' +#' @returns +#' SL wrappers return a list with two elements: +#' \item{pred}{ The predicted values for the +#' rows in \code{newX}. } +#' \item{fit}{ A list. Contains all objects necessary to +#' get predictions for new observations from specific algorithm. } +#' +#' @author Eric C Polley \email{epolley@@uchicago.edu} +#' +#' @seealso \code{\link{SuperLearner}} +#' +#' @keywords utilities +#' +#' @examples +#' write.SL.template(file = '') +#' + +#' @export `SL.template` SL.template <- function(Y, X, newX, family, obsWeights, id, ...) { - if(family$family == "gaussian") { - # insert estimation and prediction function - } - if(family$family == "binomial") { - # insert estimation and prediction function - } - # pred returns predicted responses (on the scale of the outcome) - pred <- numeric() - # fit returns all objects needed for predict.SL.template - # fit <- list(object = ) - fit <- vector("list", length=0) - class(fit) <- c("SL.template") - out <- list(pred = pred, fit = fit) - return(out) + if (family$family == "gaussian") { + # insert estimation and prediction function + } + else if (family$family == "binomial") { + # insert estimation and prediction function + } + # pred returns predicted responses (on the scale of the outcome) + pred <- numeric() + # fit returns all objects needed for predict.SL.template + # fit <- list(object = ) + fit <- vector("list", length=0) + class(fit) <- c("SL.template") + out <- list(pred = pred, fit = fit) + return(out) } -# -predict.SL.template <- function(object, newdata, family, X = NULL, Y = NULL,...) { - # insert prediction function - pred <- numeric() - return(pred) +#' @export `write.SL.template` +#' @rdname SL.template +write.SL.template <- function(file = '', ...) { + cat('SL.template <- function(Y, X, newX, family, obsWeights, id, ...) {\n # load required packages\n # require(\'pkg\')\n if (family$family == \'gaussian\') {\n \n }\n elseif (family$family == \'binomial\') {\n \n }\n # pred is the predicted responses for newX (on the scale of the outcome)\n pred <- numeric()\n # fit returns all objects needed for predict.SL.template\n fit <- list(object = )\n # declare class of fit for predict.SL.template\n class(fit) <- \'SL.template\'\n # return a list with pred and fit\n out <- list(pred = pred, fit = fit)\n return(out)\n}', file = file, ...) } -write.SL.template <- function(file = '', ...) { - cat('SL.template <- function(Y, X, newX, family, obsWeights, id, ...) {\n # load required packages\n # require(\'pkg\')\n if(family$family == \'gaussian\') {\n \n }\n if(family$family == \'binomial\') {\n \n }\n # pred is the predicted responses for newX (on the scale of the outcome)\n pred <- numeric()\n # fit returns all objects needed for predict.SL.template\n fit <- list(object = )\n # declare class of fit for predict.SL.template\n class(fit) <- \'SL.template\'\n # return a list with pred and fit\n out <- list(pred = pred, fit = fit)\n return(out)\n}', file = file, ...) -} \ No newline at end of file diff --git a/R/SL.xgboost.R b/R/SL.xgboost.R index 420b3a1..d0db5da 100644 --- a/R/SL.xgboost.R +++ b/R/SL.xgboost.R @@ -1,6 +1,6 @@ -#' XGBoost SuperLearner wrapper +#' SL wrapper for XGBoost #' -#' Supports the Extreme Gradient Boosting package for SuperLearnering, which is +#' Supports the Extreme Gradient Boosting package for SuperLearning, which is #' a variant of gradient boosted machines (GBM). #' #' The performance of XGBoost, like GBM, is sensitive to the configuration @@ -12,14 +12,9 @@ #' XGBoost from drat as described here: #' \url{http://xgboost.readthedocs.io/en/latest/build.html} #' -#' @param Y Outcome variable -#' @param X Covariate dataframe -#' @param newX Optional dataframe to predict the outcome -#' @param obsWeights Optional observation-level weights (supported but not tested) -#' @param id Optional id to group observations from the same unit (not used -#' currently). -#' @param family "gaussian" for regression, "binomial" for binary -#' classification, "multinomial" for multiple classification (not yet supported). +#' @inheritParams SL.template +#' @inheritParams predict.SL.template +#' @inheritParams SL.glm #' @param ntrees How many trees to fit. Low numbers may underfit but high #' numbers may overfit, depending also on the shrinkage. #' @param max_depth How deep each tree can be. 1 means no interactions, aka tree @@ -39,8 +34,10 @@ #' @param verbose Verbosity of XGB fitting. #' @param ... Any remaining arguments (not supported though). #' +#' @seealso [create.SL.xgboost()] to create new xgboost wrappers with different parameters. + #' @export -SL.xgboost = function(Y, X, newX, family, obsWeights, id, ntrees = 1000, +SL.xgboost = function(Y, X, newX = X, family = gaussian(), obsWeights = NULL, ntrees = 1000, max_depth = 4, shrinkage = 0.1, minobspernode = 10, params = list(), nthread = 1, @@ -48,119 +45,64 @@ SL.xgboost = function(Y, X, newX, family, obsWeights, id, ntrees = 1000, save_period = NULL, ...) { .SL.require("xgboost") - if(packageVersion("xgboost") < "0.6") stop("SL.xgboost requires xgboost version >= 0.6, try help(\'SL.xgboost\') for details") - # X needs to be converted to a matrix first, then an xgb.DMatrix. - if (!is.matrix(X)) { - X = model.matrix(~ . - 1, X) + + if (utils::packageVersion("xgboost") < "0.6") { + stop("SL.xgboost requires xgboost version >= 0.6, try help(\'SL.xgboost\') for details") } # Convert to an xgboost compatible data matrix, using the sample weights. - xgmat = xgboost::xgb.DMatrix(data = X, label = Y, weight = obsWeights) + xgmat <- xgboost::xgb.DMatrix(data = X, label = Y, weight = obsWeights) # TODO: support early stopping, which requires a "watchlist". See ?xgb.train if (family$family == "gaussian") { - # reg:linear was deprecated in version 1.1.1.1, changed to reg:squarederror - if(packageVersion("xgboost") >= "1.1.1.1") { - objective <- 'reg:squarederror' - } else { - objective <- 'reg:linear' - } - model = xgboost::xgboost(data = xgmat, objective=objective, nrounds = ntrees, - max_depth = max_depth, min_child_weight = minobspernode, eta = shrinkage, - verbose = verbose, nthread = nthread, params = params, - save_period = save_period) + xgbpar <- xgboost::xgb.params(objective = "reg:squarederror", + nthread = nthread, + eta = shrinkage, + max_depth = max_depth, + min_child_weight = minobspernode) } - if (family$family == "binomial") { - model = xgboost::xgboost(data = xgmat, objective="binary:logistic", nrounds = ntrees, - max_depth = max_depth, min_child_weight = minobspernode, eta = shrinkage, - verbose = verbose, nthread = nthread, params = params, - save_period = save_period, eval_metric = "logloss") + else if (family$family == "binomial") { + xgbpar <- xgboost::xgb.params(objective = "binary:logistic", + nthread = nthread, + eta = shrinkage, + max_depth = max_depth, + min_child_weight = minobspernode, + eval_metric = "logloss") } - if (family$family == "multinomial") { + else if (family$family == "multinomial") { # TODO: test this. - model = xgboost::xgboost(data = xgmat, objective="multi:softmax", nrounds = ntrees, - max_depth = max_depth, min_child_weight = minobspernode, eta = shrinkage, - verbose = verbose, num_class = length(unique(Y)), nthread = nthread, - params = params, - save_period = save_period) + xgbpar <- xgboost::xgb.params(objective = "multi:softmax", + nthread = nthread, + eta = shrinkage, + max_depth = max_depth, + min_child_weight = minobspernode, + num_class = length(unique(Y))) } - # Newdata needs to be converted to a matrix first, then an xgb.DMatrix. - if (!is.matrix(newX)) { - newX = model.matrix(~ . - 1, newX) + for (i in intersect(names(params), names(formals(xgboost::xgb.params)))) { + xgbpar[[i]] <- params[[i]] } - pred = predict(model, newdata = newX) + model <- xgboost::xgb.train(params = xgbpar, data = xgmat, nrounds = ntrees, + save_period = save_period, verbose = verbose) - fit = list(object = model) - class(fit) = c("SL.xgboost") - out = list(pred = pred, fit = fit) - return(out) -} + pred <- predict(model, newdata = xgboost::xgb.DMatrix(data = newX)) -#' XGBoost prediction on new data -#' @param object Model fit object from SuperLearner -#' @param newdata Dataframe that will be converted to an xgb.DMatrix -#' @param family Binomial or gaussian -#' @param ... Any remaining arguments (not supported though). -predict.SL.xgboost <- function(object, newdata, family, ...) { - .SL.require("xgboost") - if(packageVersion("xgboost") < "0.6") stop("SL.xgboost requires xgboost version >= 0.6, try help(\'SL.xgboost\') for details") - # newdata needs to be converted to a matrix first - if (!is.matrix(newdata)) { - newdata = model.matrix(~ . - 1, newdata) - } - pred = predict(object$object, newdata = newdata) - return(pred) -} + fit <- list(object = model) + class(fit) <- c("SL.xgboost") -#' Factory for XGBoost SL wrappers -#' -#' Create multiple configurations of XGBoost learners based on the desired combinations of hyperparameters. -#' -#' @param tune List of hyperparameter settings to test. If specified, each hyperparameter will need to be defined. -#' @param detailed_names Set to T to have the function names include the parameter configurations. -#' @param env Environment in which to create the SL.xgboost functions. Defaults to the global environment. -#' @param name_prefix The prefix string for the name of each function that is generated. -#' -#' @examples -#' -#' # Create a new environment to store the learner functions. -#' # This keeps the global environment organized. -#' sl_env = new.env() -#' # Create 2 * 2 * 1 * 3 = 12 combinations of hyperparameters. -#' tune = list(ntrees = c(100, 500), max_depth = c(1, 2), minobspernode = 10, -#' shrinkage = c(0.1, 0.01, 0.001)) -#' # Generate a separate learner for each combination. -#' xgb_grid = create.SL.xgboost(tune = tune, env = sl_env) -#' # Review the function configurations. -#' xgb_grid -#' # Attach the environment so that the custom learner functions can be accessed. -#' attach(sl_env) -#' \dontrun{ -#' sl = SuperLearner(Y = Y, X = X, SL.library = xgb_grid$names) -#' } -#' detach(sl_env) -#' @export -create.SL.xgboost = function(tune = list(ntrees = c(1000), max_depth = c(4), shrinkage = c(0.1), - minobspernode = c(10)), detailed_names = F, env = .GlobalEnv, - name_prefix = "SL.xgb") { - # Create all combinations of hyperparameters, for grid-like search. - tuneGrid = expand.grid(tune, stringsAsFactors=F) + list(pred = pred, fit = fit) +} - names = rep("", nrow(tuneGrid)) +#' @exportS3Method predict SL.xgboost +#' @rdname SL.xgboost +predict.SL.xgboost <- function(object, newdata, ...) { + .SL.require("xgboost") - for (i in seq(nrow(tuneGrid))) { - g = tuneGrid[i,] - if (detailed_names) { - name = paste(name_prefix, g$ntrees, g$max_depth, g$shrinkage, g$minobspernode, sep=".") - } else { - name = paste(name_prefix, i, sep=".") - } - names[i] = name - eval(parse(text = paste0(name, "= function(..., ntrees = ", g$ntrees, ", max_depth = ", g$max_depth, ", shrinkage=", g$shrinkage, ", minobspernode=", g$minobspernode, ") SL.xgboost(..., ntrees = ntrees, max_depth = max_depth, shrinkage=shrinkage, minobspernode=minobspernode)")), envir = env) + if (utils::packageVersion("xgboost") < "0.6") { + stop("SL.xgboost requires xgboost version >= 0.6, try help(\'SL.xgboost\') for details") } - results = list(grid = tuneGrid, names = names) - invisible(results) + + predict(object$object, newdata = xgboost::xgb.DMatrix(data = newdata)) } diff --git a/R/SampleSplitSuperLearner.R b/R/SampleSplitSuperLearner.R index e9397e5..14186e0 100644 --- a/R/SampleSplitSuperLearner.R +++ b/R/SampleSplitSuperLearner.R @@ -1,8 +1,130 @@ -# SuperLearner for Sample Split instead of V-fold CV -# -# Created by Eric Polley on 2014-04-15. -# -SampleSplitSuperLearner <- function(Y, X, newX = NULL, family = gaussian(), SL.library, method = 'method.NNLS', id = NULL, verbose = FALSE, control = list(), split = 0.8, obsWeights = NULL) { +#' Sampling-Splitting SuperLearner Prediction Function +#' +#' @description +#' A Prediction Function for the SuperLearner. The \code{SuperLearner()} +#' function takes a training set pair (X, Y) and returns the predicted values +#' based on a validation set. `SampleSplitSuperLearner()` uses sample split +#' validation whereas [SuperLearner()] uses V-fold cross-validation. +#' +#' @inherit SuperLearner details +#' +#' @inheritParams SuperLearner +#' @param split Either a single value between 0 and 1 indicating the fraction +#' of the samples for the training split. A value of 0.8 will randomly assign +#' 80 percent of the samples to the training split and the other 20 percent to +#' the validation split. Alternatively, split can be a numeric vector with the +#' row numbers of \code{X} corresponding to the validation split. All other +#' rows not in the vector will be considered in the training split. +#' +#' @returns +#' \item{call}{ The matched call. } +#' \item{libraryNames}{ A character +#' vector with the names of the algorithms in the library. The format is +#' 'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the +#' prediction algorithm run on all variables in X. } +#' \item{SL.library}{ Returns \code{SL.library} in the same format as the argument with the same name +#' above. } +#' \item{SL.predict}{ The predicted values from the super learner for +#' the rows in \code{newX}. } +#' \item{coef}{ Coefficients for the super learner. +#' } +#' \item{library.predict}{ A matrix with the predicted values from each +#' algorithm in \code{SL.library} for the rows in \code{newX}. } +#' \item{Z}{ The +#' Z matrix (the cross-validated predicted values for each algorithm in +#' \code{SL.library}). } +#' \item{cvRisk}{ A numeric vector with the V-fold +#' cross-validated risk estimate for each algorithm in \code{SL.library}. Note +#' that this does not contain the CV risk estimate for the SuperLearner, only +#' the individual algorithms in the library. } +#' \item{family}{ Returns the +#' \code{family} value from above } +#' \item{fitLibrary}{ A list with the fitted +#' objects for each algorithm in \code{SL.library} on the full training data +#' set. } +#' \item{varNames}{ A character vector with the names of the variables +#' in \code{X}. } +#' \item{validRows}{ A list containing the row numbers for the +#' V-fold cross-validation step. } +#' \item{method}{ A list with the method +#' functions. } +#' \item{whichScreen}{ A logical matrix indicating which variables +#' passed each screening algorithm. } +#' \item{control}{ The \code{control} list. +#' } \item{split}{ The \code{split} value. } +#' \item{errorsInCVLibrary}{ A +#' logical vector indicating if any algorithms experienced an error within the +#' CV step. } +#' \item{errorsInLibrary}{ A logical vector indicating if any +#' algorithms experienced an error on the full data. } +#' +#' @author Eric C Polley \email{epolley@@uchicago.edu} +#' +#' @inherit SuperLearner references +#' +#' @keywords models +#' +#' @examples +#' \dontrun{ +#' ## simulate data +#' set.seed(23432) +#' ## training set +#' n <- 500 +#' p <- 50 +#' X <- matrix(rnorm(n*p), nrow = n, ncol = p) +#' colnames(X) <- paste("X", 1:p, sep="") +#' X <- data.frame(X) +#' Y <- X[, 1] + sqrt(abs(X[, 2] * X[, 3])) + X[, 2] - X[, 3] + rnorm(n) +#' +#' ## test set +#' m <- 1000 +#' newX <- matrix(rnorm(m*p), nrow = m, ncol = p) +#' colnames(newX) <- paste("X", 1:p, sep="") +#' newX <- data.frame(newX) +#' newY <- newX[, 1] + sqrt(abs(newX[, 2] * newX[, 3])) + newX[, 2] - +#' newX[, 3] + rnorm(m) +#' +#' # generate Library and run Super Learner +#' SL.library <- c("SL.glm", "SL.randomForest", "SL.gam", +#' "SL.polymars", "SL.mean") +#' +#' test <- SampleSplitSuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNLS") +#' +#' test +#' +#' # library with screening +#' SL.library <- list(c("SL.glmnet", "All"), +#' c("SL.glm", "screen.randomForest", "All", "screen.SIS"), +#' "SL.randomForest", +#' c("SL.polymars", "All"), +#' "SL.mean") +#' +#' test <- SampleSplitSuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNLS") +#' test +#' +#' # binary outcome +#' set.seed(1) +#' N <- 200 +#' X <- matrix(rnorm(N*10), N, 10) +#' X <- as.data.frame(X) +#' Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + +#' .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) +#' +#' SL.library <- c("SL.glmnet", "SL.glm", "SL.knn", "SL.gam", "SL.mean") +#' +#' # least squares loss function +#' test.NNLS <- SampleSplitSuperLearner(Y = Y, X = X, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNLS", +#' family = binomial()) +#' test.NNLS +#' } + +#' @export +SampleSplitSuperLearner <- function(Y, X, newX = NULL, family = gaussian(), SL.library, + method = 'method.NNLS', id = NULL, verbose = FALSE, control = list(), + split = 0.8, obsWeights = NULL) { if(is.character(method)) { if(exists(method, mode = 'list')) { method <- get(method, mode = 'list') @@ -145,7 +267,7 @@ SampleSplitSuperLearner <- function(Y, X, newX = NULL, family = gaussian(), SL.l for(s in seq(kScreen)) { testScreen <- try(do.call(library$screenAlgorithm[s], list(Y = tempOutcome, X = tempLearn, family = family, id = tempId, obsWeights = tempObsWeights))) if(inherits(testScreen, "try-error")) { - warning(paste("replacing failed screening algorithm,", library$screenAlgorithm[s], ", with All()", "\n ")) + warning(paste("replacing failed screening algorithm,", library$screenAlgorithm[s], ", with All()", "\n ")) tempWhichScreen[s, ] <- TRUE } else { tempWhichScreen[s, ] <- testScreen diff --git a/R/SuperLearner-package.R b/R/SuperLearner-package.R new file mode 100644 index 0000000..43882ec --- /dev/null +++ b/R/SuperLearner-package.R @@ -0,0 +1,7 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +#' @import stats +## usethis namespace: end +NULL diff --git a/R/SuperLearner.R b/R/SuperLearner.R index 8b7b6b4..fc57d5a 100644 --- a/R/SuperLearner.R +++ b/R/SuperLearner.R @@ -1,7 +1,315 @@ - # SuperLearner -# -# Created by Eric Polley on 2011-01-01. -# +#' Super Learner Prediction Function +#' +#' @description +#' \code{SuperLearner()} takes a training set pair (X,Y) and returns the predicted values +#' based on a validation set. +#' +#' @details +#' \code{SuperLearner()} fits the super learner prediction algorithm. The +#' weights for each algorithm in \code{SL.library} is estimated, along with the +#' fit of each algorithm. +#' +#' \emph{The prescreen algorithms} These algorithms first rank the variables in +#' \code{X} based on either a univariate regression p-value of the +#' \code{randomForest} variable importance. A subset of the variables in +#' \code{X} is selected based on a pre-defined cut-off. With this subset of +#' the X variables, the algorithms in \code{SL.library} are then fit. +#' +#' The SuperLearner package contains a few prediction and screening algorithm +#' wrappers. The full list of wrappers can be viewed with +#' \code{listWrappers()}. The design of the \pkg{SuperLearner} package is such that +#' the user can easily add their own wrappers. We also maintain a website with +#' additional examples of wrapper functions at +#' \url{https://github.com/ecpolley/SuperLearnerExtra}. +#' +#' @param Y The outcome in the training data set. Must be a numeric vector. +#' @param X The predictor variables in the training data set, usually a +#' data.frame. +#' @param newX The predictor variables in the validation data set. The +#' structure should match X. If missing, uses X for newX. +#' @param SL.library Either a character vector of prediction algorithms or a +#' list containing character vectors. See details below for examples on the +#' structure. A list of functions included in the SuperLearner package can be +#' found with \code{listWrappers()}. +#' @param verbose logical; TRUE for printing progress during the computation +#' (helpful for debugging). +#' @param family Currently allows \code{gaussian} or \code{binomial} to +#' describe the error distribution. Link function information will be ignored +#' and should be contained in the method argument below. +#' @param method A list (or a function to create a list) containing details on +#' estimating the coefficients for the super learner and the model to combine +#' the individual algorithms in the library. See \code{?method.template} for +#' details. Currently, the built in options are either "method.NNLS" (the +#' default), "method.NNLS2", "method.NNloglik", "method.CC_LS", +#' "method.CC_nloglik", or "method.AUC". NNLS and NNLS2 are non-negative least +#' squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb +#' and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and +#' binomial outcomes. NNloglik is a non-negative binomial likelihood +#' maximization using the BFGS quasi-Newton optimization method. NN* methods +#' are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's +#' quadratic programming algorithm to calculate the best convex combination of +#' weights to minimize the squared error loss. CC_nloglik calculates the convex +#' combination of weights that minimize the negative binomial log likelihood on +#' the logistic scale using the sequential quadratic programming algorithm. +#' AUC, which only works for binary outcomes, uses the Nelder-Mead method via +#' the optim function to minimize rank loss (equivalent to maximizing AUC). +#' @param id Optional cluster identification variable. For the cross-validation +#' splits, \code{id} forces observations in the same cluster to be in the same +#' validation fold. \code{id} is passed to the prediction and screening +#' algorithms in SL.library, but be sure to check the individual wrappers as +#' many of them ignore the information. +#' @param obsWeights Optional observation weights variable. As with \code{id} +#' above, \code{obsWeights} is passed to the prediction and screening +#' algorithms, but many of the built in wrappers ignore (or can't use) the +#' information. If you are using observation weights, make sure the library you +#' specify uses the information. +#' @param control A list of parameters to control the estimation process. +#' Parameters include \code{saveFitLibrary} and \code{trimLogit}. See +#' \code{\link{SuperLearner.control}} for details. +#' @param cvControl A list of parameters to control the cross-validation +#' process. Parameters include \code{V}, \code{stratifyCV}, \code{shuffle} and +#' \code{validRows}. See \code{\link{SuperLearner.CV.control}} for details. +#' @param env Environment containing the learner functions. Defaults to the +#' calling environment. +#' +#' @returns +#' \item{call}{ The matched call. } +#' \item{libraryNames}{ A character +#' vector with the names of the algorithms in the library. The format is +#' 'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the +#' prediction algorithm run on all variables in X. } +#' \item{SL.library}{ Returns \code{SL.library} in the same format as the argument with the same name +#' above. } +#' \item{SL.predict}{ The predicted values from the super learner for +#' the rows in \code{newX}. } +#' \item{coef}{ Coefficients for the super learner.} +#' \item{library.predict}{ A matrix with the predicted values from each +#' algorithm in \code{SL.library} for the rows in \code{newX}. } +#' \item{Z}{ The +#' Z matrix (the cross-validated predicted values for each algorithm in +#' \code{SL.library}). } +#' \item{cvRisk}{ A numeric vector with the V-fold +#' cross-validated risk estimate for each algorithm in \code{SL.library}. Note +#' that this does not contain the CV risk estimate for the SuperLearner, only +#' the individual algorithms in the library. } +#' \item{family}{ Returns the +#' \code{family} value from above } +#' \item{fitLibrary}{ A list with the fitted +#' objects for each algorithm in \code{SL.library} on the full training data +#' set. } +#' \item{cvFitLibrary}{ A list with fitted objects for each algorithm in +#' \code{SL.library} on each of \code{V} different training data sets. } +#' \item{varNames}{ A character vector with the names of the variables in +#' \code{X}. } +#' \item{validRows}{ A list containing the row numbers for the +#' V-fold cross-validation step. } +#' \item{method}{ A list with the method +#' functions. } +#' \item{whichScreen}{ A logical matrix indicating which variables +#' passed each screening algorithm. } +#' \item{control}{ The \code{control} list. +#' } +#' \item{cvControl}{ The \code{cvControl} list. } +#' \item{errorsInCVLibrary}{ A +#' logical vector indicating if any algorithms experienced an error within the +#' CV step. } +#' \item{errorsInLibrary}{ A logical vector indicating if any +#' algorithms experienced an error on the full data. } +#' \item{env}{ Environment +#' passed into function which will be searched to find the learner functions. +#' Defaults to the calling environment. } +#' \item{times}{ A list that contains +#' the execution time of the SuperLearner, plus separate times for model +#' fitting and prediction. } +#' +#' @author Eric C Polley \email{epolley@@uchicago.edu} +#' +#' @references +#' van der Laan, M. J., Polley, E. C. and Hubbard, A. E. (2008) Super Learner, \emph{Statistical Applications of Genetics and Molecular Biology}, \bold{6}, article 25. +#' +#' @seealso [CV.SuperLearner()], [SampleSplitSuperLearner()] +#' +#' @keywords models +#' +#' @examples +#' +#' \dontrun{ +#' ## simulate data +#' set.seed(23432) +#' ## training set +#' n <- 500 +#' p <- 50 +#' X <- matrix(rnorm(n*p), nrow = n, ncol = p) +#' colnames(X) <- paste("X", 1:p, sep="") +#' X <- data.frame(X) +#' Y <- X[, 1] + sqrt(abs(X[, 2] * X[, 3])) + X[, 2] - X[, 3] + rnorm(n) +#' +#' ## test set +#' m <- 1000 +#' newX <- matrix(rnorm(m*p), nrow = m, ncol = p) +#' colnames(newX) <- paste("X", 1:p, sep="") +#' newX <- data.frame(newX) +#' newY <- newX[, 1] + sqrt(abs(newX[, 2] * newX[, 3])) + newX[, 2] - +#' newX[, 3] + rnorm(m) +#' +#' # generate Library and run Super Learner +#' SL.library <- c("SL.glm", "SL.randomForest", "SL.gam", +#' "SL.polymars", "SL.mean") +#' test <- SuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNLS") +#' test +#' +#' # library with screening +#' SL.library <- list(c("SL.glmnet", "All"), c("SL.glm", "screen.randomForest", +#' "All", "screen.SIS"), "SL.randomForest", c("SL.polymars", "All"), "SL.mean") +#' test <- SuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNLS") +#' test +#' +#' # binary outcome +#' set.seed(1) +#' N <- 200 +#' X <- matrix(rnorm(N*10), N, 10) +#' X <- as.data.frame(X) +#' Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + +#' .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) +#' +#' SL.library <- c("SL.glmnet", "SL.glm", "SL.knn", "SL.gam", "SL.mean") +#' +#' # least squares loss function +#' test.NNLS <- SuperLearner(Y = Y, X = X, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNLS", family = binomial()) +#' test.NNLS +#' +#' # negative log binomial likelihood loss function +#' test.NNloglik <- SuperLearner(Y = Y, X = X, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNloglik", family = binomial()) +#' test.NNloglik +#' +#' # 1 - AUC loss function +#' test.AUC <- SuperLearner(Y = Y, X = X, SL.library = SL.library, +#' verbose = TRUE, method = "method.AUC", family = binomial()) +#' test.AUC +#' +#' # 2 +#' # adapted from library(SIS) +#' set.seed(1) +#' # training +#' b <- c(2, 2, 2, -3*sqrt(2)) +#' n <- 150 +#' p <- 200 +#' truerho <- 0.5 +#' corrmat <- diag(rep(1-truerho, p)) + matrix(truerho, p, p) +#' corrmat[, 4] = sqrt(truerho) +#' corrmat[4, ] = sqrt(truerho) +#' corrmat[4, 4] = 1 +#' cholmat <- chol(corrmat) +#' x <- matrix(rnorm(n*p, mean=0, sd=1), n, p) +#' x <- x %*% cholmat +#' feta <- x[, 1:4] %*% b +#' fprob <- exp(feta) / (1 + exp(feta)) +#' y <- rbinom(n, 1, fprob) +#' +#' # test +#' m <- 10000 +#' newx <- matrix(rnorm(m*p, mean=0, sd=1), m, p) +#' newx <- newx %*% cholmat +#' newfeta <- newx[, 1:4] %*% b +#' newfprob <- exp(newfeta) / (1 + exp(newfeta)) +#' newy <- rbinom(m, 1, newfprob) +#' +#' DATA2 <- data.frame(Y = y, X = x) +#' newDATA2 <- data.frame(Y = newy, X=newx) +#' +#' create.SL.knn <- function(k = c(20, 30)) { +#' for(mm in seq(length(k))){ +#' eval(parse(text = paste('SL.knn.', k[mm], '<- function(..., k = ', k[mm], +#' ') SL.knn(..., k = k)', sep = '')), envir = .GlobalEnv) +#' } +#' invisible(TRUE) +#' } +#' create.SL.knn(c(20, 30, 40, 50, 60, 70)) +#' +#' # library with screening +#' SL.library <- list(c("SL.glmnet", "All"), c("SL.glm", "screen.randomForest"), +#' "SL.randomForest", "SL.knn", "SL.knn.20", "SL.knn.30", "SL.knn.40", +#' "SL.knn.50", "SL.knn.60", "SL.knn.70", +#' c("SL.polymars", "screen.randomForest")) +#' test <- SuperLearner(Y = DATA2$Y, X = DATA2[, -1], newX = newDATA2[, -1], +#' SL.library = SL.library, verbose = TRUE, family = binomial()) +#' test +#' +#' ## examples with multicore +#' set.seed(23432, "L'Ecuyer-CMRG") # use L'Ecuyer for multicore seeds. see ?set.seed for details +#' ## training set +#' n <- 500 +#' p <- 50 +#' X <- matrix(rnorm(n*p), nrow = n, ncol = p) +#' colnames(X) <- paste("X", 1:p, sep="") +#' X <- data.frame(X) +#' Y <- X[, 1] + sqrt(abs(X[, 2] * X[, 3])) + X[, 2] - X[, 3] + rnorm(n) +#' +#' ## test set +#' m <- 1000 +#' newX <- matrix(rnorm(m*p), nrow = m, ncol = p) +#' colnames(newX) <- paste("X", 1:p, sep="") +#' newX <- data.frame(newX) +#' newY <- newX[, 1] + sqrt(abs(newX[, 2] * newX[, 3])) + newX[, 2] - newX[, 3] + rnorm(m) +#' +#' # generate Library and run Super Learner +#' SL.library <- c("SL.glm", "SL.randomForest", "SL.gam", +#' "SL.polymars", "SL.mean") +#' +#' testMC <- mcSuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library, +#' method = "method.NNLS") +#' testMC +#' +#' ## examples with snow +#' library(parallel) +#' cl <- makeCluster(2, type = "PSOCK") # can use different types here +#' clusterSetRNGStream(cl, iseed = 2343) +#' # make SL functions available on the clusters, use assignment to avoid printing +#' foo <- clusterEvalQ(cl, library(SuperLearner)) +#' testSNOW <- snowSuperLearner(cluster = cl, Y = Y, X = X, newX = newX, +#' SL.library = SL.library, method = "method.NNLS") +#' testSNOW +#' stopCluster(cl) +#' +#' ## snow example with user-generated wrappers +#' # If you write your own wrappers and are using snowSuperLearner() +#' # These new wrappers need to be added to the SuperLearner namespace and exported to the clusters +#' # Using a simple example here, but can define any new SuperLearner wrapper +#' my.SL.wrapper <- function(...) SL.glm(...) +#' # assign function into SuperLearner namespace +#' environment(my.SL.wrapper) <-asNamespace("SuperLearner") +#' +#' cl <- makeCluster(2, type = "PSOCK") # can use different types here +#' clusterSetRNGStream(cl, iseed = 2343) +#' # make SL functions available on the clusters, use assignment to avoid printing +#' foo <- clusterEvalQ(cl, library(SuperLearner)) +#' clusterExport(cl, c("my.SL.wrapper")) # copy the function to all clusters +#' testSNOW <- snowSuperLearner(cluster = cl, Y = Y, X = X, newX = newX, +#' SL.library = c("SL.glm", "SL.mean", "my.SL.wrapper"), method = "method.NNLS") +#' testSNOW +#' stopCluster(cl) +#' +#' ## timing +#' replicate(5, system.time(SuperLearner(Y = Y, X = X, newX = newX, +#' SL.library = SL.library, method = "method.NNLS"))) +#' +#' replicate(5, system.time(mcSuperLearner(Y = Y, X = X, newX = newX, +#' SL.library = SL.library, method = "method.NNLS"))) +#' +#' cl <- makeCluster(2, type = 'PSOCK') +#' # make SL functions available on the clusters, use assignment to avoid printing +#' foo <- clusterEvalQ(cl, library(SuperLearner)) +#' replicate(5, system.time(snowSuperLearner(cl, Y = Y, X = X, newX = newX, +#' SL.library = SL.library, method = "method.NNLS"))) +#' stopCluster(cl) +#' +#' } + +#' @export SuperLearner <- function(Y, X, newX = NULL, family = gaussian(), SL.library, method = 'method.NNLS', id = NULL, verbose = FALSE, control = list(), cvControl = list(), obsWeights = NULL, env = parent.frame()) { @@ -106,7 +414,7 @@ SuperLearner <- function(Y, X, newX = NULL, family = gaussian(), SL.library, } # create function for the cross-validation step: - .crossValFUN <- function(valid, Y, dataX, id, obsWeights, library, + .crossValFUN <- function(valid, Y, dataX, id, obsWeights, library, kScreen, k, p, libraryNames, saveCVFitLibrary) { tempLearn <- dataX[-valid, , drop = FALSE] tempOutcome <- Y[-valid] @@ -163,10 +471,10 @@ SuperLearner <- function(Y, X, newX = NULL, family = gaussian(), SL.library, # need to unlist folds to put the rows back in the correct order time_train_start = proc.time() - crossValFUN_out <- lapply(validRows, FUN = .crossValFUN, - Y = Y, dataX = X, id = id, - obsWeights = obsWeights, - library = library, kScreen = kScreen, + crossValFUN_out <- lapply(validRows, FUN = .crossValFUN, + Y = Y, dataX = X, id = id, + obsWeights = obsWeights, + library = library, kScreen = kScreen, k = k, p = p, libraryNames = libraryNames, saveCVFitLibrary = control$saveCVFitLibrary) Z[unlist(validRows, use.names = FALSE), ] <- do.call('rbind', lapply(crossValFUN_out, "[[", "out")) diff --git a/R/SuperLearnerNews.R b/R/SuperLearnerNews.R index b530860..c308495 100755 --- a/R/SuperLearnerNews.R +++ b/R/SuperLearnerNews.R @@ -1,16 +1,34 @@ -SuperLearnerNews <- function(...) { - RShowDoc("NEWS", package = "SuperLearner", ...) -} +#' Show the NEWS and documentation files for the SuperLearner package +#' +#' Show the NEWS file of the SuperLearner package. The function is simply a +#' wrapper for [utils::RShowDoc()]. +#' +#' @param what specify what document to open. Currently supports the NEWS file +#' and the PDF files 'SuperLearner.pdf' and 'SuperLearnerR.pdf'. +#' @param \dots additional arguments passed to [utils::RShowDoc()]. +#' +#' @returns +#' A invisible character string given the path to the SuperLearner NEWS or documentation file. +#' +#' @keywords utilities +#' @export SuperLearnerDocs <- function(what = 'SuperLearnerR.pdf', ...) { - if(what == 'NEWS') { - RShowDoc('NEWS', package = "SuperLearner", ...) - } else { + if (what == 'NEWS') { + utils::RShowDoc('NEWS', package = "SuperLearner", ...) + } + else { f <- system.file('doc', what, package = 'SuperLearner') - if (.Platform$OS.type == 'windows') - shell.exec(f) - else system(paste(Sys.getenv('R_PDFVIEWER'), f, '&')) - return(f) + if (.Platform$OS.type == 'windows') shell.exec(f) + else system(paste(Sys.getenv('R_PDFVIEWER'), f, '&')) + + return(f) } - -} \ No newline at end of file +} + +#' @export +#' @rdname SuperLearnerDocs +SuperLearnerNews <- function(...) { + utils::RShowDoc("NEWS", package = "SuperLearner", ...) +} + diff --git a/R/control.R b/R/control.R index dfd94a4..0e8753f 100644 --- a/R/control.R +++ b/R/control.R @@ -1,27 +1,61 @@ -# control functions for SuperLearner() -# -# Created by Eric Polley on 2011-01-03. -# -SuperLearner.control <- function(saveFitLibrary = TRUE, +#' Control parameters for `SuperLearner()` +#' +#' @param saveFitLibrary Logical. Should the fit for each algorithm be saved in +#' the output from \code{SuperLearner()}. +#' @param saveCVFitLibrary Logical. Should cross-validated fits for each +#' algorithm be saved in the output from \code{SuperLearner()}. +#' @param trimLogit number between 0.0 and 0.5. What level to truncate the +#' logit transformation to maintain a bounded loss function when using the +#' NNloglik method. +#' +#' @returns +#' A list containing the control parameters. +#' +#' @keywords utilities +#' +#' @export +SuperLearner.control <- function(saveFitLibrary = TRUE, saveCVFitLibrary = FALSE, trimLogit = 0.001) { - if(trimLogit > 0.5) { + if (trimLogit > 0.5) { warning('trimLogit must be less than 0.5, will replace with trimLogit = 0.001') trimLogit <- 0.001 } + list(saveFitLibrary = saveFitLibrary, trimLogit = trimLogit, saveCVFitLibrary = saveCVFitLibrary) } +#' Control parameters for the cross validation steps in \code{SuperLearner()} +#' +#' @param V Integer. Number of splits for the V-fold cross-validation step. The +#' default is 10. In most cases, between 10 and 20 splits works well. +#' @param stratifyCV Logical. Should the data splits be stratified by a binary +#' response? Attempts to maintain the same ratio in each training and +#' validation sample. +#' @param shuffle Logical. Should the rows of \code{X} be shuffled before +#' creating the splits. +#' @param validRows A List. Use this to pass pre-specified rows for the sample +#' splits. The length of the list should be \code{V} and each entry in the list +#' should contain a vector with the row numbers of the corresponding validation +#' sample. +#' +#' @returns +#' A list containing the control parameters. +#' +#' @keywords utilities + +#' @export `SuperLearner.CV.control` SuperLearner.CV.control <- function(V = 10L, stratifyCV = FALSE, shuffle = TRUE, validRows = NULL){ # make sure V is an integer V <- as.integer(V) - + # Checks for user supplied validRows is present: - if(!is.null(validRows)) { - if(!is.list(validRows)) { + if (!is.null(validRows)) { + if (!is.list(validRows)) { stop('validRows must be a list of length V containing the row numbers for the corresponding validation set') } - if(!identical(V, length(validRows))) { + + if (!identical(V, length(validRows))) { stop('V and length(validRows) must be identical') } } diff --git a/R/create.Learner.R b/R/create.Learner.R index 35dcea1..fafe0cb 100644 --- a/R/create.Learner.R +++ b/R/create.Learner.R @@ -50,7 +50,7 @@ #' sl #' } #' -#' @export +#' @export `create.Learner` create.Learner = function(base_learner, params = list(), tune = list(), env = parent.frame(), name_prefix = base_learner, detailed_names = F, verbose = F) { @@ -71,11 +71,11 @@ create.Learner = function(base_learner, params = list(), tune = list(), if (length(tune) > 0) { # Specify drop=F in case tuneGrid is a single-column dataframe. - g = tuneGrid[i, , drop=F] + g = tuneGrid[i, , drop = FALSE] # Separate with "_" because some hyperparameters could be floats with a period. if (detailed_names) { - name = do.call(paste, c(list(name_prefix), g, list(sep="_"))) + name = do.call(paste, c(list(name_prefix), g, list(sep = "_"))) } } else { g = c() @@ -94,7 +94,7 @@ create.Learner = function(base_learner, params = list(), tune = list(), # May need to tweak this if someone really needs to pass a NULL for some reason. if (!is.null(val) && val != "NULL") { # Add quotes around val if it is a string rather than numeric. - if (is(val, "character")) { + if (is.character(val)) { val = paste0('"', val, '"') } fn_params = paste0(fn_params, ", ", name_i, "=", val) diff --git a/R/create.SL.xgboost.R b/R/create.SL.xgboost.R new file mode 100644 index 0000000..23d9c70 --- /dev/null +++ b/R/create.SL.xgboost.R @@ -0,0 +1,66 @@ +#' Factory for XGBoost SL wrappers +#' +#' Create multiple configurations of XGBoost learners based on the desired combinations of hyperparameters. +#' +#' @param tune List of hyperparameter settings to test. If specified, each hyperparameter will need to be defined. +#' @param detailed_names Set to T to have the function names include the parameter configurations. +#' @param env Environment in which to create the SL.xgboost functions. Defaults to the global environment. +#' @param name_prefix The prefix string for the name of each function that is generated. +#' +#' @seealso [SL.xgboost()] +#' +#' @examples +#' +#' # Create a new environment to store the learner functions. +#' # This keeps the global environment organized. +#' sl_env = new.env() +#' # Create 2 * 2 * 1 * 3 = 12 combinations of hyperparameters. +#' tune = list(ntrees = c(100, 500), max_depth = c(1, 2), minobspernode = 10, +#' shrinkage = c(0.1, 0.01, 0.001)) +#' # Generate a separate learner for each combination. +#' xgb_grid = create.SL.xgboost(tune = tune, env = sl_env) +#' # Review the function configurations. +#' xgb_grid +#' # Attach the environment so that the custom learner functions can be accessed. +#' attach(sl_env) +#' \dontrun{ +#' sl = SuperLearner(Y = Y, X = X, SL.library = xgb_grid$names) +#' } +#' detach(sl_env) +#' +#' @export +create.SL.xgboost <- function(tune = list(ntrees = c(1000), max_depth = c(4), shrinkage = c(0.1), + minobspernode = c(10)), + detailed_names = FALSE, + env = .GlobalEnv, + name_prefix = "SL.xgb") { + # Create all combinations of hyperparameters, for grid-like search. + tuneGrid <- expand.grid(tune, stringsAsFactors = FALSE) + + slnames <- character(nrow(tuneGrid)) + + for (i in seq_len(nrow(tuneGrid))) { + g <- tuneGrid[i,] + + if (detailed_names) { + slnames[i] <- paste(name_prefix, g$ntrees, g$max_depth, g$shrinkage, g$minobspernode, sep = ".") + } + else { + slnames[i] <- paste(name_prefix, i, sep = ".") + } + + text <- sprintf(" + %s <- function(..., ntrees = %s, max_depth = %s, shrinkage = %s, minobspernode = %s) { + SL.xgboost(..., ntrees = ntrees, max_depth = max_depth, shrinkage = shrinkage, minobspernode = minobspernode) + } + ", slnames[i], g$ntrees, g$max_depth, g$shrinkage, g$minobspernode) + + exp <- str2expression(text) + + eval(exp, envir = env) + } + + results <- list(grid = tuneGrid, names = slnames) + + invisible(results) +} \ No newline at end of file diff --git a/R/internals.R b/R/internals.R index 97a0e00..64f2354 100644 --- a/R/internals.R +++ b/R/internals.R @@ -1,12 +1,12 @@ # Internal functions for SuperLearner package -# +# # Created by Eric Polley on 2011-01-03 -# +# # .SL.require() extends the require() function to add my own error messages # If a package uses special functions in a formula, may need to use require() instead .SL.require <- function(package, message = paste('loading required package (', package, ') failed', sep = '')) { - if(!requireNamespace(package, quietly = FALSE)) { + if(!requireNamespace(package, quietly = TRUE)) { stop(message, call. = FALSE) } invisible(TRUE) @@ -22,7 +22,7 @@ # the list contains character vectors, and the scructure of the character vectors is always prediction algorithm first, followed by a list of screening algorithms to be coupled with the prediction algorithm. If no screening algorithm is given (as in "predAlg3" above) then the algorithm will run on the entire set of variables by default. .createLibrary <- function(SL.library) { - if (is.character(SL.library)) { + if (is.character(SL.library)) { k <- length(SL.library) whichScreen <- matrix(1, nrow = 1, ncol = k) screenAlgorithm <- "All" @@ -38,12 +38,12 @@ } screenAlgorithmFull <- unlist(lapply(SL.library, FUN="[", -1)) screenAlgorithm <- unique(screenAlgorithmFull) - + library <- data.frame(predAlgorithm = rep(predNames, times=NumberScreen), rowScreen = match(screenAlgorithmFull, screenAlgorithm), stringsAsFactors = FALSE) } else { stop('format for SL.library is not recognized') } - + out <- list(library = library, screenAlgorithm = screenAlgorithm) return(out) } diff --git a/R/listWrapper.R b/R/listWrappers.R similarity index 50% rename from R/listWrapper.R rename to R/listWrappers.R index 1fd5f6f..3342186 100644 --- a/R/listWrapper.R +++ b/R/listWrappers.R @@ -1,12 +1,36 @@ +#' List all wrapper functions in SuperLearner +#' +#' @param what What list to return. Can be \code{both} for both prediction +#' algorithms and screening algorithms, \code{SL} for the prediction +#' algorithms, \code{screen} for the screening algorithms, \code{method} for +#' the estimation method details, or anything else will return a list of all +#' (exported) functions in the \code{SuperLearner} package. Additional wrapper +#' functions are available at +#' \url{https://github.com/ecpolley/SuperLearnerExtra}. +#' @returns +#' Invisible character vector with all exported functions in the +#' \pkg{SuperLearner} package +#' +#' @author Eric C Polley \email{epolley@@uchicago.edu} +#' +#' @seealso \code{\link{SuperLearner}} +#' +#' @keywords utilities +#' @examples +#' +#' listWrappers(what = "SL") +#' listWrappers(what = "screen") + +#' @export listWrappers <- function(what = "both") { everything <- sort(getNamespaceExports("SuperLearner")) - if(what == "both") { + if (what == "both") { message("All prediction algorithm wrappers in SuperLearner:\n") print(everything[grepl(pattern="^[S]L", everything)]) message("\nAll screening algorithm wrappers in SuperLearner:\n") print("All") print(everything[grepl(pattern="screen", everything)]) - } else if(what == "SL") { + } else if (what == "SL") { message("All prediction algorithm wrappers in SuperLearner:\n") print(everything[grepl(pattern="^[S]L", everything)]) } else if(what == "screen") { diff --git a/R/mcSuperLearner.R b/R/mcSuperLearner.R index 9e920e0..ebb5e55 100644 --- a/R/mcSuperLearner.R +++ b/R/mcSuperLearner.R @@ -1,11 +1,20 @@ -# mcSuperLearner -# -# Created by Eric Polley on 2011-01-01. -# +#' SuperLearner with multicore support +#' +#' `mcSuperlearner()` and `snowSuperlearner()` allow for multicore parallelization for [SuperLearner()]. +#' +#' @param cluster a `cluster` object from `parallel::makeCluster()`. +#' @param mc.cores The number of cores to use, i.e. at most how many child processes will be run simultaneously. Passed to `parallel::mclapply()`. +#' @param Y,X,newX,family,SL.library,method,id,verbose,control,cvControl,obsWeights,env See [SuperLearner()]. +#' +#' @details +#' `mcSuperlearner()` uses `parallel::mclapply()`, and `snowSuperlearner()` uses `parallel::parLapply()`. +#' + +#' @export mcSuperLearner <- function(Y, X, newX = NULL, family = gaussian(), SL.library, method = 'method.NNLS', id = NULL, verbose = FALSE, control = list(), cvControl = list(), obsWeights = NULL, - env = parent.frame()) { + env = parent.frame(), mc.cores = getOption("mc.cores", 2L)) { # Begin timing how long SuperLearner takes to execute. time_start = proc.time() @@ -153,7 +162,8 @@ mcSuperLearner <- function(Y, X, newX = NULL, family = gaussian(), SL.library, # rbind unlists the output from lapply # need to unlist folds to put the rows back in the correct order time_train = system.time({ - Z[unlist(validRows, use.names = FALSE), ] <- do.call('rbind', parallel::mclapply(validRows, FUN = .crossValFUN, Y = Y, dataX = X, id = id, obsWeights = obsWeights, library = library, kScreen = kScreen, k = k, p = p, libraryNames = libraryNames)) + Z[unlist(validRows, use.names = FALSE), ] <- do.call('rbind', parallel::mclapply(validRows, FUN = .crossValFUN, Y = Y, dataX = X, id = id, obsWeights = obsWeights, library = library, kScreen = kScreen, k = k, p = p, libraryNames = libraryNames, + mc.cores = mc.cores)) # Check for errors. If any algorithms had errors, replace entire column with # 0 even if error is only in one fold. @@ -240,7 +250,8 @@ mcSuperLearner <- function(Y, X, newX = NULL, family = gaussian(), SL.library, } - foo <- parallel::mclapply(seq(k), FUN = .predFun, lib = library$library, Y = Y, dataX = X, newX = newX, whichScreen = whichScreen, family = family, id = id, obsWeights = obsWeights, verbose = verbose, control = control, libraryNames = libraryNames) + foo <- parallel::mclapply(seq(k), FUN = .predFun, lib = library$library, Y = Y, dataX = X, newX = newX, whichScreen = whichScreen, family = family, id = id, obsWeights = obsWeights, verbose = verbose, control = control, libraryNames = libraryNames, + mc.cores = mc.cores) predY <- do.call('cbind', lapply(foo, '[[', 'pred')) assign('fitLibrary', lapply(foo, '[[', 'fitLibrary'), envir = fitLibEnv) rm(foo) @@ -316,3 +327,325 @@ mcSuperLearner <- function(Y, X, newX = NULL, family = gaussian(), SL.library, class(out) <- c("SuperLearner") return(out) } + +#' @export +#' @rdname mcSuperLearner +snowSuperLearner <- function(cluster, + Y, X, newX = NULL, family = gaussian(), SL.library, + method = 'method.NNLS', id = NULL, verbose = FALSE, + control = list(), cvControl = list(), obsWeights = NULL, + env = parent.frame()) { + + # Begin timing how long SuperLearner takes to execute. + time_start = proc.time() + + .SL.require('parallel') + if (!inherits(cluster, 'cluster')) stop('\'cluster\' must be a cluster created using the makeCluster() function in the snow package') + if (is.character(method)) { + if (exists(method, mode = 'list')) { + method <- get(method, mode = 'list') + } else if (exists(method, mode = 'function')) { + method <- get(method, mode = 'function')() + } + } else if (is.function(method)) { + method <- method() + } + if(!is.list(method)) { + stop("method is not in the appropriate format. Check out help('method.template')") + } + if(!is.null(method$require)) { + sapply(method$require, function(x) require(force(x), character.only = TRUE)) + } + # get defaults for controls and make sure in correct format + control <- do.call('SuperLearner.control', control) + cvControl <- do.call('SuperLearner.CV.control', cvControl) + + # put together the library + # should this be in a new environment? + library <- .createLibrary(SL.library) + .check.SL.library(library = c(unique(library$library$predAlgorithm), library$screenAlgorithm)) + + call <- match.call(expand.dots = TRUE) + # should we be checking X and newX for data.frame? + # data.frame not required, but most of the built-in wrappers assume a data.frame + if(!inherits(X, 'data.frame')) message('X is not a data frame. Check the algorithms in SL.library to make sure they are compatible with non data.frame inputs') + varNames <- colnames(X) + N <- dim(X)[1L] + p <- dim(X)[2L] + k <- nrow(library$library) + kScreen <- length(library$screenAlgorithm) + Z <- matrix(NA, N, k) + libraryNames <- paste(library$library$predAlgorithm, library$screenAlgorithm[library$library$rowScreen], sep="_") + + if(p < 2 & !identical(library$screenAlgorithm, "All")) { + warning('Screening algorithms specified in combination with single-column X.') + } + + # put fitLibrary in it's own environment to locate later + fitLibEnv <- new.env() + assign('fitLibrary', vector('list', length = k), envir = fitLibEnv) + assign('libraryNames', libraryNames, envir = fitLibEnv) + evalq(names(fitLibrary) <- libraryNames, envir = fitLibEnv) + + # errors* records if an algorithm stops either in the CV step and/or in full data + errorsInCVLibrary <- rep(0, k) + errorsInLibrary <- rep(0, k) + + # if newX is missing, use X + if(is.null(newX)) { + newX <- X + } + # Are these checks still required? + if(!identical(colnames(X), colnames(newX))) { + stop("The variable names and order in newX must be identical to the variable names and order in X") + } + if (sum(is.na(X)) > 0 | sum(is.na(newX)) > 0 | sum(is.na(Y)) > 0) { + stop("missing data is currently not supported. Check Y, X, and newX for missing values") + } + if (!is.numeric(Y)) { + stop("the outcome Y must be a numeric vector") + } + # family can be either character or function, so these lines put everything together (code from glm()) + if (is.character(family)) + family <- get(family, mode="function", envir=env) + if (is.function(family)) + family <- family() + if (is.null(family$family)) { + print(family) + stop("'family' not recognized") + } + + if (family$family != "binomial" & isTRUE("cvAUC" %in% method$require)){ + stop("'method.AUC' is designed for the 'binomial' family only") + } + + # create CV folds + validRows <- CVFolds(N = N, id = id, Y = Y, cvControl = cvControl) + + # test id + if(is.null(id)) { + id <- seq(N) + } + if(!identical(length(id), N)) { + stop("id vector must have the same dimension as Y") + } + # test observation weights + if(is.null(obsWeights)) { + obsWeights <- rep(1, N) + } + if(!identical(length(obsWeights), N)) { + stop("obsWeights vector must have the same dimension as Y") + } + + # create function for the cross-validation step: + .crossValFUN <- function(valid, Y, dataX, id, obsWeights, library, kScreen, k, p, libraryNames, verbose) { + tempLearn <- dataX[-valid, , drop = FALSE] + tempOutcome <- Y[-valid] + tempValid <- dataX[valid, , drop = FALSE] + tempWhichScreen <- matrix(NA, nrow = kScreen, ncol = p) + tempId <- id[-valid] + tempObsWeights <- obsWeights[-valid] + + # should this be converted to a lapply also? + for(s in seq(kScreen)) { + screen_fn = get(library$screenAlgorithm[s], envir = env) + testScreen <- try(do.call(screen_fn, list(Y = tempOutcome, X = tempLearn, family = family, id = tempId, obsWeights = tempObsWeights))) + if(inherits(testScreen, "try-error")) { + warning(paste("replacing failed screening algorithm,", library$screenAlgorithm[s], ", with All()", "\n ")) + tempWhichScreen[s, ] <- TRUE + } else { + tempWhichScreen[s, ] <- testScreen + } + if(verbose) { + message(paste("Number of covariates in ", library$screenAlgorithm[s], " is: ", sum(tempWhichScreen[s, ]), sep = "")) + } + } #end screen + + # should this be converted to a lapply also? + out <- matrix(NA, nrow = nrow(tempValid), ncol = k) + for(s in seq(k)) { + pred_fn = library$library$predAlgorithm[s] + testAlg <- try(do.call(pred_fn, list(Y = tempOutcome, X = subset(tempLearn, select = tempWhichScreen[library$library$rowScreen[s], ], drop=FALSE), newX = subset(tempValid, select = tempWhichScreen[library$library$rowScreen[s], ], drop=FALSE), family = family, id = tempId, obsWeights = tempObsWeights))) + if(inherits(testAlg, "try-error")) { + warning(paste("Error in algorithm", library$library$predAlgorithm[s], "\n The Algorithm will be removed from the Super Learner (i.e. given weight 0) \n" )) + # errorsInCVLibrary[s] <<- 1 + # '<<-' doesn't work with snow. + } else { + out[, s] <- testAlg$pred + } + # verbose will not work in the GUI, but works in the terminal (test this) + if(verbose) message(paste("CV", libraryNames[s])) + } #end library + invisible(out) + } + # the lapply performs the cross-validation steps to create Z + # additional steps to put things in the correct order + # rbind unlists the output from lapply + # need to unlist folds to put the rows back in the correct order + time_train = system.time({ + Z[unlist(validRows, use.names = FALSE), ] <- do.call('rbind', parallel::parLapply(cl = cluster, X = validRows, fun = .crossValFUN, Y = Y, dataX = X, id = id, obsWeights = obsWeights, library = library, kScreen = kScreen, k = k, p = p, libraryNames = libraryNames, verbose = verbose)) + + # Check for errors. If any algorithms had errors, replace entire column with + # 0 even if error is only in one fold. + errorsInCVLibrary <- apply(Z, 2, function(x) any(is.na(x))) + if(sum(errorsInCVLibrary) > 0) { + Z[, as.logical(errorsInCVLibrary)] <- 0 + } + if(all(Z == 0)) { + stop("All algorithms dropped from library") + } + + # Compute weights for each algorithm in library. + getCoef <- method$computeCoef(Z = Z, Y = Y, libraryNames = libraryNames, + obsWeights = obsWeights, control = control, + verbose = verbose, + errorsInLibrary = errorsInCVLibrary) + coef <- getCoef$coef + names(coef) <- libraryNames + + }) # Finish timing. + + # Set a default in case the method does not return the optimizer result. + if(!("optimizer" %in% names(getCoef))) { + getCoef["optimizer"] <- NA + } + + # now fit all algorithms in library on entire learning data set and predict on newX + m <- dim(newX)[1L] + predY <- matrix(NA, nrow = m, ncol = k) + # whichScreen <- matrix(NA, nrow = kScreen, ncol = p) + + .screenFun <- function(fun, list) { + screen_fn = get(fun, envir = env) + testScreen <- try(do.call(screen_fn, list)) + if(inherits(testScreen, "try-error")) { + warning(paste("replacing failed screening algorithm,", fun, ", with All() in full data", "\n ")) + out <- rep(TRUE, ncol(list$X)) + } else { + out <- testScreen + } + return(out) + } + + time_predict = system.time({ + + whichScreen <- if (length(library$screenAlgorithm) < 2) { + sapply(library$screenAlgorithm, FUN = .screenFun, list = list(Y = Y, X = X, family = family, id = id, obsWeights = obsWeights), simplify = FALSE) + } else { + parallel::parSapply(cl = cluster, X = library$screenAlgorithm, FUN = .screenFun, list = list(Y = Y, X = X, family = family, id = id, obsWeights = obsWeights), simplify = FALSE) + } + whichScreen <- do.call(rbind, whichScreen) + # change to sapply? + # for(s in 1:k) { + # testAlg <- try(do.call(library$library$predAlgorithm[s], list(Y = Y, X = subset(X, select = whichScreen[library$library$rowScreen[s], ], drop=FALSE), newX = subset(newX, select = whichScreen[library$library$rowScreen[s], ], drop=FALSE), family = family, id = id, obsWeights = obsWeights))) + # if(inherits(testAlg, "try-error")) { + # warning(paste("Error in algorithm", library$library$predAlgorithm[s], " on full data", "\n The Algorithm will be removed from the Super Learner (i.e. given weight 0) \n" )) + # errorsInLibrary[s] <- 1 + # } else { + # predY[, s] <- testAlg$pred + # } + # if(control$saveFitLibrary) { + # fitLibrary[[s]] <- testAlg$fit + # } + # if(verbose) { + # message(paste("full", libraryNames[s])) + # } + # } + .predFun <- function(index, lib, Y, dataX, newX, whichScreen, family, id, obsWeights, verbose, control, libraryNames) { + out <- list(pred = NA, fitLibrary = NULL) + pred_fn = get(lib$predAlgorithm[index], envir = env) + testAlg <- try(do.call(pred_fn, list(Y = Y, X = subset(dataX, select = whichScreen[lib$rowScreen[index], ], drop=FALSE), newX = subset(newX, select = whichScreen[lib$rowScreen[index], ], drop=FALSE), family = family, id = id, obsWeights = obsWeights))) + if(inherits(testAlg, "try-error")) { + warning(paste("Error in algorithm", lib$predAlgorithm[index], " on full data", "\n The Algorithm will be removed from the Super Learner (i.e. given weight 0) \n" )) + out$pred <- rep.int(NA, times = nrow(newX)) + } else { + out$pred <- testAlg$pred + if(control$saveFitLibrary) { + # eval(bquote(fitLibrary[[.(index)]] <- .(testAlg$fit)), envir = fitLibEnv) + out$fitLibrary <- testAlg$fit + } + } + if(verbose) { + message(paste("full", libraryNames[index])) + } + invisible(out) + } + + + foo <- parallel::parLapply(cl = cluster, X = seq(k), fun = .predFun, lib = library$library, Y = Y, dataX = X, newX = newX, whichScreen = whichScreen, family = family, id = id, obsWeights = obsWeights, verbose = verbose, control = control, libraryNames = libraryNames) + predY <- do.call('cbind', lapply(foo, '[[', 'pred')) + assign('fitLibrary', lapply(foo, '[[', 'fitLibrary'), envir = fitLibEnv) + rm(foo) + + # predY <- do.call('cbind', parLapply(cluster, seq(k), fun = .predFun, lib = library$library, Y = Y, dataX = X, newX = newX, whichScreen = whichScreen, family = family, id = id, obsWeights = obsWeights, verbose = verbose, control = control, libraryNames = libraryNames)) + + # check for errors + errorsInLibrary <- apply(predY, 2, function(xx) any(is.na(xx))) + if (sum(errorsInLibrary) > 0) { + if (sum(coef[as.logical(errorsInLibrary)]) > 0) { + warning(paste0("Re-running estimation of coefficients removing failed algorithm(s)\n", + "Original coefficients are: \n", paste(coef, collapse = ", "), "\n")) + Z[, as.logical(errorsInLibrary)] <- 0 + if (all(Z == 0)) { + stop("All algorithms dropped from library") + } + getCoef <- method$computeCoef(Z = Z, Y = Y, libraryNames = libraryNames, + obsWeights = obsWeights, control = control, + verbose = verbose, + errorsInLibrary = errorsInLibrary) + coef <- getCoef$coef + names(coef) <- libraryNames + } else { + warning("coefficients already 0 for all failed algorithm(s)") + } + } + + # Compute super learner predictions on newX. + getPred <- method$computePred(predY = predY, coef = coef, control = control) + + }) # Finish timing. + + # Add names of algorithms to the predictions. + colnames(predY) <- libraryNames + + # Clean up when errors in library. + if(sum(errorsInCVLibrary) > 0) { + getCoef$cvRisk[as.logical(errorsInCVLibrary)] <- NA + } + + # Finish timing the full SuperLearner execution. + time_end = proc.time() + + # Compile execution times. + times = list(everything = time_end - time_start, + train = time_train, + predict = time_predict) + + # Put everything together in a list. + out <- list( + call = call, + libraryNames = libraryNames, + SL.library = library, + SL.predict = getPred, + coef = coef, + library.predict = predY, + Z = Z, + cvRisk = getCoef$cvRisk, + family = family, + fitLibrary = get('fitLibrary', envir = fitLibEnv), + id = id, + varNames = varNames, + validRows = validRows, + method = method, + whichScreen = whichScreen, + control = control, + errorsInCVLibrary = errorsInCVLibrary, + errorsInLibrary = errorsInLibrary, + obsWeights = obsWeights, + metaOptimizer = getCoef$optimizer, + env = env, + times = times + ) + class(out) <- c("SuperLearner") + return(out) +} \ No newline at end of file diff --git a/R/method.R b/R/method.R index bac3c40..0848753 100644 --- a/R/method.R +++ b/R/method.R @@ -1,3 +1,50 @@ +#' Methods to estimate the coefficients for the SuperLearner +#' @name method +#' +#' @description +#' These functions contain the information on the loss function and the model +#' to combine algorithms. +#' +#' @details +#' A \code{SuperLearner} method must be a list (or a function to create a list) +#' with exactly 3 elements. The 3 elements must be named \code{require}, +#' \code{computeCoef} and \code{computePred}. +#' +#' @param file A connection, or a character string naming a file to print to. +#' Passed to \code{\link{cat}}. +#' @param optim_method Passed to the \code{optim} call method. See +#' \code{\link{optim}} for details. +#' @param nlopt_method Either \code{optim_method} or \code{nlopt_method} must +#' be provided, the other must be \code{NULL} +#' @param bounds Bounds for parameter estimates +#' @param normalize Logical. Should the parameters be normalized to sum up to 1 +#' @param \dots Additional arguments passed to \code{\link{cat}}. +#' +#' @returns +#' A list containing 3 elements: +#' \item{require}{ A character vector +#' listing any required packages. Use \code{NULL} if no additional packages are +#' required } +#' \item{computeCoef}{ A function. The arguments are: \code{Z}, +#' \code{Y}, \code{libraryNames}, \code{obsWeights}, \code{control}, +#' \code{verbose}. The value is a list with two items: \code{cvRisk} and +#' \code{coef}. This function computes the coefficients of the super learner. +#' As the super learner minimizes the cross-validated risk, the loss function +#' information is contained in this function as well as the model to combine +#' the algorithms in \code{SL.library}. } +#' \item{computePred}{ A function. The +#' arguments are: \code{predY}, \code{coef}, \code{control}. The value is a +#' numeric vector with the super learner predicted values. } +#' +#' @author Eric C Polley \email{Polley.Eric@@mayo.edu} +#' +#' @seealso [SuperLearner()] +#' +#' @keywords utilities +#' @examples +#' write.method.template(file = '') +#' + # outline for SuperLearner methods # these should always have class 'SL.method' # @@ -7,6 +54,9 @@ # 1) compute coefficients # 2) compute predictions +#' +#' @export +#' @rdname SL.method method.template <- function() { out <- list( # require allows you to pass a character vector with required packages @@ -31,11 +81,15 @@ method.template <- function() { invisible(out) } +#' @export +#' @rdname SL.method write.method.template <- function(file = '', ...) { cat('method.template <- function() {\n out <- list(\n # require allows you to pass a character vector with required packages\n # use NULL if no required packages\n require = NULL,\n\n # computeCoef is a function that returns a list with two elements:\n # 1) coef: the weights (coefficients) for each algorithm\n # 2) cvRisk: the V-fold CV risk for each algorithm\n computeCoef = function(Z, Y, libraryNames, obsWeights, control, verbose, ...) {\n cvRisk <- numeric()\n coef <- numeric()\n out <- list(cvRisk = cvRisk, coef = coef)\n return(out)\n },\n\n # computePred is a function that takes the weights and the predicted values\n # from each algorithm in the library and combines them based on the model to\n # output the super learner predicted values\n computePred = function(predY, coef, control, ...) {\n out <- crossprod(t(predY), coef)\n return(out)\n }\n )\n invisible(out)\n }', file = file, ...) } # examples: +#' @export +#' @rdname SL.method method.NNLS <- function() { out <- list( require = 'nnls', @@ -45,7 +99,7 @@ method.NNLS <- function() { names(cvRisk) <- libraryNames # compute coef - fit.nnls <- nnls(sqrt(obsWeights) * Z, sqrt(obsWeights) * Y) + fit.nnls <- nnls::nnls(sqrt(obsWeights) * Z, sqrt(obsWeights) * Y) if (verbose) { message(paste("Non-Negative least squares convergence:", fit.nnls$mode == 1)) } @@ -77,6 +131,8 @@ method.NNLS <- function() { invisible(out) } +#' @export +#' @rdname SL.method method.NNLS2 <- function() { out <- list( require = 'quadprog', @@ -123,6 +179,8 @@ method.NNLS2 <- function() { invisible(out) } +#' @export +#' @rdname SL.method method.NNloglik <- function() { out <- list( require = NULL, @@ -187,6 +245,8 @@ method.NNloglik <- function() { invisible(out) } +#' @export +#' @rdname SL.method method.CC_LS <- function() { # Contributed by Sam Lendle # Edited by David Benkeser @@ -222,10 +282,10 @@ method.CC_LS <- function() { # set a tolerance level to avoid numerical instability tol <- 8 dupCols <- which(duplicated(round(Z, tol), MARGIN = 2)) - anyDupCols <- length(dupCols) > 0 + anyDupCols <- length(dupCols) > 0 if(anyDupCols){ # if present, throw warning identifying learners - warning(paste0(paste0(libraryNames[dupCols],collapse = ", "), + warning(paste0(paste0(libraryNames[dupCols],collapse = ", "), " are duplicates of previous learners.", " Removing from super learner.")) } @@ -266,6 +326,8 @@ method.CC_LS <- function() { invisible(out) } +#' @export +#' @rdname SL.method method.CC_nloglik <- function() { # Contributed by Sam Lendle # Edited by David Benkeser @@ -278,14 +340,14 @@ method.CC_nloglik <- function() { } computeCoef = function(Z, Y, libraryNames, obsWeights, control, verbose, ...) { # check for duplicated columns - # set a tolerance + # set a tolerance tol <- 8 dupCols <- which(duplicated(round(Z, tol), MARGIN = 2)) - anyDupCols <- length(dupCols) > 0 + anyDupCols <- length(dupCols) > 0 modZ <- Z if(anyDupCols){ # if present, throw warning identifying learners - warning(paste0(paste0(libraryNames[dupCols],collapse = ", "), + warning(paste0(paste0(libraryNames[dupCols],collapse = ", "), " are duplicates of previous learners.", " Removing from super learner.")) modZ <- modZ[,-dupCols] @@ -338,7 +400,7 @@ method.CC_nloglik <- function() { warning("Some algorithms have weights of NA, setting to 0.") coef[is.na(coef)] <- 0 } - # add in duplicated coefficients equal to 0 + # add in duplicated coefficients equal to 0 if(anyDupCols){ ind <- c(seq_along(coef), dupCols - 0.5) coef <- c(coef,rep(0, length(dupCols))) @@ -355,7 +417,9 @@ method.CC_nloglik <- function() { computeCoef = computeCoef, computePred = computePred) } - + +#' @export +#' @rdname SL.method method.AUC <- function(nlopt_method = NULL, optim_method = "L-BFGS-B", bounds = c(0, Inf), normalize = TRUE) { # Contributed by Erin LeDell diff --git a/R/plot.CV.SuperLearner.R b/R/plot.CV.SuperLearner.R index d242712..6376279 100644 --- a/R/plot.CV.SuperLearner.R +++ b/R/plot.CV.SuperLearner.R @@ -1,21 +1,52 @@ +#' Graphical display of the V-fold CV risk estimates +#' +#' The function plots the V-fold cross-validated risk estimates for the super +#' learner, the discrete super learner and each algorithm in the library. By +#' default the estimates will be sorted and include an asymptotic 95% +#' confidence interval. +#' +#' See [summary.CV.SuperLearner()] for details on how the estimates are +#' computed +#' +#' @param x The output from \code{CV.SuperLearner()}. +#' @param package Either `"ggplot2"` or `"lattice"`. The package selected must be +#' available. +#' @param constant A numeric value. The confidence interval is defined as `p +/- constant * se`, where `p` is the point estimate and `se` is the standard error. The default is the quantile of the standard normal corresponding to a 95% CI. +#' @param sort Logical. Should the rows in the plot be sorted from the smallest +#' to the largest point estimate. If `FALSE`, then the order is super learner, +#' discrete super learner, then the estimators in \code{SL.library}. +#' @param \dots Additional arguments for \code{summary.CV.SuperLearner()}. +#' +#' @returns +#' Returns the plot (either a ggplot2 object (class \code{ggplot}) or a +#' lattice object (class \code{trellis})) +#' +#' @author Eric C Polley \email{epolley@@uchicago.edu} +#' +#' @seealso \code{\link{summary.CV.SuperLearner}} and +#' \code{\link{CV.SuperLearner}} +#' +#' @keywords plot + +#' @exportS3Method plot CV.SuperLearner plot.CV.SuperLearner <- function(x, package = "ggplot2", constant = qnorm(0.975), sort = TRUE, ...) { sumx <- summary(x, ...) # if(sort) sumx$Table$Algorithm <- stats:::reorder.default(sumx$Table$Algorithm, -sumx$Table$Ave)\ - if(sort) sumx$Table$Algorithm <- reorder(sumx$Table$Algorithm, -sumx$Table$Ave) + if (sort) sumx$Table$Algorithm <- reorder(sumx$Table$Algorithm, -sumx$Table$Ave) Mean <- sumx$Table$Ave se <- sumx$Table$se Lower <- Mean - constant*se Upper <- Mean + constant*se # d <- data.frame(Y = Mean, X = sumx$Table$Algorithm, Lower = Lower, Upper = Upper) assign("d", data.frame(Y = Mean, X = sumx$Table$Algorithm, Lower = Lower, Upper = Upper)) - + if(package == "lattice") { .SL.require("lattice") p <- lattice::dotplot(X ~ Y, data = d, xlim = c(min(d$Lower) - 0.02, max(d$Upper) + 0.02), xlab = "V-fold CV Risk Estimate", ylab = "Method", panel = function(x, y){ lattice::panel.xyplot(x, y, pch = 16, cex = 1) lattice::panel.segments(d$Lower, y, d$Upper, y, lty = 1) }) - } + } if(package == "ggplot2") { .SL.require("ggplot2") p <- ggplot2::ggplot(d, ggplot2::aes_string(x = "X", y = "Y", ymin = "Lower", ymax = "Upper")) + ggplot2::geom_pointrange() + ggplot2::coord_flip() + ggplot2::ylab("V-fold CV Risk Estimate") + ggplot2::xlab("Method") diff --git a/R/predict.SL.template.R b/R/predict.SL.template.R new file mode 100644 index 0000000..b6b0e91 --- /dev/null +++ b/R/predict.SL.template.R @@ -0,0 +1,31 @@ +#' Wrapper functions for computing predictions from SL learners +#' +#' @inheritParams SL.template +#' @param object an object resulting from the fitting function. +#' @param newdata a data frame of predictors for which to compute predictions. +#' @param family Either [gaussian()] or [binomial()] to describe the error distribution. Link function information will be ignored. +#' @param X The predictor variables in the training data set, usually a data.frame. +#' @param Y The outcome in the training data set. Must be a numeric vector. +#' @param \dots For `predict.SL` wrappers, other remaining arguments. For `write.predict.SL.template()`, arguments passed to [cat()]. +#' +#' @returns +#' `predict.SL` wrappers return a vector of predictions for each unit in `newdata`. +#' +#' @keywords utilities +#' +#' @examples +#' write.predict.SL.template(file = '') + +#' @export `predict.SL.template` +predict.SL.template <- function(object, newdata, family, X = NULL, Y = NULL,...) { + # insert prediction function + pred <- numeric() + return(pred) +} + +#' @export `write.predict.SL.template` +#' @rdname predict.SL.template +write.predict.SL.template <- function(file = '', ...) { + cat('predict.SL.template <- function(object, newdata, family, X = NULL, Y = NULL,...) {\n # insert prediction function\n # pred <- numeric()\n return(pred)\n}', file = file, ...) +} + diff --git a/R/predict.SuperLearner.R b/R/predict.SuperLearner.R index f5e6113..cb04a6d 100644 --- a/R/predict.SuperLearner.R +++ b/R/predict.SuperLearner.R @@ -27,6 +27,7 @@ #' @seealso \code{\link{SuperLearner}} #' #' @keywords models +#' @exportS3Method predict SuperLearner predict.SuperLearner <- function(object, newdata, X = NULL, Y = NULL, onlySL = FALSE, ...) { if (missing(newdata)) { diff --git a/R/print.SuperLearner.R b/R/print.SuperLearner.R index 315dd65..3da269c 100644 --- a/R/print.SuperLearner.R +++ b/R/print.SuperLearner.R @@ -1,17 +1,21 @@ +#' @exportS3Method print SuperLearner print.SuperLearner <- function(x, ...) { cat("\nCall: ", deparse(x$call, width.cutoff = .9*getOption("width")), "\n\n", fill = getOption("width")) print(cbind(Risk = x$cvRisk, Coef = x$coef)) } +#' @exportS3Method stats::coef SuperLearner coef.SuperLearner <- function(object, ...) { object$coef } +#' @exportS3Method print CV.SuperLearner print.CV.SuperLearner <- function(x, ...) { cat("\nCall: ", deparse(x$call, width.cutoff = .9*getOption("width")), "\n\n", fill = getOption("width")) cat("Cross-validated predictions from the SuperLearner: SL.predict \n\nCross-validated predictions from the discrete super learner (cross-validation selector): discreteSL.predict \n\nWhich library algorithm was the discrete super learner: whichDiscreteSL \n\nCross-validated prediction for all algorithms in the library: library.predict\n") } +#' @exportS3Method stats::coef CV.SuperLearner coef.CV.SuperLearner <- function(object, ...) { object$coef } diff --git a/R/recombine.R b/R/recombine.R index 3c41812..1c80fdd 100644 --- a/R/recombine.R +++ b/R/recombine.R @@ -1,18 +1,98 @@ -# These functions take an existing SuperLearner or CV.SuperLearner fit, +#' Recombine a SuperLearner fit using a new metalearning method +#' +#' `recombineSL()` takes an existing SuperLearner fit and a new +#' metalearning method and returns a new SuperLearner fit with updated base +#' learner weights. +#' +#' \code{recombineSL()} re-fits the super learner prediction algorithm using a +#' new metalearning method. The weights for each algorithm in +#' \code{SL.library} are re-estimated using the new metalearner, however the +#' base learner fits are not regenerated, so this function saves a lot of +#' computation time as opposed to using the \code{SuperLearner()} function with a +#' new \code{method} argument. The output is identical to the output from the +#' \code{SuperLearner()} function. +#' +#' @inheritParams SuperLearner +#' @param object Fitted object from \code{SuperLearner}. +#' +#' @inherit SuperLearner return +#' +#' @author Erin LeDell \email{ledell@@berkeley.edu} +#' +#' @references +#' van der Laan, M. J., Polley, E. C. and Hubbard, A. E. (2008) Super Learner, \emph{Statistical Applications of Genetics and Molecular Biology}, \bold{6}, article 25. +#' +#' @keywords models +#' +#' @examples +#' \dontrun{ +#' # Binary outcome example adapted from SuperLearner examples +#' +#' set.seed(1) +#' N <- 200 +#' X <- matrix(rnorm(N*10), N, 10) +#' X <- as.data.frame(X) +#' Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + +#' .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) +#' +#' SL.library <- c("SL.glmnet", "SL.glm", "SL.knn", "SL.gam", "SL.mean") +#' +#' # least squares loss function +#' set.seed(1) # for reproducibility +#' fit_nnls <- SuperLearner(Y = Y, X = X, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNLS", family = binomial()) +#' fit_nnls +#' # Risk Coef +#' # SL.glmnet_All 0.2439433 0.01293059 +#' # SL.glm_All 0.2461245 0.08408060 +#' # SL.knn_All 0.2604000 0.09600353 +#' # SL.gam_All 0.2471651 0.40761918 +#' # SL.mean_All 0.2486049 0.39936611 +#' +#' +#' # negative log binomial likelihood loss function +#' fit_nnloglik <- recombineSL(fit_nnls, Y = Y, method = "method.NNloglik") +#' fit_nnloglik +#' # Risk Coef +#' # SL.glmnet_All 0.6815911 0.1577228 +#' # SL.glm_All 0.6918926 0.0000000 +#' # SL.knn_All Inf 0.0000000 +#' # SL.gam_All 0.6935383 0.6292881 +#' # SL.mean_All 0.6904050 0.2129891 +#' +#' # If we use the same seed as the original `fit_nnls`, then +#' # the recombineSL and SuperLearner results will be identical +#' # however, the recombineSL version will be much faster since +#' # it doesn't have to re-fit all the base learners. +#' set.seed(1) +#' fit_nnloglik2 <- SuperLearner(Y = Y, X = X, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNloglik", family = binomial()) +#' fit_nnloglik2 +#' # Risk Coef +#' # SL.glmnet_All 0.6815911 0.1577228 +#' # SL.glm_All 0.6918926 0.0000000 +#' # SL.knn_All Inf 0.0000000 +#' # SL.gam_All 0.6935383 0.6292881 +#' # SL.mean_All 0.6904050 0.2129891 +#' +#' } +#' + +# These functions take an existing SuperLearner or CV.SuperLearner fit, # re-fits the ensemble metalearning step using a new metalearning method, -# specified in the `method` argument and returns the new fit. -# This saves a lot of computation time since we don't have to re-compute the Z +# specified in the `method` argument and returns the new fit. +# This saves a lot of computation time since we don't have to re-compute the Z # matrix of cv predicted values by cross-validating each base learner a second time. -# The recombineSL and recombineCVSL functions are stripped down versions of the +# The recombineSL and recombineCVSL functions are stripped down versions of the # original SuperLearner and CV.SuperLearner functions by Eric C. Polley. recombineSL <- function(object, Y, method = "method.NNloglik", verbose = FALSE) { - + if (!inherits(object, "SuperLearner")) { - stop("The supplied 'object' is not of class, SuperLearner.") + stop("The supplied 'object' is not of class SuperLearner.") } - + if (is.character(method)) { if (exists(method, mode = 'list')) { method <- get(method, mode = 'list') @@ -32,9 +112,8 @@ recombineSL <- function(object, Y, method = "method.NNloglik", verbose = FALSE) # May want to modify this "if" statement because method.AUC with different args look the same here warning("The new method supplied is identical to the existing method.") } - + # get relevant objects from the SuperLearner fit object - call <- object$call obsWeights <- object$obsWeights control <- object$control cvControl <- object$cvControl @@ -48,8 +127,8 @@ recombineSL <- function(object, Y, method = "method.NNloglik", verbose = FALSE) predY <- object$library.predict whichScreen <- object$whichScreen errorsInCVLibrary <- object$errorsInCVLibrary - errorsInLibrary <- object$errorsInLibrary - + errorsInLibrary <- object$errorsInLibrary + # put fitLibrary in it's own environment to locate later fitLibEnv <- new.env() assign('fitLibrary', fitLibrary, envir = fitLibEnv) @@ -67,88 +146,175 @@ recombineSL <- function(object, Y, method = "method.NNloglik", verbose = FALSE) # family can be either character or function, so these lines put everything together (code from glm()) if(is.character(family)) family <- get(family, mode="function", envir=parent.frame()) - if(is.function(family)) + if (is.function(family)) family <- family() if (is.null(family$family)) { print(family) stop("'family' not recognized") } + if (family$family != "binomial" & isTRUE("cvAUC" %in% method$require)) { stop("'method.AUC' is designed for the 'binomial' family only") } - + # Re-compute weights for each algorithm in library using the new metalearner method: - getCoef <- method$computeCoef(Z = Z, Y = Y, libraryNames = libraryNames, - obsWeights = obsWeights, control = control, + getCoef <- method$computeCoef(Z = Z, Y = Y, libraryNames = libraryNames, + obsWeights = obsWeights, control = control, verbose = verbose) coef <- getCoef$coef names(coef) <- libraryNames - + # compute super learner predictions on newX getPred <- method$computePred(predY = predY, coef = coef, control = control) - + # clean up when errors in library - if(sum(errorsInCVLibrary) > 0) { + if (sum(errorsInCVLibrary) > 0) { getCoef$cvRisk[as.logical(errorsInCVLibrary)] <- NA } - - # put everything together in a list - out <- list(call = call, - libraryNames = libraryNames, - SL.library = library, - SL.predict = getPred, - coef = coef, - library.predict = predY, - Z = Z, - cvRisk = getCoef$cvRisk, - family = family, - fitLibrary = get('fitLibrary', envir = fitLibEnv), - varNames = varNames, - validRows = validRows, - method = method, - whichScreen = whichScreen, - control = control, - cvControl = cvControl, - errorsInCVLibrary = errorsInCVLibrary, - errorsInLibrary = errorsInLibrary) - class(out) <- c("SuperLearner") - return(out) -} + object$SL.predict <- getPred + object$coef = coef + object$cvRisk = getCoef$cvRisk + object$family <- family + object$fitLibrary = get('fitLibrary', envir = fitLibEnv) + object$method <- method + object +} -# This function takes an existing CV.SuperLearner object and for each of the V -# cross-validation folds, it re-fits the ensemble using a new metalearning method, -# specified by the `method` argument, and returns a new CV.SuperLearner object. -# This saves a lot of computation time since, for all V iterations, we can skip re-computing -# the Z matrix of cv predicted values by cross-validating each base learner a second time. -# The recombineCVSL function is a re-worked version of the original CV.SuperLearner function by Eric C. Polley. +#' Recombine a CV.SuperLearner fit using a new metalearning method +#' +#' Function to re-compute the V-fold cross-validated risk estimate for super +#' learner using a new metalearning method. This function takes as input an +#' existing CV.SuperLearner fit and applies the \code{recombineSL} fit to each +#' of the V Super Learner fits. +#' +#' The function \code{recombineCVSL} computes the usual V-fold cross-validated +#' risk estimate for the super learner (and all algorithms in \code{SL.library} +#' for comparison), using a newly specified metalearning method. The weights for +#' each algorithm in \code{SL.library} are re-estimated using the new +#' metalearner, however the base learner fits are not regenerated, so this +#' function saves a lot of computation time as opposed to using the +#' \code{CV.SuperLearner} function with a new \code{method} argument. The +#' output is identical to the output from the \code{CV.SuperLearner} function. +#' +#' @inheritParams CV.SuperLearner +#' @param object Fitted object from \code{CV.SuperLearner}. +#' +#' @inherit CV.SuperLearner return +#' +#' @author Erin LeDell \email{ledell@@berkeley.edu} +#' +#' @seealso \code{\link{recombineSL}} +#' +#' @keywords models +#' +#' @examples +#' \dontrun{ +#' # Binary outcome example adapted from SuperLearner examples +#' +#' set.seed(1) +#' N <- 200 +#' X <- matrix(rnorm(N*10), N, 10) +#' X <- as.data.frame(X) +#' Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + +#' .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) +#' +#' SL.library <- c("SL.glmnet", "SL.glm", "SL.knn", "SL.gam", "SL.mean") +#' +#' # least squares loss function +#' set.seed(1) # for reproducibility +#' cvfit_nnls <- CV.SuperLearner(Y = Y, X = X, V = 10, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNLS", family = binomial()) +#' cvfit_nnls$coef +#' # SL.glmnet_All SL.glm_All SL.knn_All SL.gam_All SL.mean_All +#' # 1 0.0000000 0.00000000 0.000000000 0.4143862 0.5856138 +#' # 2 0.0000000 0.00000000 0.304802397 0.3047478 0.3904498 +#' # 3 0.0000000 0.00000000 0.002897533 0.5544075 0.4426950 +#' # 4 0.0000000 0.20322642 0.000000000 0.1121891 0.6845845 +#' # 5 0.1743973 0.00000000 0.032471026 0.3580624 0.4350693 +#' # 6 0.0000000 0.00000000 0.099881535 0.3662309 0.5338876 +#' # 7 0.0000000 0.00000000 0.234876082 0.2942472 0.4708767 +#' # 8 0.0000000 0.06424676 0.113988158 0.5600208 0.2617443 +#' # 9 0.0000000 0.00000000 0.338030342 0.2762604 0.3857093 +#' # 10 0.3022442 0.00000000 0.294226204 0.1394534 0.2640762 +#' +#' +#' # negative log binomial likelihood loss function +#' cvfit_nnloglik <- recombineCVSL(cvfit_nnls, method = "method.NNloglik") +#' cvfit_nnloglik$coef +#' # SL.glmnet_All SL.glm_All SL.knn_All SL.gam_All SL.mean_All +#' # 1 0.0000000 0.0000000 0.00000000 0.5974799 0.40252010 +#' # 2 0.0000000 0.0000000 0.31177345 0.6882266 0.00000000 +#' # 3 0.0000000 0.0000000 0.01377469 0.8544238 0.13180152 +#' # 4 0.0000000 0.1644188 0.00000000 0.2387919 0.59678930 +#' # 5 0.2142254 0.0000000 0.00000000 0.3729426 0.41283197 +#' # 6 0.0000000 0.0000000 0.00000000 0.5847150 0.41528502 +#' # 7 0.0000000 0.0000000 0.47538172 0.5080311 0.01658722 +#' # 8 0.0000000 0.0000000 0.00000000 1.0000000 0.00000000 +#' # 9 0.0000000 0.0000000 0.45384961 0.2923480 0.25380243 +#' # 10 0.3977816 0.0000000 0.27927906 0.1606384 0.16230097 +#' +#' # If we use the same seed as the original `cvfit_nnls`, then +#' # the recombineCVSL and CV.SuperLearner results will be identical +#' # however, the recombineCVSL version will be much faster since +#' # it doesn't have to re-fit all the base learners, V times each. +#' set.seed(1) +#' cvfit_nnloglik2 <- CV.SuperLearner(Y = Y, X = X, V = 10, SL.library = SL.library, +#' verbose = TRUE, method = "method.NNloglik", family = binomial()) +#' cvfit_nnloglik2$coef +#' # SL.glmnet_All SL.glm_All SL.knn_All SL.gam_All SL.mean_All +#' # 1 0.0000000 0.0000000 0.00000000 0.5974799 0.40252010 +#' # 2 0.0000000 0.0000000 0.31177345 0.6882266 0.00000000 +#' # 3 0.0000000 0.0000000 0.01377469 0.8544238 0.13180152 +#' # 4 0.0000000 0.1644188 0.00000000 0.2387919 0.59678930 +#' # 5 0.2142254 0.0000000 0.00000000 0.3729426 0.41283197 +#' # 6 0.0000000 0.0000000 0.00000000 0.5847150 0.41528502 +#' # 7 0.0000000 0.0000000 0.47538172 0.5080311 0.01658722 +#' # 8 0.0000000 0.0000000 0.00000000 1.0000000 0.00000000 +#' # 9 0.0000000 0.0000000 0.45384961 0.2923480 0.25380243 +#' # 10 0.3977816 0.0000000 0.27927906 0.1606384 0.16230097 +#' +#' } +#' @export recombineCVSL <- function(object, method = "method.NNloglik", verbose = FALSE, saveAll = TRUE, parallel = "seq") { - + # This function takes an existing CV.SuperLearner object and for each of the V + # cross-validation folds, it re-fits the ensemble using a new metalearning method, + # specified by the `method` argument, and returns a new CV.SuperLearner object. + # This saves a lot of computation time since, for all V iterations, we can skip re-computing + # the Z matrix of cv predicted values by cross-validating each base learner a second time. + # The recombineCVSL function is a re-worked version of the original CV.SuperLearner function by Eric C. Polley. + if (!inherits(object, "CV.SuperLearner")) { - stop("The supplied 'object' is not of class, CV.SuperLearner.") + stop("The supplied 'object' is not of class CV.SuperLearner.") } + if (is.character(method)) { if (exists(method, mode = 'list')) { method <- get(method, mode = 'list') - } else if (exists(method, mode = 'function')) { + } + else if (exists(method, mode = 'function')) { method <- get(method, mode = 'function')() } - } else if (is.function(method)) { + } + else if (is.function(method)) { method <- method() } + if (!is.list(method)) { stop("method is not in the appropriate format. Check out help('method.template')") } + if (!is.null(method$require)) { sapply(method$require, function(x) require(force(x), character.only = TRUE)) } + if (identical(object$method, method, ignore.environment=TRUE)) { # May want to modify this "if" statement because method.AUC with different args look the same here warning("The new method supplied is identical to the existing method.") } - + # get relevant objects from the CV.SuperLearner object call <- object$call library <- object$SL.library @@ -158,10 +324,11 @@ recombineCVSL <- function(object, method = "method.NNloglik", verbose = FALSE, s oldAllSL <- object$AllSL Y <- object$Y N <- length(object$Y) + if (N != length(Y)) { stop("length(Y) does not match nrow(Z). Verify that Y is the same outcome variable that 'fit' was trained on.") } - + k <- nrow(library$library) AllSL <- vector('list', V) #Need to relearn SL fits using recombineSL and update this list names(AllSL) <- paste("training", 1:V, sep=" ") @@ -169,7 +336,7 @@ recombineCVSL <- function(object, method = "method.NNloglik", verbose = FALSE, s discreteSL.predict <- object$discreteSL.predict #Maybe we should use the versions created below, and keep the NA versions here whichDiscreteSL <- object$whichDiscreteSL library.predict <- object$library.predict - + # Get required arguments family <- object$AllSL[[1]]$family vlist <- as.list(seq(V)) @@ -181,20 +348,28 @@ recombineCVSL <- function(object, method = "method.NNloglik", verbose = FALSE, s valid <- folds[[v]] cvOutcome <- Y[-valid] fit.SL <- recombineSL(object = fit, Y = cvOutcome, method = method, verbose = verbose) - out <- list(cvAllSL = if(saveAll) fit.SL, cvSL.predict = fit.SL$SL.predict, cvdiscreteSL.predict = fit.SL$library.predict[, which.min(fit.SL$cvRisk)], cvwhichDiscreteSL = names(which.min(fit.SL$cvRisk)), cvlibrary.predict = fit.SL$library.predict, cvcoef = fit.SL$coef) - return(out) + + list(cvAllSL = if (saveAll) fit.SL, + cvSL.predict = fit.SL$SL.predict, + cvdiscreteSL.predict = fit.SL$library.predict[, which.min(fit.SL$cvRisk)], + cvwhichDiscreteSL = names(which.min(fit.SL$cvRisk)), + cvlibrary.predict = fit.SL$library.predict, + cvcoef = fit.SL$coef) } - - if(parallel == "seq") { + + if (parallel == "seq") { cvList <- lapply(vlist, FUN = .crossValFun, folds = folds, oldAllSL = oldAllSL, Y = Y, method = method, verbose = verbose, saveAll = saveAll) - } else if (parallel == 'multicore') { + } + else if (parallel == 'multicore') { # not tested .SL.require('parallel') cvList <- parallel::mclapply(vlist, FUN = .crossValFun, folds = folds, oldAllSL = oldAllSL, Y = Y, method = method, verbose = verbose, saveAll = saveAll) - } else if (inherits(parallel, 'cluster')) { + } + else if (inherits(parallel, 'cluster')) { # not tested cvList <- parallel::parLapply(parallel, x = vlist, fun = .crossValFun, folds = folds, oldAllSL = oldAllSL, Y = Y, method = method, verbose = verbose, saveAll = saveAll) - } else { + } + else { stop('parallel option was not recognized, use parallel = "seq" for sequential computation.') } @@ -205,21 +380,14 @@ recombineCVSL <- function(object, method = "method.NNloglik", verbose = FALSE, s library.predict[unlist(folds, use.names = FALSE), ] <- do.call('rbind', lapply(cvList, '[[', 'cvlibrary.predict')) coef <- do.call('rbind', lapply(cvList, '[[', 'cvcoef')) colnames(coef) <- libraryNames - - # put everything together in a list - out <- list(call = call, - AllSL = AllSL, - SL.predict = SL.predict, - discreteSL.predict = discreteSL.predict, - whichDiscreteSL = whichDiscreteSL, - library.predict = library.predict, - coef = coef, - folds = folds, - V = V, - libraryNames = libraryNames, - SL.library = library, - method = method, - Y = Y) - class(out) <- "CV.SuperLearner" - return(out) + + object$AllSL <- AllSL + object$SL.predict <- SL.predict + object$discreteSL.predict <- discreteSL.predict + object$whichDiscreteSL <- whichDiscreteSL + object$library.predict <- library.predict + object$coef <- coef + object$method <- method + + object } \ No newline at end of file diff --git a/R/screen.R b/R/screen.R new file mode 100644 index 0000000..782892e --- /dev/null +++ b/R/screen.R @@ -0,0 +1,151 @@ +#' Screening algorithms for SuperLearner +#' @name screen +#' +#' @description +#' Screening algorithms for \code{SuperLearner} to be used with +#' \code{SL.library}. +#' +#' @aliases write.screen.template screen.template All screen.randomForest +#' screen.SIS screen.ttest screen.corP screen.corRank screen.glmnet +#' +#' @param file A connection, or a character string naming a file to print to. +#' Passed to \code{\link{cat}}. +#' @param \dots Additional arguments passed to \code{\link{cat}} +#' +#' @returns +#' A logical vector with the length equal to the +#' number of columns in \code{X}. TRUE indicates the variable (column of X) +#' should be included. +#' +#' @author Eric C Polley \email{polley.eric@@mayo.edu} +#' +#' @seealso \code{\link{SuperLearner}} +#' +#' @keywords utilities +#' +#' @examples +#' write.screen.template(file = '') +#' + +#' @export `write.screen.template` +#' @rdname screen +write.screen.template <- function(file = '', ...) { + cat('screen.template <- function(Y, X, family, obsWeights, id, ...) {\n # load required packages\n # library(\'pkg\')\n if (family$family == \'gaussian\') {\n \n }\n if (family$family == \'binomial\') {\n \n }\n # whichVariable is a logical vector,\n # TRUE indicates variable will be used\n whichVariable <- rep(TRUE, ncol(X))\n return(whichVariable)\n}', file = file, ...) +} + +#' @export +# screen functions must return a logical vector of length ncol(X) +screen.template <- function(Y, X, family, obsWeights, id, ...) +{ + # library('pkg') + if (family$family == "gaussian") { + + } + if (family$family == "binomial") { + + } + whichVariable <- rep(TRUE, ncol(X)) + return(whichVariable) +} + +#' @export +# Pass all variables: +All <- function(X, ...) { + rep.int(TRUE, ncol(X)) +} + +#' @export +screen.corP <- function(Y, X, family, obsWeights, id, method = 'pearson', minPvalue = 0.1, minscreen = 2, ...) { + listp <- apply(X, 2, function(x, Y, method) { + ifelse(var(x) <= 0, 1, cor.test(x, y = Y, method = method)$p.value) + }, Y = Y, method = method) + whichVariable <- (listp <= minPvalue) + if (sum(whichVariable) < minscreen) { + warning('number of variables with p value less than minPvalue is less than minscreen') + whichVariable[rank(listp) <= minscreen] <- TRUE + } + return(whichVariable) +} + +#' @export +screen.corRank <- function(Y, X, family, method = 'pearson', rank = 2, ...) { + # if(rank > ncol(X)) { + # rank <- ncol(X) + # } + # Don't really need that check, but might want to add a warning message + listp <- apply(X, 2, function(x, Y, method) { + ifelse(var(x) <= 0, 1, cor.test(x, y = Y, method = method)$p.value) + }, Y = Y, method = method) + whichVariable <- (rank(listp) <= rank) + return(whichVariable) +} + +#' @export +screen.glmnet <- function(Y, X, family, alpha = 1, minscreen = 2, nfolds = 10, nlambda = 100, ...) { + .SL.require('glmnet') + if(!is.matrix(X)) { + X <- model.matrix(~ -1 + ., X) + } + fitCV <- glmnet::cv.glmnet(x = X, y = Y, lambda = NULL, type.measure = 'deviance', nfolds = nfolds, family = family$family, alpha = alpha, nlambda = nlambda) + whichVariable <- (as.numeric(coef(fitCV$glmnet.fit, s = fitCV$lambda.min))[-1] != 0) + # the [-1] removes the intercept + if (sum(whichVariable) < minscreen) { + warning("fewer than minscreen variables passed the glmnet screen, increased lambda to allow minscreen variables") + sumCoef <- apply(as.matrix(fitCV$glmnet.fit$beta), 2, function(x) sum((x != 0))) + newCut <- which.max(sumCoef >= minscreen) + whichVariable <- (as.matrix(fitCV$glmnet.fit$beta)[, newCut] != 0) + } + return(whichVariable) +} + +#' @export +screen.randomForest <- function(Y, X, family, nVar = 10, ntree = 1000, + mtry = ifelse(family$family == "gaussian", floor(sqrt(ncol(X))), max(floor(ncol(X)/3), 1)), + nodesize = ifelse(family$family == "gaussian", 5, 1), maxnodes = NULL, ...) +{ + .SL.require('randomForest') + if (family$family == "gaussian") { + rank.rf.fit <- randomForest::randomForest(Y ~ ., data = X, ntree = ntree, mtry = mtry, nodesize = nodesize, keep.forest = FALSE, maxnodes = maxnodes) + } + else if (family$family == "binomial") { + rank.rf.fit <- randomForest::randomForest(as.factor(Y) ~ ., data=X, ntree = ntree, mtry = mtry, nodesize = nodesize, keep.forest = FALSE, maxnodes = maxnodes) + } + whichVariable <- (rank(-rank.rf.fit$importance) <= nVar) + return(whichVariable) +} + +#' @export +screen.SIS <- function (Y, X, family, nsis = NULL, tune.method = "bic", type.measure = "deviance", minScreen = 5, ...) +{ + .SL.require('SIS', "you have selected SIS as a library algorithm but do not have the SIS package installed") + if (!is.matrix(X)) { + warning("X is not a matrix, screen.SIS will convert to matrix for variable screening") + Xmat <- as.matrix(X) + } + if (family$family == "gaussian") { + fitSIS <- SIS::SIS(x = Xmat, y = Y, family = family$family, nsis = nsis, tune = tune.method, type.measure = type.measure) + } + if (family$family == "binomial") { + fitSIS <- SIS::SIS(x = Xmat, y = Y, family = family$family, nsis = nsis, tune = tune.method, type.measure = type.measure) + } + whichVariable <- (1:ncol(X) %in% fitSIS$ix) + # check more than minScreen in screened set + if(sum(whichVariable) < minScreen) { + warning("fewer than minScreen variables in screen.SIS, using initial ranking") + } + return(whichVariable) +} + +#' @export +screen.ttest <- function(Y, X, family, obsWeights, id, rank = 2, ...) { + # implemented with colttests from the genefilter package + .SL.require('genefilter') + if (family$family == "gaussian") { + stop('t-test screening undefined for gaussian family, look at screen.corP or screen.corRank') + } + if (family$family == "binomial") { + listP <- genefilter::colttests(x = as.matrix(X), fac = as.factor(Y), tstatOnly = FALSE)$p.value + } + whichVariable <- (rank(listP) <= rank) + return(whichVariable) +} \ No newline at end of file diff --git a/R/screen.SIS.R b/R/screen.SIS.R deleted file mode 100644 index 033c9c2..0000000 --- a/R/screen.SIS.R +++ /dev/null @@ -1,20 +0,0 @@ -screen.SIS <- function (Y, X, family, nsis = NULL, tune.method = "bic", type.measure = "deviance", minScreen = 5, ...) -{ - .SL.require('SIS', "you have selected SIS as a library algorithm but do not have the SIS package installed") - if (!is.matrix(X)) { - warning("X is not a matrix, screen.SIS will convert to matrix for variable screening") - Xmat <- as.matrix(X) - } - if (family$family == "gaussian") { - fitSIS <- SIS::SIS(x = Xmat, y = Y, family = family$family, nsis = nsis, tune = tune.method, type.measure = type.measure) - } - if (family$family == "binomial") { - fitSIS <- SIS::SIS(x = Xmat, y = Y, family = family$family, nsis = nsis, tune = tune.method, type.measure = type.measure) - } - whichVariable <- (1:ncol(X) %in% fitSIS$ix) - # check more than minScreen in screened set - if(sum(whichVariable) < minScreen) { - warning("fewer than minScreen variables in screen.SIS, using initial ranking") - } - return(whichVariable) -} \ No newline at end of file diff --git a/R/screen.corP.R b/R/screen.corP.R deleted file mode 100644 index e6249d0..0000000 --- a/R/screen.corP.R +++ /dev/null @@ -1,11 +0,0 @@ -screen.corP <- function(Y, X, family, obsWeights, id, method = 'pearson', minPvalue = 0.1, minscreen = 2, ...) { - listp <- apply(X, 2, function(x, Y, method) { - ifelse(var(x) <= 0, 1, cor.test(x, y = Y, method = method)$p.value) - }, Y = Y, method = method) - whichVariable <- (listp <= minPvalue) - if (sum(whichVariable) < minscreen) { - warning('number of variables with p value less than minPvalue is less than minscreen') - whichVariable[rank(listp) <= minscreen] <- TRUE - } - return(whichVariable) -} \ No newline at end of file diff --git a/R/screen.corRank.R b/R/screen.corRank.R deleted file mode 100644 index 8522384..0000000 --- a/R/screen.corRank.R +++ /dev/null @@ -1,11 +0,0 @@ -screen.corRank <- function(Y, X, family, method = 'pearson', rank = 2, ...) { - # if(rank > ncol(X)) { - # rank <- ncol(X) - # } - # Don't really need that check, but might want to add a warning message - listp <- apply(X, 2, function(x, Y, method) { - ifelse(var(x) <= 0, 1, cor.test(x, y = Y, method = method)$p.value) - }, Y = Y, method = method) - whichVariable <- (rank(listp) <= rank) - return(whichVariable) -} \ No newline at end of file diff --git a/R/screen.glmnet.R b/R/screen.glmnet.R deleted file mode 100644 index a841095..0000000 --- a/R/screen.glmnet.R +++ /dev/null @@ -1,16 +0,0 @@ -screen.glmnet <- function(Y, X, family, alpha = 1, minscreen = 2, nfolds = 10, nlambda = 100, ...) { - .SL.require('glmnet') - if(!is.matrix(X)) { - X <- model.matrix(~ -1 + ., X) - } - fitCV <- glmnet::cv.glmnet(x = X, y = Y, lambda = NULL, type.measure = 'deviance', nfolds = nfolds, family = family$family, alpha = alpha, nlambda = nlambda) - whichVariable <- (as.numeric(coef(fitCV$glmnet.fit, s = fitCV$lambda.min))[-1] != 0) - # the [-1] removes the intercept - if (sum(whichVariable) < minscreen) { - warning("fewer than minscreen variables passed the glmnet screen, increased lambda to allow minscreen variables") - sumCoef <- apply(as.matrix(fitCV$glmnet.fit$beta), 2, function(x) sum((x != 0))) - newCut <- which.max(sumCoef >= minscreen) - whichVariable <- (as.matrix(fitCV$glmnet.fit$beta)[, newCut] != 0) - } - return(whichVariable) -} \ No newline at end of file diff --git a/R/screen.randomForest.R b/R/screen.randomForest.R deleted file mode 100644 index 9b80b5c..0000000 --- a/R/screen.randomForest.R +++ /dev/null @@ -1,12 +0,0 @@ -screen.randomForest <- function (Y, X, family, nVar = 10, ntree = 1000, mtry = ifelse(family$family == "gaussian", floor(sqrt(ncol(X))), max(floor(ncol(X)/3), 1)), nodesize = ifelse(family$family == "gaussian", 5, 1), maxnodes = NULL,...) -{ - .SL.require('randomForest') - if (family$family == "gaussian") { - rank.rf.fit <- randomForest::randomForest(Y ~ ., data = X, ntree = ntree, mtry = mtry, nodesize = nodesize, keep.forest = FALSE, maxnodes = maxnodes) - } - if (family$family == "binomial") { - rank.rf.fit <- randomForest::randomForest(as.factor(Y) ~ ., data=X, ntree = ntree, mtry = mtry, nodesize = nodesize, keep.forest = FALSE, maxnodes = maxnodes) - } - whichVariable <- (rank(-rank.rf.fit$importance) <= nVar) - return(whichVariable) -} \ No newline at end of file diff --git a/R/screen.template.R b/R/screen.template.R deleted file mode 100644 index 46099b1..0000000 --- a/R/screen.template.R +++ /dev/null @@ -1,22 +0,0 @@ -# Pass all variables: -All <- function(X, ...) { - rep.int(TRUE, ncol(X)) -} - -# screen functions must return a logical vector of length ncol(X) -screen.template <- function (Y, X, family, obsWeights, id, ...) -{ - # library('pkg') - if (family$family == "gaussian") { - - } - if (family$family == "binomial") { - - } - whichVariable <- rep(TRUE, ncol(X)) - return(whichVariable) -} - -write.screen.template <- function(file = '', ...) { - cat('screen.template <- function(Y, X, family, obsWeights, id, ...) {\n # load required packages\n # library(\'pkg\')\n if (family$family == \'gaussian\') {\n \n }\n if (family$family == \'binomial\') {\n \n }\n # whichVariable is a logical vector,\n # TRUE indicates variable will be used\n whichVariable <- rep(TRUE, ncol(X))\n return(whichVariable)\n}', file = file, ...) -} diff --git a/R/screen.ttest.R b/R/screen.ttest.R deleted file mode 100644 index 4c4b5ad..0000000 --- a/R/screen.ttest.R +++ /dev/null @@ -1,12 +0,0 @@ -screen.ttest <- function(Y, X, family, obsWeights, id, rank = 2, ...) { - # implemented with colttests from the genefilter package - .SL.require('genefilter') - if (family$family == "gaussian") { - stop('t-test screening undefined for gaussian family, look at screen.corP or screen.corRank') - } - if (family$family == "binomial") { - listP <- genefilter::colttests(x = as.matrix(X), fac = as.factor(Y), tstatOnly = FALSE)$p.value - } - whichVariable <- (rank(listP) <= rank) - return(whichVariable) -} \ No newline at end of file diff --git a/R/snowSuperLearner.R b/R/snowSuperLearner.R deleted file mode 100644 index ea5b7af..0000000 --- a/R/snowSuperLearner.R +++ /dev/null @@ -1,322 +0,0 @@ -# snowSuperLearner -# -# Created by Eric Polley on 2011-04-19. -# -snowSuperLearner <- function(cluster, Y, X, newX = NULL, family = gaussian(), SL.library, - method = 'method.NNLS', id = NULL, verbose = FALSE, - control = list(), cvControl = list(), obsWeights = NULL, - env = parent.frame()) { - - # Begin timing how long SuperLearner takes to execute. - time_start = proc.time() - - .SL.require('parallel') - if (!inherits(cluster, 'cluster')) stop('\'cluster\' must be a cluster created using the makeCluster() function in the snow package') - if (is.character(method)) { - if (exists(method, mode = 'list')) { - method <- get(method, mode = 'list') - } else if (exists(method, mode = 'function')) { - method <- get(method, mode = 'function')() - } - } else if (is.function(method)) { - method <- method() - } - if(!is.list(method)) { - stop("method is not in the appropriate format. Check out help('method.template')") - } - if(!is.null(method$require)) { - sapply(method$require, function(x) require(force(x), character.only = TRUE)) - } - # get defaults for controls and make sure in correct format - control <- do.call('SuperLearner.control', control) - cvControl <- do.call('SuperLearner.CV.control', cvControl) - - # put together the library - # should this be in a new environment? - library <- .createLibrary(SL.library) - .check.SL.library(library = c(unique(library$library$predAlgorithm), library$screenAlgorithm)) - - call <- match.call(expand.dots = TRUE) - # should we be checking X and newX for data.frame? - # data.frame not required, but most of the built-in wrappers assume a data.frame - if(!inherits(X, 'data.frame')) message('X is not a data frame. Check the algorithms in SL.library to make sure they are compatible with non data.frame inputs') - varNames <- colnames(X) - N <- dim(X)[1L] - p <- dim(X)[2L] - k <- nrow(library$library) - kScreen <- length(library$screenAlgorithm) - Z <- matrix(NA, N, k) - libraryNames <- paste(library$library$predAlgorithm, library$screenAlgorithm[library$library$rowScreen], sep="_") - - if(p < 2 & !identical(library$screenAlgorithm, "All")) { - warning('Screening algorithms specified in combination with single-column X.') - } - - # put fitLibrary in it's own environment to locate later - fitLibEnv <- new.env() - assign('fitLibrary', vector('list', length = k), envir = fitLibEnv) - assign('libraryNames', libraryNames, envir = fitLibEnv) - evalq(names(fitLibrary) <- libraryNames, envir = fitLibEnv) - - # errors* records if an algorithm stops either in the CV step and/or in full data - errorsInCVLibrary <- rep(0, k) - errorsInLibrary <- rep(0, k) - - # if newX is missing, use X - if(is.null(newX)) { - newX <- X - } - # Are these checks still required? - if(!identical(colnames(X), colnames(newX))) { - stop("The variable names and order in newX must be identical to the variable names and order in X") - } - if (sum(is.na(X)) > 0 | sum(is.na(newX)) > 0 | sum(is.na(Y)) > 0) { - stop("missing data is currently not supported. Check Y, X, and newX for missing values") - } - if (!is.numeric(Y)) { - stop("the outcome Y must be a numeric vector") - } - # family can be either character or function, so these lines put everything together (code from glm()) - if (is.character(family)) - family <- get(family, mode="function", envir=env) - if (is.function(family)) - family <- family() - if (is.null(family$family)) { - print(family) - stop("'family' not recognized") - } - - if (family$family != "binomial" & isTRUE("cvAUC" %in% method$require)){ - stop("'method.AUC' is designed for the 'binomial' family only") - } - - # create CV folds - validRows <- CVFolds(N = N, id = id, Y = Y, cvControl = cvControl) - - # test id - if(is.null(id)) { - id <- seq(N) - } - if(!identical(length(id), N)) { - stop("id vector must have the same dimension as Y") - } - # test observation weights - if(is.null(obsWeights)) { - obsWeights <- rep(1, N) - } - if(!identical(length(obsWeights), N)) { - stop("obsWeights vector must have the same dimension as Y") - } - - # create function for the cross-validation step: - .crossValFUN <- function(valid, Y, dataX, id, obsWeights, library, kScreen, k, p, libraryNames, verbose) { - tempLearn <- dataX[-valid, , drop = FALSE] - tempOutcome <- Y[-valid] - tempValid <- dataX[valid, , drop = FALSE] - tempWhichScreen <- matrix(NA, nrow = kScreen, ncol = p) - tempId <- id[-valid] - tempObsWeights <- obsWeights[-valid] - - # should this be converted to a lapply also? - for(s in seq(kScreen)) { - screen_fn = get(library$screenAlgorithm[s], envir = env) - testScreen <- try(do.call(screen_fn, list(Y = tempOutcome, X = tempLearn, family = family, id = tempId, obsWeights = tempObsWeights))) - if(inherits(testScreen, "try-error")) { - warning(paste("replacing failed screening algorithm,", library$screenAlgorithm[s], ", with All()", "\n ")) - tempWhichScreen[s, ] <- TRUE - } else { - tempWhichScreen[s, ] <- testScreen - } - if(verbose) { - message(paste("Number of covariates in ", library$screenAlgorithm[s], " is: ", sum(tempWhichScreen[s, ]), sep = "")) - } - } #end screen - - # should this be converted to a lapply also? - out <- matrix(NA, nrow = nrow(tempValid), ncol = k) - for(s in seq(k)) { - pred_fn = library$library$predAlgorithm[s] - testAlg <- try(do.call(pred_fn, list(Y = tempOutcome, X = subset(tempLearn, select = tempWhichScreen[library$library$rowScreen[s], ], drop=FALSE), newX = subset(tempValid, select = tempWhichScreen[library$library$rowScreen[s], ], drop=FALSE), family = family, id = tempId, obsWeights = tempObsWeights))) - if(inherits(testAlg, "try-error")) { - warning(paste("Error in algorithm", library$library$predAlgorithm[s], "\n The Algorithm will be removed from the Super Learner (i.e. given weight 0) \n" )) - # errorsInCVLibrary[s] <<- 1 - # '<<-' doesn't work with snow. - } else { - out[, s] <- testAlg$pred - } - # verbose will not work in the GUI, but works in the terminal (test this) - if(verbose) message(paste("CV", libraryNames[s])) - } #end library - invisible(out) - } - # the lapply performs the cross-validation steps to create Z - # additional steps to put things in the correct order - # rbind unlists the output from lapply - # need to unlist folds to put the rows back in the correct order - time_train = system.time({ - Z[unlist(validRows, use.names = FALSE), ] <- do.call('rbind', parallel::parLapply(cl = cluster, X = validRows, fun = .crossValFUN, Y = Y, dataX = X, id = id, obsWeights = obsWeights, library = library, kScreen = kScreen, k = k, p = p, libraryNames = libraryNames, verbose = verbose)) - - # Check for errors. If any algorithms had errors, replace entire column with - # 0 even if error is only in one fold. - errorsInCVLibrary <- apply(Z, 2, function(x) any(is.na(x))) - if(sum(errorsInCVLibrary) > 0) { - Z[, as.logical(errorsInCVLibrary)] <- 0 - } - if(all(Z == 0)) { - stop("All algorithms dropped from library") - } - - # Compute weights for each algorithm in library. - getCoef <- method$computeCoef(Z = Z, Y = Y, libraryNames = libraryNames, - obsWeights = obsWeights, control = control, - verbose = verbose, - errorsInLibrary = errorsInCVLibrary) - coef <- getCoef$coef - names(coef) <- libraryNames - - }) # Finish timing. - - # Set a default in case the method does not return the optimizer result. - if(!("optimizer" %in% names(getCoef))) { - getCoef["optimizer"] <- NA - } - - # now fit all algorithms in library on entire learning data set and predict on newX - m <- dim(newX)[1L] - predY <- matrix(NA, nrow = m, ncol = k) - # whichScreen <- matrix(NA, nrow = kScreen, ncol = p) - - .screenFun <- function(fun, list) { - screen_fn = get(fun, envir = env) - testScreen <- try(do.call(screen_fn, list)) - if(inherits(testScreen, "try-error")) { - warning(paste("replacing failed screening algorithm,", fun, ", with All() in full data", "\n ")) - out <- rep(TRUE, ncol(list$X)) - } else { - out <- testScreen - } - return(out) - } - - time_predict = system.time({ - - whichScreen <- if (length(library$screenAlgorithm) < 2) { - sapply(library$screenAlgorithm, FUN = .screenFun, list = list(Y = Y, X = X, family = family, id = id, obsWeights = obsWeights), simplify = FALSE) - } else { - parallel::parSapply(cl = cluster, X = library$screenAlgorithm, FUN = .screenFun, list = list(Y = Y, X = X, family = family, id = id, obsWeights = obsWeights), simplify = FALSE) - } - whichScreen <- do.call(rbind, whichScreen) - # change to sapply? - # for(s in 1:k) { - # testAlg <- try(do.call(library$library$predAlgorithm[s], list(Y = Y, X = subset(X, select = whichScreen[library$library$rowScreen[s], ], drop=FALSE), newX = subset(newX, select = whichScreen[library$library$rowScreen[s], ], drop=FALSE), family = family, id = id, obsWeights = obsWeights))) - # if(inherits(testAlg, "try-error")) { - # warning(paste("Error in algorithm", library$library$predAlgorithm[s], " on full data", "\n The Algorithm will be removed from the Super Learner (i.e. given weight 0) \n" )) - # errorsInLibrary[s] <- 1 - # } else { - # predY[, s] <- testAlg$pred - # } - # if(control$saveFitLibrary) { - # fitLibrary[[s]] <- testAlg$fit - # } - # if(verbose) { - # message(paste("full", libraryNames[s])) - # } - # } - .predFun <- function(index, lib, Y, dataX, newX, whichScreen, family, id, obsWeights, verbose, control, libraryNames) { - out <- list(pred = NA, fitLibrary = NULL) - pred_fn = get(lib$predAlgorithm[index], envir = env) - testAlg <- try(do.call(pred_fn, list(Y = Y, X = subset(dataX, select = whichScreen[lib$rowScreen[index], ], drop=FALSE), newX = subset(newX, select = whichScreen[lib$rowScreen[index], ], drop=FALSE), family = family, id = id, obsWeights = obsWeights))) - if(inherits(testAlg, "try-error")) { - warning(paste("Error in algorithm", lib$predAlgorithm[index], " on full data", "\n The Algorithm will be removed from the Super Learner (i.e. given weight 0) \n" )) - out$pred <- rep.int(NA, times = nrow(newX)) - } else { - out$pred <- testAlg$pred - if(control$saveFitLibrary) { - # eval(bquote(fitLibrary[[.(index)]] <- .(testAlg$fit)), envir = fitLibEnv) - out$fitLibrary <- testAlg$fit - } - } - if(verbose) { - message(paste("full", libraryNames[index])) - } - invisible(out) - } - - - foo <- parallel::parLapply(cl = cluster, X = seq(k), fun = .predFun, lib = library$library, Y = Y, dataX = X, newX = newX, whichScreen = whichScreen, family = family, id = id, obsWeights = obsWeights, verbose = verbose, control = control, libraryNames = libraryNames) - predY <- do.call('cbind', lapply(foo, '[[', 'pred')) - assign('fitLibrary', lapply(foo, '[[', 'fitLibrary'), envir = fitLibEnv) - rm(foo) - - # predY <- do.call('cbind', parLapply(cluster, seq(k), fun = .predFun, lib = library$library, Y = Y, dataX = X, newX = newX, whichScreen = whichScreen, family = family, id = id, obsWeights = obsWeights, verbose = verbose, control = control, libraryNames = libraryNames)) - - # check for errors - errorsInLibrary <- apply(predY, 2, function(xx) any(is.na(xx))) - if (sum(errorsInLibrary) > 0) { - if (sum(coef[as.logical(errorsInLibrary)]) > 0) { - warning(paste0("Re-running estimation of coefficients removing failed algorithm(s)\n", - "Original coefficients are: \n", paste(coef, collapse = ", "), "\n")) - Z[, as.logical(errorsInLibrary)] <- 0 - if (all(Z == 0)) { - stop("All algorithms dropped from library") - } - getCoef <- method$computeCoef(Z = Z, Y = Y, libraryNames = libraryNames, - obsWeights = obsWeights, control = control, - verbose = verbose, - errorsInLibrary = errorsInLibrary) - coef <- getCoef$coef - names(coef) <- libraryNames - } else { - warning("coefficients already 0 for all failed algorithm(s)") - } - } - - # Compute super learner predictions on newX. - getPred <- method$computePred(predY = predY, coef = coef, control = control) - - }) # Finish timing. - - # Add names of algorithms to the predictions. - colnames(predY) <- libraryNames - - # Clean up when errors in library. - if(sum(errorsInCVLibrary) > 0) { - getCoef$cvRisk[as.logical(errorsInCVLibrary)] <- NA - } - - # Finish timing the full SuperLearner execution. - time_end = proc.time() - - # Compile execution times. - times = list(everything = time_end - time_start, - train = time_train, - predict = time_predict) - - # Put everything together in a list. - out <- list( - call = call, - libraryNames = libraryNames, - SL.library = library, - SL.predict = getPred, - coef = coef, - library.predict = predY, - Z = Z, - cvRisk = getCoef$cvRisk, - family = family, - fitLibrary = get('fitLibrary', envir = fitLibEnv), - id = id, - varNames = varNames, - validRows = validRows, - method = method, - whichScreen = whichScreen, - control = control, - errorsInCVLibrary = errorsInCVLibrary, - errorsInLibrary = errorsInLibrary, - obsWeights = obsWeights, - metaOptimizer = getCoef$optimizer, - env = env, - times = times - ) - class(out) <- c("SuperLearner") - return(out) -} diff --git a/R/summary.CV.SuperLearner.R b/R/summary.CV.SuperLearner.R index 138c241..278c13a 100644 --- a/R/summary.CV.SuperLearner.R +++ b/R/summary.CV.SuperLearner.R @@ -1,10 +1,46 @@ +#' Summary Function for Cross-Validated Super Learner +#' +#' Summary method for the \code{CV.SuperLearner()} function. +#' +#' Summary method for \code{CV.SuperLearner}. Calculates the V-fold +#' cross-validated estimate of either the mean squared error or the \eqn{-2 \log(L)} +#' depending on the loss function used. +#' +#' @param object An object of class "CV.SuperLearner", the result of a call to +#' \code{CV.SuperLearner}. +#' @param obsWeights Optional vector for observation weights. +#' @param x An object of class "summary.CV.SuperLearner", the result of a call +#' to \code{summary.CV.SuperLearner()}. +#' @param digits The number of significant digits to use when printing. +#' @param \dots Ignored. +#' +#' @returns +#' A list with components +#' \item{call}{ The function call from \code{CV.SuperLearner} } +#' \item{method}{ Describes the loss function used. Currently either least squares of negative log Likelihood. } +#' \item{V}{ Number of folds } +#' \item{Risk.SL}{ Risk estimate for the super learner } +#' \item{Risk.dSL}{ Risk estimate for the +#' discrete super learner (the cross-validation selector) } +#' \item{Risk.library}{ A matrix with the risk estimates for each algorithm in the library } +#' \item{Table}{ A table with the mean risk estimate and standard +#' deviation across the folds for the super learner and all algorithms in the +#' library } +#' +#' @author Eric C Polley \email{eric.polley@@nih.gov} +#' @seealso \code{\link{CV.SuperLearner}} +#' @keywords methods + +#' @exportS3Method summary CV.SuperLearner summary.CV.SuperLearner <- function(object, obsWeights = NULL, ...) { if ("env" %in% names(object)) { env = object$env - } else { + } + else { env = parent.frame() } + # Default is "method.NNLS". method <- if (is.null(as.list(object$call)[["method"]])) { method <- "method.NNLS" @@ -69,6 +105,8 @@ summary.CV.SuperLearner <- function(object, obsWeights = NULL, ...) { return(out) } +#' @exportS3Method print summary.CV.SuperLearner +#' @rdname summary.CV.SuperLearner print.summary.CV.SuperLearner <- function(x, digits = max(2, getOption("digits") - 2), ...) { cat("\nCall: ", deparse(x$call, width.cutoff = .9*getOption("width")), "\n", fill = getOption("width")) cat("Risk is based on: ") diff --git a/R/trimLogit.R b/R/trimLogit.R index bf795f0..163f217 100644 --- a/R/trimLogit.R +++ b/R/trimLogit.R @@ -1,6 +1,26 @@ -trimLogit <- function(x, trim=0.00001) { +#' Truncated-probabilities logit transformation +#' +#' Computes the logit transformation on the truncated probabilities. +#' +#' @param x vector of probabilities. +#' @param trim value to truncate probabilities at. Currently symmetric +#' truncation (trim and 1-trim). +#' +#' @returns +#' The logit-transformed trimmed values +#' +#' @keywords models +#' +#' @examples +#' x <- c(0.00000001, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.7, 0.9, 0.99, +#' 0.999, 0.9999, 0.99999999) +#' trimLogit(x, trim = 0.001) +#' data.frame(Prob = x, Logit = qlogis(x), trimLogit = trimLogit(x, 0.001)) + +#' @export +trimLogit <- function(x, trim = 0.00001) { x[x < trim] <- trim - x[x > (1-trim)] <- (1-trim) - foo <- log(x/(1-x)) - return(foo) + x[x > (1 - trim)] <- 1 - trim + + qlogis(x) } \ No newline at end of file diff --git a/R/zzz.R b/R/zzz.R index 9f2edae..a8afc0b 100755 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,5 +1,10 @@ -.onAttach <- function(...) { - packageStartupMessage('Super Learner') - packageStartupMessage('Version: ', utils::packageDescription('SuperLearner')$Version) - packageStartupMessage('Package created on ', utils::packageDescription('SuperLearner')$Date, '\n') +.onAttach <- function(libname, pkgname) { + v <- utils::packageVersion(pkgname) + b <- utils::packageDate(pkgname) + + foo <- sprintf("%s\nVersion: %s\nPackage created on: %s", + pkgname, + v, + if (anyNA(b)) "unknown" else format(b, "%F")) + packageStartupMessage(foo) } diff --git a/man/CV.SuperLearner.Rd b/man/CV.SuperLearner.Rd index 689aa1f..9e53ee2 100644 --- a/man/CV.SuperLearner.Rd +++ b/man/CV.SuperLearner.Rd @@ -1,125 +1,161 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CV.SuperLearner.R \name{CV.SuperLearner} \alias{CV.SuperLearner} -\alias{print.CV.SuperLearner} -\alias{coef.CV.SuperLearner} - -\title{ -Function to get V-fold cross-validated risk estimate for super learner -} -\description{ -Function to get V-fold cross-validated risk estimate for super learner. This function simply splits the data into V folds and then calls SuperLearner. Most of the arguments are passed directly to SuperLearner. -} +\title{V-fold cross-validated risk estimate for SuperLearner} \usage{ -CV.SuperLearner(Y, X, V = NULL, family = gaussian(), SL.library, - method = "method.NNLS", id = NULL, verbose = FALSE, - control = list(saveFitLibrary = FALSE), cvControl = list(), - innerCvControl = list(), - obsWeights = NULL, saveAll = TRUE, parallel = "seq", env = parent.frame()) +CV.SuperLearner( + Y, + X, + V = NULL, + family = gaussian(), + SL.library, + method = "method.NNLS", + id = NULL, + verbose = FALSE, + control = list(saveFitLibrary = FALSE), + cvControl = list(), + innerCvControl = list(), + obsWeights = NULL, + saveAll = TRUE, + parallel = "seq", + env = parent.frame() +) } - \arguments{ - \item{Y}{ -The outcome. -} - \item{X}{ -The covariates. -} - \item{V}{ -The number of folds for \code{CV.SuperLearner}. This argument will be depreciated and moved into the \code{cvControl}. If Both \code{V} and \code{cvControl} set the number of cross-validation folds, an error message will appear. The recommendation is to use \code{cvControl}. This is not the number of folds for \code{SuperLearner}. The number of folds for \code{SuperLearner} is controlled with \code{innerCvControl}. -} - \item{family}{ -Currently allows \code{gaussian} or \code{binomial} to describe the error distribution. Link function information will be ignored and should be contained in the method argument below. -} - \item{SL.library}{ -Either a character vector of prediction algorithms or a list containing character vectors. See details below for examples on the structure. A list of functions included in the SuperLearner package can be found with \code{listWrappers()}. -} - \item{method}{ -A list (or a function to create a list) containing details on estimating the coefficients for the super learner and the model to combine the individual algorithms in the library. See \code{?method.template} for details. Currently, the built in options are either "method.NNLS" (the default), "method.NNLS2", "method.NNloglik", "method.CC_LS", "method.CC_nloglik", or "method.AUC". NNLS and NNLS2 are non-negative least squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and binomial outcomes. NNloglik is a non-negative binomial likelihood maximization using the BFGS quasi-Newton optimization method. NN* methods are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's quadratic programming algorithm to calculate the best convex combination of weights to minimize the squared error loss. CC_nloglik calculates the convex combination of weights that minimize the negative binomial log likelihood on the logistic scale using the sequential quadratic programming algorithm. AUC, which only works for binary outcomes, uses the Nelder-Mead method via the optim function to minimize rank loss (equivalent to maximizing AUC). -} - \item{id}{ -Optional cluster identification variable. For the cross-validation splits, \code{id} forces observations in the same cluster to be in the same validation fold. \code{id} is passed to the prediction and screening algorithms in SL.library, but be sure to check the individual wrappers as many of them ignore the information. -} - \item{verbose}{ -Logical; TRUE for printing progress during the computation (helpful for debugging). -} - \item{control}{ -A list of parameters to control the estimation process. Parameters include \code{saveFitLibrary} and \code{trimLogit}. See \code{\link{SuperLearner.control}} for details. -} - \item{cvControl}{ -A list of parameters to control the outer cross-validation process. The outer cross-validation is the sample spliting for evaluating the SuperLearner. Parameters include \code{V}, \code{stratifyCV}, \code{shuffle} and \code{validRows}. See \code{\link{SuperLearner.CV.control}} for details. -} - \item{innerCvControl}{ -A list of lists of parameters to control the inner cross-validation process. It should have \code{V} elements in the list, each a valid \code{cvControl} list. If only a single value, then replicated across all folds. The inner cross-validation are the values passed to each of the \code{V} \code{SuperLearner} calls. Parameters include \code{V}, \code{stratifyCV}, \code{shuffle} and \code{validRows}. See \code{\link{SuperLearner.CV.control}} for details. -} - \item{obsWeights}{ -Optional observation weights variable. As with \code{id} above, \code{obsWeights} is passed to the prediction and screening algorithms, but many of the built in wrappers ignore (or can't use) the information. If you are using observation weights, make sure the library you specify uses the information. -} - \item{saveAll}{ -Logical; Should the entire \code{SuperLearner} object be saved for each fold? -} - \item{parallel}{ -Options for parallel computation of the V-fold step. Use "seq" (the default) for sequential computation. \code{parallel = 'multicore'} to use \code{mclapply} for the V-fold step (but note that \code{SuperLearner()} will still be sequential). The default for mclapply is to check the \code{mc.cores} option, and if not set to default to 2 cores. Be sure to set \code{options()$mc.cores} to the desired number of cores if you don't want the default. Or \code{parallel} can be the name of a snow cluster and will use \code{parLapply} for the V-fold step. For both multicore and snow, the inner \code{SuperLearner} calls will be sequential. - } - \item{env}{ - Environment containing the learner functions. Defaults to the calling environment. - } -} +\item{Y}{The outcome.} -\details{ -The \code{SuperLearner} function builds a estimator, but does not contain an estimate on the performance of the estimator. Various methods exist for estimator performance evaluation. If you are familiar with the super learner algorithm, it should be no surprise we recommend using cross-validation to evaluate the honest performance of the super learner estimator. The function \code{CV.SuperLearner} computes the usual V-fold cross-validated risk estimate for the super learner (and all algorithms in \code{SL.library} for comparison). +\item{X}{The covariates.} + +\item{V}{The number of folds for \code{CV.SuperLearner}. This argument will +be depreciated and moved into the \code{cvControl}. If Both \code{V} and +\code{cvControl} set the number of cross-validation folds, an error message +will appear. The recommendation is to use \code{cvControl}. This is not the +number of folds for \code{SuperLearner}. The number of folds for +\code{SuperLearner} is controlled with \code{innerCvControl}.} + +\item{family}{Currently allows \code{gaussian} or \code{binomial} to +describe the error distribution. Link function information will be ignored +and should be contained in the method argument below.} + +\item{SL.library}{Either a character vector of prediction algorithms or a +list containing character vectors. See details below for examples on the +structure. A list of functions included in the SuperLearner package can be +found with \code{\link[=listWrappers]{listWrappers()}}.} + +\item{method}{A list (or a function to create a list) containing details on +estimating the coefficients for the super learner and the model to combine +the individual algorithms in the library. See \code{?method.template} for +details. Currently, the built in options are either \code{"method.NNLS"} (the +default), \code{"method.NNLS2"}, \code{"method.NNloglik"}, \code{"method.CC_LS"}, +\code{"method.CC_nloglik"}, or \code{"method.AUC"}. NNLS and NNLS2 are non-negative least +squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb +and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and +binomial outcomes. NNloglik is a non-negative binomial likelihood +maximization using the BFGS quasi-Newton optimization method. NN* methods +are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's +quadratic programming algorithm to calculate the best convex combination of +weights to minimize the squared error loss. CC_nloglik calculates the convex +combination of weights that minimize the negative binomial log likelihood on +the logistic scale using the sequential quadratic programming algorithm. +AUC, which only works for binary outcomes, uses the Nelder-Mead method via +the optim function to minimize rank loss (equivalent to maximizing AUC).} + +\item{id}{Optional cluster identification variable. For the cross-validation +splits, \code{id} forces observations in the same cluster to be in the same +validation fold. \code{id} is passed to the prediction and screening +algorithms in \code{SL.library}, but be sure to check the individual wrappers as +many of them ignore the information.} + +\item{verbose}{Logical; \code{TRUE} for printing progress during the computation +(helpful for debugging).} + +\item{control}{A list of parameters to control the estimation process. +Parameters include \code{saveFitLibrary} and \code{trimLogit}. See +\code{\link{SuperLearner.control}} for details.} + +\item{cvControl}{A list of parameters to control the outer cross-validation +process. The outer cross-validation is the sample spliting for evaluating +the SuperLearner. Parameters include \code{V}, \code{stratifyCV}, +\code{shuffle} and \code{validRows}. See +\code{\link{SuperLearner.CV.control}} for details.} + +\item{innerCvControl}{A list of lists of parameters to control the inner +cross-validation process. It should have \code{V} elements in the list, each +a valid \code{cvControl} list. If only a single value, then replicated +across all folds. The inner cross-validation are the values passed to each +of the \code{V} \code{SuperLearner} calls. Parameters include \code{V}, +\code{stratifyCV}, \code{shuffle} and \code{validRows}. See +\code{\link{SuperLearner.CV.control}} for details.} + +\item{obsWeights}{Optional observation weights variable. As with \code{id} +above, \code{obsWeights} is passed to the prediction and screening +algorithms, but many of the built in wrappers ignore (or can't use) the +information. If you are using observation weights, make sure the library you +specify uses the information.} + +\item{saveAll}{Logical; Should the entire \code{SuperLearner} object be +saved for each fold?} + +\item{parallel}{Options for parallel computation of the V-fold step. Use +"seq" (the default) for sequential computation. \code{parallel = 'multicore'} to use \code{mclapply} for the V-fold step (but note that \code{SuperLearner()} will still be sequential). The default for \code{mclapply()} is +to check the \code{mc.cores} option, and if not set to default to 2 cores. +Be sure to set \code{options()$mc.cores} to the desired number of cores if +you don't want the default. Or \code{parallel} can be the name of a snow +cluster and will use \code{parLapply} for the V-fold step. For both +multicore and snow, the inner \code{SuperLearner} calls will be sequential.} + +\item{env}{Environment containing the learner functions. Defaults to the +calling environment.} } \value{ An object of class \code{CV.SuperLearner} (a list) with components: - \item{call}{ -The matched call. -} -\item{AllSL}{ -If \code{saveAll = TRUE}, a list with output from each call to \code{SuperLearner}, otherwise NULL. -} -\item{SL.predict}{ -The predicted values from the super learner when each particular row was part of the validation fold. -} +\item{call}{ The matched call. } +\item{AllSL}{ If \code{saveAll = TRUE}, a +list with output from each call to \code{SuperLearner}, otherwise NULL. } +\item{SL.predict}{ The predicted values from the super learner when each +particular row was part of the validation fold. } \item{discreteSL.predict}{ -The traditional cross-validated selector. Picks the algorithm with the smallest cross-validated risk (in super learner terms, gives that algorithm coefficient 1 and all others 0). -} -\item{whichDiscreteSL}{ -A list of length \code{V}. The elements in the list are the algorithm that had the smallest cross-validated risk estimate for that fold. -} -\item{library.predict}{ -A matrix with the predicted values from each algorithm in \code{SL.library}. The columns are the algorithms in \code{SL.library} and the rows represent the predicted values when that particular row was in the validation fold (i.e. not used to fit that estimator). -} -\item{coef}{ -A matrix with the coefficients for the super learner on each fold. The columns are the algorithms in \code{SL.library} the rows are the folds. -} -\item{folds}{ -A list containing the row numbers for each validation fold. -} -\item{V}{ -Number of folds for \code{CV.SuperLearner}. -} - \item{libraryNames}{ -A character vector with the names of the algorithms in the library. The format is 'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the prediction algorithm run on all variables in X. -} - \item{SL.library}{ -Returns \code{SL.library} in the same format as the argument with the same name above. -} -\item{method}{ -A list with the method functions. -} -\item{Y}{ -The outcome +The traditional cross-validated selector. Picks the algorithm with the +smallest cross-validated risk (in super learner terms, gives that algorithm +coefficient 1 and all others 0). } +\item{whichDiscreteSL}{ A list of length +\code{V}. The elements in the list are the algorithm that had the smallest +cross-validated risk estimate for that fold. } +\item{library.predict}{ A +matrix with the predicted values from each algorithm in \code{SL.library}. +The columns are the algorithms in \code{SL.library} and the rows represent +the predicted values when that particular row was in the validation fold +(i.e. not used to fit that estimator). } +\item{coef}{ A matrix with the coefficients for the super learner on each fold. The columns are the +algorithms in \code{SL.library} the rows are the folds. } +\item{folds}{ A list containing the row numbers for each validation fold. } +\item{V}{ Number of folds for \code{CV.SuperLearner}. } +\item{libraryNames}{ A character +vector with the names of the algorithms in the library. The format is +'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the +prediction algorithm run on all variables in X. } +\item{SL.library}{ Returns \code{SL.library} in the same format as the argument with the same name above. } +\item{method}{ A list with the method functions. } +\item{Y}{ The outcome } } +\description{ +This function simply splits the data into V folds and then calls SuperLearner. +Most of the arguments are passed directly to SuperLearner. } - - -\author{ Eric C Polley \email{epolley@uchicago.edu} } - -\seealso{ -\code{\link{SuperLearner}} +\details{ +The \code{SuperLearner} function builds a estimator, but does not contain an +estimate on the performance of the estimator. Various methods exist for +estimator performance evaluation. If you are familiar with the super learner +algorithm, it should be no surprise we recommend using cross-validation to +evaluate the honest performance of the super learner estimator. The function +\code{CV.SuperLearner} computes the usual V-fold cross-validated risk +estimate for the super learner (and all algorithms in \code{SL.library} for +comparison). } - \examples{ + \dontrun{ set.seed(23432) ## training set @@ -140,7 +176,7 @@ summary(test) ## Look at the coefficients across folds coef(test) -# Example with specifying cross-validation options for both +# Example with specifying cross-validation options for both # CV.SuperLearner (cvControl) and the internal SuperLearners (innerCvControl) test <- CV.SuperLearner(Y = Y, X = X, SL.library = SL.library, cvControl = list(V = 10, shuffle = FALSE), @@ -156,6 +192,12 @@ testSNOW <- CV.SuperLearner(Y = Y, X = X, SL.library = SL.library, method = "met summary(testSNOW) stopCluster(cl) } -} +} +\seealso{ +\code{\link{SuperLearner}} +} +\author{ +Eric C Polley \email{epolley@uchicago.edu} +} \keyword{models} diff --git a/man/CVFolds.Rd b/man/CVFolds.Rd index 9110793..faf0936 100644 --- a/man/CVFolds.Rd +++ b/man/CVFolds.Rd @@ -1,36 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CVFolds.R \name{CVFolds} \alias{CVFolds} - -\title{ -Generate list of row numbers for each fold in the cross-validation -} -\description{ -Generate list of row numbers for each fold in the cross-validation. \code{CVFolds} is used in the \code{SuperLearner} to create the cross-validation splits. -} +\title{Generate list of row numbers for each fold in the cross-validation} \usage{ CVFolds(N, id, Y, cvControl) } \arguments{ - \item{N}{ -Sample size -} - \item{id}{ -Optional cluster id variable. If present, all observations in the same cluster will always be in the same split. -} - \item{Y}{ -outcome -} - \item{cvControl}{ -Control parameters for the cross-validation step. See \code{\link{SuperLearner.CV.control}} for details. - } -} +\item{N}{Sample size} +\item{id}{Optional cluster id variable. If present, all observations in the +same cluster will always be in the same split.} + +\item{Y}{outcome} + +\item{cvControl}{Control parameters for the cross-validation step. See +\code{\link{SuperLearner.CV.control}} for details.} +} \value{ -\item{validRows}{ -A list of length V where each element in the list is a vector with the row numbers of the corresponding validation sample. +A list of length V where each element in the list +is a vector with the row numbers of the corresponding validation sample. } +\description{ +\code{CVFolds} is used in the \code{SuperLearner} to create the +cross-validation splits. +} +\author{ +Eric C Polley \email{epolley@uchicago.edu} } - -\author{ Eric C Polley \email{epolley@uchicago.edu} } - \keyword{utilities} diff --git a/man/SL.bartMachine.Rd b/man/SL.bartMachine.Rd index dc42207..31b4077 100644 --- a/man/SL.bartMachine.Rd +++ b/man/SL.bartMachine.Rd @@ -1,23 +1,41 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/SL.bartMachine.R -\encoding{utf-8} \name{SL.bartMachine} \alias{SL.bartMachine} +\alias{predict.SL.bartMachine} \title{Wrapper for bartMachine learner} \usage{ -SL.bartMachine(Y, X, newX, family, obsWeights, id, num_trees = 50, - num_burn_in = 250, verbose = F, alpha = 0.95, beta = 2, k = 2, - q = 0.9, nu = 3, num_iterations_after_burn_in = 1000, ...) +SL.bartMachine( + Y, + X, + newX = X, + family = gaussian(), + obsWeights = NULL, + id, + num_trees = 50, + num_burn_in = 250, + verbose = F, + alpha = 0.95, + beta = 2, + k = 2, + q = 0.9, + nu = 3, + num_iterations_after_burn_in = 1000, + ... +) + +\method{predict}{SL.bartMachine}(object, newdata, ...) } \arguments{ -\item{Y}{Outcome variable} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\item{X}{Covariate dataframe} +\item{X}{The predictor variables in the training data set, usually a data.frame.} -\item{newX}{Optional dataframe to predict the outcome} +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} -\item{family}{"gaussian" for regression, "binomial" for binary -classification} +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} \item{obsWeights}{Optional observation-level weights (supported but not tested)} @@ -37,11 +55,11 @@ nonterminal or not.} \item{beta}{Power hyperparameter in tree prior for whether a node is nonterminal or not.} -\item{k}{For regression, k determines the prior probability that E(Y|X) is -contained in the interval (y_{min}, y_{max}), based on a normal -distribution. For example, when k=2, the prior probability is 95\%. For -classification, k determines the prior probability that E(Y|X) is between -(-3,3). Note that a larger value of k results in more shrinkage and a more +\item{k}{For regression, \code{k} determines the prior probability that \eqn{E(Y|X)} is +contained in the interval \eqn{[y_\text{min}, y_\text{max}]}, based on a normal +distribution. For example, when \code{k = 2}, the prior probability is 95\%. For +classification, \code{k} determines the prior probability that \eqn{E(Y|X)} is between +-3 and 3. Note that a larger value of \code{k} results in more shrinkage and a more conservative fit.} \item{q}{Quantile of the prior on the error variance at which the data-based @@ -49,14 +67,18 @@ estimate is placed. Note that the larger the value of q, the more aggressive the fit as you are placing more prior weight on values lower than the data-based estimate. Not used for classification.} -\item{nu}{Degrees of freedom for the inverse chi^2 prior. Not used for +\item{nu}{Degrees of freedom for the inverse \eqn{\chi^2} prior. Not used for classification.} \item{num_iterations_after_burn_in}{Number of MCMC samples to draw from the -posterior distribution of f(x).} +posterior distribution of \eqn{f(x)}.} \item{...}{Additional arguments (not used)} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} } \description{ -Support bayesian additive regression trees via the bartMachine package. +Support Bayesian additive regression trees via the \pkg{bartMachine} package. } diff --git a/man/SL.bayesglm.Rd b/man/SL.bayesglm.Rd new file mode 100644 index 0000000..1b1c453 --- /dev/null +++ b/man/SL.bayesglm.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SL.bayesglm.R +\name{SL.bayesglm} +\alias{SL.bayesglm} +\alias{predict.SL.bayesglm} +\title{Wrapper for Bayesian GLM learner using \code{arm}} +\usage{ +SL.bayesglm(Y, X, newX = X, family = gaussian(), obsWeights = NULL, ...) + +\method{predict}{SL.bayesglm}(object, newdata, ...) +} +\arguments{ +\item{Y}{The outcome in the training data set. Must be a numeric vector.} + +\item{X}{The predictor variables in the training data set, usually a data.frame.} + +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} + +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} + +\item{obsWeights}{Optional observation weights.} + +\item{...}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} +} +\description{ +Support Bayesian GLM via the \pkg{arm} package. +} diff --git a/man/SL.biglasso.Rd b/man/SL.biglasso.Rd index 6a403ea..f2506ff 100644 --- a/man/SL.biglasso.Rd +++ b/man/SL.biglasso.Rd @@ -2,22 +2,37 @@ % Please edit documentation in R/SL.biglasso.R \name{SL.biglasso} \alias{SL.biglasso} +\alias{predict.SL.biglasso} \title{SL wrapper for biglasso} \usage{ -SL.biglasso(Y, X, newX, family, obsWeights, penalty = "lasso", - alg.logistic = "Newton", screen = "SSR", alpha = 1, nlambda = 100, - eval.metric = "default", ncores = 1, nfolds = 5, ...) +SL.biglasso( + Y, + X, + newX = X, + family = gaussian(), + penalty = "lasso", + alg.logistic = "Newton", + screen = "SSR", + alpha = 1, + nlambda = 100, + eval.metric = "default", + ncores = 1, + nfolds = 5, + ... +) + +\method{predict}{SL.biglasso}(object, newdata, ...) } \arguments{ -\item{Y}{Outcome variable} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\item{X}{Training dataframe} +\item{X}{The predictor variables in the training data set, usually a data.frame.} -\item{newX}{Test dataframe} +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} -\item{family}{Gaussian or binomial} - -\item{obsWeights}{Observation-level weights} +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} \item{penalty}{The penalty to be applied to the model. Either "lasso" (default), "ridge", or "enet" (elastic net).} @@ -51,13 +66,16 @@ cluster created by the \code{parallel} package.} \item{nfolds}{The number of cross-validation folds. Default is 5.} -\item{...}{Any additional arguments, not currently used.} +\item{...}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} } \description{ SL wrapper for biglasso } \examples{ - data(Boston, package = "MASS") Y = Boston$medv # Remove outcome from covariate dataframe. @@ -84,6 +102,6 @@ Data. https://CRAN.R-project.org/package=biglasso. } \seealso{ \code{\link{predict.SL.biglasso}} \code{\link[biglasso]{biglasso}} - \code{\link[biglasso]{cv.biglasso}} - \code{\link[biglasso]{predict.biglasso}} \code{\link{SL.glmnet}} +\code{\link[biglasso]{cv.biglasso}} +\code{\link[biglasso]{predict.biglasso}} \code{\link{SL.glmnet}} } diff --git a/man/SL.caret.Rd b/man/SL.caret.Rd new file mode 100644 index 0000000..6dc5893 --- /dev/null +++ b/man/SL.caret.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SL.caret.R +\name{SL.caret} +\alias{SL.caret} +\title{SL wrapper for machine learning learner using \code{caret}} +\usage{ +SL.caret( + Y, + X, + newX = X, + family = gaussian(), + obsWeights = NULL, + method = "rf", + tuneLength = 3, + trControl = caret::trainControl(method = "cv", number = 10, verboseIter = TRUE), + metric = ifelse(family$family == "gaussian", "RMSE", "Accuracy"), + ... +) +} +\arguments{ +\item{Y}{The outcome in the training data set. Must be a numeric vector.} + +\item{X}{The predictor variables in the training data set, usually a data.frame.} + +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} + +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} + +\item{obsWeights}{Optional observation weights.} + +\item{method, tuneLength, trControl, metric}{arguments passed to \code{caret::train()}} + +\item{...}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} +} +\description{ +Support machine learning via the \pkg{caret} package. +} +\examples{ +# how to change to a different method: +SL.caret.rpart <- function(..., method = "rpart") { + SL.caret(..., method = method) +} +} diff --git a/man/SL.cforest.Rd b/man/SL.cforest.Rd index 27adbdf..f8fc785 100644 --- a/man/SL.cforest.Rd +++ b/man/SL.cforest.Rd @@ -2,43 +2,52 @@ % Please edit documentation in R/SL.cforest.R \name{SL.cforest} \alias{SL.cforest} -\title{cforest {party}} +\alias{predict.SL.cforest} +\title{SL wrapper for \code{party::cforest()}} \usage{ -SL.cforest(Y, X, newX, family, obsWeights, id, ntree = 1000, - mtry = max(floor(ncol(X)/3), 1), mincriterion = 0, teststat = "quad", - testtype = "Univ", replace = F, fraction = 0.632, ...) +SL.cforest( + Y, + X, + newX = X, + family = gaussian(), + obsWeights = NULL, + ntree = 1000, + mtry = max(floor(ncol(X)/3), 1), + mincriterion = 0, + teststat = "quad", + testtype = "Univ", + replace = F, + fraction = 0.632, + ... +) + +\method{predict}{SL.cforest}(object, newdata, ...) } \arguments{ -\item{Y}{Outcome variable} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\item{X}{Covariate dataframe} +\item{X}{The predictor variables in the training data set, usually a data.frame.} -\item{newX}{Optional dataframe to predict the outcome} +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} -\item{family}{"gaussian" for regression, "binomial" for binary -classification} +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} -\item{obsWeights}{Optional observation-level weights (supported but not tested)} +\item{obsWeights}{Optional observation weights.} -\item{id}{Optional id to group observations from the same unit (not used -currently).} +\item{ntree}{Number of trees.} -\item{ntree}{Number of trees} +\item{mtry}{Number of randomly selected features per node.} -\item{mtry}{Number of randomly selected features per node} +\item{mincriterion, teststat, testtype, replace, fraction}{Arguments passed to \code{party::cforest_control()}.} -\item{mincriterion}{See ?cforest_control} +\item{...}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} -\item{teststat}{See ?cforest_control} +\item{object}{an object resulting from the fitting function.} -\item{testtype}{See ?cforest_control} - -\item{replace}{See ?cforest_control} - -\item{fraction}{See ?cforest_control} - -\item{...}{Remaining arguments (unused)} +\item{newdata}{a data frame of predictors for which to compute predictions.} } \description{ -These defaults emulate cforest_unbiased() but allow customization. +These defaults emulate \code{party::cforest_unbiased()} but allow customization. } diff --git a/man/SL.earth.Rd b/man/SL.earth.Rd new file mode 100644 index 0000000..ce70b44 --- /dev/null +++ b/man/SL.earth.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SL.earth.R +\name{SL.earth} +\alias{SL.earth} +\alias{predict.SL.earth} +\title{SL wrapper for \code{earth::earth()}} +\usage{ +SL.earth( + Y, + X, + newX = X, + family = gaussian(), + degree = 2, + penalty = 3, + nk = max(21, 2 * ncol(X) + 1), + pmethod = "backward", + nfold = 0, + ncross = 1, + minspan = 0, + endspan = 0, + ... +) + +\method{predict}{SL.earth}(object, newdata, ...) +} +\arguments{ +\item{Y}{The outcome in the training data set. Must be a numeric vector.} + +\item{X}{The predictor variables in the training data set, usually a data.frame.} + +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} + +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} + +\item{degree, penalty, nk, pmethod, nfold, ncross, minspan, endspan}{Arguments passed to \code{earth::earth()}.} + +\item{...}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} +} +\description{ +SL wrapper for \code{earth::earth()} +} diff --git a/man/SL.gam.Rd b/man/SL.gam.Rd new file mode 100644 index 0000000..3047da9 --- /dev/null +++ b/man/SL.gam.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SL.gam.R +\name{SL.gam} +\alias{SL.gam} +\alias{predict.SL.gam} +\title{SL wrapper for GAMs} +\usage{ +SL.gam( + Y, + X, + newX = X, + family = gaussian(), + obsWeights = NULL, + deg.gam = 2, + cts.num = 4, + ... +) + +\method{predict}{SL.gam}(object, newdata, ...) +} +\arguments{ +\item{Y}{The outcome in the training data set. Must be a numeric vector.} + +\item{X}{The predictor variables in the training data set, usually a data.frame.} + +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} + +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} + +\item{obsWeights}{Optional observation weights.} + +\item{deg.gam}{Degree of smoothing (passed to the \code{df} argument of \code{gam::s()}).} + +\item{cts.num}{Number of levels required for a variable to be consider continuous (and therefore given a smoothing spline).} + +\item{...}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} +} +\description{ +Wrapper for generalized additivie models (GAMs) using \code{gam::gam()}. +} +\examples{ +# easy to add additional algorithms with different degrees +} diff --git a/man/SL.glm.Rd b/man/SL.glm.Rd index b2471a1..2d45f81 100644 --- a/man/SL.glm.Rd +++ b/man/SL.glm.Rd @@ -2,34 +2,84 @@ % Please edit documentation in R/SL.glm.R \name{SL.glm} \alias{SL.glm} -\title{Wrapper for glm} +\alias{SL.glm.interaction} +\alias{SL.lm} +\alias{SL.mean} +\alias{predict.SL.glm} +\title{SL wrapper for \code{glm()}} \usage{ -SL.glm(Y, X, newX, family, obsWeights, model = TRUE, ...) +SL.glm( + Y, + X, + newX = X, + family = gaussian(), + obsWeights = NULL, + model = TRUE, + ... +) + +SL.glm.interaction( + Y, + X, + newX = X, + family = gaussian(), + obsWeights = NULL, + model = TRUE, + ... +) + +SL.lm( + Y, + X, + newX = X, + family = gaussian(), + obsWeights = NULL, + model = TRUE, + ... +) + +SL.mean( + Y, + X, + newX = X, + family = gaussian(), + obsWeights = NULL, + model = TRUE, + ... +) + +\method{predict}{SL.glm}(object, newdata, ...) } \arguments{ -\item{Y}{Outcome variable} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\item{X}{Training dataframe} +\item{X}{The predictor variables in the training data set, usually a data.frame.} -\item{newX}{Test dataframe} +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} -\item{family}{Gaussian or binomial} +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} -\item{obsWeights}{Observation-level weights} +\item{obsWeights}{Optional observation weights.} -\item{model}{Whether to save model.matrix of data in fit object. Set to FALSE +\item{model}{Whether to save \code{model.matrix} of data in fit object. Set to \code{FALSE} to save memory.} -\item{...}{Any remaining arguments, not used.} +\item{...}{Ignored.} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} } \description{ -Wrapper for generalized linear models via glm(). - +\code{SL.glm()} is a wrapper for generalized linear models (GLMs) fit with \code{\link[=glm]{glm()}}. \code{SL.glm.interaction()} is the same but automatically includes all 2-way interactions between predictors. \code{SL.lm()} is a wrapper for linear models fit with \code{\link[=lm]{lm()}}. \code{SL.mean()} is a wrapper for an intercept-only model. +} +\details{ Note that for outcomes bounded by [0, 1] the binomial family can be used in addition to gaussian. } \examples{ - data(Boston, package = "MASS") Y = Boston$medv # Remove outcome from covariate dataframe. @@ -37,11 +87,11 @@ X = Boston[, -14] set.seed(1) -sl = SuperLearner(Y, X, family = gaussian(), - SL.library = c("SL.mean", "SL.glm")) +sl <- SuperLearner(Y, X, family = gaussian(), + SL.library = c("SL.mean", "SL.glm", + "SL.glm.interaction")) print(sl) - } \references{ Fox, J. (2015). Applied regression analysis and generalized linear models. @@ -49,5 +99,5 @@ Sage Publications. } \seealso{ \code{\link{predict.SL.glm}} \code{\link[stats]{glm}} - \code{\link[stats]{predict.glm}} \code{\link{SL.speedglm}} +\code{\link[stats]{predict.glm}} \code{\link{SL.speedglm}} } diff --git a/man/SL.glmnet.Rd b/man/SL.glmnet.Rd index 8cbb814..a66f2e5 100644 --- a/man/SL.glmnet.Rd +++ b/man/SL.glmnet.Rd @@ -2,27 +2,46 @@ % Please edit documentation in R/SL.glmnet.R \name{SL.glmnet} \alias{SL.glmnet} -\title{Elastic net regression, including lasso and ridge} +\alias{predict.SL.glmnet} +\title{SL wrapper for elastic net regression, including lasso and ridge} \usage{ -SL.glmnet(Y, X, newX, family, obsWeights, id, alpha = 1, nfolds = 10, - nlambda = 100, useMin = TRUE, loss = "deviance", ...) +SL.glmnet( + Y, + X, + newX, + family, + obsWeights, + id, + alpha = 1, + nfolds = 10, + nlambda = 100, + useMin = TRUE, + loss = "deviance", + ... +) + +\method{predict}{SL.glmnet}( + object, + newdata, + remove_extra_cols = TRUE, + add_missing_cols = TRUE, + ... +) } \arguments{ -\item{Y}{Outcome variable} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\item{X}{Covariate dataframe} +\item{X}{The predictor variables in the training data set, usually a data.frame.} -\item{newX}{Dataframe to predict the outcome} +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} -\item{family}{"gaussian" for regression, "binomial" for binary -classification. Untested options: "multinomial" for multiple classification -or "mgaussian" for multiple response, "poisson" for non-negative outcome -with proportional mean and variance, "cox".} +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} -\item{obsWeights}{Optional observation-level weights} +\item{obsWeights}{Optional observation weights.} -\item{id}{Optional id to group observations from the same unit (not used -currently).} +\item{id}{Optional cluster identification variable.} \item{alpha}{Elastic net mixing parameter, range [0, 1]. 0 = ridge regression and 1 = lasso.} @@ -39,7 +58,17 @@ background).} \item{loss}{Loss function, can be "deviance", "mse", or "mae". If family = binomial can also be "auc" or "class" (misclassification error).} -\item{...}{Any additional arguments are passed through to cv.glmnet.} +\item{...}{Any additional arguments are passed through to \code{cv.glmnet()}.} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} + +\item{remove_extra_cols}{Remove any extra columns in the new data that were +not part of the original model.} + +\item{add_missing_cols}{Add any columns from original data that do not exist +in the new data, and set values to 0.} } \description{ Penalized regression using elastic net. Alpha = 0 corresponds to ridge @@ -84,5 +113,5 @@ Methodology), 67(2), 301-320. } \seealso{ \code{\link{predict.SL.glmnet}} \code{\link[glmnet]{cv.glmnet}} - \code{\link[glmnet]{glmnet}} +\code{\link[glmnet]{glmnet}} } diff --git a/man/SL.kernelKnn.Rd b/man/SL.kernelKnn.Rd index 3cb7d3b..4f8a920 100644 --- a/man/SL.kernelKnn.Rd +++ b/man/SL.kernelKnn.Rd @@ -2,19 +2,34 @@ % Please edit documentation in R/SL.kernelKnn.R \name{SL.kernelKnn} \alias{SL.kernelKnn} +\alias{predict.SL.kernelKnn} \title{SL wrapper for KernelKNN} \usage{ -SL.kernelKnn(Y, X, newX, family, k = 10, method = "euclidean", - weights_function = NULL, extrema = F, h = 1, ...) +SL.kernelKnn( + Y, + X, + newX = X, + family = gaussian(), + k = 10, + method = "euclidean", + weights_function = NULL, + extrema = FALSE, + h = 1, + ... +) + +\method{predict}{SL.kernelKnn}(object, newdata, ...) } \arguments{ -\item{Y}{Outcome variable} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\item{X}{Training dataframe} +\item{X}{The predictor variables in the training data set, usually a data.frame.} -\item{newX}{Test dataframe} +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} -\item{family}{Gaussian or binomial} +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} \item{k}{Number of nearest neighbors to use} @@ -35,18 +50,17 @@ k-nearest-neighbors will be removed (can be thought as outlier removal).} \item{h}{the bandwidth, applicable if the weights_function is not NULL. Defaults to 1.0.} -\item{...}{Any additional parameters, not currently passed through.} -} -\value{ -List with predictions and the original training data & - hyperparameters. +\item{...}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} } \description{ Wrapper for a configurable implementation of k-nearest - neighbors. Supports both binomial and gaussian outcome distributions. +neighbors. Supports both binomial and gaussian outcome distributions. } \examples{ - # Load a test dataset. data(PimaIndiansDiabetes2, package = "mlbench") @@ -63,5 +77,4 @@ set.seed(1) sl = SuperLearner(Y_bin, X, family = binomial(), SL.library = c("SL.mean", "SL.kernelKnn")) sl - } diff --git a/man/SL.ksvm.Rd b/man/SL.ksvm.Rd index d96fb7a..a30af29 100644 --- a/man/SL.ksvm.Rd +++ b/man/SL.ksvm.Rd @@ -1,23 +1,43 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/SL.ksvm.R -\encoding{utf-8} \name{SL.ksvm} \alias{SL.ksvm} -\title{Wrapper for Kernlab's SVM algorithm} +\alias{predict.SL.ksvm} +\title{SL wrapper for Kernlab's SVM algorithm} \usage{ -SL.ksvm(Y, X, newX, family, type = NULL, kernel = "rbfdot", - kpar = "automatic", scaled = T, C = 1, nu = 0.2, epsilon = 0.1, - cross = 0, prob.model = family$family == "binomial", - class.weights = NULL, cache = 40, tol = 0.001, shrinking = T, ...) +SL.ksvm( + Y, + X, + newX = X, + family = gaussian(), + type = NULL, + kernel = "rbfdot", + kpar = "automatic", + scaled = TRUE, + C = 1, + nu = 0.2, + epsilon = 0.1, + cross = 0, + prob.model = family$family == "binomial", + class.weights = NULL, + cache = 40, + tol = 0.001, + shrinking = TRUE, + ... +) + +\method{predict}{SL.ksvm}(object, newdata, family, coupler = "minpair", ...) } \arguments{ -\item{Y}{Outcome variable} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\item{X}{Training dataframe} +\item{X}{The predictor variables in the training data set, usually a data.frame.} -\item{newX}{Test dataframe} +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} -\item{family}{Gaussian or binomial} +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} \item{type}{ksvm can be used for classification , for regression, or for novelty detection. Depending on whether y is a factor or not, the default @@ -69,17 +89,19 @@ used for asymmetric class sizes. Not all factor levels have to be supplied \item{shrinking}{option whether to use the shrinking-heuristics (default: TRUE)} -\item{...}{Any additional parameters, not currently passed through.} -} -\value{ -List with predictions and the original training data & - hyperparameters. +\item{...}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} + +\item{coupler}{Coupling method used in the multiclass case, can be one of +minpair or pkpd (see kernlab package for details). For future usage.} } \description{ Wrapper for Kernlab's support vector machine algorithm. } \examples{ - data(Boston, package = "MASS") Y = Boston$medv # Remove outcome from covariate dataframe. @@ -94,10 +116,11 @@ sl pred = predict(sl, X) summary(pred$pred) + } \references{ Hsu, C. W., Chang, C. C., & Lin, C. J. (2016). A practical guide to support -vector classification. \url{https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf} +vector classification. \url{http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf} Scholkopf, B., & Smola, A. J. (2001). Learning with kernels: support vector machines, regularization, optimization, and beyond. MIT press. @@ -110,5 +133,5 @@ package for kernel methods in R. Journal of statistical software, 11(9), } \seealso{ \code{\link{predict.SL.ksvm}} \code{\link[kernlab]{ksvm}} - \code{\link[kernlab]{predict.ksvm}} +\code{\link[kernlab]{predict.ksvm}} } diff --git a/man/SL.lda.Rd b/man/SL.lda.Rd index 23fffcb..d41b6fe 100644 --- a/man/SL.lda.Rd +++ b/man/SL.lda.Rd @@ -2,47 +2,101 @@ % Please edit documentation in R/SL.lda.R \name{SL.lda} \alias{SL.lda} -\title{SL wrapper for MASS:lda} +\alias{SL.qda} +\alias{predict.SL.lda} +\alias{predict.SL.qda} +\title{SL wrappers for \code{MASS::lda()} and \code{MASS::qda()}} \usage{ -SL.lda(Y, X, newX, family, obsWeights = rep(1, nrow(X)), id = NULL, - verbose = F, prior = as.vector(prop.table(table(Y))), method = "mle", - tol = 1e-04, CV = F, nu = 5, ...) +SL.lda( + Y, + X, + newX = X, + family = binomial(), + verbose = FALSE, + prior = as.vector(prop.table(table(Y))), + method = "mle", + tol = 1e-04, + CV = FALSE, + nu = 5, + ... +) + +SL.qda( + Y, + X, + newX = X, + family = binomial(), + verbose = FALSE, + prior = as.vector(prop.table(table(Y))), + method = "mle", + tol = 1e-04, + CV = FALSE, + nu = 5, + ... +) + +\method{predict}{SL.lda}( + object, + newdata, + prior = object$object$prior, + dimen = NULL, + method = "plug-in", + ... +) + +\method{predict}{SL.qda}( + object, + newdata, + prior = object$object$prior, + dimen = NULL, + method = "plug-in", + ... +) } \arguments{ -\item{Y}{Outcome variable} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\item{X}{Training dataframe} +\item{X}{The predictor variables in the training data set, usually a data.frame.} -\item{newX}{Test dataframe} +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} \item{family}{Binomial only, cannot be used for regression.} -\item{obsWeights}{Observation-level weights} - -\item{id}{Not supported.} - \item{verbose}{If TRUE, display additional output during execution.} \item{prior}{the prior probabilities of class membership. If unspecified, the -class proportions for the training set are used. If present, the -probabilities should be specified in the order of the factor levels.} +class proportions for the training set are used. For \code{SL.lda()} and \code{SL.qda()}, if present, the +probabilities should be specified in the order of the factor levels. For \code{predict.SL.lda()}, defaults to the +proportions in the training set or what was set in the call to \code{lda()} or \code{qda()}.} -\item{method}{"moment" for standard estimators of the mean and variance, +\item{method}{for \code{SL.lda()} and \code{SL.qda()}, "moment" for standard estimators of the mean and variance, "mle" for MLEs, "mve" to use cov.mve, or "t" for robust estimates based on -a t distribution.} +a t distribution. For \code{predict.SL.lda()} and \code{predict.SL.qda()}, this determines how the parameter estimation is handled. With "plug-in" (the default) the usual unbiased parameter estimates are used and +assumed to be correct. With "debiased" an unbiased estimator of the log +posterior probabilities is used, and with "predictive" the parameter +estimates are integrated out using a vague prior.} \item{tol}{tolerance} -\item{CV}{If true, returns results (classes and posterior probabilities) for +\item{CV}{If \code{TRUE}, returns results (classes and posterior probabilities) for leave-one-out cross-validation. Note that if the prior is estimated, the proportions in the whole dataset are used.} -\item{nu}{degrees of freedom for method = "t".} +\item{nu}{degrees of freedom for \code{method = "t"}.} + +\item{...}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} -\item{...}{Any additional arguments, not currently used.} +\item{dimen}{the dimension of the space to be used. If this is less than +\code{min(p, ng-1)}, only the first dimen discriminant components are used (except +for \code{method="predictive"}), and only those dimensions are returned in x.} } \description{ -Linear discriminant analysis, used for classification. +Linear and quadratic discriminant analysis, used for classification. } \examples{ @@ -55,7 +109,8 @@ set.seed(1) # Use only 2 CV folds to speed up example. sl = SuperLearner(Y, X, family = binomial(), cvControl = list(V = 2), - SL.library = c("SL.mean", "SL.lda")) + SL.library = c("SL.mean", "SL.lda", + "SL.qda")) sl pred = predict(sl, X) @@ -68,5 +123,5 @@ to Statistical Learning (Vol. 6). New York: Springer. Section 4.4. } \seealso{ \code{\link{predict.SL.lda}} \code{\link[MASS]{lda}} - \code{\link[MASS]{predict.lda}} \code{\link{SL.qda}} +\code{\link[MASS]{predict.lda}} \code{\link{SL.qda}} } diff --git a/man/SL.lm.Rd b/man/SL.lm.Rd deleted file mode 100644 index 058d911..0000000 --- a/man/SL.lm.Rd +++ /dev/null @@ -1,50 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.lm.R -\name{SL.lm} -\alias{SL.lm} -\title{Wrapper for lm} -\usage{ -SL.lm(Y, X, newX, family, obsWeights, model = TRUE, ...) -} -\arguments{ -\item{Y}{Outcome variable} - -\item{X}{Training dataframe} - -\item{newX}{Test dataframe} - -\item{family}{Gaussian or binomial} - -\item{obsWeights}{Observation-level weights} - -\item{model}{Whether to save model.matrix of data in fit object. Set to FALSE -to save memory.} - -\item{...}{Any remaining arguments, not used.} -} -\description{ -Wrapper for OLS via lm(), which may be faster than glm(). -} -\examples{ - -data(Boston, package = "MASS") -Y = Boston$medv -# Remove outcome from covariate dataframe. -X = Boston[, -14] - -set.seed(1) - -sl = SuperLearner(Y, X, family = gaussian(), - SL.library = c("SL.mean", "SL.lm")) - -print(sl) - -} -\references{ -Fox, J. (2015). Applied regression analysis and generalized linear models. -Sage Publications. -} -\seealso{ -\code{\link{predict.SL.lm}} \code{\link[stats]{lm}} - \code{\link[stats]{predict.lm}} \code{\link{SL.speedlm}} -} diff --git a/man/SL.method.Rd b/man/SL.method.Rd new file mode 100644 index 0000000..898d7d6 --- /dev/null +++ b/man/SL.method.Rd @@ -0,0 +1,88 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method.R +\name{method} +\alias{method} +\alias{method.template} +\alias{write.method.template} +\alias{method.NNLS} +\alias{method.NNLS2} +\alias{method.NNloglik} +\alias{method.CC_LS} +\alias{method.CC_nloglik} +\alias{method.AUC} +\title{Methods to estimate the coefficients for the SuperLearner} +\usage{ +method.template() + +write.method.template(file = "", ...) + +method.NNLS() + +method.NNLS2() + +method.NNloglik() + +method.CC_LS() + +method.CC_nloglik() + +method.AUC( + nlopt_method = NULL, + optim_method = "L-BFGS-B", + bounds = c(0, Inf), + normalize = TRUE +) +} +\arguments{ +\item{file}{A connection, or a character string naming a file to print to. +Passed to \code{\link{cat}}.} + +\item{\dots}{Additional arguments passed to \code{\link{cat}}.} + +\item{nlopt_method}{Either \code{optim_method} or \code{nlopt_method} must +be provided, the other must be \code{NULL}} + +\item{optim_method}{Passed to the \code{optim} call method. See +\code{\link{optim}} for details.} + +\item{bounds}{Bounds for parameter estimates} + +\item{normalize}{Logical. Should the parameters be normalized to sum up to 1} +} +\value{ +A list containing 3 elements: +\item{require}{ A character vector +listing any required packages. Use \code{NULL} if no additional packages are +required } +\item{computeCoef}{ A function. The arguments are: \code{Z}, +\code{Y}, \code{libraryNames}, \code{obsWeights}, \code{control}, +\code{verbose}. The value is a list with two items: \code{cvRisk} and +\code{coef}. This function computes the coefficients of the super learner. +As the super learner minimizes the cross-validated risk, the loss function +information is contained in this function as well as the model to combine +the algorithms in \code{SL.library}. } +\item{computePred}{ A function. The +arguments are: \code{predY}, \code{coef}, \code{control}. The value is a +numeric vector with the super learner predicted values. } +} +\description{ +These functions contain the information on the loss function and the model +to combine algorithms. +} +\details{ +A \code{SuperLearner} method must be a list (or a function to create a list) +with exactly 3 elements. The 3 elements must be named \code{require}, +\code{computeCoef} and \code{computePred}. +} +\examples{ +write.method.template(file = '') + + +} +\seealso{ +\code{\link[=SuperLearner]{SuperLearner()}} +} +\author{ +Eric C Polley \email{Polley.Eric@mayo.edu} +} +\keyword{utilities} diff --git a/man/SL.qda.Rd b/man/SL.qda.Rd deleted file mode 100644 index 9c80024..0000000 --- a/man/SL.qda.Rd +++ /dev/null @@ -1,73 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.qda.R -\name{SL.qda} -\alias{SL.qda} -\title{SL wrapper for MASS:qda} -\usage{ -SL.qda(Y, X, newX, family, obsWeights = rep(1, nrow(X)), verbose = F, - id = NULL, prior = as.vector(prop.table(table(Y))), method = "mle", - tol = 1e-04, CV = F, nu = 5, ...) -} -\arguments{ -\item{Y}{Outcome variable} - -\item{X}{Training dataframe} - -\item{newX}{Test dataframe} - -\item{family}{Binomial only, cannot be used for regression.} - -\item{obsWeights}{Observation-level weights} - -\item{verbose}{If TRUE, display additional output during execution.} - -\item{id}{Not supported.} - -\item{prior}{the prior probabilities of class membership. If unspecified, the -class proportions for the training set are used. If present, the -probabilities should be specified in the order of the factor levels.} - -\item{method}{"moment" for standard estimators of the mean and variance, -"mle" for MLEs, "mve" to use cov.mve, or "t" for robust estimates based on -a t distribution.} - -\item{tol}{tolerance} - -\item{CV}{If true, returns results (classes and posterior probabilities) for -leave-one-out cross-validation. Note that if the prior is estimated, the -proportions in the whole dataset are used.} - -\item{nu}{degrees of freedom for method = "t".} - -\item{...}{Any additional arguments, not currently used.} -} -\description{ -Quadratic discriminant analysis, used for classification. -} -\examples{ - -data(Boston, package = "MASS") -Y = as.numeric(Boston$medv > 23) -# Remove outcome from covariate dataframe. -X = Boston[, -14] - -set.seed(1) - -# Use only 2 CV folds to speed up example. -sl = SuperLearner(Y, X, family = binomial(), cvControl = list(V = 2), - SL.library = c("SL.mean", "SL.qda")) -sl - -pred = predict(sl, X) -summary(pred$pred) - - -} -\references{ -James, G., Witten, D., Hastie, T., & Tibshirani, R. (2013). An Introduction -to Statistical Learning (Vol. 6). New York: Springer. Section 4.4. -} -\seealso{ -\code{\link{predict.SL.qda}} \code{\link[MASS]{qda}} - \code{\link[MASS]{predict.qda}} \code{\link{SL.lda}} -} diff --git a/man/SL.ranger.Rd b/man/SL.ranger.Rd index 8e27994..5ba1f3c 100644 --- a/man/SL.ranger.Rd +++ b/man/SL.ranger.Rd @@ -2,25 +2,41 @@ % Please edit documentation in R/SL.ranger.R \name{SL.ranger} \alias{SL.ranger} +\alias{predict.SL.ranger} \title{SL wrapper for ranger} \usage{ -SL.ranger(Y, X, newX, family, obsWeights, num.trees = 500, - mtry = floor(sqrt(ncol(X))), write.forest = TRUE, +SL.ranger( + Y, + X, + newX = X, + family = gaussian(), + obsWeights = NULL, + num.trees = 500, + mtry = floor(sqrt(ncol(X))), + write.forest = TRUE, probability = family$family == "binomial", - min.node.size = ifelse(family$family == "gaussian", 5, 1), replace = TRUE, - sample.fraction = ifelse(replace, 1, 0.632), num.threads = 1, - verbose = T, ...) + min.node.size = ifelse(family$family == "gaussian", 5, 1), + replace = TRUE, + sample.fraction = ifelse(replace, 1, 0.632), + num.threads = 1, + verbose = FALSE, + ... +) + +\method{predict}{SL.ranger}(object, newdata, num.threads = 1, verbose = object$verbose, ...) } \arguments{ -\item{Y}{Outcome variable} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\item{X}{Training dataframe} +\item{X}{The predictor variables in the training data set, usually a data.frame.} -\item{newX}{Test dataframe} +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} -\item{family}{Gaussian or binomial} +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} -\item{obsWeights}{Observation-level weights} +\item{obsWeights}{Optional observation weights.} \item{num.trees}{Number of trees.} @@ -42,14 +58,19 @@ sampling with replacement and 0.632 for sampling without replacement.} \item{num.threads}{Number of threads to use.} -\item{verbose}{If TRUE, display additional output during execution.} +\item{verbose}{If \code{TRUE}, display additional output during execution.} -\item{...}{Any additional arguments, not currently used.} +\item{...}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} } \description{ Ranger is a fast implementation of Random Forest (Breiman 2001) - or recursive partitioning, particularly suited for high dimensional data. - +or recursive partitioning, particularly suited for high dimensional data. +} +\details{ Extending code by Eric Polley from the SuperLearnerExtra package. } \examples{ @@ -79,5 +100,5 @@ Software, in press. http://arxiv.org/abs/1508.04409. } \seealso{ \code{\link{SL.ranger}} \code{\link[ranger]{ranger}} - \code{\link[ranger]{predict.ranger}} +\code{\link[ranger]{predict.ranger}} } diff --git a/man/SL.speedglm.Rd b/man/SL.speedglm.Rd index b818038..00c150d 100644 --- a/man/SL.speedglm.Rd +++ b/man/SL.speedglm.Rd @@ -2,30 +2,54 @@ % Please edit documentation in R/SL.speedglm.R \name{SL.speedglm} \alias{SL.speedglm} -\title{Wrapper for speedglm} +\alias{SL.speedlm} +\alias{predict.SL.speedglm} +\alias{predict.SL.speedlm} +\title{Wrapper for \code{speedglm}} \usage{ -SL.speedglm(Y, X, newX, family, obsWeights, maxit = 25, k = 2, ...) +SL.speedglm( + Y, + X, + newX = X, + family = gaussian(), + obsWeights = NULL, + maxit = 25, + k = 2, + ... +) + +SL.speedlm(Y, X, newX = X, family = gaussian(), obsWeights = NULL, ...) + +\method{predict}{SL.speedglm}(object, newdata, ...) + +\method{predict}{SL.speedlm}(object, newdata, ...) } \arguments{ -\item{Y}{Outcome variable} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\item{X}{Training dataframe} +\item{X}{The predictor variables in the training data set, usually a data.frame.} -\item{newX}{Test dataframe} +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} -\item{family}{Gaussian or binomial} +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} -\item{obsWeights}{Observation-level weights} +\item{obsWeights}{Optional observation weights.} \item{maxit}{Maximum number of iterations before stopping.} \item{k}{numeric, the penalty per parameter to be used; the default k = 2 is the classical AIC.} -\item{...}{Any remaining arguments, not used.} +\item{...}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} } \description{ -Speedglm is a fast version of glm() +\code{speedglm::speedglm()} and \code{speedglm::speedlm()} are fast versions of \code{glm()} and \code{lm()}, respectively. } \references{ Enea, M. A. R. C. O. (2013). Fitting linear models and generalized linear @@ -34,5 +58,5 @@ Large Datasets: book of short papers, 411-414. } \seealso{ \code{\link{predict.SL.speedglm}} \code{\link[speedglm]{speedglm}} - \code{\link[speedglm]{predict.speedglm}} +\code{\link[speedglm]{predict.speedglm}} } diff --git a/man/SL.speedlm.Rd b/man/SL.speedlm.Rd deleted file mode 100644 index 06967d5..0000000 --- a/man/SL.speedlm.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.speedlm.R -\name{SL.speedlm} -\alias{SL.speedlm} -\title{Wrapper for speedlm} -\usage{ -SL.speedlm(Y, X, newX, family, obsWeights, ...) -} -\arguments{ -\item{Y}{Outcome variable} - -\item{X}{Training dataframe} - -\item{newX}{Test dataframe} - -\item{family}{Gaussian or binomial} - -\item{obsWeights}{Observation-level weights} - -\item{...}{Any remaining arguments, not used.} -} -\description{ -Speedlm is a fast version of lm() -} -\references{ -Enea, M. A. R. C. O. (2013). Fitting linear models and generalized linear -models with large data sets in R. Statistical Methods for the Analysis of -Large Datasets: book of short papers, 411-414. -} -\seealso{ -\code{\link{predict.SL.speedlm}} \code{\link[speedglm]{speedlm}} - \code{\link[speedglm]{predict.speedlm}} \code{\link{SL.speedglm}} -} diff --git a/man/SL.template.Rd b/man/SL.template.Rd new file mode 100644 index 0000000..2261b6c --- /dev/null +++ b/man/SL.template.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SL.template.R +\name{SL.template} +\alias{SL.template} +\alias{SL.caret.rpart} +\alias{SL.gbm} +\alias{SL.ipredbagg} +\alias{SL.knn} +\alias{SL.leekasso} +\alias{SL.loess} +\alias{SL.logreg} +\alias{SL.nnet} +\alias{SL.nnls} +\alias{SL.polymars} +\alias{SL.randomForest} +\alias{SL.ridge} +\alias{SL.rpart} +\alias{SL.rpartPrune} +\alias{SL.step} +\alias{SL.step.forward} +\alias{SL.step.interaction} +\alias{SL.stepAIC} +\alias{SL.svm} +\alias{write.SL.template} +\title{Wrapper functions for prediction algorithms in SuperLearner} +\usage{ +SL.template(Y, X, newX, family, obsWeights, id, ...) + +write.SL.template(file = "", ...) +} +\arguments{ +\item{Y}{The outcome in the training data set. Must be a numeric vector.} + +\item{X}{The predictor variables in the training data set, usually a data.frame.} + +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} + +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} + +\item{obsWeights}{Optional observation weights.} + +\item{id}{Optional cluster identification variable.} + +\item{\dots}{For SL wrappers, other remaining arguments. For \code{write.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} + +\item{file}{A connection, or a character string naming a file to print to. +Passed to \code{\link[=cat]{cat()}}.} +} +\value{ +SL wrappers return a list with two elements: +\item{pred}{ The predicted values for the +rows in \code{newX}. } +\item{fit}{ A list. Contains all objects necessary to +get predictions for new observations from specific algorithm. } +} +\description{ +Template function for SuperLearner prediction wrappers and built in options. +} +\examples{ +write.SL.template(file = '') + +} +\seealso{ +\code{\link{SuperLearner}} +} +\author{ +Eric C Polley \email{epolley@uchicago.edu} +} +\keyword{utilities} diff --git a/man/SL.xgboost.Rd b/man/SL.xgboost.Rd index 73b695b..5c8e519 100644 --- a/man/SL.xgboost.Rd +++ b/man/SL.xgboost.Rd @@ -2,26 +2,40 @@ % Please edit documentation in R/SL.xgboost.R \name{SL.xgboost} \alias{SL.xgboost} -\title{XGBoost SuperLearner wrapper} +\alias{predict.SL.xgboost} +\title{SL wrapper for XGBoost} \usage{ -SL.xgboost(Y, X, newX, family, obsWeights, id, ntrees = 1000, max_depth = 4, - shrinkage = 0.1, minobspernode = 10, params = list(), nthread = 1, - verbose = 0, save_period = NULL, ...) +SL.xgboost( + Y, + X, + newX = X, + family = gaussian(), + obsWeights = NULL, + ntrees = 1000, + max_depth = 4, + shrinkage = 0.1, + minobspernode = 10, + params = list(), + nthread = 1, + verbose = 0, + save_period = NULL, + ... +) + +\method{predict}{SL.xgboost}(object, newdata, ...) } \arguments{ -\item{Y}{Outcome variable} - -\item{X}{Covariate dataframe} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\item{newX}{Optional dataframe to predict the outcome} +\item{X}{The predictor variables in the training data set, usually a data.frame.} -\item{family}{"gaussian" for regression, "binomial" for binary -classification, "multinomial" for multiple classification (not yet supported).} +\item{newX}{The predictor variables in the validation data set. The +structure should match X.} -\item{obsWeights}{Optional observation-level weights (supported but not tested)} +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to +describe the error distribution. Link function information will be ignored.} -\item{id}{Optional id to group observations from the same unit (not used -currently).} +\item{obsWeights}{Optional observation weights.} \item{ntrees}{How many trees to fit. Low numbers may underfit but high numbers may overfit, depending also on the shrinkage.} @@ -36,7 +50,7 @@ overfitting.} no more splitting will occur.} \item{params}{Many other parameters can be customized. See -\url{https://xgboost.readthedocs.io/en/latest/parameter.html}} +\url{http://xgboost.readthedocs.io/en/latest/parameter.html}} \item{nthread}{How many threads (cores) should xgboost use. Generally we want to keep this to 1 so that XGBoost does not compete with SuperLearner @@ -49,9 +63,13 @@ disk during processing. If NULL does not save model, and if 0 saves model at the end.} \item{...}{Any remaining arguments (not supported though).} + +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} } \description{ -Supports the Extreme Gradient Boosting package for SuperLearnering, which is +Supports the Extreme Gradient Boosting package for SuperLearning, which is a variant of gradient boosted machines (GBM). } \details{ @@ -62,5 +80,8 @@ on cross-validated performance. If you run into errors please first try installing the latest version of XGBoost from drat as described here: -\url{https://xgboost.readthedocs.io/en/latest/build.html} +\url{http://xgboost.readthedocs.io/en/latest/build.html} +} +\seealso{ +\code{\link[=create.SL.xgboost]{create.SL.xgboost()}} to create new xgboost wrappers with different parameters. } diff --git a/man/SampleSplitSuperLearner.Rd b/man/SampleSplitSuperLearner.Rd index 43e40a0..b74c44a 100644 --- a/man/SampleSplitSuperLearner.Rd +++ b/man/SampleSplitSuperLearner.Rd @@ -1,119 +1,151 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SampleSplitSuperLearner.R \name{SampleSplitSuperLearner} \alias{SampleSplitSuperLearner} - -\title{Super Learner Prediction Function} -\description{A Prediction Function for the Super Learner. The \code{SuperLearner} function takes a training set pair (X,Y) and returns the predicted values based on a validation set. SampleSplitSuperLearner uses sample split validation whereas SuperLearner uses V-fold cross-validation.} - +\title{Sampling-Splitting SuperLearner Prediction Function} \usage{ -SampleSplitSuperLearner(Y, X, newX = NULL, family = gaussian(), SL.library, - method = "method.NNLS", id = NULL, verbose = FALSE, - control = list(), split = 0.8, obsWeights = NULL) +SampleSplitSuperLearner( + Y, + X, + newX = NULL, + family = gaussian(), + SL.library, + method = "method.NNLS", + id = NULL, + verbose = FALSE, + control = list(), + split = 0.8, + obsWeights = NULL +) } - \arguments{ - \item{Y}{ -The outcome in the training data set. Must be a numeric vector. -} - \item{X}{ -The predictor variables in the training data set, usually a data.frame. -} - \item{newX}{ -The predictor variables in the validation data set. The structure should match X. If missing, uses X for newX. -} - \item{SL.library}{ -Either a character vector of prediction algorithms or a list containing character vectors. See details below for examples on the structure. A list of functions included in the SuperLearner package can be found with \code{listWrappers()}.} - \item{verbose}{ -logical; TRUE for printing progress during the computation (helpful for debugging). -} - \item{family}{ -Currently allows \code{gaussian} or \code{binomial} to describe the error distribution. Link function information will be ignored and should be contained in the method argument below. -} - \item{method}{ -A list (or a function to create a list) containing details on estimating the coefficients for the super learner and the model to combine the individual algorithms in the library. See \code{?method.template} for details. Currently, the built in options are either "method.NNLS" (the default), "method.NNLS2", "method.NNloglik", "method.CC_LS", or "method.CC_nloglik". NNLS and NNLS2 are non-negative least squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and binomial outcomes. NNloglik is a non-negative binomial likelihood maximization using the BFGS quasi-Newton optimization method. NN* methods are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's quadratic programming algorithm to calculate the best convex combination of weights to minimize the squared error loss. CC_nloglik calculates the convex combination of weights that minimize the negative binomial log likelihood on the logistic scale using the sequential quadratic programming algorithm. -} - \item{id}{ -Optional cluster identification variable. For the cross-validation splits, \code{id} forces observations in the same cluster to be in the same validation fold. \code{id} is passed to the prediction and screening algorithms in SL.library, but be sure to check the individual wrappers as many of them ignore the information. -} - \item{obsWeights}{ -Optional observation weights variable. As with \code{id} above, \code{obsWeights} is passed to the prediction and screening algorithms, but many of the built in wrappers ignore (or can't use) the information. If you are using observation weights, make sure the library you specify uses the information. -} - \item{control}{ -A list of parameters to control the estimation process. Parameters include \code{saveFitLibrary} and \code{trimLogit}. See \code{\link{SuperLearner.control}} for details. -} - \item{split}{ -Either a single value between 0 and 1 indicating the fraction of the samples for the training split. A value of 0.8 will randomly assign 80 percent of the samples to the training split and the other 20 percent to the validation split. Alternatively, split can be a numeric vector with the row numbers of \code{X} corresponding to the validation split. All other rows not in the vector will be considered in the training split. -} -} +\item{Y}{The outcome in the training data set. Must be a numeric vector.} -\details{ - \code{SuperLearner} fits the super learner prediction algorithm. The weights for each algorithm in \code{SL.library} is estimated, along with the fit of each algorithm. +\item{X}{The predictor variables in the training data set, usually a +data.frame.} + +\item{newX}{The predictor variables in the validation data set. The +structure should match X. If missing, uses X for newX.} + +\item{family}{Currently allows \code{gaussian} or \code{binomial} to +describe the error distribution. Link function information will be ignored +and should be contained in the method argument below.} + +\item{SL.library}{Either a character vector of prediction algorithms or a +list containing character vectors. See details below for examples on the +structure. A list of functions included in the SuperLearner package can be +found with \code{listWrappers()}.} + +\item{method}{A list (or a function to create a list) containing details on +estimating the coefficients for the super learner and the model to combine +the individual algorithms in the library. See \code{?method.template} for +details. Currently, the built in options are either "method.NNLS" (the +default), "method.NNLS2", "method.NNloglik", "method.CC_LS", +"method.CC_nloglik", or "method.AUC". NNLS and NNLS2 are non-negative least +squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb +and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and +binomial outcomes. NNloglik is a non-negative binomial likelihood +maximization using the BFGS quasi-Newton optimization method. NN* methods +are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's +quadratic programming algorithm to calculate the best convex combination of +weights to minimize the squared error loss. CC_nloglik calculates the convex +combination of weights that minimize the negative binomial log likelihood on +the logistic scale using the sequential quadratic programming algorithm. +AUC, which only works for binary outcomes, uses the Nelder-Mead method via +the optim function to minimize rank loss (equivalent to maximizing AUC).} -The prescreen algorithms. These algorithms first rank the variables in \code{X} based on either a univariate regression p-value of the \code{randomForest} variable importance. A subset of the variables in \code{X} is selected based on a pre-defined cut-off. With this subset of the X variables, the algorithms in \code{SL.library} are then fit. +\item{id}{Optional cluster identification variable. For the cross-validation +splits, \code{id} forces observations in the same cluster to be in the same +validation fold. \code{id} is passed to the prediction and screening +algorithms in SL.library, but be sure to check the individual wrappers as +many of them ignore the information.} -The SuperLearner package contains a few prediction and screening algorithm wrappers. The full list of wrappers can be viewed with \code{listWrappers()}. The design of the SuperLearner package is such that the user can easily add their own wrappers. We also maintain a website with additional examples of wrapper functions at \url{https://github.com/ecpolley/SuperLearnerExtra}. +\item{verbose}{logical; TRUE for printing progress during the computation +(helpful for debugging).} + +\item{control}{A list of parameters to control the estimation process. +Parameters include \code{saveFitLibrary} and \code{trimLogit}. See +\code{\link{SuperLearner.control}} for details.} + +\item{split}{Either a single value between 0 and 1 indicating the fraction +of the samples for the training split. A value of 0.8 will randomly assign +80 percent of the samples to the training split and the other 20 percent to +the validation split. Alternatively, split can be a numeric vector with the +row numbers of \code{X} corresponding to the validation split. All other +rows not in the vector will be considered in the training split.} + +\item{obsWeights}{Optional observation weights variable. As with \code{id} +above, \code{obsWeights} is passed to the prediction and screening +algorithms, but many of the built in wrappers ignore (or can't use) the +information. If you are using observation weights, make sure the library you +specify uses the information.} } \value{ - \item{call}{ -The matched call. -} - \item{libraryNames}{ -A character vector with the names of the algorithms in the library. The format is 'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the prediction algorithm run on all variables in X. -} - \item{SL.library}{ -Returns \code{SL.library} in the same format as the argument with the same name above. -} - \item{SL.predict}{ -The predicted values from the super learner for the rows in \code{newX}. -} - \item{coef}{ -Coefficients for the super learner. -} - \item{library.predict}{ -A matrix with the predicted values from each algorithm in \code{SL.library} for the rows in \code{newX}. -} - \item{Z}{ -The Z matrix (the cross-validated predicted values for each algorithm in \code{SL.library}). -} - \item{cvRisk}{ -A numeric vector with the V-fold cross-validated risk estimate for each algorithm in \code{SL.library}. Note that this does not contain the CV risk estimate for the SuperLearner, only the individual algorithms in the library. -} - \item{family}{ -Returns the \code{family} value from above -} - \item{fitLibrary}{ -A list with the fitted objects for each algorithm in \code{SL.library} on the full training data set. -} - \item{varNames}{ -A character vector with the names of the variables in \code{X}. -} - \item{validRows}{ -A list containing the row numbers for the V-fold cross-validation step. -} - \item{method}{ -A list with the method functions. -} - \item{whichScreen}{ -A logical matrix indicating which variables passed each screening algorithm. -} - \item{control}{ -The \code{control} list. -} - \item{split}{ -The \code{split} value. -} - \item{errorsInCVLibrary}{ -A logical vector indicating if any algorithms experienced an error within the CV step. -} - \item{errorsInLibrary}{ -A logical vector indicating if any algorithms experienced an error on the full data. -} +\item{call}{ The matched call. } +\item{libraryNames}{ A character +vector with the names of the algorithms in the library. The format is +'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the +prediction algorithm run on all variables in X. } +\item{SL.library}{ Returns \code{SL.library} in the same format as the argument with the same name +above. } +\item{SL.predict}{ The predicted values from the super learner for +the rows in \code{newX}. } +\item{coef}{ Coefficients for the super learner. +} +\item{library.predict}{ A matrix with the predicted values from each +algorithm in \code{SL.library} for the rows in \code{newX}. } +\item{Z}{ The +Z matrix (the cross-validated predicted values for each algorithm in +\code{SL.library}). } +\item{cvRisk}{ A numeric vector with the V-fold +cross-validated risk estimate for each algorithm in \code{SL.library}. Note +that this does not contain the CV risk estimate for the SuperLearner, only +the individual algorithms in the library. } +\item{family}{ Returns the +\code{family} value from above } +\item{fitLibrary}{ A list with the fitted +objects for each algorithm in \code{SL.library} on the full training data +set. } +\item{varNames}{ A character vector with the names of the variables +in \code{X}. } +\item{validRows}{ A list containing the row numbers for the +V-fold cross-validation step. } +\item{method}{ A list with the method +functions. } +\item{whichScreen}{ A logical matrix indicating which variables +passed each screening algorithm. } +\item{control}{ The \code{control} list. +} \item{split}{ The \code{split} value. } +\item{errorsInCVLibrary}{ A +logical vector indicating if any algorithms experienced an error within the +CV step. } +\item{errorsInLibrary}{ A logical vector indicating if any +algorithms experienced an error on the full data. } +} +\description{ +A Prediction Function for the SuperLearner. The \code{SuperLearner()} +function takes a training set pair (X, Y) and returns the predicted values +based on a validation set. \code{SampleSplitSuperLearner()} uses sample split +validation whereas \code{\link[=SuperLearner]{SuperLearner()}} uses V-fold cross-validation. } +\details{ +\code{SuperLearner()} fits the super learner prediction algorithm. The +weights for each algorithm in \code{SL.library} is estimated, along with the +fit of each algorithm. -\references{ - van der Laan, M. J., Polley, E. C. and Hubbard, A. E. (2008) Super Learner, \emph{Statistical Applications of Genetics and Molecular Biology}, \bold{6}, article 25. -} -\author{ Eric C Polley \email{epolley@uchicago.edu} } +\emph{The prescreen algorithms} These algorithms first rank the variables in +\code{X} based on either a univariate regression p-value of the +\code{randomForest} variable importance. A subset of the variables in +\code{X} is selected based on a pre-defined cut-off. With this subset of +the X variables, the algorithms in \code{SL.library} are then fit. +The SuperLearner package contains a few prediction and screening algorithm +wrappers. The full list of wrappers can be viewed with +\code{listWrappers()}. The design of the \pkg{SuperLearner} package is such that +the user can easily add their own wrappers. We also maintain a website with +additional examples of wrapper functions at +\url{https://github.com/ecpolley/SuperLearnerExtra}. +} \examples{ \dontrun{ ## simulate data @@ -137,15 +169,21 @@ newY <- newX[, 1] + sqrt(abs(newX[, 2] * newX[, 3])) + newX[, 2] - # generate Library and run Super Learner SL.library <- c("SL.glm", "SL.randomForest", "SL.gam", "SL.polymars", "SL.mean") + test <- SampleSplitSuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library, - verbose = TRUE, method = "method.NNLS") + verbose = TRUE, method = "method.NNLS") + test # library with screening -SL.library <- list(c("SL.glmnet", "All"), c("SL.glm", "screen.randomForest", - "All", "screen.SIS"), "SL.randomForest", c("SL.polymars", "All"), "SL.mean") -test <- SuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library, - verbose = TRUE, method = "method.NNLS") +SL.library <- list(c("SL.glmnet", "All"), + c("SL.glm", "screen.randomForest", "All", "screen.SIS"), + "SL.randomForest", + c("SL.polymars", "All"), + "SL.mean") + +test <- SampleSplitSuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library, + verbose = TRUE, method = "method.NNLS") test # binary outcome @@ -153,16 +191,22 @@ set.seed(1) N <- 200 X <- matrix(rnorm(N*10), N, 10) X <- as.data.frame(X) -Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + +Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) SL.library <- c("SL.glmnet", "SL.glm", "SL.knn", "SL.gam", "SL.mean") # least squares loss function -test.NNLS <- SampleSplitSuperLearner(Y = Y, X = X, SL.library = SL.library, - verbose = TRUE, method = "method.NNLS", family = binomial()) +test.NNLS <- SampleSplitSuperLearner(Y = Y, X = X, SL.library = SL.library, + verbose = TRUE, method = "method.NNLS", + family = binomial()) test.NNLS } } +\references{ +van der Laan, M. J., Polley, E. C. and Hubbard, A. E. (2008) Super Learner, \emph{Statistical Applications of Genetics and Molecular Biology}, \bold{6}, article 25. +} +\author{ +Eric C Polley \email{epolley@uchicago.edu} +} \keyword{models} - diff --git a/man/SuperLearner-package.Rd b/man/SuperLearner-package.Rd new file mode 100644 index 0000000..8f43a0c --- /dev/null +++ b/man/SuperLearner-package.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SuperLearner-package.R +\docType{package} +\name{SuperLearner-package} +\alias{SuperLearner-package} +\title{SuperLearner: Super Learner Prediction} +\description{ +Implements the super learner prediction method and contains a library of prediction algorithms to be used in the super learner. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/ecpolley/SuperLearner} + \item Report bugs at \url{https://github.com/ecpolley/SuperLearner/issues} +} + +} +\author{ +\strong{Maintainer}: Eric Polley \email{epolley@uchicago.edu} + +Authors: +\itemize{ + \item Erin LeDell + \item Chris Kennedy + \item Mark van der Laan [thesis advisor] +} + +Other contributors: +\itemize{ + \item Sam Lendle [contributor] +} + +} +\keyword{internal} diff --git a/man/SuperLearner.CV.control.Rd b/man/SuperLearner.CV.control.Rd index c07295d..e64d25b 100644 --- a/man/SuperLearner.CV.control.Rd +++ b/man/SuperLearner.CV.control.Rd @@ -1,34 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/control.R \name{SuperLearner.CV.control} \alias{SuperLearner.CV.control} - -\title{ -Control parameters for the cross validation steps in \code{SuperLearner} -} -\description{ -Control parameters for the cross validation steps in \code{SuperLearner} -} +\title{Control parameters for the cross validation steps in \code{SuperLearner()}} \usage{ -SuperLearner.CV.control(V = 10L, stratifyCV = FALSE, shuffle = TRUE, - validRows = NULL) +SuperLearner.CV.control( + V = 10L, + stratifyCV = FALSE, + shuffle = TRUE, + validRows = NULL +) } - \arguments{ - \item{V}{ -Integer. Number of splits for the V-fold cross-validation step. The default is 10. In most cases, between 10 and 20 splits works well. -} - \item{stratifyCV}{ -Logical. Should the data splits be stratified by a binary response? Attempts to maintain the same ratio in each training and validation sample. -} - \item{shuffle}{ -Logical. Should the rows of \code{X} be shuffled before creating the splits. -} - \item{validRows}{ -A List. Use this to pass pre-specified rows for the sample splits. The length of the list should be \code{V} and each entry in the list should contain a vector with the row numbers of the corresponding validation sample. -} -} +\item{V}{Integer. Number of splits for the V-fold cross-validation step. The +default is 10. In most cases, between 10 and 20 splits works well.} + +\item{stratifyCV}{Logical. Should the data splits be stratified by a binary +response? Attempts to maintain the same ratio in each training and +validation sample.} + +\item{shuffle}{Logical. Should the rows of \code{X} be shuffled before +creating the splits.} +\item{validRows}{A List. Use this to pass pre-specified rows for the sample +splits. The length of the list should be \code{V} and each entry in the list +should contain a vector with the row numbers of the corresponding validation +sample.} +} \value{ -A list containing the control parameters +A list containing the control parameters. } - -\keyword{utilities} \ No newline at end of file +\description{ +Control parameters for the cross validation steps in \code{SuperLearner()} +} +\keyword{utilities} diff --git a/man/SuperLearner.Rd b/man/SuperLearner.Rd index d44e4ea..f4fcf48 100644 --- a/man/SuperLearner.Rd +++ b/man/SuperLearner.Rd @@ -1,138 +1,160 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SuperLearner.R \name{SuperLearner} \alias{SuperLearner} -\alias{mcSuperLearner} -\alias{snowSuperLearner} -\alias{print.SuperLearner} -\alias{coef.SuperLearner} - \title{Super Learner Prediction Function} -\description{A Prediction Function for the Super Learner. The \code{SuperLearner} function takes a training set pair (X,Y) and returns the predicted values based on a validation set.} - \usage{ -SuperLearner(Y, X, newX = NULL, family = gaussian(), SL.library, - method = "method.NNLS", id = NULL, verbose = FALSE, - control = list(), cvControl = list(), obsWeights = NULL, env = parent.frame()) +SuperLearner( + Y, + X, + newX = NULL, + family = gaussian(), + SL.library, + method = "method.NNLS", + id = NULL, + verbose = FALSE, + control = list(), + cvControl = list(), + obsWeights = NULL, + env = parent.frame() +) } - \arguments{ - \item{Y}{ -The outcome in the training data set. Must be a numeric vector. -} - \item{X}{ -The predictor variables in the training data set, usually a data.frame. -} - \item{newX}{ -The predictor variables in the validation data set. The structure should match X. If missing, uses X for newX. -} - \item{SL.library}{ -Either a character vector of prediction algorithms or a list containing character vectors. See details below for examples on the structure. A list of functions included in the SuperLearner package can be found with \code{listWrappers()}.} - \item{verbose}{ -logical; TRUE for printing progress during the computation (helpful for debugging). -} - \item{family}{ -Currently allows \code{gaussian} or \code{binomial} to describe the error distribution. Link function information will be ignored and should be contained in the method argument below. -} - \item{method}{ -A list (or a function to create a list) containing details on estimating the coefficients for the super learner and the model to combine the individual algorithms in the library. See \code{?method.template} for details. Currently, the built in options are either "method.NNLS" (the default), "method.NNLS2", "method.NNloglik", "method.CC_LS", "method.CC_nloglik", or "method.AUC". NNLS and NNLS2 are non-negative least squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and binomial outcomes. NNloglik is a non-negative binomial likelihood maximization using the BFGS quasi-Newton optimization method. NN* methods are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's quadratic programming algorithm to calculate the best convex combination of weights to minimize the squared error loss. CC_nloglik calculates the convex combination of weights that minimize the negative binomial log likelihood on the logistic scale using the sequential quadratic programming algorithm. AUC, which only works for binary outcomes, uses the Nelder-Mead method via the optim function to minimize rank loss (equivalent to maximizing AUC). -} - \item{id}{ -Optional cluster identification variable. For the cross-validation splits, \code{id} forces observations in the same cluster to be in the same validation fold. \code{id} is passed to the prediction and screening algorithms in SL.library, but be sure to check the individual wrappers as many of them ignore the information. -} - \item{obsWeights}{ -Optional observation weights variable. As with \code{id} above, \code{obsWeights} is passed to the prediction and screening algorithms, but many of the built in wrappers ignore (or can't use) the information. If you are using observation weights, make sure the library you specify uses the information. -} - \item{control}{ -A list of parameters to control the estimation process. Parameters include \code{saveFitLibrary} and \code{trimLogit}. See \code{\link{SuperLearner.control}} for details. -} - \item{cvControl}{ -A list of parameters to control the cross-validation process. Parameters include \code{V}, \code{stratifyCV}, \code{shuffle} and \code{validRows}. See \code{\link{SuperLearner.CV.control}} for details. -} - - \item{env}{ - Environment containing the learner functions. Defaults to the calling environment. - } -} - -\details{ - \code{SuperLearner} fits the super learner prediction algorithm. The weights for each algorithm in \code{SL.library} is estimated, along with the fit of each algorithm. - -The prescreen algorithms. These algorithms first rank the variables in \code{X} based on either a univariate regression p-value of the \code{randomForest} variable importance. A subset of the variables in \code{X} is selected based on a pre-defined cut-off. With this subset of the X variables, the algorithms in \code{SL.library} are then fit. - -The SuperLearner package contains a few prediction and screening algorithm wrappers. The full list of wrappers can be viewed with \code{listWrappers()}. The design of the SuperLearner package is such that the user can easily add their own wrappers. We also maintain a website with additional examples of wrapper functions at \url{https://github.com/ecpolley/SuperLearnerExtra}. +\item{Y}{The outcome in the training data set. Must be a numeric vector.} + +\item{X}{The predictor variables in the training data set, usually a +data.frame.} + +\item{newX}{The predictor variables in the validation data set. The +structure should match X. If missing, uses X for newX.} + +\item{family}{Currently allows \code{gaussian} or \code{binomial} to +describe the error distribution. Link function information will be ignored +and should be contained in the method argument below.} + +\item{SL.library}{Either a character vector of prediction algorithms or a +list containing character vectors. See details below for examples on the +structure. A list of functions included in the SuperLearner package can be +found with \code{listWrappers()}.} + +\item{method}{A list (or a function to create a list) containing details on +estimating the coefficients for the super learner and the model to combine +the individual algorithms in the library. See \code{?method.template} for +details. Currently, the built in options are either "method.NNLS" (the +default), "method.NNLS2", "method.NNloglik", "method.CC_LS", +"method.CC_nloglik", or "method.AUC". NNLS and NNLS2 are non-negative least +squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb +and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and +binomial outcomes. NNloglik is a non-negative binomial likelihood +maximization using the BFGS quasi-Newton optimization method. NN* methods +are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's +quadratic programming algorithm to calculate the best convex combination of +weights to minimize the squared error loss. CC_nloglik calculates the convex +combination of weights that minimize the negative binomial log likelihood on +the logistic scale using the sequential quadratic programming algorithm. +AUC, which only works for binary outcomes, uses the Nelder-Mead method via +the optim function to minimize rank loss (equivalent to maximizing AUC).} + +\item{id}{Optional cluster identification variable. For the cross-validation +splits, \code{id} forces observations in the same cluster to be in the same +validation fold. \code{id} is passed to the prediction and screening +algorithms in SL.library, but be sure to check the individual wrappers as +many of them ignore the information.} + +\item{verbose}{logical; TRUE for printing progress during the computation +(helpful for debugging).} + +\item{control}{A list of parameters to control the estimation process. +Parameters include \code{saveFitLibrary} and \code{trimLogit}. See +\code{\link{SuperLearner.control}} for details.} + +\item{cvControl}{A list of parameters to control the cross-validation +process. Parameters include \code{V}, \code{stratifyCV}, \code{shuffle} and +\code{validRows}. See \code{\link{SuperLearner.CV.control}} for details.} + +\item{obsWeights}{Optional observation weights variable. As with \code{id} +above, \code{obsWeights} is passed to the prediction and screening +algorithms, but many of the built in wrappers ignore (or can't use) the +information. If you are using observation weights, make sure the library you +specify uses the information.} + +\item{env}{Environment containing the learner functions. Defaults to the +calling environment.} } \value{ - \item{call}{ -The matched call. -} - \item{libraryNames}{ -A character vector with the names of the algorithms in the library. The format is 'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the prediction algorithm run on all variables in X. -} - \item{SL.library}{ -Returns \code{SL.library} in the same format as the argument with the same name above. -} - \item{SL.predict}{ -The predicted values from the super learner for the rows in \code{newX}. -} - \item{coef}{ -Coefficients for the super learner. -} - \item{library.predict}{ -A matrix with the predicted values from each algorithm in \code{SL.library} for the rows in \code{newX}. -} - \item{Z}{ -The Z matrix (the cross-validated predicted values for each algorithm in \code{SL.library}). -} - \item{cvRisk}{ -A numeric vector with the V-fold cross-validated risk estimate for each algorithm in \code{SL.library}. Note that this does not contain the CV risk estimate for the SuperLearner, only the individual algorithms in the library. +\item{call}{ The matched call. } +\item{libraryNames}{ A character +vector with the names of the algorithms in the library. The format is +'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the +prediction algorithm run on all variables in X. } +\item{SL.library}{ Returns \code{SL.library} in the same format as the argument with the same name +above. } +\item{SL.predict}{ The predicted values from the super learner for +the rows in \code{newX}. } +\item{coef}{ Coefficients for the super learner.} +\item{library.predict}{ A matrix with the predicted values from each +algorithm in \code{SL.library} for the rows in \code{newX}. } +\item{Z}{ The +Z matrix (the cross-validated predicted values for each algorithm in +\code{SL.library}). } +\item{cvRisk}{ A numeric vector with the V-fold +cross-validated risk estimate for each algorithm in \code{SL.library}. Note +that this does not contain the CV risk estimate for the SuperLearner, only +the individual algorithms in the library. } +\item{family}{ Returns the +\code{family} value from above } +\item{fitLibrary}{ A list with the fitted +objects for each algorithm in \code{SL.library} on the full training data +set. } +\item{cvFitLibrary}{ A list with fitted objects for each algorithm in +\code{SL.library} on each of \code{V} different training data sets. } +\item{varNames}{ A character vector with the names of the variables in +\code{X}. } +\item{validRows}{ A list containing the row numbers for the +V-fold cross-validation step. } +\item{method}{ A list with the method +functions. } +\item{whichScreen}{ A logical matrix indicating which variables +passed each screening algorithm. } +\item{control}{ The \code{control} list. +} +\item{cvControl}{ The \code{cvControl} list. } +\item{errorsInCVLibrary}{ A +logical vector indicating if any algorithms experienced an error within the +CV step. } +\item{errorsInLibrary}{ A logical vector indicating if any +algorithms experienced an error on the full data. } +\item{env}{ Environment +passed into function which will be searched to find the learner functions. +Defaults to the calling environment. } +\item{times}{ A list that contains +the execution time of the SuperLearner, plus separate times for model +fitting and prediction. } +} +\description{ +\code{SuperLearner()} takes a training set pair (X,Y) and returns the predicted values +based on a validation set. } - \item{family}{ -Returns the \code{family} value from above -} - \item{fitLibrary}{ -A list with the fitted objects for each algorithm in \code{SL.library} on the full training data set. -} - \item{cvFitLibrary}{ -A list with fitted objects for each algorithm in \code{SL.library} on each of -\code{V} different training data sets. -} - \item{varNames}{ -A character vector with the names of the variables in \code{X}. -} - \item{validRows}{ -A list containing the row numbers for the V-fold cross-validation step. -} - \item{method}{ -A list with the method functions. -} - \item{whichScreen}{ -A logical matrix indicating which variables passed each screening algorithm. -} - \item{control}{ -The \code{control} list. -} - \item{cvControl}{ -The \code{cvControl} list. -} - \item{errorsInCVLibrary}{ -A logical vector indicating if any algorithms experienced an error within the CV step. -} - \item{errorsInLibrary}{ -A logical vector indicating if any algorithms experienced an error on the full data. -} - \item{env}{ - Environment passed into function which will be searched to find the learner functions. Defaults to the calling environment. - } - \item{times}{ - A list that contains the execution time of the SuperLearner, plus separate times for model fitting and prediction. - } -} - -\references{ - van der Laan, M. J., Polley, E. C. and Hubbard, A. E. (2008) Super Learner, \emph{Statistical Applications of Genetics and Molecular Biology}, \bold{6}, article 25. +\details{ +\code{SuperLearner()} fits the super learner prediction algorithm. The +weights for each algorithm in \code{SL.library} is estimated, along with the +fit of each algorithm. + +\emph{The prescreen algorithms} These algorithms first rank the variables in +\code{X} based on either a univariate regression p-value of the +\code{randomForest} variable importance. A subset of the variables in +\code{X} is selected based on a pre-defined cut-off. With this subset of +the X variables, the algorithms in \code{SL.library} are then fit. + +The SuperLearner package contains a few prediction and screening algorithm +wrappers. The full list of wrappers can be viewed with +\code{listWrappers()}. The design of the \pkg{SuperLearner} package is such that +the user can easily add their own wrappers. We also maintain a website with +additional examples of wrapper functions at +\url{https://github.com/ecpolley/SuperLearnerExtra}. } -\author{ Eric C Polley \email{epolley@uchicago.edu} } - \examples{ + \dontrun{ ## simulate data set.seed(23432) @@ -205,16 +227,16 @@ corrmat[4, ] = sqrt(truerho) corrmat[4, 4] = 1 cholmat <- chol(corrmat) x <- matrix(rnorm(n*p, mean=0, sd=1), n, p) -x <- x %*% cholmat -feta <- x[, 1:4] %*% b +x <- x \%*\% cholmat +feta <- x[, 1:4] \%*\% b fprob <- exp(feta) / (1 + exp(feta)) y <- rbinom(n, 1, fprob) # test m <- 10000 newx <- matrix(rnorm(m*p, mean=0, sd=1), m, p) -newx <- newx %*% cholmat -newfeta <- newx[, 1:4] %*% b +newx <- newx \%*\% cholmat +newfeta <- newx[, 1:4] \%*\% b newfprob <- exp(newfeta) / (1 + exp(newfeta)) newy <- rbinom(m, 1, newfprob) @@ -269,7 +291,7 @@ library(parallel) cl <- makeCluster(2, type = "PSOCK") # can use different types here clusterSetRNGStream(cl, iseed = 2343) # make SL functions available on the clusters, use assignment to avoid printing -foo <- clusterEvalQ(cl, library(SuperLearner)) +foo <- clusterEvalQ(cl, library(SuperLearner)) testSNOW <- snowSuperLearner(cluster = cl, Y = Y, X = X, newX = newX, SL.library = SL.library, method = "method.NNLS") testSNOW @@ -285,8 +307,8 @@ environment(my.SL.wrapper) <-asNamespace("SuperLearner") cl <- makeCluster(2, type = "PSOCK") # can use different types here clusterSetRNGStream(cl, iseed = 2343) -# make SL functions available on the clusters, use assignment to avoid printing -foo <- clusterEvalQ(cl, library(SuperLearner)) +# make SL functions available on the clusters, use assignment to avoid printing +foo <- clusterEvalQ(cl, library(SuperLearner)) clusterExport(cl, c("my.SL.wrapper")) # copy the function to all clusters testSNOW <- snowSuperLearner(cluster = cl, Y = Y, X = X, newX = newX, SL.library = c("SL.glm", "SL.mean", "my.SL.wrapper"), method = "method.NNLS") @@ -301,13 +323,21 @@ replicate(5, system.time(mcSuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library, method = "method.NNLS"))) cl <- makeCluster(2, type = 'PSOCK') -# make SL functions available on the clusters, use assignment to avoid printing -foo <- clusterEvalQ(cl, library(SuperLearner)) +# make SL functions available on the clusters, use assignment to avoid printing +foo <- clusterEvalQ(cl, library(SuperLearner)) replicate(5, system.time(snowSuperLearner(cl, Y = Y, X = X, newX = newX, SL.library = SL.library, method = "method.NNLS"))) stopCluster(cl) } } +\references{ +van der Laan, M. J., Polley, E. C. and Hubbard, A. E. (2008) Super Learner, \emph{Statistical Applications of Genetics and Molecular Biology}, \bold{6}, article 25. +} +\seealso{ +\code{\link[=CV.SuperLearner]{CV.SuperLearner()}}, \code{\link[=SampleSplitSuperLearner]{SampleSplitSuperLearner()}} +} +\author{ +Eric C Polley \email{epolley@uchicago.edu} +} \keyword{models} - diff --git a/man/SuperLearner.control.Rd b/man/SuperLearner.control.Rd index ed74d2e..e309b20 100644 --- a/man/SuperLearner.control.Rd +++ b/man/SuperLearner.control.Rd @@ -1,29 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/control.R \name{SuperLearner.control} \alias{SuperLearner.control} - -\title{ -Control parameters for the SuperLearner -} -\description{ -Control parameters for the \code{SuperLearner}} +\title{Control parameters for \code{SuperLearner()}} \usage{ -SuperLearner.control(saveFitLibrary = TRUE, saveCVFitLibrary = FALSE, trimLogit = 0.001) +SuperLearner.control( + saveFitLibrary = TRUE, + saveCVFitLibrary = FALSE, + trimLogit = 0.001 +) } - \arguments{ - \item{saveFitLibrary}{ -Logical. Should the fit for each algorithm be saved in the output from \code{SuperLearner}. -} - \item{saveCVFitLibrary}{ -Logical. Should cross-validated fits for each algorithm be saved in the output from \code{SuperLearner}. -} - \item{trimLogit}{ -number between 0.0 and 0.5. What level to truncate the logit transformation to maintain a bounded loss function when using the NNloglik method. -} -} +\item{saveFitLibrary}{Logical. Should the fit for each algorithm be saved in +the output from \code{SuperLearner()}.} + +\item{saveCVFitLibrary}{Logical. Should cross-validated fits for each +algorithm be saved in the output from \code{SuperLearner()}.} +\item{trimLogit}{number between 0.0 and 0.5. What level to truncate the +logit transformation to maintain a bounded loss function when using the +NNloglik method.} +} \value{ A list containing the control parameters. } +\description{ +Control parameters for \code{SuperLearner()} +} \keyword{utilities} - diff --git a/man/SuperLearnerDocs.Rd b/man/SuperLearnerDocs.Rd new file mode 100644 index 0000000..8369c5a --- /dev/null +++ b/man/SuperLearnerDocs.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SuperLearnerNews.R +\name{SuperLearnerDocs} +\alias{SuperLearnerDocs} +\alias{SuperLearnerNews} +\title{Show the NEWS and documentation files for the SuperLearner package} +\usage{ +SuperLearnerDocs(what = "SuperLearnerR.pdf", ...) + +SuperLearnerNews(...) +} +\arguments{ +\item{what}{specify what document to open. Currently supports the NEWS file +and the PDF files 'SuperLearner.pdf' and 'SuperLearnerR.pdf'.} + +\item{\dots}{additional arguments passed to \code{\link[utils:RShowDoc]{utils::RShowDoc()}}.} +} +\value{ +A invisible character string given the path to the SuperLearner NEWS or documentation file. +} +\description{ +Show the NEWS file of the SuperLearner package. The function is simply a +wrapper for \code{\link[utils:RShowDoc]{utils::RShowDoc()}}. +} +\keyword{utilities} diff --git a/man/SuperLearnerNews.Rd b/man/SuperLearnerNews.Rd deleted file mode 100755 index 73e91fb..0000000 --- a/man/SuperLearnerNews.Rd +++ /dev/null @@ -1,21 +0,0 @@ -\name{SuperLearnerNews} -\alias{SuperLearnerNews} -\alias{SuperLearnerDocs} - -\title{Show the NEWS file for the SuperLearner package} -\description{ -Show the NEWS file of the SuperLearner package. The function is simply a wrapper for the \code{RShowDoc} function -} -\usage{ -SuperLearnerNews(\dots) -SuperLearnerDocs(what = 'SuperLearnerR.pdf', \dots) -} - -\arguments{ - \item{\dots}{additional arguments passed to \code{RShowDoc}} - \item{what}{specify what document to open. Currently supports the NEWS file and the PDF files 'SuperLearner.pdf' and 'SuperLearnerR.pdf'.} -} -\value{ -A invisible character string given the path to the SuperLearner NEWS file -} -\keyword{utilities} diff --git a/man/create.Learner.Rd b/man/create.Learner.Rd index 8d4c106..04a6632 100644 --- a/man/create.Learner.Rd +++ b/man/create.Learner.Rd @@ -4,9 +4,15 @@ \alias{create.Learner} \title{Factory for learner wrappers} \usage{ -create.Learner(base_learner, params = list(), tune = list(), - env = parent.frame(), name_prefix = base_learner, detailed_names = F, - verbose = F) +create.Learner( + base_learner, + params = list(), + tune = list(), + env = parent.frame(), + name_prefix = base_learner, + detailed_names = F, + verbose = F +) } \arguments{ \item{base_learner}{Character string of the learner function that will be @@ -29,7 +35,7 @@ parameter configurations.} } \value{ Returns a list with expanded tuneGrid and the names of the created - functions. +functions. } \description{ Create custom learners and/or a sequence of learners with hyperparameter diff --git a/man/create.SL.xgboost.Rd b/man/create.SL.xgboost.Rd index 75d9a99..85e76e7 100644 --- a/man/create.SL.xgboost.Rd +++ b/man/create.SL.xgboost.Rd @@ -1,12 +1,16 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.xgboost.R +% Please edit documentation in R/create.SL.xgboost.R \name{create.SL.xgboost} \alias{create.SL.xgboost} \title{Factory for XGBoost SL wrappers} \usage{ -create.SL.xgboost(tune = list(ntrees = c(1000), max_depth = c(4), shrinkage = - c(0.1), minobspernode = c(10)), detailed_names = F, env = .GlobalEnv, - name_prefix = "SL.xgb") +create.SL.xgboost( + tune = list(ntrees = c(1000), max_depth = c(4), shrinkage = c(0.1), minobspernode = + c(10)), + detailed_names = FALSE, + env = .GlobalEnv, + name_prefix = "SL.xgb" +) } \arguments{ \item{tune}{List of hyperparameter settings to test. If specified, each hyperparameter will need to be defined.} @@ -38,4 +42,8 @@ attach(sl_env) sl = SuperLearner(Y = Y, X = X, SL.library = xgb_grid$names) } detach(sl_env) + +} +\seealso{ +\code{\link[=SL.xgboost]{SL.xgboost()}} } diff --git a/man/listWrappers.Rd b/man/listWrappers.Rd index 3ccbf80..10f121e 100644 --- a/man/listWrappers.Rd +++ b/man/listWrappers.Rd @@ -1,34 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/listWrappers.R \name{listWrappers} \alias{listWrappers} - -\title{ -list all wrapper functions in SuperLearner -} -\description{ -List all wrapper functions in \code{\link{SuperLearner}} package -} +\title{List all wrapper functions in SuperLearner} \usage{ listWrappers(what = "both") } - \arguments{ - \item{what}{ -What list to return. Can be \code{both} for both prediction algorithms and screening algorithms, \code{SL} for the prediction algorithms, \code{screen} for the screening algorithms, \code{method} for the estimation method details, or anything else will return a list of all (exported) functions in the \code{SuperLearner} package. Additional wrapper functions are available at \url{https://github.com/ecpolley/SuperLearnerExtra}. +\item{what}{What list to return. Can be \code{both} for both prediction +algorithms and screening algorithms, \code{SL} for the prediction +algorithms, \code{screen} for the screening algorithms, \code{method} for +the estimation method details, or anything else will return a list of all +(exported) functions in the \code{SuperLearner} package. Additional wrapper +functions are available at +\url{https://github.com/ecpolley/SuperLearnerExtra}.} } -} - \value{ -Invisible character vector with all exported functions in the SuperLearner package +Invisible character vector with all exported functions in the +\pkg{SuperLearner} package } - -\author{ Eric C Polley \email{epolley@uchicago.edu} } - -\seealso{ -\code{\link{SuperLearner}} +\description{ +List all wrapper functions in SuperLearner } \examples{ + listWrappers(what = "SL") listWrappers(what = "screen") } +\seealso{ +\code{\link{SuperLearner}} +} +\author{ +Eric C Polley \email{epolley@uchicago.edu} +} \keyword{utilities} - diff --git a/man/mcSuperLearner.Rd b/man/mcSuperLearner.Rd new file mode 100644 index 0000000..c351123 --- /dev/null +++ b/man/mcSuperLearner.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mcSuperLearner.R +\name{mcSuperLearner} +\alias{mcSuperLearner} +\alias{snowSuperLearner} +\title{SuperLearner with multicore support} +\usage{ +mcSuperLearner( + Y, + X, + newX = NULL, + family = gaussian(), + SL.library, + method = "method.NNLS", + id = NULL, + verbose = FALSE, + control = list(), + cvControl = list(), + obsWeights = NULL, + env = parent.frame(), + mc.cores = getOption("mc.cores", 2L) +) + +snowSuperLearner( + cluster, + Y, + X, + newX = NULL, + family = gaussian(), + SL.library, + method = "method.NNLS", + id = NULL, + verbose = FALSE, + control = list(), + cvControl = list(), + obsWeights = NULL, + env = parent.frame() +) +} +\arguments{ +\item{Y, X, newX, family, SL.library, method, id, verbose, control, cvControl, obsWeights, env}{See \code{\link[=SuperLearner]{SuperLearner()}}.} + +\item{mc.cores}{The number of cores to use, i.e. at most how many child processes will be run simultaneously. Passed to \code{parallel::mclapply()}.} + +\item{cluster}{a \code{cluster} object from \code{parallel::makeCluster()}.} +} +\description{ +\code{mcSuperlearner()} and \code{snowSuperlearner()} allow for multicore parallelization for \code{\link[=SuperLearner]{SuperLearner()}}. +} +\details{ +\code{mcSuperlearner()} uses \code{parallel::mclapply()}, and \code{snowSuperlearner()} uses \code{parallel::parLapply()}. +} diff --git a/man/plot.CV.SuperLearner.Rd b/man/plot.CV.SuperLearner.Rd index 3c84a99..7dd92b7 100644 --- a/man/plot.CV.SuperLearner.Rd +++ b/man/plot.CV.SuperLearner.Rd @@ -1,45 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.CV.SuperLearner.R \name{plot.CV.SuperLearner} \alias{plot.CV.SuperLearner} - -\title{ -Graphical display of the V-fold CV risk estimates -} -\description{ -The function plots the V-fold cross-validated risk estimates for the super learner, the discrete super learner and each algorithm in the library. By default the estimates will be sorted and include an asymptotic 95\% confidence interval. -} +\title{Graphical display of the V-fold CV risk estimates} \usage{ \method{plot}{CV.SuperLearner}(x, package = "ggplot2", constant = qnorm(0.975), sort = TRUE, ...) } -%- maybe also 'usage' for other objects documented here. \arguments{ - \item{x}{ -The output from \code{CV.SuperLearner}. -} - \item{package}{ -Either "ggplot2" or "lattice". The package selected must be available. -} - \item{constant}{ -A numeric value. The confidence interval is defined as p +/- constant * se, where p is the point estimate and se is the standard error. The default is the quantile of the standard normal corresponding to a 95\% CI. -} - \item{sort}{ -Logical. Should the rows in the plot be sorted from the smallest to the largest point estimate. If FALSE, then the order is super learner, discrete super learner, then the estimators in \code{SL.library}. +\item{x}{The output from \code{CV.SuperLearner()}.} + +\item{package}{Either \code{"ggplot2"} or \code{"lattice"}. The package selected must be +available.} + +\item{constant}{A numeric value. The confidence interval is defined as \verb{p +/- constant * se}, where \code{p} is the point estimate and \code{se} is the standard error. The default is the quantile of the standard normal corresponding to a 95\% CI.} + +\item{sort}{Logical. Should the rows in the plot be sorted from the smallest +to the largest point estimate. If \code{FALSE}, then the order is super learner, +discrete super learner, then the estimators in \code{SL.library}.} + +\item{\dots}{Additional arguments for \code{summary.CV.SuperLearner()}.} } - \item{\dots}{ -Additional arguments for \code{summary.CV.SuperLearner} +\value{ +Returns the plot (either a ggplot2 object (class \code{ggplot}) or a +lattice object (class \code{trellis})) } +\description{ +The function plots the V-fold cross-validated risk estimates for the super +learner, the discrete super learner and each algorithm in the library. By +default the estimates will be sorted and include an asymptotic 95\% +confidence interval. } \details{ -see \link{summary.CV.SuperLearner} for details on how the estimates are computed -} -\value{ -Returns the plot (either a ggplot2 object (class \code{ggplot}) or a lattice object (class \code{trellis})) +See \code{\link[=summary.CV.SuperLearner]{summary.CV.SuperLearner()}} for details on how the estimates are +computed } - -\author{ Eric C Polley \email{epolley@uchicago.edu} } - \seealso{ -\code{\link{summary.CV.SuperLearner}} and \code{\link{CV.SuperLearner}} +\code{\link{summary.CV.SuperLearner}} and +\code{\link{CV.SuperLearner}} +} +\author{ +Eric C Polley \email{epolley@uchicago.edu} } - \keyword{plot} - diff --git a/man/predict.SL.bartMachine.Rd b/man/predict.SL.bartMachine.Rd deleted file mode 100644 index db169ce..0000000 --- a/man/predict.SL.bartMachine.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.bartMachine.R -\name{predict.SL.bartMachine} -\alias{predict.SL.bartMachine} -\title{bartMachine prediction} -\usage{ -\method{predict}{SL.bartMachine}(object, newdata, family, X = NULL, - Y = NULL, ...) -} -\arguments{ -\item{object}{SuperLearner object} - -\item{newdata}{Dataframe to predict the outcome} - -\item{family}{"gaussian" for regression, "binomial" for binary -classification. (Not used)} - -\item{X}{Covariate dataframe (not used)} - -\item{Y}{Outcome variable (not used)} - -\item{...}{Additional arguments (not used)} -} -\description{ -bartMachine prediction -} diff --git a/man/predict.SL.biglasso.Rd b/man/predict.SL.biglasso.Rd deleted file mode 100644 index 2a48674..0000000 --- a/man/predict.SL.biglasso.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.biglasso.R -\name{predict.SL.biglasso} -\alias{predict.SL.biglasso} -\title{Prediction wrapper for SL.biglasso} -\usage{ -\method{predict}{SL.biglasso}(object, newdata, ...) -} -\arguments{ -\item{object}{SL.kernlab object} - -\item{newdata}{Dataframe to generate predictions} - -\item{...}{Unused additional arguments} -} -\description{ -Prediction wrapper for SL.biglasso objects. -} -\seealso{ -\code{\link{SL.biglasso}} \code{\link[biglasso]{biglasso}} - \code{\link[biglasso]{predict.biglasso}} -} diff --git a/man/predict.SL.glm.Rd b/man/predict.SL.glm.Rd deleted file mode 100644 index 45a59a6..0000000 --- a/man/predict.SL.glm.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.glm.R -\name{predict.SL.glm} -\alias{predict.SL.glm} -\title{Prediction for SL.glm} -\usage{ -\method{predict}{SL.glm}(object, newdata, ...) -} -\arguments{ -\item{object}{SL.glm object} - -\item{newdata}{Dataframe to generate predictions} - -\item{...}{Unused additional arguments} -} -\description{ -Prediction for SL.glm -} -\seealso{ -\code{\link{SL.glm}} \code{\link[stats]{glm}} - \code{\link[stats]{predict.glm}} \code{\link{SL.speedglm}} -} diff --git a/man/predict.SL.glmnet.Rd b/man/predict.SL.glmnet.Rd deleted file mode 100644 index ed3a20f..0000000 --- a/man/predict.SL.glmnet.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.glmnet.R -\name{predict.SL.glmnet} -\alias{predict.SL.glmnet} -\title{Prediction for an SL.glmnet object} -\usage{ -\method{predict}{SL.glmnet}(object, newdata, remove_extra_cols = T, - add_missing_cols = T, ...) -} -\arguments{ -\item{object}{Result object from SL.glmnet} - -\item{newdata}{Dataframe or matrix that will generate predictions.} - -\item{remove_extra_cols}{Remove any extra columns in the new data that were -not part of the original model.} - -\item{add_missing_cols}{Add any columns from original data that do not exist -in the new data, and set values to 0.} - -\item{...}{Any additional arguments (not used).} -} -\description{ -Prediction for the glmnet wrapper. -} -\seealso{ -\code{\link{SL.glmnet}} -} diff --git a/man/predict.SL.kernelKnn.Rd b/man/predict.SL.kernelKnn.Rd deleted file mode 100644 index 088f4bb..0000000 --- a/man/predict.SL.kernelKnn.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.kernelKnn.R -\name{predict.SL.kernelKnn} -\alias{predict.SL.kernelKnn} -\title{Prediction for SL.kernelKnn} -\usage{ -\method{predict}{SL.kernelKnn}(object, newdata, ...) -} -\arguments{ -\item{object}{SL.kernelKnn object} - -\item{newdata}{Dataframe to generate predictions} - -\item{...}{Unused additional arguments} -} -\description{ -Prediction for SL.kernelKnn -} diff --git a/man/predict.SL.ksvm.Rd b/man/predict.SL.ksvm.Rd deleted file mode 100644 index 57dc8c6..0000000 --- a/man/predict.SL.ksvm.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.ksvm.R -\name{predict.SL.ksvm} -\alias{predict.SL.ksvm} -\title{Prediction for SL.ksvm} -\usage{ -\method{predict}{SL.ksvm}(object, newdata, family, coupler = "minpair", ...) -} -\arguments{ -\item{object}{SL.kernlab object} - -\item{newdata}{Dataframe to generate predictions} - -\item{family}{Gaussian or binomial} - -\item{coupler}{Coupling method used in the multiclass case, can be one of -minpair or pkpd (see kernlab package for details). For future usage.} - -\item{...}{Unused additional arguments} -} -\description{ -Prediction for SL.ksvm -} -\seealso{ -\code{\link{SL.ksvm}} \code{\link[kernlab]{ksvm}} \code{\link[kernlab]{predict.ksvm}} -} diff --git a/man/predict.SL.lda.Rd b/man/predict.SL.lda.Rd deleted file mode 100644 index 97e8938..0000000 --- a/man/predict.SL.lda.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.lda.R -\name{predict.SL.lda} -\alias{predict.SL.lda} -\title{Prediction wrapper for SL.lda} -\usage{ -\method{predict}{SL.lda}(object, newdata, prior = object$object$prior, - dimen = NULL, method = "plug-in", ...) -} -\arguments{ -\item{object}{SL.lda object} - -\item{newdata}{Dataframe to generate predictions} - -\item{prior}{The prior probabilities of the classes, by default the -proportions in the training set or what was set in the call to lda.} - -\item{dimen}{the dimension of the space to be used. If this is less than -min(p, ng-1), only the first dimen discriminant components are used (except -for method="predictive"), and only those dimensions are returned in x.} - -\item{method}{This determines how the parameter estimation is handled. With -"plug-in" (the default) the usual unbiased parameter estimates are used and -assumed to be correct. With "debiased" an unbiased estimator of the log -posterior probabilities is used, and with "predictive" the parameter -estimates are integrated out using a vague prior.} - -\item{...}{Unused additional arguments} -} -\description{ -Prediction wrapper for SL.lda -} -\seealso{ -\code{\link{SL.lda}} \code{\link[MASS]{lda}} - \code{\link[MASS]{predict.lda}} -} diff --git a/man/predict.SL.lm.Rd b/man/predict.SL.lm.Rd deleted file mode 100644 index 4d72af8..0000000 --- a/man/predict.SL.lm.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.lm.R -\name{predict.SL.lm} -\alias{predict.SL.lm} -\title{Prediction for SL.lm} -\usage{ -\method{predict}{SL.lm}(object, newdata, ...) -} -\arguments{ -\item{object}{SL.lm object} - -\item{newdata}{Dataframe to generate predictions} - -\item{...}{Unused additional arguments} -} -\description{ -Prediction for SL.lm -} -\seealso{ -\code{\link{SL.lm}} \code{\link[stats]{lm}} - \code{\link[stats]{predict.lm}} \code{\link{SL.speedlm}} -} diff --git a/man/predict.SL.qda.Rd b/man/predict.SL.qda.Rd deleted file mode 100644 index 6f9133e..0000000 --- a/man/predict.SL.qda.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.qda.R -\name{predict.SL.qda} -\alias{predict.SL.qda} -\title{Prediction wrapper for SL.qda} -\usage{ -\method{predict}{SL.qda}(object, newdata, prior = object$object$prior, - dimen = NULL, method = "plug-in", ...) -} -\arguments{ -\item{object}{SL.lda object} - -\item{newdata}{Dataframe to generate predictions} - -\item{prior}{The prior probabilities of the classes, by default the -proportions in the training set or what was set in the call to lda.} - -\item{dimen}{the dimension of the space to be used. If this is less than -min(p, ng-1), only the first dimen discriminant components are used (except -for method="predictive"), and only those dimensions are returned in x.} - -\item{method}{This determines how the parameter estimation is handled. With -"plug-in" (the default) the usual unbiased parameter estimates are used and -assumed to be correct. With "debiased" an unbiased estimator of the log -posterior probabilities is used, and with "predictive" the parameter -estimates are integrated out using a vague prior.} - -\item{...}{Unused additional arguments} -} -\description{ -Prediction wrapper for SL.qda -} -\seealso{ -\code{\link{SL.qda}} \code{\link[MASS]{qda}} - \code{\link[MASS]{predict.qda}} -} diff --git a/man/predict.SL.ranger.Rd b/man/predict.SL.ranger.Rd deleted file mode 100644 index baa7dcc..0000000 --- a/man/predict.SL.ranger.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.ranger.R -\name{predict.SL.ranger} -\alias{predict.SL.ranger} -\title{Prediction wrapper for ranger random forests} -\usage{ -\method{predict}{SL.ranger}(object, newdata, family, num.threads = 1, - verbose = object$verbose, ...) -} -\arguments{ -\item{object}{SL.kernlab object} - -\item{newdata}{Dataframe to generate predictions} - -\item{family}{Gaussian or binomial} - -\item{num.threads}{Number of threads used for parallelization} - -\item{verbose}{If TRUE output additional information during execution.} - -\item{...}{Unused additional arguments} -} -\description{ -Prediction wrapper for SL.ranger objects. -} -\seealso{ -\code{\link{SL.ranger}} \code{\link[ranger]{ranger}} - \code{\link[ranger]{predict.ranger}} -} diff --git a/man/predict.SL.speedglm.Rd b/man/predict.SL.speedglm.Rd deleted file mode 100644 index 0d4d512..0000000 --- a/man/predict.SL.speedglm.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.speedglm.R -\name{predict.SL.speedglm} -\alias{predict.SL.speedglm} -\title{Prediction for SL.speedglm} -\usage{ -\method{predict}{SL.speedglm}(object, newdata, ...) -} -\arguments{ -\item{object}{SL.speedglm object} - -\item{newdata}{Dataframe to generate predictions} - -\item{...}{Unused additional arguments} -} -\description{ -Prediction for SL.speedglm -} -\seealso{ -\code{\link{SL.speedglm}} \code{\link[speedglm]{speedglm}} - \code{\link[speedglm]{predict.speedglm}} -} diff --git a/man/predict.SL.speedlm.Rd b/man/predict.SL.speedlm.Rd deleted file mode 100644 index 2ddcd28..0000000 --- a/man/predict.SL.speedlm.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.speedlm.R -\name{predict.SL.speedlm} -\alias{predict.SL.speedlm} -\title{Prediction for SL.speedlm} -\usage{ -\method{predict}{SL.speedlm}(object, newdata, ...) -} -\arguments{ -\item{object}{SL.speedlm object} - -\item{newdata}{Dataframe to generate predictions} - -\item{...}{Unused additional arguments} -} -\description{ -Prediction for SL.speedlm, a fast lm() -} -\seealso{ -\code{\link{SL.speedlm}} \code{\link[speedglm]{speedlm}} - \code{\link[speedglm]{predict.speedlm}} \code{\link{SL.speedglm}} -} diff --git a/man/predict.SL.template.Rd b/man/predict.SL.template.Rd new file mode 100644 index 0000000..d374058 --- /dev/null +++ b/man/predict.SL.template.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict.SL.template.R +\name{predict.SL.template} +\alias{predict.SL.template} +\alias{write.predict.SL.template} +\title{Wrapper functions for computing predictions from SL learners} +\usage{ +\method{predict}{SL.template}(object, newdata, family, X = NULL, Y = NULL, ...) + +write.predict.SL.template(file = "", ...) +} +\arguments{ +\item{object}{an object resulting from the fitting function.} + +\item{newdata}{a data frame of predictors for which to compute predictions.} + +\item{family}{Either \code{\link[=gaussian]{gaussian()}} or \code{\link[=binomial]{binomial()}} to describe the error distribution. Link function information will be ignored.} + +\item{X}{The predictor variables in the training data set, usually a data.frame.} + +\item{Y}{The outcome in the training data set. Must be a numeric vector.} + +\item{\dots}{For \code{predict.SL} wrappers, other remaining arguments. For \code{write.predict.SL.template()}, arguments passed to \code{\link[=cat]{cat()}}.} + +\item{file}{A connection, or a character string naming a file to print to. +Passed to \code{\link[=cat]{cat()}}.} +} +\value{ +\code{predict.SL} wrappers return a vector of predictions for each unit in \code{newdata}. +} +\description{ +Wrapper functions for computing predictions from SL learners +} +\examples{ +write.predict.SL.template(file = '') +} +\keyword{utilities} diff --git a/man/predict.SL.xgboost.Rd b/man/predict.SL.xgboost.Rd deleted file mode 100644 index 29a2d28..0000000 --- a/man/predict.SL.xgboost.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SL.xgboost.R -\name{predict.SL.xgboost} -\alias{predict.SL.xgboost} -\title{XGBoost prediction on new data} -\usage{ -\method{predict}{SL.xgboost}(object, newdata, family, ...) -} -\arguments{ -\item{object}{Model fit object from SuperLearner} - -\item{newdata}{Dataframe that will be converted to an xgb.DMatrix} - -\item{family}{Binomial or gaussian} - -\item{...}{Any remaining arguments (not supported though).} -} -\description{ -XGBoost prediction on new data -} diff --git a/man/predict.superlearner.Rd b/man/predict.superlearner.Rd index dbcce39..9cc41dc 100644 --- a/man/predict.superlearner.Rd +++ b/man/predict.superlearner.Rd @@ -4,8 +4,7 @@ \alias{predict.SuperLearner} \title{Predict method for SuperLearner object} \usage{ -\method{predict}{SuperLearner}(object, newdata, X = NULL, Y = NULL, - onlySL = FALSE, ...) +\method{predict}{SuperLearner}(object, newdata, X = NULL, Y = NULL, onlySL = FALSE, ...) } \arguments{ \item{object}{Fitted object from \code{SuperLearner}} diff --git a/man/recombineCVSL.Rd b/man/recombineCVSL.Rd index e253306..36509ed 100644 --- a/man/recombineCVSL.Rd +++ b/man/recombineCVSL.Rd @@ -1,107 +1,116 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recombine.R \name{recombineCVSL} \alias{recombineCVSL} - -\title{ -Recombine a CV.SuperLearner fit using a new metalearning method -} -\description{ -Function to re-compute the V-fold cross-validated risk estimate for super learner using a new metalearning method. This function takes as input an existing CV.SuperLearner fit and applies the \code{recombineSL} fit to each of the V Super Learner fits. -} +\title{Recombine a CV.SuperLearner fit using a new metalearning method} \usage{ -recombineCVSL(object, method = "method.NNloglik", verbose = FALSE, - saveAll = TRUE, parallel = "seq") +recombineCVSL( + object, + method = "method.NNloglik", + verbose = FALSE, + saveAll = TRUE, + parallel = "seq" +) } - \arguments{ - \item{object}{ -Fitted object from \code{CV.SuperLearner}. -} - \item{method}{ -A list (or a function to create a list) containing details on estimating the coefficients for the super learner and the model to combine the individual algorithms in the library. See \code{?method.template} for details. Currently, the built in options are either "method.NNLS" (the default), "method.NNLS2", "method.NNloglik", "method.CC_LS", "method.CC_nloglik", or "method.AUC". NNLS and NNLS2 are non-negative least squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and binomial outcomes. NNloglik is a non-negative binomial likelihood maximization using the BFGS quasi-Newton optimization method. NN* methods are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's quadratic programming algorithm to calculate the best convex combination of weights to minimize the squared error loss. CC_nloglik calculates the convex combination of weights that minimize the negative binomial log likelihood on the logistic scale using the sequential quadratic programming algorithm. AUC, which only works for binary outcomes, uses the Nelder-Mead method via the optim function to minimize rank loss (equivalent to maximizing AUC). -} - \item{verbose}{ -logical; TRUE for printing progress during the computation (helpful for debugging). -} - - \item{saveAll}{ -Logical; Should the entire \code{SuperLearner} object be saved for each fold? -} - \item{parallel}{ -Options for parallel computation of the V-fold step. Use "seq" (the default) for sequential computation. \code{parallel = 'multicore'} to use \code{mclapply} for the V-fold step (but note that \code{SuperLearner()} will still be sequential). Or \code{parallel} can be the name of a snow cluster and will use \code{parLapply} for the V-fold step. For both multicore and snow, the inner \code{SuperLearner} calls will be sequential. - } -} - -\details{ - The function \code{recombineCVSL} computes the usual V-fold cross-validated risk estimate for the super learner (and all algorithms in \code{SL.library} for comparison), using a newly specified metalearning method. The weights for each algorithm in \code{SL.library} are re-estimated using the new metalearner, however the base learner fits are not regenerated, so this function saves a lot of computation time as opposed to using the \code{CV.SuperLearner} function with a new \code{method} argument. The output is identical to the output from the \code{CV.SuperLearner} function. +\item{object}{Fitted object from \code{CV.SuperLearner}.} + +\item{method}{A list (or a function to create a list) containing details on +estimating the coefficients for the super learner and the model to combine +the individual algorithms in the library. See \code{?method.template} for +details. Currently, the built in options are either \code{"method.NNLS"} (the +default), \code{"method.NNLS2"}, \code{"method.NNloglik"}, \code{"method.CC_LS"}, +\code{"method.CC_nloglik"}, or \code{"method.AUC"}. NNLS and NNLS2 are non-negative least +squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb +and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and +binomial outcomes. NNloglik is a non-negative binomial likelihood +maximization using the BFGS quasi-Newton optimization method. NN* methods +are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's +quadratic programming algorithm to calculate the best convex combination of +weights to minimize the squared error loss. CC_nloglik calculates the convex +combination of weights that minimize the negative binomial log likelihood on +the logistic scale using the sequential quadratic programming algorithm. +AUC, which only works for binary outcomes, uses the Nelder-Mead method via +the optim function to minimize rank loss (equivalent to maximizing AUC).} + +\item{verbose}{Logical; \code{TRUE} for printing progress during the computation +(helpful for debugging).} + +\item{saveAll}{Logical; Should the entire \code{SuperLearner} object be +saved for each fold?} + +\item{parallel}{Options for parallel computation of the V-fold step. Use +"seq" (the default) for sequential computation. \code{parallel = 'multicore'} to use \code{mclapply} for the V-fold step (but note that \code{SuperLearner()} will still be sequential). The default for \code{mclapply()} is +to check the \code{mc.cores} option, and if not set to default to 2 cores. +Be sure to set \code{options()$mc.cores} to the desired number of cores if +you don't want the default. Or \code{parallel} can be the name of a snow +cluster and will use \code{parLapply} for the V-fold step. For both +multicore and snow, the inner \code{SuperLearner} calls will be sequential.} } \value{ An object of class \code{CV.SuperLearner} (a list) with components: - \item{call}{ -The matched call. -} -\item{AllSL}{ -If \code{saveAll = TRUE}, a list with output from each call to \code{SuperLearner}, otherwise NULL. -} -\item{SL.predict}{ -The predicted values from the super learner when each particular row was part of the validation fold. -} +\item{call}{ The matched call. } +\item{AllSL}{ If \code{saveAll = TRUE}, a +list with output from each call to \code{SuperLearner}, otherwise NULL. } +\item{SL.predict}{ The predicted values from the super learner when each +particular row was part of the validation fold. } \item{discreteSL.predict}{ -The traditional cross-validated selector. Picks the algorithm with the smallest cross-validated risk (in super learner terms, gives that algorithm coefficient 1 and all others 0). -} -\item{whichDiscreteSL}{ -A list of length \code{V}. The elements in the list are the algorithm that had the smallest cross-validated risk estimate for that fold. -} -\item{library.predict}{ -A matrix with the predicted values from each algorithm in \code{SL.library}. The columns are the algorithms in \code{SL.library} and the rows represent the predicted values when that particular row was in the validation fold (i.e. not used to fit that estimator). -} -\item{coef}{ -A matrix with the coefficients for the super learner on each fold. The columns are the algorithms in \code{SL.library} the rows are the folds. -} -\item{folds}{ -A list containing the row numbers for each validation fold. -} -\item{V}{ -Number of folds for \code{CV.SuperLearner}. -} - \item{libraryNames}{ -A character vector with the names of the algorithms in the library. The format is 'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the prediction algorithm run on all variables in X. -} - \item{SL.library}{ -Returns \code{SL.library} in the same format as the argument with the same name above. -} -\item{method}{ -A list with the method functions. -} -\item{Y}{ -The outcome +The traditional cross-validated selector. Picks the algorithm with the +smallest cross-validated risk (in super learner terms, gives that algorithm +coefficient 1 and all others 0). } +\item{whichDiscreteSL}{ A list of length +\code{V}. The elements in the list are the algorithm that had the smallest +cross-validated risk estimate for that fold. } +\item{library.predict}{ A +matrix with the predicted values from each algorithm in \code{SL.library}. +The columns are the algorithms in \code{SL.library} and the rows represent +the predicted values when that particular row was in the validation fold +(i.e. not used to fit that estimator). } +\item{coef}{ A matrix with the coefficients for the super learner on each fold. The columns are the +algorithms in \code{SL.library} the rows are the folds. } +\item{folds}{ A list containing the row numbers for each validation fold. } +\item{V}{ Number of folds for \code{CV.SuperLearner}. } +\item{libraryNames}{ A character +vector with the names of the algorithms in the library. The format is +'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the +prediction algorithm run on all variables in X. } +\item{SL.library}{ Returns \code{SL.library} in the same format as the argument with the same name above. } +\item{method}{ A list with the method functions. } +\item{Y}{ The outcome } } +\description{ +Function to re-compute the V-fold cross-validated risk estimate for super +learner using a new metalearning method. This function takes as input an +existing CV.SuperLearner fit and applies the \code{recombineSL} fit to each +of the V Super Learner fits. } - - -\author{ Erin LeDell \email{ledell@berkeley.edu} } - -\seealso{ -\code{\link{recombineSL}} +\details{ +The function \code{recombineCVSL} computes the usual V-fold cross-validated +risk estimate for the super learner (and all algorithms in \code{SL.library} +for comparison), using a newly specified metalearning method. The weights for +each algorithm in \code{SL.library} are re-estimated using the new +metalearner, however the base learner fits are not regenerated, so this +function saves a lot of computation time as opposed to using the +\code{CV.SuperLearner} function with a new \code{method} argument. The +output is identical to the output from the \code{CV.SuperLearner} function. } - \examples{ \dontrun{ - # Binary outcome example adapted from SuperLearner examples set.seed(1) N <- 200 X <- matrix(rnorm(N*10), N, 10) X <- as.data.frame(X) -Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + +Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) SL.library <- c("SL.glmnet", "SL.glm", "SL.knn", "SL.gam", "SL.mean") # least squares loss function set.seed(1) # for reproducibility -cvfit_nnls <- CV.SuperLearner(Y = Y, X = X, V = 10, SL.library = SL.library, +cvfit_nnls <- CV.SuperLearner(Y = Y, X = X, V = 10, SL.library = SL.library, verbose = TRUE, method = "method.NNLS", family = binomial()) cvfit_nnls$coef # SL.glmnet_All SL.glm_All SL.knn_All SL.gam_All SL.mean_All @@ -154,5 +163,10 @@ cvfit_nnloglik2$coef } } - +\seealso{ +\code{\link{recombineSL}} +} +\author{ +Erin LeDell \email{ledell@berkeley.edu} +} \keyword{models} diff --git a/man/recombineSL.Rd b/man/recombineSL.Rd index 668359c..c7aeb44 100644 --- a/man/recombineSL.Rd +++ b/man/recombineSL.Rd @@ -1,111 +1,117 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recombine.R \name{recombineSL} \alias{recombineSL} - \title{Recombine a SuperLearner fit using a new metalearning method} -\description{The \code{recombineSL} function takes an existing SuperLearner fit and a new metalearning method and returns a new SuperLearner fit with updated base learner weights.} - \usage{ recombineSL(object, Y, method = "method.NNloglik", verbose = FALSE) } - \arguments{ - \item{object}{ -Fitted object from \code{SuperLearner}. -} - \item{Y}{ -The outcome in the training data set. Must be a numeric vector. -} - \item{method}{ -A list (or a function to create a list) containing details on estimating the coefficients for the super learner and the model to combine the individual algorithms in the library. See \code{?method.template} for details. Currently, the built in options are either "method.NNLS" (the default), "method.NNLS2", "method.NNloglik", "method.CC_LS", "method.CC_nloglik", or "method.AUC". NNLS and NNLS2 are non-negative least squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and binomial outcomes. NNloglik is a non-negative binomial likelihood maximization using the BFGS quasi-Newton optimization method. NN* methods are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's quadratic programming algorithm to calculate the best convex combination of weights to minimize the squared error loss. CC_nloglik calculates the convex combination of weights that minimize the negative binomial log likelihood on the logistic scale using the sequential quadratic programming algorithm. AUC, which only works for binary outcomes, uses the Nelder-Mead method via the optim function to minimize rank loss (equivalent to maximizing AUC). -} - \item{verbose}{ -logical; TRUE for printing progress during the computation (helpful for debugging). -} -} - -\details{ - \code{recombineSL} re-fits the super learner prediction algorithm using a new metalearning method. The weights for each algorithm in \code{SL.library} are re-estimated using the new metalearner, however the base learner fits are not regenerated, so this function saves a lot of computation time as opposed to using the \code{SuperLearner} function with a new \code{method} argument. The output is identical to the output from the \code{SuperLearner} function. - +\item{object}{Fitted object from \code{SuperLearner}.} + +\item{Y}{The outcome in the training data set. Must be a numeric vector.} + +\item{method}{A list (or a function to create a list) containing details on +estimating the coefficients for the super learner and the model to combine +the individual algorithms in the library. See \code{?method.template} for +details. Currently, the built in options are either "method.NNLS" (the +default), "method.NNLS2", "method.NNloglik", "method.CC_LS", +"method.CC_nloglik", or "method.AUC". NNLS and NNLS2 are non-negative least +squares based on the Lawson-Hanson algorithm and the dual method of Goldfarb +and Idnani, respectively. NNLS and NNLS2 will work for both gaussian and +binomial outcomes. NNloglik is a non-negative binomial likelihood +maximization using the BFGS quasi-Newton optimization method. NN* methods +are normalized so weights sum to one. CC_LS uses Goldfarb and Idnani's +quadratic programming algorithm to calculate the best convex combination of +weights to minimize the squared error loss. CC_nloglik calculates the convex +combination of weights that minimize the negative binomial log likelihood on +the logistic scale using the sequential quadratic programming algorithm. +AUC, which only works for binary outcomes, uses the Nelder-Mead method via +the optim function to minimize rank loss (equivalent to maximizing AUC).} + +\item{verbose}{logical; TRUE for printing progress during the computation +(helpful for debugging).} } \value{ - \item{call}{ -The matched call. -} - \item{libraryNames}{ -A character vector with the names of the algorithms in the library. The format is 'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the prediction algorithm run on all variables in X. -} - \item{SL.library}{ -Returns \code{SL.library} in the same format as the argument with the same name above. -} - \item{SL.predict}{ -The predicted values from the super learner for the rows in \code{newX}. -} - \item{coef}{ -Coefficients for the super learner. -} - \item{library.predict}{ -A matrix with the predicted values from each algorithm in \code{SL.library} for the rows in \code{newX}. -} - \item{Z}{ -The Z matrix (the cross-validated predicted values for each algorithm in \code{SL.library}). -} - \item{cvRisk}{ -A numeric vector with the V-fold cross-validated risk estimate for each algorithm in \code{SL.library}. Note that this does not contain the CV risk estimate for the SuperLearner, only the individual algorithms in the library. -} - \item{family}{ -Returns the \code{family} value from above +\item{call}{ The matched call. } +\item{libraryNames}{ A character +vector with the names of the algorithms in the library. The format is +'predictionAlgorithm_screeningAlgorithm' with '_All' used to denote the +prediction algorithm run on all variables in X. } +\item{SL.library}{ Returns \code{SL.library} in the same format as the argument with the same name +above. } +\item{SL.predict}{ The predicted values from the super learner for +the rows in \code{newX}. } +\item{coef}{ Coefficients for the super learner.} +\item{library.predict}{ A matrix with the predicted values from each +algorithm in \code{SL.library} for the rows in \code{newX}. } +\item{Z}{ The +Z matrix (the cross-validated predicted values for each algorithm in +\code{SL.library}). } +\item{cvRisk}{ A numeric vector with the V-fold +cross-validated risk estimate for each algorithm in \code{SL.library}. Note +that this does not contain the CV risk estimate for the SuperLearner, only +the individual algorithms in the library. } +\item{family}{ Returns the +\code{family} value from above } +\item{fitLibrary}{ A list with the fitted +objects for each algorithm in \code{SL.library} on the full training data +set. } +\item{cvFitLibrary}{ A list with fitted objects for each algorithm in +\code{SL.library} on each of \code{V} different training data sets. } +\item{varNames}{ A character vector with the names of the variables in +\code{X}. } +\item{validRows}{ A list containing the row numbers for the +V-fold cross-validation step. } +\item{method}{ A list with the method +functions. } +\item{whichScreen}{ A logical matrix indicating which variables +passed each screening algorithm. } +\item{control}{ The \code{control} list. +} +\item{cvControl}{ The \code{cvControl} list. } +\item{errorsInCVLibrary}{ A +logical vector indicating if any algorithms experienced an error within the +CV step. } +\item{errorsInLibrary}{ A logical vector indicating if any +algorithms experienced an error on the full data. } +\item{env}{ Environment +passed into function which will be searched to find the learner functions. +Defaults to the calling environment. } +\item{times}{ A list that contains +the execution time of the SuperLearner, plus separate times for model +fitting and prediction. } +} +\description{ +\code{recombineSL()} takes an existing SuperLearner fit and a new +metalearning method and returns a new SuperLearner fit with updated base +learner weights. } - \item{fitLibrary}{ -A list with the fitted objects for each algorithm in \code{SL.library} on the full training data set. -} - \item{varNames}{ -A character vector with the names of the variables in \code{X}. -} - \item{validRows}{ -A list containing the row numbers for the V-fold cross-validation step. -} - \item{method}{ -A list with the method functions. -} - \item{whichScreen}{ -A logical matrix indicating which variables passed each screening algorithm. -} - \item{control}{ -The \code{control} list. -} - \item{cvControl}{ -The \code{cvControl} list. -} - \item{errorsInCVLibrary}{ -A logical vector indicating if any algorithms experienced an error within the CV step. -} - \item{errorsInLibrary}{ -A logical vector indicating if any algorithms experienced an error on the full data. -} -} - -\references{ -van der Laan, M. J., Polley, E. C. and Hubbard, A. E. (2008) Super Learner, \emph{Statistical Applications of Genetics and Molecular Biology}, \bold{6}, article 25. +\details{ +\code{recombineSL()} re-fits the super learner prediction algorithm using a +new metalearning method. The weights for each algorithm in +\code{SL.library} are re-estimated using the new metalearner, however the +base learner fits are not regenerated, so this function saves a lot of +computation time as opposed to using the \code{SuperLearner()} function with a +new \code{method} argument. The output is identical to the output from the +\code{SuperLearner()} function. } -\author{ Erin LeDell \email{ledell@berkeley.edu} } - \examples{ \dontrun{ - # Binary outcome example adapted from SuperLearner examples set.seed(1) N <- 200 X <- matrix(rnorm(N*10), N, 10) X <- as.data.frame(X) -Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + +Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) SL.library <- c("SL.glmnet", "SL.glm", "SL.knn", "SL.gam", "SL.mean") # least squares loss function set.seed(1) # for reproducibility -fit_nnls <- SuperLearner(Y = Y, X = X, SL.library = SL.library, +fit_nnls <- SuperLearner(Y = Y, X = X, SL.library = SL.library, verbose = TRUE, method = "method.NNLS", family = binomial()) fit_nnls # Risk Coef @@ -142,6 +148,12 @@ fit_nnloglik2 # SL.mean_All 0.6904050 0.2129891 } + +} +\references{ +van der Laan, M. J., Polley, E. C. and Hubbard, A. E. (2008) Super Learner, \emph{Statistical Applications of Genetics and Molecular Biology}, \bold{6}, article 25. +} +\author{ +Erin LeDell \email{ledell@berkeley.edu} } \keyword{models} - diff --git a/man/screen.Rd b/man/screen.Rd new file mode 100644 index 0000000..ee6fd0b --- /dev/null +++ b/man/screen.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/screen.R +\name{screen} +\alias{screen} +\alias{write.screen.template} +\alias{screen.template} +\alias{All} +\alias{screen.randomForest} +\alias{screen.SIS} +\alias{screen.ttest} +\alias{screen.corP} +\alias{screen.corRank} +\alias{screen.glmnet} +\title{Screening algorithms for SuperLearner} +\usage{ +write.screen.template(file = "", ...) +} +\arguments{ +\item{file}{A connection, or a character string naming a file to print to. +Passed to \code{\link{cat}}.} + +\item{\dots}{Additional arguments passed to \code{\link{cat}}} +} +\value{ +A logical vector with the length equal to the +number of columns in \code{X}. TRUE indicates the variable (column of X) +should be included. +} +\description{ +Screening algorithms for \code{SuperLearner} to be used with +\code{SL.library}. +} +\examples{ +write.screen.template(file = '') + +} +\seealso{ +\code{\link{SuperLearner}} +} +\author{ +Eric C Polley \email{polley.eric@mayo.edu} +} +\keyword{utilities} diff --git a/man/summary.CV.SuperLearner.Rd b/man/summary.CV.SuperLearner.Rd index 9e6161d..d7e1e66 100644 --- a/man/summary.CV.SuperLearner.Rd +++ b/man/summary.CV.SuperLearner.Rd @@ -1,69 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summary.CV.SuperLearner.R \name{summary.CV.SuperLearner} \alias{summary.CV.SuperLearner} \alias{print.summary.CV.SuperLearner} +\title{Summary Function for Cross-Validated Super Learner} +\usage{ +\method{summary}{CV.SuperLearner}(object, obsWeights = NULL, ...) -\title{ -Summary Function for Cross-Validated Super Learner -} -\description{ -summary method for the \code{CV.SuperLearner} function +\method{print}{summary.CV.SuperLearner}(x, digits = max(2, getOption("digits") - 2), ...) } -\usage{ +\arguments{ +\item{object}{An object of class "CV.SuperLearner", the result of a call to +\code{CV.SuperLearner}.} -\method{summary}{CV.SuperLearner}(object, obsWeights = NULL, \dots) +\item{obsWeights}{Optional vector for observation weights.} -\method{print}{summary.CV.SuperLearner}(x, digits, \dots) -} +\item{\dots}{Ignored.} -\arguments{ - \item{object}{ - An object of class "CV.SuperLearner", the result of a call to \code{CV.SuperLearner}. -} - \item{x}{ - An object of class "summary.CV.SuperLearner", the result of a call to \code{summary.CV.SuperLearner}. -} - \item{obsWeights}{ - Optional vector for observation weights. - } - \item{digits}{ - The number of significant digits to use when printing. -} - \item{\dots}{ additional arguments \dots} -} -\details{ -Summary method for \code{CV.SuperLearner}. Calculates the V-fold cross-validated estimate of either the mean squared error or the -2*log(L) depending on the loss function used. +\item{x}{An object of class "summary.CV.SuperLearner", the result of a call +to \code{summary.CV.SuperLearner()}.} + +\item{digits}{The number of significant digits to use when printing.} } \value{ -\code{summary.CV.SuperLearner} returns a list with components -\item{call}{ - The function call from \code{CV.SuperLearner} +A list with components +\item{call}{ The function call from \code{CV.SuperLearner} } +\item{method}{ Describes the loss function used. Currently either least squares of negative log Likelihood. } +\item{V}{ Number of folds } +\item{Risk.SL}{ Risk estimate for the super learner } +\item{Risk.dSL}{ Risk estimate for the +discrete super learner (the cross-validation selector) } +\item{Risk.library}{ A matrix with the risk estimates for each algorithm in the library } +\item{Table}{ A table with the mean risk estimate and standard +deviation across the folds for the super learner and all algorithms in the +library } } -\item{method}{ - Describes the loss function used. Currently either least squares of negative log Likelihood. -} -\item{V}{ - Number of folds -} -\item{Risk.SL}{ - Risk estimate for the super learner -} -\item{Risk.dSL}{ - Risk estimate for the discrete super learner (the cross-validation selector) -} -\item{Risk.library}{ - A matrix with the risk estimates for each algorithm in the library -} -\item{Table}{ - A table with the mean risk estimate and standard deviation across the folds for the super learner and all algorithms in the library +\description{ +Summary method for the \code{CV.SuperLearner()} function. } +\details{ +Summary method for \code{CV.SuperLearner}. Calculates the V-fold +cross-validated estimate of either the mean squared error or the \eqn{-2 \log(L)} +depending on the loss function used. } - -\author{ Eric C Polley \email{eric.polley@nih.gov} } - \seealso{ \code{\link{CV.SuperLearner}} } - - +\author{ +Eric C Polley \email{eric.polley@nih.gov} +} \keyword{methods} - diff --git a/man/trimLogit.Rd b/man/trimLogit.Rd index 2520942..49bd12c 100644 --- a/man/trimLogit.Rd +++ b/man/trimLogit.Rd @@ -1,35 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/trimLogit.R \name{trimLogit} \alias{trimLogit} - -\title{ -truncated-probabilities logit transformation -} -\description{ -computes the logit transformation on the truncated probabilities -} +\title{Truncated-probabilities logit transformation} \usage{ trimLogit(x, trim = 1e-05) } - \arguments{ - \item{x}{ -vector of probabilities. -} - \item{trim}{ -value to truncate probabilities at. Currently symmetric truncation (trim and 1-trim). -} -} +\item{x}{vector of probabilities.} +\item{trim}{value to truncate probabilities at. Currently symmetric +truncation (trim and 1-trim).} +} \value{ -logit transformed values +The logit-transformed trimmed values +} +\description{ +Computes the logit transformation on the truncated probabilities. } - - \examples{ -x <- c(0.00000001, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.7, 0.9, 0.99, +x <- c(0.00000001, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.7, 0.9, 0.99, 0.999, 0.9999, 0.99999999) trimLogit(x, trim = 0.001) data.frame(Prob = x, Logit = qlogis(x), trimLogit = trimLogit(x, 0.001)) } - -\keyword{models} \ No newline at end of file +\keyword{models} diff --git a/man/write.SL.template.Rd b/man/write.SL.template.Rd deleted file mode 100644 index 2d90cfe..0000000 --- a/man/write.SL.template.Rd +++ /dev/null @@ -1,98 +0,0 @@ -\name{write.SL.template} -\alias{write.SL.template} -\alias{SL.template} -\alias{predict.SL.template} -% \alias{SL.bart} -% \alias{predict.SL.bart} -\alias{SL.bayesglm} -\alias{predict.SL.bayesglm} -\alias{SL.caret} -\alias{predict.SL.caret} -\alias{SL.caret.rpart} -%\alias{SL.cforest} -\alias{predict.SL.cforest} -% \alias{SL.DSA} -% \alias{predict.SL.DSA} -\alias{SL.earth} -\alias{predict.SL.earth} -\alias{SL.gam} -\alias{predict.SL.gam} -\alias{SL.gbm} -\alias{predict.SL.gbm} -\alias{SL.glm.interaction} -\alias{SL.ipredbagg} -\alias{predict.SL.ipredbagg} -\alias{SL.knn} -\alias{predict.SL.knn} -\alias{SL.loess} -\alias{predict.SL.loess} -\alias{SL.logreg} -\alias{predict.SL.logreg} -\alias{SL.mean} -\alias{predict.SL.mean} -\alias{SL.nnet} -\alias{predict.SL.nnet} -\alias{SL.polymars} -\alias{predict.SL.polymars} -\alias{SL.randomForest} -\alias{predict.SL.randomForest} -\alias{SL.rpart} -\alias{SL.rpartPrune} -\alias{predict.SL.rpart} -\alias{SL.step} -\alias{predict.SL.step} -\alias{SL.step.forward} -\alias{SL.step.interaction} -\alias{SL.stepAIC} -\alias{predict.SL.stepAIC} -\alias{SL.svm} -\alias{predict.SL.svm} -\alias{SL.ridge} -\alias{predict.SL.ridge} -\alias{SL.leekasso} -\alias{predict.SL.leekasso} -\alias{SL.nnls} -\alias{predict.SL.nnls} - -\title{ -Wrapper functions for prediction algorithms in SuperLearner -} -\description{ -Template function for SuperLearner prediction wrappers and built in options. -} -\usage{ -write.SL.template(file = "", ...) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{file}{ - A connection, or a character string naming a file to print to. Passed to \code{\link{cat}}. -} - \item{\dots}{ - Additional arguments passed to \code{\link{cat}} -} -} -\details{ -Describe SL.* structure here -} -\value{ -A list with two elements: - \item{pred}{ -The predicted values for the rows in \code{newX}. -} - \item{fit}{ -A list. Contains all objects necessary to get predictions for new observations from specific algorithm. -} -} - -\author{ Eric C Polley \email{epolley@uchicago.edu} } - - -\seealso{ -\code{\link{SuperLearner}} -} - -\examples{ -write.SL.template(file = '') -} -\keyword{utilities} diff --git a/man/write.method.template.Rd b/man/write.method.template.Rd deleted file mode 100644 index 4c7053c..0000000 --- a/man/write.method.template.Rd +++ /dev/null @@ -1,78 +0,0 @@ -\name{write.method.template} -\alias{write.method.template} -\alias{method.template} -\alias{method.NNLS} -\alias{method.NNLS2} -\alias{method.NNloglik} -\alias{method.CC_LS} -\alias{method.CC_nloglik} -\alias{method.AUC} - - -\title{ -Method to estimate the coefficients for the super learner -} -\description{ -These functions contain the information on the loss function and the model to combine algorithms -} -\usage{ -write.method.template(file = "", ...) - -## a few built in options: -method.NNLS() -method.NNLS2() -method.NNloglik() -method.CC_LS() -method.CC_nloglik() -method.AUC(nlopt_method=NULL, optim_method="L-BFGS-B", bounds=c(0, Inf), normalize=TRUE) -} - -\arguments{ - \item{file}{ -A connection, or a character string naming a file to print to. Passed to \code{\link{cat}}. -} - \item{optim_method}{ -Passed to the \code{optim} call method. See \code{\link{optim}} for details. -} - \item{nlopt_method}{ - Either \code{optim_method} or \code{nlopt_method} must be provided, the other must be \code{NULL} -} - \item{bounds}{ - Bounds for parameter estimates -} - \item{normalize}{ - Logical. Should the parameters be normalized to sum up to 1 -} - \item{\dots}{ -Additional arguments passed to \code{\link{cat}}. -} -} - -\details{ -A \code{SuperLearner} method must be a list (or a function to create a list) with exactly 3 elements. The 3 elements must be named \code{require}, \code{computeCoef} and \code{computePred}. -} - -\value{ -A list containing 3 elements: - \item{require}{ -A character vector listing any required packages. Use \code{NULL} if no additional packages are required -} - \item{computeCoef}{ -A function. The arguments are: \code{Z}, \code{Y}, \code{libraryNames}, \code{obsWeights}, \code{control}, \code{verbose}. The value is a list with two items: \code{cvRisk} and \code{coef}. This function computes the coefficients of the super learner. As the super learner minimizes the cross-validated risk, the loss function information is contained in this function as well as the model to combine the algorithms in \code{SL.library}. -} - \item{computePred}{ -A function. The arguments are: \code{predY}, \code{coef}, \code{control}. The value is a numeric vector with the super learner predicted values. -} -} - -\author{ Eric C Polley \email{Polley.Eric@mayo.edu} } -\seealso{ -\code{\link{SuperLearner}} -} - -\examples{ -write.method.template(file = '') -} - -\keyword{utilities} - diff --git a/man/write.screen.template.Rd b/man/write.screen.template.Rd deleted file mode 100644 index 5b487ba..0000000 --- a/man/write.screen.template.Rd +++ /dev/null @@ -1,50 +0,0 @@ -\name{write.screen.template} -\alias{write.screen.template} -\alias{screen.template} -\alias{All} -\alias{screen.randomForest} -\alias{screen.SIS} -\alias{screen.ttest} -\alias{screen.corP} -\alias{screen.corRank} -\alias{screen.glmnet} - -\title{ -screening algorithms for SuperLearner -} -\description{ -Screening algorithms for \code{SuperLearner} to be used with \code{SL.library}. -} -\usage{ -write.screen.template(file = "", ...) -} - -\arguments{ - \item{file}{ - A connection, or a character string naming a file to print to. Passed to \code{\link{cat}}. -} - \item{\dots}{ -Additional arguments passed to \code{\link{cat}} -} -} - -\details{ -Explain structure of a screening algorithm here: - -} -\value{ - \item{whichVariable}{ - A logical vector with the length equal to the number of columns in \code{X}. TRUE indicates the variable (column of X) should be included. -} -} - -\author{ Eric C Polley \email{polley.eric@mayo.edu} } - -\seealso{ -\code{\link{SuperLearner}} -} - -\examples{ -write.screen.template(file = '') -} -\keyword{utilities} diff --git a/vignettes/Guide-to-SuperLearner.Rmd b/vignettes/Guide-to-SuperLearner.Rmd index 6bde3d9..06c592f 100644 --- a/vignettes/Guide-to-SuperLearner.Rmd +++ b/vignettes/Guide-to-SuperLearner.Rmd @@ -548,7 +548,7 @@ summary(cv_sl) This conveniently shows us the AUC for each algorithm without us having to calculate it manually. But we aren't getting SEs sadly. -Another important optimizer to consider is negative log likelihood, which is intended for binary outcomes and will often work better than NNLS (the default). This is specified by method = "NNloglik". +Another important optimizer to consider is negative log likelihood, which is intended for binary outcomes and will often work better than NNLS (the default). This is specified by `method = "NNloglik"`. # XGBoost hyperparameter exploration