From 272fb289c5ac5a66362a7416e6dbdbeeead5a15e Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Wed, 28 Feb 2024 11:36:42 -0500 Subject: [PATCH 01/11] update gitignore clean up and document lasso functions, add new predict and brier functions, add code to run and evaluate simulations run simulations script --- .Rbuildignore | 6 +- .gitignore | 1 + R/brier2.R | 158 +++++++++++++++++++++++++++++ R/cureitlasso.R | 65 +++++++++++- R/cv.cureitlasso.R | 19 +++- R/predict.R | 183 ++++++++++++++++++++++++++++++++++ R/simulasso.R | 98 +++++++++++------- inst/evaluate-simulations.R | 192 ++++++++++++++++++++++++++++++++++++ inst/run-simulations.R | 168 +++++++++++++++++++++++++++++++ 9 files changed, 850 insertions(+), 40 deletions(-) create mode 100644 R/brier2.R create mode 100644 R/predict.R create mode 100644 inst/evaluate-simulations.R create mode 100644 inst/run-simulations.R diff --git a/.Rbuildignore b/.Rbuildignore index 241c55c..0317850 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,4 +13,8 @@ ^.*[.]Rproj.*$ ^\.Rproj\.user$ ^.Rhistory$ -[~] \ No newline at end of file +[~] +^R/cv.cureitlasso.R +^R/simulasso.R +^R/coxsplit.R +^R/run-simulations.R \ No newline at end of file diff --git a/.gitignore b/.gitignore index 3f26a3f..22b011a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ inst/doc inst/model-fits inst/cure-model-fits +inst/cureit-simulation-results/* inst/archive docs/ revdep/ diff --git a/R/brier2.R b/R/brier2.R new file mode 100644 index 0000000..5063496 --- /dev/null +++ b/R/brier2.R @@ -0,0 +1,158 @@ +# we should use this: https://www.tidymodels.org/learn/statistics/survival-metrics/ + +library(survival) +library(tidyverse) + +# Prep Data --------------------------------------------------------------- + +xi = dat$x +final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]] + +fitcure <- final_fit$fitcure +predcure <- predict(fitcure, newx = xi, + s = min(fitcure$lambda), type = "response") + +fitcox <- fi nal_fit$fitcox +predsurvexp <- predict(fitcox, newx = xi, s = min(fitcox$lambda), type = "response") + + +haz <- final_fit$haz +cumhaz <- final_fit$cumhaz + +predsurv <- exp(-cumhaz*predsurvexp) + +# Create Data frame to pass ---------------------------------------------- + +data = data.frame(dat$x) %>% + mutate(times = dat$t, + event = dat$d, + "truth_survival" = Surv(dat$t, dat$d)) + +# Predict Function (glmnet cure) --------------------------------------------- + +# Trying to get predictions in similar format as tidymodels yardstick +# Eventually will integratethis with existing predict method function + +predict_cure <- function(final_fit, + data = NULL, + eval_timepoints = NULL) { + + # Get values from observed data + survival_times = data$times + sorted_survival_times <- sort(survival_times) + sorted_event_times = sort(data$times[data$event==1]) + censoring_weights = survfit(Surv(data$times, 1 - data$event) ~ 1)$surv + + data_x <- data %>% + select(-c(times, event, truth_survival)) %>% + as.matrix() + + # Extract cumulative hazard from final fit (for event times) + haz <- final_fit$haz + cumhaz <- final_fit$cumhaz + + # default eval timepoints are observed event times if not supplied by user + if(is.null(eval_timepoints)) { + eval_timepoints = sorted_event_times + } + + + # Get Relative Cure and Cox Risk ------------------------------------------ + + # Cure probability risk prediction (independent of timepoint) + fitcure <- final_fit$fitcure + predcure <- predict(fitcure, newx = data_x, + s = min(fitcure$lambda), type = "response") + + # Cox survival probability risk prediction (independent of timepoint) + fitcox <- final_fit$fitcox + predsurvexp <- predict(fitcox, + newx = data_x, + s = min(fitcox$lambda), + type = "response") + + + + # Get Predicted probabilities at select timepoints ------------------- + + # vector for conditional survival prob for all patients at sorted_survival_times[l] + all_preds_all_tps <- list() + + # for each eval timepoint, get predicted survival values + for (l in 1:length(eval_timepoints)) { + all_preds <- data.frame("eval_timepoint" = NA, + "preds" = NA, + "pred_cure" = NA, + "preds_surv" = NA, + "weights" = NA) + + # vector for conditional survival prob for all patients at sorted_survival_times[l] + predsurv <- rep(NA,length(survival_times)) + + + if (eval_timepoints[l] >= min(sorted_event_times)){ + + ids <- which( + sorted_event_times == + max(sorted_event_times[sorted_event_times <= eval_timepoints[l]]) + ) + + # Conditional survival prob at given patient + predsurv <- exp(-cumhaz[ids]*predsurvexp) + + # inverse probability of the censoring weights + ipw <- as.numeric(survival_times > eval_timepoints[l])*censoring_weights[l] + + as.numeric(survival_times <= eval_timepoints[l])*censoring_weights + + # if no events occurred before eval time + } else if (eval_timepoints[l] < min(sorted_event_times)) { + + predsurv <- rep(1,length(survival_times)) + ipw <- 1 + + } + + preds <- 1 - predcure + predcure*predsurv + + all_preds <- tibble( + + "preds" = preds, + "predcure" = predcure, + "predsurv" = predsurv, + "weights" = ipw) %>% + mutate(id_num = 1:nrow(.)) + + names(all_preds) <- c( + "preds", + "predcure", "predsurv", "weights", + "id_num") + + x <- all_preds %>% nest(.preds = -c(id_num)) + + all_preds_all_tps[[l]] <- all_preds + + # names(all_preds) = c("pred", "pred_cure","pred_surv") + } + # all_pred_df <- tibble( + # times = data$times, + # events = data$event, + # .pred = all_preds_all_tps + # ) + return(all_pred_df) +} + +predict_df <- predict_cure(final_fit = final_fit, data = data) + + +# Brier Calculation ------------------------------------------------------- + +brier_cure <- function(predict_df, times, event) { + + eval_timepoints = unique(predict_df$eval_timepoint) + + for (l in 1:length(eval_timepoints)) { + brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) + + + } + diff --git a/R/cureitlasso.R b/R/cureitlasso.R index 00fd83f..bc42272 100644 --- a/R/cureitlasso.R +++ b/R/cureitlasso.R @@ -1,3 +1,47 @@ + +# mu is penlanlity for cure +# lamda is penality for cox +# touch is posterior probabilty of being cured fr uncured (maybe don't need) (Entropy) + # NOTE maybe change name of alphs +# calculated linear peredictor of that iteration + + +#' Lasso for Cure Models +#' +#' @param t Survival times +#' @param d Event indicator +#' @param x data n x p matrix or data.frame +#' @param minmu.ratio minimum penalty value in the logistic model +#' @param minlambda.ratio minimum penalty value in the cox model +#' @param adaptive +#' @param length.grid +#' @param mus penalty for cure +#' @param lambdas is penalty for cox +#' @param tol posterior probability of being cured for uncured (maybe don't need) (Entropy) +#' @param maxiter +#' @param progress verbose iteration counter +#' +#' @return +#' - `all_fits` +#' - `t`, +#' - `tj` - unique event times sorted, +#' - `d`- event indicator, +#' - `x`- data n x p matrix or data.frame, +#' - `alpha0` fitcure$a0[length(fitcure$lambda)], +#' - `alpha` =alpha, +#' - `beta` =beta, +#' - `haz`=haz, +#' - `cumhaz`=cumhaz, +#' - `tau`=as.vector(tau), +#' - `predsurv`=as.vector(predsurv_iter), +#' - `predcure`=as.vector(predcure_iter), +#' - `fitcure`=fitcure, +#' - `fitcox` =fitcox +#' - `mus` - mus that were used as penalties for cure models in cross validation +#' - `lambda` - lambdas that were used as penalities in cix model cross validation +#' @export +#' +#' @examples cureitlasso <- function(t, d, x, @@ -15,6 +59,7 @@ cureitlasso <- function(t, require(glmnet) require(survival) + # initialize initial weights tau <- matrix(0.5,nrow=length(t),ncol=2) # First column: cured; second column: uncured tau[d==1,1] = 0 tau[d==1,2] = 1 @@ -24,6 +69,7 @@ cureitlasso <- function(t, # CV: Be careful for fold split! Two replicates from the same subject should always be assigned to the same fold + #this is for later integreation with tuning penalty.factor.cure=rep(1,ncol(x)) penalty.factor.cox=rep(1,ncol(x)) @@ -51,6 +97,8 @@ cureitlasso <- function(t, # # } + # So this section is finding the set of optimal hyperparemters to test ---- + # tau is uncure probability fitcure0 <- glmnet(x=do.call("rbind", rep(list(x), 2)), y=lab_class, family="binomial", @@ -72,8 +120,10 @@ cureitlasso <- function(t, lambdas <- fitcox0$lambda[idx_lambdas] } + musmax <- fitcure0$lambda[1] lambdasmax <- fitcox0$lambda[1] + # ----------------- predcure <- predict(fitcure0,newx=x,s=mus,type="response") #Prob of uncured predsurvexp <- predict(fitcox0,newx=x,s=lambdas,type="response") # exp(betaX) @@ -104,6 +154,7 @@ cureitlasso <- function(t, fit[[i]] <- list() + for (j in 1:length(lambdas)){ tau <- d + (1-d) * (predcure[,i] * predsurv[,j])/( (1 - predcure[,i]) + predcure[,i] * predsurv[,j] ) @@ -309,14 +360,24 @@ cureitlasso <- function(t, # num_alpha[i,j] <- sum(alpha!=0) # num_beta[i,j] <- sum(beta!=0) + # names(fit[i][j]) <- paste0("mu_fit_", i, "_lambda_fit_", "j") + } + + names(fit[[i]]) <- paste0("lambda_fit_", 1:length(lambdas)) } - return(list(fit=fit, + names(fit) <- paste0("mu_fit_", 1:length(mus)) + + return(list(fit = fit, # num_alpha=num_alpha, # num_beta=num_beta, mus=mus, lambdas=lambdas)) -} \ No newline at end of file +} + +# tj - is unique event times sorted +# haz + diff --git a/R/cv.cureitlasso.R b/R/cv.cureitlasso.R index 8865130..403c130 100644 --- a/R/cv.cureitlasso.R +++ b/R/cv.cureitlasso.R @@ -1,10 +1,14 @@ +# Returns: +# fit - +# arg: length.grid - + cv.cureitlasso <- function(t, d, x, minmu.ratio=0.05, # minimum penalty value in the logistic model minlambda.ratio=0.05, # minimum penalty value in the cox model adaptive=FALSE, - length.grid=10, + length.grid=10, # tells how many hyperparameters to test. default is 10 for each nfolds=5, tol=1e-2, maxiter=100, @@ -22,6 +26,7 @@ cv.cureitlasso <- function(t, require(survcomp) require(pracma) + # order data by times order_idx <- order(t) t <- t[order_idx] d <- d[order_idx] @@ -29,6 +34,7 @@ cv.cureitlasso <- function(t, if (progress) print("Fitting by EM algorithm ...") + # main model with no cross validation fit <- cureitlasso(t,d,x, minmu.ratio, minlambda.ratio, @@ -38,7 +44,7 @@ cv.cureitlasso <- function(t, lambdas=NULL, tol, maxiter, - progress) + progress = TRUE) if (progress) print("Running cross validations ...") @@ -46,6 +52,8 @@ cv.cureitlasso <- function(t, # Fold split foldid <- coxsplit(as.matrix(Surv(t,d)), nfolds) + + # grid/array of values cv_brier <- array(NA,dim=c(length.grid,length.grid,nfolds)) # Run CVs @@ -57,6 +65,7 @@ cv.cureitlasso <- function(t, if (progress) print(i) + cv.fit[[i]] <- cureitlasso(t[foldid != i], d[foldid != i], x[foldid != i,], @@ -114,6 +123,7 @@ cv.cureitlasso <- function(t, } + # calcualte trapazoidal area for brier score cv_brier[j,k,i] <- trapz(tbrier,brier) @@ -130,7 +140,7 @@ cv.cureitlasso <- function(t, cv.fit <- foreach(i = 1:nfolds) %dopar% { - source("~/Projects/Whiting-Qin-cureit/cureit/R/cureitlasso.R") + source(here::here("R/cureitlasso.R")) cureitlasso(t[foldid != i], d[foldid != i], @@ -206,6 +216,7 @@ cv.cureitlasso <- function(t, } + # summarize - take average across all folds cv_brier_mean <- apply(cv_brier,c(1,2),function(x) mean(x)) cv_brier_se <- apply(cv_brier,c(1,2),function(x) sd(x)/sqrt(nfolds)) # pheatmap::pheatmap(cv_brier_mean,cluster_cols = F,cluster_rows = F) @@ -235,4 +246,4 @@ cv.cureitlasso <- function(t, ) ) -} \ No newline at end of file +} diff --git a/R/predict.R b/R/predict.R new file mode 100644 index 0000000..087eee0 --- /dev/null +++ b/R/predict.R @@ -0,0 +1,183 @@ +#' #' Predicted survival probability by cure model +#' #' +#' #' @param object A cureit object. +#' #' @param times Numeric vector of times to obtain survival probability estimates at +#' #' @param probs Numeric vector of quantiles to obtain estimates at +#' #' @param newdata A `base::data.frame()` or `tibble::tibble()` containing all +#' #' the original predictors used to create object. Defaults to `NULL`. +#' #' @param type Output format of predicted values: "lp" (linear predictor) or "prob" (predicted probabilities). +#' #' @param brier Boolean indicator of calculating the Brier scores at specified `times`. +#' #' @param cox Boolean indicator of fitting the Cox model for the training data and calculating the Brier scores at specified `times` for `newdata`. +#' #' @inheritParams cureit +#' #' @return named list of prediction estimates +#' #' @family cureit() functions +#' #' @export +#' #' @examples +#' #' p <- cureit(surv_formula = Surv(ttdeath, death) ~ age, +#' #' cure_formula = ~ age, +#' #' data = trial) +#' #' +#' #' pred <- predict(p, times = seq(5,24,0.5), +#' #' newdata = trial[complete.cases(trial), ], brier=TRUE,cox=TRUE) +#' #' +#' #' plot(seq(5,24,0.5),pred$brier,type="S",pch=1) +#' #' lines(seq(5,24,0.5),pred$brier_cox,type="S",col="red",pch=3) +#' #' legend("topright",c("Cure model","Cox model"), +#' #' col=c("black","red"),lty=1,pch=c(1,3)) +#' #' +#' #' +#' predict.cureit <- function(object, times = NULL, probs = NULL, +#' newdata = NULL, type = c("prob", "lp"), +#' brier = FALSE, cox = FALSE, ...) { +#' # checking inputs ------------------------------------------------------------ +#' +#' type <- match.arg(type) +#' +#' if (is.null(times) + is.null(probs) != 1L) { +#' stop("Must specify one and only one of `times=` and `probs=`.", call. = FALSE) +#' } +#' if (!is.null(times) && any(times < 0)) { +#' stop("`times=` must be non-negative.", call. = FALSE) +#' } +#' if (!is.null(probs) && !all(dplyr::between(probs, 0, 1))) { +#' stop("`probs=` must be between 0 and 1.", call. = FALSE) +#' } +#' if (!(type %in% c("lp","prob"))) { +#' stop("`type=` must be one out of 'lp' or 'prob'.", call. = FALSE) +#' } +#' # if (type == "lp" & brier){ +#' # stop("`type=` must be 'prob' to calculate the Brier scores.") +#' # } +#' # if (brier & is.null(times)){ +#' # stop("Must specify one or more times to calculate the Brier scores.") +#' # } +#' # if (!brier & cox){ +#' # stop("Must let `brier=TRUE` to perform Cox regression and corresponding Brier scores.") +#' # } +#' +#' # getting predictions on the original model fit ------------------------------ +#' processed <- cureit_mold(object$surv_formula, object$cure_formula, newdata %||% object$data, +#' surv_blueprint = object$surv_blueprint, cure_blueprint = object$cure_blueprint) +#' +#' newX=processed$surv_processed$predictors +#' newZ=processed$cure_processed$predictors +#' if (is.vector(newZ)) +#' newZ = as.matrix(newZ) +#' newZ = cbind(1, newZ) +#' if (is.vector(newX)) +#' newX = as.matrix(newX) +#' +#' s0 = as.matrix(object$smcure$s[order(object$smcure$Time)], ncol = 1) +#' n = nrow(s0) +#' uncureprob = exp(object$smcure$b %*% t(newZ))/(1 + exp(object$smcure$b %*% t(newZ))) +#' +#' scure = array(0, dim = c(n, nrow(newX))) +#' t = array(0, dim = c(n, nrow(newX))) +#' spop = array(0, dim = c(n, nrow(newX))) +#' +#' #### Only support PH model for now +#' ebetaX = exp(object$smcure$beta %*% t(newX)) +#' for (i in 1:nrow(newZ)) { +#' scure[, i] = s0^ebetaX[i] +#' } +#' +#' for (i in 1:n) { +#' for (j in 1:nrow(newX)) { +#' spop[i, j] = uncureprob[j] * scure[i, j] + (1 - +#' uncureprob[j]) +#' } +#' } +#' +#' spop_prd = cbind(object$smcure$Time, spop) +#' scure_prd = cbind(object$smcure$Time, scure) +#' cure_prd = 1-uncureprob +#' +#' # Type = "Prob --------- +#' if (type == "prob"){ +#' +#' if (!is.null(times)) { +#' return(list(cured = cure_prd, +#' surv_uncured = probs_at_times(scure_prd,times), +#' surv_marginal = probs_at_times(spop_prd,times))) +#' } +#' +#' if (!is.null(probs)) { +#' return(list(cured = cure_prd, +#' surv_uncured = times_at_probs(scure_prd,probs), +#' surv_marginal = times_at_probs(spop_prd,probs))) +#' } +#' } +#' +#' # Type = "Prob --------- +#' if (type == "lp"){ +#' +#' return(list(lp_cure_model = object$smcure$b %*% t(newZ), +#' lp_surv_model = object$smcure$beta %*% t(newX))) +#' +#' } +#' +#' } +#' +#' times_at_probs <- function(matrix_pred, probs) { +#' matrix_zero <- matrix(c(0, 1), ncol = 2) +#' lst_time_risk <- +#' purrr::map( +#' probs, +#' ~ purrr::map_dbl( +#' seq_len(ncol(matrix_pred) - 1L), +#' function(i) { +#' # adding 1 to the matrix, keeping only the time column, and col of interest +#' m <- rbind(matrix_zero, matrix_pred[, c(1L, i + 1L)]) +#' +#' # return NA if the quantiles are all missing OR prob is larger than observed +#' if (isTRUE(all(is.na(m[-1, 2])) || .x > max(m[, 2]))) { +#' return(NA) +#' } +#' if (isTRUE(.x %in% m[, 2])) { +#' return(m[m[, 2] %in% .x, 1]) +#' } +#' return(m[which.min(m[, 2] >= .x), 1]) +#' } +#' ) +#' ) %>% +#' stats::setNames(paste0("prob ", probs * 100, "%")) +#' +#' # returning results ---------------------------------------------------------- +#' lst_time_risk +#' } +#' +#' +#' +#' probs_at_times <- function(matrix_pred, times) { +#' # defining times for predictions --------------------------------------------- +#' # CHECK- here should this be warning? +#' all_times <- union(0, matrix_pred[, 1]) %>% sort() +#' if (isTRUE(max(times) > max(all_times))) { +#' stringr::str_glue("`times=` cannot be larger than {max(all_times)}") %>% +#' stop(call. = FALSE) +#' } +#' times_obs <- +#' purrr::map_dbl( +#' times, +#' function(.x) { +#' if (isTRUE(.x %in% all_times)) { +#' return(all_times[all_times %in% .x]) +#' } +#' all_times[which.min(all_times <= .x) - 1L] +#' } +#' ) +#' +#' # named list of the risks, the names are the times, +#' # the values are the estimates of risk at the covar levels +#' lst_risk_time <- +#' purrr::map(seq_len(length(all_times) - 1L), ~ matrix_pred[.x, -1]) %>% +#' stats::setNames(all_times[-1]) %>% +#' dplyr::bind_cols() %>% +#' mutate(`0` = 0, .before = 1) %>% +#' as.list() %>% +#' stats::setNames(all_times) +#' +#' # extracting risks at specified time ----------------------------------------- +#' lst_risk_time[as.character(times_obs)] %>% +#' stats::setNames(paste("time", times)) +#' } diff --git a/R/simulasso.R b/R/simulasso.R index edc60c4..3d745f6 100644 --- a/R/simulasso.R +++ b/R/simulasso.R @@ -1,38 +1,70 @@ -simulasso <- function(n=100, - p=200, - id_cure=NULL, - id_cox=NULL, - coefs_cure=NULL, - coefs_cox=NULL - ){ - +#' Simulate Data +#' +#' @param n Number of patients to simulate +#' @param p Number of parameters to simulate +#' @param id_cure Which parameters to include in cure simulation +#' @param id_cox Which parameters to include in cox simulation +#' @param coefs_cure Pre-specified coefficients of cure +#' @param coefs_cox Pre-specified Coefficients of cox +#' @param n_true_p Number of "true" non-zero coefficients +#' +#' @return +#' - `t`- simulated survival times +#' - `d`- simulated event indicator +#' - `x` simulated data n x p matrix or data.frame +#' - `id_cure` - IDs of selected cure parameters +#' - `id_cure` - IDs of selected cox parameters +#' - `coefs_cure` - Coefficients of cure parameters +#' - `coefs_cox` - Coefficients of cox parameters +#' +#' @export +#' +#' @examples +#' simulasso(n = 10, p = 3) +simulasso <- function(n = 100, + p = 200, + id_cure = NULL, + id_cox = NULL, + coefs_cure = NULL, + coefs_cox = NULL, + n_true_p = 10) { require(mvtnorm) + + t <- rep(NA, length = n) + d <- rep(NA, length = n) + + # number of predictors is size + if (is.null(id_cure)) id_cure <- sample(p, size = n_true_p) + if (is.null(id_cox)) id_cox <- sample(p, size = n_true_p) + + if (is.null(coefs_cure)) coefs_cure <- runif(length(id_cure) + 1, min = 2, max = 3) + if (is.null(coefs_cox)) coefs_cox <- runif(length(id_cox), min = 2, max = 3) + + x <- mvtnorm::rmvnorm(n, mean = rep(0, p)) + + # add intercept and calculate linear predictor and probability of cure using the logistic function: - t <- rep(NA,length=n) - d <- rep(NA,length=n) - - if(is.null(id_cure)) id_cure = sample(p,size=3) - if(is.null(id_cox)) id_cox = sample(p,size=3) - - if(is.null(coefs_cure)) coefs_cure = runif(length(id_cure)+1,min=2,max=3) - if(is.null(coefs_cox)) coefs_cox = runif(length(id_cox),min=2,max=3) - - x <- mvtnorm::rmvnorm(n,mean=rep(0,p)) - puncure <- exp(cbind(1,x[,id_cure]) %*% coefs_cure)/(1+exp(cbind(1,x[,id_cure]) %*% coefs_cure)) - uncure <- rbinom(n,1,prob=puncure) - - lambda <- exp(x[,id_cox] %*% coefs_cox) + puncure <- exp(cbind(1, x[, id_cure]) %*% coefs_cure) / (1 + exp(cbind(1, x[, id_cure]) %*% coefs_cure)) + # make into event index + uncure <- rbinom(n, 1, prob = puncure) + + # linear predictor survival + lp_cox <- exp(x[, id_cox] %*% coefs_cox) + ### Should censoring time for cured subjects be generated from a same distribution as those uncured? - - for(i in 1:n){ - U <- runif(1,min=0,max=1) - t0 <- log(1-(log(1-U))/(0.1*lambda[i])) - c0 <- min(rexp(1,rate=0.1),runif(1,min=5,max=6)) - t[i] <- ifelse(uncure[i]==1, min(t0,c0), c0) - d[i] <- ifelse(uncure[i]==1, as.numeric(I(t0 <= c0)), 0) + + for (i in 1:n) { + U <- runif(1, min = 0, max = 1) + t0 <- log(1 - (log(1 - U)) / (0.1 * lp_cox[i])) + c0 <- min(rexp(1, rate = 0.1), runif(1, min = 5, max = 6)) + t[i] <- ifelse(uncure[i] == 1, min(t0, c0), c0) + d[i] <- ifelse(uncure[i] == 1, as.numeric(I(t0 <= c0)), 0) } - - dat <- list(t=t,d=d,x=x,uncure=uncure,id_cure=id_cure,id_cox=id_cox,coefs_cure=coefs_cure,coefs_cox=coefs_cox) - -} \ No newline at end of file + + dat <- list(t = t, d = d, x = x, + uncure = uncure, + id_cure = id_cure, + id_cox = id_cox, coefs_cure = coefs_cure, + coefs_cox = coefs_cox) +} diff --git a/inst/evaluate-simulations.R b/inst/evaluate-simulations.R new file mode 100644 index 0000000..83aa32e --- /dev/null +++ b/inst/evaluate-simulations.R @@ -0,0 +1,192 @@ +source(here::here("R/cv.cureitlasso.R")) +source(here::here("R/cureitlasso.R")) +source(here::here("R/simulasso.R")) +source(here::here("R/coxsplit.R")) + +# Calculate Performance Metrics on Simulations ---------------------------------- + +# * Load all relevant data ------- +load(here::here("inst", "cv_fits_50.RData")) +load(here::here("inst", "cv_fits_100.RData")) +load(here::here("inst", "sim_data_list.RData")) +load(here::here("inst", "sim_valid_data_list.RData")) + +# * Calculate Sensitivity of Variable Selection in Training Fits ------- +x <- for (i in 1:50) { + fit <- cv_fits_list[[i]] + #fit <- x$fit + dat <- sim_data_list[[i]] + + sen_cox_min <- mean(dat$id_cox %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + print(sen_cox_min) + +} + + +hist(x) + +# Sensitivity and specificity (for both min and 1se choices of parameters) --- + +sen_cox_min <- mean(dat$id_cox %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + + +spe_cure_min <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$alpha!=0)) +spe_cox_min <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + +sen_cure_1se <- mean(dat$id_cure %in% which(fit$fit[[fit$index$`1se`[1]]][[fit$index$`1se`[2]]]$alpha!=0)) +sen_cox_1se <- mean(dat$id_cox %in% which(fit$fit[[fit$index$`1se`[1]]][[fit$index$`1se`[2]]]$beta!=0)) +spe_cure_1se <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$`1se`[1]]][[fit$index$`1se`[2]]]$alpha!=0)) +spe_cox_1se <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$`1se`[1]]][[fit$index$`1se`[2]]]$beta!=0)) + + + + +# * Get it to work with yardstick ----- +library(yardstick) + +vec <- c(TRUE, TRUE, FALSE) +df <- data.frame( + pred = factor(as.integer(dat$id_cure %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$alpha!=0))) + # Presence = as.integer(vec) # Convert logical values to integers (TRUE -> 1, FALSE -> 0) +) %>% + mutate(obs = factor(rep(1, length = nrow(.)), levels = c("0", "1"))) + +sens(df, obs, pred, event_level = "second") +sen_cure_min <- mean(dat$id_cure %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$alpha!=0)) + + + +# * Brier Score ------------------------------------------------------------- +#' - `ti`- simulated survival times +#' - `d`- simulated event indicator +#' - `x` simulated data n x p matrix or data.frame +#' - `id_cure` - IDs of selected cure parameters +#' - `id_cure` - IDs of selected cox parameters +#' - `coefs_cure` - Coefficients of cure parameters +#' - `coefs_cox` - Coefficients of cox parameters +#' - `tj` - is unique event times sorted + + +for (i in 1:length(sim_data_list)) { + + dat <- sim_data_list[[i]] + dat_valid = sim_valid_data_list[[i]] + fit <- cv_fits_list[[i]] + + # for training set + ti <- dat$t + di <- dat$d + xi <- dat$x + # unique event times sorted + tj <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$tj + + # for test set + ti_valid <- dat_valid$t + di_valid <- dat_valid$d + xi_valid <- dat_valid$x + tj_valid <- sort(ti_valid[di_valid==1]) + + + # get final fit (hyperparam with min error) for cure model (fits used train data) + final_select_model = fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]] + fitcure <- final_select_model$fitcure + + # use this model to make predictions on data (fitted relative-risk for "cox) + predcure <- predict(fitcure, newx=xi, s=min(fitcure$lambda), type="response") + predcure_valid <- predict(fitcure, newx = xi_valid, s=min(fitcure$lambda), type="response") + + # get final fit of min coxcure model + fitcox <- final_select_model$fitcox + + # Get Data from Selected "Final" Prediction Model ----- + # use this model to make predictions on data + + # get linear predictors from cox + predsurvexp <- predict(fitcox, + newx=xi, + s = min(fitcox$lambda), + type="response") + + predsurvexp_valid <- predict(fitcox, + newx=xi_valid, + s = min(fitcox$lambda), + type="response") + + haz <- final_select_model$haz + cumhaz <- final_select_model$cumhaz + + # Get Kaplan Meier estimate of actual data + predcens <- survfit(Surv(ti,1-di)~1)$surv + tcens <- survfit(Surv(ti,1-di)~1)$time + # + # predcens <- survfit(Surv(ti,1-di)~1)$surv + # tcens <- survfit(Surv(ti,1-di)~1)$time + + # times at which to get score + tbrier <- sort(ti) + brier <- rep(NA,length(tbrier)) + + tbrier_valid <- sort(ti_valid) + brier_valid <- rep(NA,length(tbrier_valid)) + + # Train Data---- + for (l in 1:length(tbrier)){ + + predsurv <- rep(NA,length(ti)) # Conditional survival prob for all patients at tbrier[l] + + if (tbrier[l] >= min(tj)){ + + ids <- which(tj == max(tj[tj <= tbrier[l]])) + + # get predicted survival probability + predsurv <- exp(-cumhaz[ids]*predsurvexp) + + # weights + ipw <- as.numeric(ti > tbrier[l])*predcens[l] + as.numeric(ti <= tbrier[l])*predcens + + } else if (tbrier[l] < min(tj)){ + + predsurv <- rep(1,length(ti)) + ipw <- 1 + + } + + preds <- 1 - predcure + predcure*predsurv + brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) + + } + + # Test Data---- + for (l in 1:length(tbrier)){ + + predsurv_valid <- rep(NA,length(ti_valid)) # Conditional survival prob for all patients at tbrier[l] + + if (tbrier_valid[l] >= min(tj_valid)){ + + ids_valid <- which(tj_valid == max(tj_valid[tj_valid <= tbrier_valid[l]])) + + # get predicted survival probability + predsurv_valid <- exp(-cumhaz[ids_valid]*predsurvexp_valid) + + # weights + ipw_valid <- as.numeric(ti_valid > tbrier_valid[l])*predcens[l] + as.numeric(ti_valid <= tbrier_valid[l])*predcens + + }else if (tbrier_valid[l] < min(tj_valid)){ + + predsurv_valid <- rep(1,length(ti_valid)) + ipw_valid <- 1 + + } + + preds_valid <- 1 - predcure_valid + predcure_valid*predsurv_valid + brier_valid[l] <- mean(I(di_valid==0)*(1 - preds_valie)^2/(ipw_valid + 0.001) + I(di_valid==1)*(0 - preds_valid^2/(ipw_valid+0.001)) + + + + all_brier = cbind.data.frame("tbrier" = tbrier, "brier" = brier, "brier_valid" = brier_valid) + return(all_brier) + # plot(tbrier,brier,type="S") + + + } +} diff --git a/inst/run-simulations.R b/inst/run-simulations.R new file mode 100644 index 0000000..35275d5 --- /dev/null +++ b/inst/run-simulations.R @@ -0,0 +1,168 @@ +source(here::here("R/cv.cureitlasso.R")) +source(here::here("R/cureitlasso.R")) +source(here::here("R/simulasso.R")) +source(here::here("R/coxsplit.R")) + +# Potentially change sample sizes and number of parameters +# We could try simulating scenarios with overlap between cure and cox + +# Simulate Cure Data -------------------------------------------------------- + +# Test- Try simulating one train and test set + +dat <- simulasso() # training data + + + +dat_valid <- simulasso(id_cure = dat$id_cure, + id_cox = dat$id_cox, + coefs_cure = dat$coefs_cure, + coefs_cox = dat$coefs_cox) + + +setdiff(dat$id_cox, dat_valid$id_cox) + + +# * Training- Simulate n Data Sets -------------------------------------------------------- + +sim_data_list <- list() + +for (i in 1:100) { + sim_data_list[[i]] = simulasso() +} + +sim_data_list[[1]]$id_cure + +# * Validation - Simulate n Validation Data --------------------------------------------------- + +sim_valid_data_list <- list() + +for (i in 1:100) { + dat <- sim_data_list[[i]] + sim_valid_data_list[[i]] = simulasso(id_cure = dat$id_cure, + id_cox = dat$id_cox, + coefs_cure = dat$coefs_cure, + coefs_cox = dat$coefs_cox) +} + +sim_valid_data_list[[1]]$id_cure + +# save(sim_data_list, file = here::here("inst", "sim_data_list.RData")) +# save(sim_valid_data_list, file = here::here("inst", "sim_valid_data_list.RData")) + +# Fit Lasso ------------------------------------------------------------ + +# test- A Single Lasso Fit + +dat <- sim_data_list[[1]] +test_single_run <- cureitlasso( + t = dat$t, + d = dat$d, + x = dat$x, + minmu.ratio=0.05, # minimum penalty value in the logistic model + minlambda.ratio=0.05, # minimum penalty value in the cox model + adaptive=FALSE, + length.grid=8, + tol=1e-2, + maxiter=100, + progress=FALSE) + +# Run Cross Validation ---------------------------------------------------- + +# * Test a Single CV Fit ------ +# To get a sense of what is in the output + +dat_valid <- sim_data_list[[1]] + +test_single_cv <- cv.cureitlasso(t = dat_valid$t, + d = dat_valid$d, + x = dat_valid$x, + minmu.ratio=0.05, # minimum penalty value in the logistic model + minlambda.ratio=0.05, # minimum penalty value in the cox model + adaptive=FALSE, + length.grid=10, + + tol=1e-2, + maxiter=100, + + nfolds=5, + ncore=5, + seed=NULL, + progress=FALSE) + +test_list <- list() +test_list[[1]] <- test_single_cv + +# * 100 Runs ---- + +cv_fits_list <- list() +start_time <- Sys.time() + +for (i in 1:100) { + + dat_valid <- sim_data_list[[i]] + + cv_fits_list[[i]] <- cv.cureitlasso(t = dat_valid$t, + d = dat_valid$d, + x = dat_valid$x, + minmu.ratio=0.05, # minimum penalty value in the logistic model + minlambda.ratio=0.05, # minimum penalty value in the cox model + adaptive=FALSE, + length.grid=10, + + tol=1e-2, + maxiter=100, + + nfolds=5, + ncore=5, + seed=NULL, + progress=FALSE + ) + + +} + +# 8.670569 mins for 3 data sets +end_time <- Sys.time() +end_time - start_time + + +save(cv_fits_list, file = here::here("inst", "cv_fits_100.RData")) + +# * 50 Runs ---- + +cv_fits_list2 <- list() + +start_time <- Sys.time() + +for (i in 1:50) { + + dat_valid <- sim_data_list[[i]] + + cv_fits_list2[[i]] <- cv.cureitlasso(t = dat_valid$t, + d = dat_valid$d, + x = dat_valid$x, + minmu.ratio=0.05, # minimum penalty value in the logistic model + minlambda.ratio=0.05, # minimum penalty value in the cox model + adaptive=FALSE, + length.grid=10, + + tol=1e-2, + maxiter=100, + + nfolds=5, + ncore=5, + seed=NULL, + progress=FALSE + ) + + +} + +# 2.369856 hours for 50 fits +end_time <- Sys.time() +end_time - start_time + + +save(cv_fits_list2, file = here::here("inst", "cv_fits_50.RData")) + From ba5773eb43736cd3d26743ee8f253a16ab92d40b Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Wed, 28 Feb 2024 17:46:16 -0500 Subject: [PATCH 02/11] brier scores --- R/brier2.R | 158 --------------------- R/cureit_lasso_predict_eval.R | 260 ++++++++++++++++++++++++++++++++++ 2 files changed, 260 insertions(+), 158 deletions(-) delete mode 100644 R/brier2.R create mode 100644 R/cureit_lasso_predict_eval.R diff --git a/R/brier2.R b/R/brier2.R deleted file mode 100644 index 5063496..0000000 --- a/R/brier2.R +++ /dev/null @@ -1,158 +0,0 @@ -# we should use this: https://www.tidymodels.org/learn/statistics/survival-metrics/ - -library(survival) -library(tidyverse) - -# Prep Data --------------------------------------------------------------- - -xi = dat$x -final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]] - -fitcure <- final_fit$fitcure -predcure <- predict(fitcure, newx = xi, - s = min(fitcure$lambda), type = "response") - -fitcox <- fi nal_fit$fitcox -predsurvexp <- predict(fitcox, newx = xi, s = min(fitcox$lambda), type = "response") - - -haz <- final_fit$haz -cumhaz <- final_fit$cumhaz - -predsurv <- exp(-cumhaz*predsurvexp) - -# Create Data frame to pass ---------------------------------------------- - -data = data.frame(dat$x) %>% - mutate(times = dat$t, - event = dat$d, - "truth_survival" = Surv(dat$t, dat$d)) - -# Predict Function (glmnet cure) --------------------------------------------- - -# Trying to get predictions in similar format as tidymodels yardstick -# Eventually will integratethis with existing predict method function - -predict_cure <- function(final_fit, - data = NULL, - eval_timepoints = NULL) { - - # Get values from observed data - survival_times = data$times - sorted_survival_times <- sort(survival_times) - sorted_event_times = sort(data$times[data$event==1]) - censoring_weights = survfit(Surv(data$times, 1 - data$event) ~ 1)$surv - - data_x <- data %>% - select(-c(times, event, truth_survival)) %>% - as.matrix() - - # Extract cumulative hazard from final fit (for event times) - haz <- final_fit$haz - cumhaz <- final_fit$cumhaz - - # default eval timepoints are observed event times if not supplied by user - if(is.null(eval_timepoints)) { - eval_timepoints = sorted_event_times - } - - - # Get Relative Cure and Cox Risk ------------------------------------------ - - # Cure probability risk prediction (independent of timepoint) - fitcure <- final_fit$fitcure - predcure <- predict(fitcure, newx = data_x, - s = min(fitcure$lambda), type = "response") - - # Cox survival probability risk prediction (independent of timepoint) - fitcox <- final_fit$fitcox - predsurvexp <- predict(fitcox, - newx = data_x, - s = min(fitcox$lambda), - type = "response") - - - - # Get Predicted probabilities at select timepoints ------------------- - - # vector for conditional survival prob for all patients at sorted_survival_times[l] - all_preds_all_tps <- list() - - # for each eval timepoint, get predicted survival values - for (l in 1:length(eval_timepoints)) { - all_preds <- data.frame("eval_timepoint" = NA, - "preds" = NA, - "pred_cure" = NA, - "preds_surv" = NA, - "weights" = NA) - - # vector for conditional survival prob for all patients at sorted_survival_times[l] - predsurv <- rep(NA,length(survival_times)) - - - if (eval_timepoints[l] >= min(sorted_event_times)){ - - ids <- which( - sorted_event_times == - max(sorted_event_times[sorted_event_times <= eval_timepoints[l]]) - ) - - # Conditional survival prob at given patient - predsurv <- exp(-cumhaz[ids]*predsurvexp) - - # inverse probability of the censoring weights - ipw <- as.numeric(survival_times > eval_timepoints[l])*censoring_weights[l] + - as.numeric(survival_times <= eval_timepoints[l])*censoring_weights - - # if no events occurred before eval time - } else if (eval_timepoints[l] < min(sorted_event_times)) { - - predsurv <- rep(1,length(survival_times)) - ipw <- 1 - - } - - preds <- 1 - predcure + predcure*predsurv - - all_preds <- tibble( - - "preds" = preds, - "predcure" = predcure, - "predsurv" = predsurv, - "weights" = ipw) %>% - mutate(id_num = 1:nrow(.)) - - names(all_preds) <- c( - "preds", - "predcure", "predsurv", "weights", - "id_num") - - x <- all_preds %>% nest(.preds = -c(id_num)) - - all_preds_all_tps[[l]] <- all_preds - - # names(all_preds) = c("pred", "pred_cure","pred_surv") - } - # all_pred_df <- tibble( - # times = data$times, - # events = data$event, - # .pred = all_preds_all_tps - # ) - return(all_pred_df) -} - -predict_df <- predict_cure(final_fit = final_fit, data = data) - - -# Brier Calculation ------------------------------------------------------- - -brier_cure <- function(predict_df, times, event) { - - eval_timepoints = unique(predict_df$eval_timepoint) - - for (l in 1:length(eval_timepoints)) { - brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) - - - } - diff --git a/R/cureit_lasso_predict_eval.R b/R/cureit_lasso_predict_eval.R new file mode 100644 index 0000000..e2ff1f0 --- /dev/null +++ b/R/cureit_lasso_predict_eval.R @@ -0,0 +1,260 @@ +# we should use this: https://www.tidymodels.org/learn/statistics/survival-metrics/ + +library(survival) +library(tidyverse) +library(glmnet) + +# Prep Data --------------------------------------------------------------- + +load(here::here("inst", "cureit-simulation-results", "sim_data_list.RData")) +load(here::here("inst", "cureit-simulation-results", "sim_valid_data_list.RData")) +dat <- sim_data_list[[1]] + +load(here::here("inst","cureit-simulation-results", "cv_fits_100.RData")) +fit <- cv_fits_list[[1]] +final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] + +# Create Data frame to pass + +data = data.frame(dat$x) %>% + mutate(times = dat$t, + event = dat$d, + "truth_survival" = Surv(dat$t, dat$d)) + +# Predict Function (glmnet cure) --------------------------------------------- + +# Trying to get predictions in similar format as tidymodels yardstick +# Eventually will integrate this with existing predict method function + +predict_cure <- function(final_fit, + data = NULL, + eval_timepoints = NULL) { + + # Get values from observed data + survival_times = data$times + sorted_survival_times <- sort(survival_times) + sorted_event_times = sort(data$times[data$event==1]) + censoring_weights = survfit(Surv(data$times, 1 - data$event) ~ 1)$surv + + data_x <- data %>% + select(-c(times, event, truth_survival)) %>% + as.matrix() + + + # Extract cumulative hazard from final fit (for event times) + haz <- final_fit$haz + cumhaz <- final_fit$cumhaz + + # default eval timepoints are observed event times if not supplied by user + if(is.null(eval_timepoints)) { + eval_timepoints = sorted_survival_times + } + + + # Get Relative Cure and Cox Risk (independent of timepoint) ------------------- + + # Cure probability risk prediction + fitcure <- final_fit$fitcure + predcure <- predict(fitcure, newx = data_x, + s = min(fitcure$lambda), type = "response") + + # Cox survival probability risk prediction + fitcox <- final_fit$fitcox + predsurvexp <- predict(fitcox, + newx = data_x, + s = min(fitcox$lambda), + type = "response") + + + + # Get Predicted probabilities at Selected Timepoints ------------------- + + all_preds_all_tps <- list() + + # for each eval timepoint, get predicted survival values + for (l in 1:length(eval_timepoints)) { + all_preds <- data.frame("eval_timepoint" = NA, + "preds" = NA, + "pred_cure" = NA, + "preds_surv" = NA, + "weights" = NA) + + # vector for conditional survival prob for all patients at sorted_survival_times[l] + #predsurv <- rep(NA,length(survival_times)) + + + if (eval_timepoints[l] >= min(sorted_event_times)){ + + # maximum event time that is less than timepoint we are evaluating + ids <- which( + sorted_event_times == + max(sorted_event_times[sorted_event_times <= eval_timepoints[l]]) + ) + + # Conditional survival prob for all patients at given time + predsurv <- exp(-cumhaz[ids]*predsurvexp) + + # inverse probability of the censoring weights + ipw <- as.numeric(survival_times > eval_timepoints[l])*censoring_weights[l] + + as.numeric(survival_times <= eval_timepoints[l])*censoring_weights + + # if no events occurred before eval time + } else if (eval_timepoints[l] < min(sorted_event_times)) { + + predsurv <- rep(1,length(survival_times)) + ipw <- 1 + + } + + preds <- 1 - predcure + predcure*predsurv + + all_preds <- tibble( + + ".eval_time" = eval_timepoints[l], + ".pred_survival" = preds, + ".predcure" = predcure, + ".predsurv" = predsurv, + ".weight_censored_raw" = censoring_weights[l], + ".weight_censored" = ipw) %>% + mutate(id = 1:nrow(.)) + # + # names(all_preds) <- c( + # "preds", + # "predcure", "predsurv", "weights", + # "id_num") + + all_preds_nest <- all_preds %>% nest(.preds = -c(id)) + + all_preds_all_tps[[l]] <- all_preds + + # names(all_preds) = c("pred", "pred_cure","pred_surv") + } + + + all_pred_df <- all_preds_all_tps %>% + do.call("rbind", .) + + all_pred_df <- all_pred_df %>% + nest(.pred = -c(id)) %>% + bind_cols(., "times" = data$times, + "event" = data$event) + + + return(all_pred_df) +} + +predict_df <- predict_cure(final_fit = final_fit, data = data) + +# quick brier +x <- c() +for (i in 1:10) { + + time_1 <- predict_df_long$.eval_time[[i]] + + predict_df_long_1 <- predict_df_long %>% + filter(.eval_time == time_1) + + x[i] <- mean(I(predict_df_long_1$event == 0)*(1 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001) + + I(predict_df_long_1$event ==1)*(0 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001)) +} + +# Brier With Yardstick ------------------------------------------------------- +library(yardstick) + +brier_scores <- + predict_df %>% + mutate(truth = Surv(times, event)) %>% + brier_survival(truth = truth, .pred) + + +# Brier Function ------------------------------------------------------- + +brier_cure <- function(predict_df, truth = times, eval_timepoints = NULL) { + + + # Get values from observed data + survival_times = predict_df$times + sorted_survival_times <- sort(survival_times) + sorted_event_times = sort(data$times[data$event==1]) + + + predict_df_long <- predict_df %>% + unnest(everything()) + + predict_df_nest_time <- predict_df_long %>% + nest(data = -.eval_time) + + + map(predict_df_nest_time$data, function(x) { + mean(I(x$event == 0)*(1 - x$.pred_survival)^2/(x$.weight_censored + 0.001) + I(x$event == 1)*(0 - x$.pred_survival)^2/(x$.weight_censored + 0.001)) + + }) + for (l in 1:length(eval_timepoints)) { + brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) + + } +} + + + +# Former Brier Calculation --------------------------------------------------- + +calc_brier_old <- function(dat, fit) { + # ti <- dat_valid$t + # di <- dat_valid$d + # xi <- dat_valid$x + ti <- dat$t + di <- dat$d + xi <- dat$x + tj <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$tj + final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]] + + fitcure <- final_fit$fitcure + predcure <- predict(fitcure,newx=xi,s=min(fitcure$lambda),type="response") + fitcox <- final_fit$fitcox + predsurvexp <- predict(fitcox,newx=xi,s=min(fitcox$lambda),type="response") + haz <- final_fit$haz + cumhaz <- final_fit$cumhaz + + predcens <- survfit(Surv(ti,1-di)~1)$surv + tcens <- survfit(Surv(ti,1-di)~1)$time + + tbrier <- sort(ti) + brier <- rep(NA,length(tbrier)) + ipw <- rep(NA,length(tbrier)) + + for (l in 1:length(tbrier)){ + + predsurv <- rep(NA,length(ti)) # Conditional survival prob for all patients at tbrier[l] + + if (tbrier[l] >= min(tj)){ + + ids <- which(tj == max(tj[tj <= tbrier[l]])) + predsurv <- exp(-cumhaz[ids]*predsurvexp) + ipw <- as.numeric(ti > tbrier[l])*predcens[l] + as.numeric(ti <= tbrier[l])*predcens + + }else if (tbrier[l] < min(tj)){ + + predsurv <- rep(1,length(ti)) + ipw <- 1 + + } + + preds <- 1 - predcure + predcure*predsurv + brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) + #ipw[l] <- ipw + # print(ipw) + } + + preds <- cbind.data.frame(tbrier, brier) + return(preds) + # plot(tbrier,brier,type="S") + +} + +x <- calc_brier_old(sim_data_list[[1]], cv_fits_list[[1]]) +x_val <- calc_brier_old(sim_valid_data_list[[1]], cv_fits_list[[1]]) + +for (i in 1:length(cv_fits_list)) { + +} From a1218db4ec1823af94fc954891cc9cbb3a19ce83 Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Wed, 28 Feb 2024 18:01:28 -0500 Subject: [PATCH 03/11] updates --- R/cureit_lasso_predict_eval.R | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/R/cureit_lasso_predict_eval.R b/R/cureit_lasso_predict_eval.R index e2ff1f0..4a7b2a0 100644 --- a/R/cureit_lasso_predict_eval.R +++ b/R/cureit_lasso_predict_eval.R @@ -8,9 +8,9 @@ library(glmnet) load(here::here("inst", "cureit-simulation-results", "sim_data_list.RData")) load(here::here("inst", "cureit-simulation-results", "sim_valid_data_list.RData")) -dat <- sim_data_list[[1]] - load(here::here("inst","cureit-simulation-results", "cv_fits_100.RData")) + +dat <- sim_data_list[[1]] fit <- cv_fits_list[[1]] final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] @@ -117,17 +117,11 @@ predict_cure <- function(final_fit, ".weight_censored_raw" = censoring_weights[l], ".weight_censored" = ipw) %>% mutate(id = 1:nrow(.)) - # - # names(all_preds) <- c( - # "preds", - # "predcure", "predsurv", "weights", - # "id_num") - + all_preds_nest <- all_preds %>% nest(.preds = -c(id)) all_preds_all_tps[[l]] <- all_preds - # names(all_preds) = c("pred", "pred_cure","pred_surv") } @@ -145,7 +139,7 @@ predict_cure <- function(final_fit, predict_df <- predict_cure(final_fit = final_fit, data = data) -# quick brier +# quick brier --- x <- c() for (i in 1:10) { @@ -161,6 +155,7 @@ for (i in 1:10) { # Brier With Yardstick ------------------------------------------------------- library(yardstick) +# Doesn't match- weights are wrong brier_scores <- predict_df %>% mutate(truth = Surv(times, event)) %>% @@ -197,7 +192,7 @@ brier_cure <- function(predict_df, truth = times, eval_timepoints = NULL) { -# Former Brier Calculation --------------------------------------------------- +# Simple Former Brier Calculation --------------------------------------------------- calc_brier_old <- function(dat, fit) { # ti <- dat_valid$t @@ -255,6 +250,10 @@ calc_brier_old <- function(dat, fit) { x <- calc_brier_old(sim_data_list[[1]], cv_fits_list[[1]]) x_val <- calc_brier_old(sim_valid_data_list[[1]], cv_fits_list[[1]]) -for (i in 1:length(cv_fits_list)) { - +all_brier <- list() + +for (i in 1:10) { + all_brier[[i]] <- calc_brier_old(sim_data_list[[i]], fit = cv_fits_list[[i]]) } + + From dcc6e13bc6e7e2561b18d675f7b4784d31eb1bdf Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Thu, 29 Feb 2024 16:41:06 -0500 Subject: [PATCH 04/11] small code changes --- R/cureit_lasso_predict_eval.R | 71 ++++++++++++++++++++++++++++------- R/cv.cureitlasso.R | 3 +- inst/evaluate-simulations.R | 13 +++++-- inst/run-simulations.R | 17 ++++++--- inst/testcode.R | 48 +++++++++++++++++++++++ 5 files changed, 129 insertions(+), 23 deletions(-) create mode 100644 inst/testcode.R diff --git a/R/cureit_lasso_predict_eval.R b/R/cureit_lasso_predict_eval.R index 4a7b2a0..764f703 100644 --- a/R/cureit_lasso_predict_eval.R +++ b/R/cureit_lasso_predict_eval.R @@ -21,22 +21,47 @@ data = data.frame(dat$x) %>% event = dat$d, "truth_survival" = Surv(dat$t, dat$d)) +# * Calculate Sensitivity of Variable Selection in Training Fits ------- +spe_cox_min <- c() +sen_cox_min <- c() + +x <- for (i in 1:100) { + fit <- cv_fits_list[[i]] + #fit <- x$fit + dat <- sim_data_list[[i]] + + spe_cox_min[[i]] <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + sen_cox_min[[i]] <- mean(dat$id_cox %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + # spe_cox_min <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + + # print(sen_cox_min) + +} + + +hist(unlist(sen_cox_min)) +hist(unlist(spe_cox_min)) + +# Check Variable Selection on Standard Lasso +# will add foldid (see new version of cv.lassocure()) +cv.glmnet(x = dat$x, y = Surv(dat$t, dat$d), family = "cox") + # Predict Function (glmnet cure) --------------------------------------------- # Trying to get predictions in similar format as tidymodels yardstick # Eventually will integrate this with existing predict method function predict_cure <- function(final_fit, - data = NULL, + new_data = NULL, eval_timepoints = NULL) { # Get values from observed data - survival_times = data$times + survival_times = new_data$times sorted_survival_times <- sort(survival_times) - sorted_event_times = sort(data$times[data$event==1]) - censoring_weights = survfit(Surv(data$times, 1 - data$event) ~ 1)$surv + sorted_event_times = sort(new_data$times[new_data$event==1]) + censoring_weights = survfit(Surv(new_data$times, 1 - new_data$event) ~ 1)$surv - data_x <- data %>% + data_x <- new_data %>% select(-c(times, event, truth_survival)) %>% as.matrix() @@ -93,7 +118,8 @@ predict_cure <- function(final_fit, # Conditional survival prob for all patients at given time predsurv <- exp(-cumhaz[ids]*predsurvexp) - + + # HERE- check this - tru predict of survfit # inverse probability of the censoring weights ipw <- as.numeric(survival_times > eval_timepoints[l])*censoring_weights[l] + as.numeric(survival_times <= eval_timepoints[l])*censoring_weights @@ -130,14 +156,13 @@ predict_cure <- function(final_fit, all_pred_df <- all_pred_df %>% nest(.pred = -c(id)) %>% - bind_cols(., "times" = data$times, - "event" = data$event) + bind_cols(., "event" = new_data$event) return(all_pred_df) } -predict_df <- predict_cure(final_fit = final_fit, data = data) +predict_df <- predict_cure(final_fit = final_fit, new_data = data) # quick brier --- x <- c() @@ -193,27 +218,36 @@ brier_cure <- function(predict_df, truth = times, eval_timepoints = NULL) { # Simple Former Brier Calculation --------------------------------------------------- +library(glmnet) +library(tidyverse) +library(survival) -calc_brier_old <- function(dat, fit) { +calc_brier_old <- function(new_data, fit) { # ti <- dat_valid$t # di <- dat_valid$d # xi <- dat_valid$x - ti <- dat$t - di <- dat$d - xi <- dat$x + ti <- new_data$t + di <- new_data$d + xi <- new_data$x + + # tj from train data tj <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$tj final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]] fitcure <- final_fit$fitcure predcure <- predict(fitcure,newx=xi,s=min(fitcure$lambda),type="response") + fitcox <- final_fit$fitcox predsurvexp <- predict(fitcox,newx=xi,s=min(fitcox$lambda),type="response") + haz <- final_fit$haz cumhaz <- final_fit$cumhaz + # censoring weights from test data predcens <- survfit(Surv(ti,1-di)~1)$surv tcens <- survfit(Surv(ti,1-di)~1)$time + # tbrier is evaluation timepoints (default to all timepoints in test data) tbrier <- sort(ti) brier <- rep(NA,length(tbrier)) ipw <- rep(NA,length(tbrier)) @@ -250,10 +284,19 @@ calc_brier_old <- function(dat, fit) { x <- calc_brier_old(sim_data_list[[1]], cv_fits_list[[1]]) x_val <- calc_brier_old(sim_valid_data_list[[1]], cv_fits_list[[1]]) +plot(x$tbrier,x$brier,type="S") +plot(x_val$tbrier,x_val$brier,type="S") + + all_brier <- list() for (i in 1:10) { - all_brier[[i]] <- calc_brier_old(sim_data_list[[i]], fit = cv_fits_list[[i]]) + all_brier[[i]] <- calc_brier_old( new_data = sim_data_list[[i]], fit = cv_fits_list[[i]]) } +all_brier2 <- all_brier %>% + do.call("rbind", .) +x <- all_brier2 %>% group_by(round(tbrier, 1)) %>% + summarize(mean = mean(brier)) +plot(x$`round(tbrier, 1)`, x$mean,type="S") diff --git a/R/cv.cureitlasso.R b/R/cv.cureitlasso.R index 403c130..89cb1ac 100644 --- a/R/cv.cureitlasso.R +++ b/R/cv.cureitlasso.R @@ -242,7 +242,8 @@ cv.cureitlasso <- function(t, cv_brier_mean = cv_brier_mean, cv_brier_se = cv_brier_se, index = list(min=idxmin, - `1se`= idx1se) + `1se`= idx1se), + foldid = foldid ) ) diff --git a/inst/evaluate-simulations.R b/inst/evaluate-simulations.R index 83aa32e..15abbdb 100644 --- a/inst/evaluate-simulations.R +++ b/inst/evaluate-simulations.R @@ -12,18 +12,25 @@ load(here::here("inst", "sim_data_list.RData")) load(here::here("inst", "sim_valid_data_list.RData")) # * Calculate Sensitivity of Variable Selection in Training Fits ------- +spe_cox_min <- c() +sen_cox_min <- c() + x <- for (i in 1:50) { fit <- cv_fits_list[[i]] #fit <- x$fit dat <- sim_data_list[[i]] - sen_cox_min <- mean(dat$id_cox %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) - print(sen_cox_min) + spe_cox_min[[i]] <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + sen_cox_min[[i]] <- mean(dat$id_cox %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + # spe_cox_min <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + + # print(sen_cox_min) } -hist(x) +hist(unlist(sen_cox_min)) +hist(unlist(sen_cox_min)) # Sensitivity and specificity (for both min and 1se choices of parameters) --- diff --git a/inst/run-simulations.R b/inst/run-simulations.R index 35275d5..11f7ace 100644 --- a/inst/run-simulations.R +++ b/inst/run-simulations.R @@ -10,6 +10,7 @@ source(here::here("R/coxsplit.R")) # Test- Try simulating one train and test set + dat <- simulasso() # training data @@ -24,8 +25,8 @@ setdiff(dat$id_cox, dat_valid$id_cox) # * Training- Simulate n Data Sets -------------------------------------------------------- - sim_data_list <- list() +set.seed(999) for (i in 1:100) { sim_data_list[[i]] = simulasso() @@ -45,10 +46,16 @@ for (i in 1:100) { coefs_cox = dat$coefs_cox) } +sim_data_list[[1]]$id_cure sim_valid_data_list[[1]]$id_cure -# save(sim_data_list, file = here::here("inst", "sim_data_list.RData")) -# save(sim_valid_data_list, file = here::here("inst", "sim_valid_data_list.RData")) +save(sim_data_list, file = here::here("inst", + "cureit-simulation-results", + "sim_data_list_v2.RData")) + +save(sim_valid_data_list, file = here::here("inst", + "cureit-simulation-results", + "sim_valid_data_list_v2.RData")) # Fit Lasso ------------------------------------------------------------ @@ -98,7 +105,7 @@ test_list[[1]] <- test_single_cv cv_fits_list <- list() start_time <- Sys.time() -for (i in 1:100) { +for (i in 1:50) { dat_valid <- sim_data_list[[i]] @@ -127,7 +134,7 @@ end_time <- Sys.time() end_time - start_time -save(cv_fits_list, file = here::here("inst", "cv_fits_100.RData")) +save(cv_fits_list, file = here::here("inst", "cureit-simulation-results", "cv_fits_50_v2.RData")) # * 50 Runs ---- diff --git a/inst/testcode.R b/inst/testcode.R new file mode 100644 index 0000000..4ecfc25 --- /dev/null +++ b/inst/testcode.R @@ -0,0 +1,48 @@ +#### Evaluate the calibration by Brier score for the validation data +#### Line 79 - 123: Brier score for the cureit model + +# ti <- dat_valid$t +# di <- dat_valid$d +# xi <- dat_valid$x +ti <- dat$t +di <- dat$d +xi <- dat$x +tj <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$tj + +fitcure <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$fitcure +predcure <- predict(fitcure,newx=xi,s=min(fitcure$lambda),type="response") +fitcox <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$fitcox +predsurvexp <- predict(fitcox,newx=xi,s=min(fitcox$lambda),type="response") +haz <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$haz +cumhaz <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$cumhaz + +predcens <- survfit(Surv(ti,1-di)~1)$surv +tcens <- survfit(Surv(ti,1-di)~1)$time + +tbrier <- sort(ti) +brier <- rep(NA,length(tbrier)) +ipw <- rep(NA,length(tbrier)) +for (l in 1:length(tbrier)){ + + predsurv <- rep(NA,length(ti)) # Conditional survival prob for all patients at tbrier[l] + + if (tbrier[l] >= min(tj)){ + + ids <- which(tj == max(tj[tj <= tbrier[l]])) + predsurv <- exp(-cumhaz[ids]*predsurvexp) + ipw <- as.numeric(ti > tbrier[l])*predcens[l] + as.numeric(ti <= tbrier[l])*predcens + + }else if (tbrier[l] < min(tj)){ + + predsurv <- rep(1,length(ti)) + ipw <- 1 + + } + + ipw[l] <- ipw + preds <- 1 - predcure + predcure*predsurv + brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) + +} + +plot(tbrier,brier,type="S") \ No newline at end of file From 286b16999255c1e083f3e69139dcde7d69099ddb Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Fri, 15 Mar 2024 14:27:31 -0400 Subject: [PATCH 05/11] small updates to get survfit for censoring weights --- R/cureit_lasso_predict_eval.R | 58 +++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/R/cureit_lasso_predict_eval.R b/R/cureit_lasso_predict_eval.R index 764f703..d954009 100644 --- a/R/cureit_lasso_predict_eval.R +++ b/R/cureit_lasso_predict_eval.R @@ -59,7 +59,11 @@ predict_cure <- function(final_fit, survival_times = new_data$times sorted_survival_times <- sort(survival_times) sorted_event_times = sort(new_data$times[new_data$event==1]) - censoring_weights = survfit(Surv(new_data$times, 1 - new_data$event) ~ 1)$surv + + # This may need to be eval timepoints.... + censoring_survfit = survfit(Surv(new_data$times, 1 - new_data$event) ~ 1) + censoring_weights = summary(censoring_survfit, times = survival_times) + censoring_weights = censoring_weights$surv data_x <- new_data %>% select(-c(times, event, truth_survival)) %>% @@ -111,19 +115,24 @@ predict_cure <- function(final_fit, if (eval_timepoints[l] >= min(sorted_event_times)){ # maximum event time that is less than timepoint we are evaluating + ids <- which( sorted_event_times == max(sorted_event_times[sorted_event_times <= eval_timepoints[l]]) ) - # Conditional survival prob for all patients at given time + # Conditional survival prob for all patients at given time- + # get cumhaz for time closest to eval timepoint predsurv <- exp(-cumhaz[ids]*predsurvexp) # HERE- check this - tru predict of survfit # inverse probability of the censoring weights + ipw <- as.numeric(survival_times > eval_timepoints[l])*censoring_weights[l] + as.numeric(survival_times <= eval_timepoints[l])*censoring_weights + ipw = 1/ipw + # if no events occurred before eval time } else if (eval_timepoints[l] < min(sorted_event_times)) { @@ -156,13 +165,16 @@ predict_cure <- function(final_fit, all_pred_df <- all_pred_df %>% nest(.pred = -c(id)) %>% - bind_cols(., "event" = new_data$event) + bind_cols(., "event" = new_data$event, + "times" = new_data$times) return(all_pred_df) } predict_df <- predict_cure(final_fit = final_fit, new_data = data) +predict_df_long <- predict_df %>% + unnest(everything()) # quick brier --- x <- c() @@ -177,6 +189,7 @@ for (i in 1:10) { I(predict_df_long_1$event ==1)*(0 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001)) } +x # Brier With Yardstick ------------------------------------------------------- library(yardstick) @@ -244,8 +257,11 @@ calc_brier_old <- function(new_data, fit) { cumhaz <- final_fit$cumhaz # censoring weights from test data - predcens <- survfit(Surv(ti,1-di)~1)$surv - tcens <- survfit(Surv(ti,1-di)~1)$time + sf <- survfit(Surv(ti,1-di)~1) + sf_pred <- summary(sf, time = ti) + + predcens <- sf_pred$surv + tcens <- sf_pred$time # tbrier is evaluation timepoints (default to all timepoints in test data) tbrier <- sort(ti) @@ -284,15 +300,42 @@ calc_brier_old <- function(new_data, fit) { x <- calc_brier_old(sim_data_list[[1]], cv_fits_list[[1]]) x_val <- calc_brier_old(sim_valid_data_list[[1]], cv_fits_list[[1]]) +x plot(x$tbrier,x$brier,type="S") plot(x_val$tbrier,x_val$brier,type="S") all_brier <- list() -for (i in 1:10) { +for (i in 1:length(sim_data_list)) { all_brier[[i]] <- calc_brier_old( new_data = sim_data_list[[i]], fit = cv_fits_list[[i]]) } + +all_brier_valid <- list() + +for (i in 1:length(sim_valid_data_list)) { + all_brier_valid[[i]] <- calc_brier_old( new_data = sim_valid_data_list[[i]], + fit = cv_fits_list[[i]]) +} + +x <- all_brier_valid[[1]] +x_valid <- all_brier[[1]] + +ggplot(x, aes(x = tbrier,y = brier)) + geom_line( aes(x = tbrier,y = brier)) + + geom_line(data = x_valid, aes(x =tbrier, y = brier), color = "blue") + +plot(x$tbrier,x$brier,type="S") +line(x_valid$tbrier,x_valid$brier,type="S") + + +traps <- c() + +for (i in 1:length(all_brier)) { + traps[i] <- trapz(all_brier[[i]]$tbrier,all_brier[[i]]$brier) + +} + + all_brier2 <- all_brier %>% do.call("rbind", .) @@ -300,3 +343,6 @@ x <- all_brier2 %>% group_by(round(tbrier, 1)) %>% summarize(mean = mean(brier)) plot(x$`round(tbrier, 1)`, x$mean,type="S") + + + From d518c5aa1d29318c322faa0293cb2c5f5f5817a8 Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Thu, 28 Mar 2024 09:46:33 -0400 Subject: [PATCH 06/11] update predict function --- R/cureit_lasso_predict_eval.R | 121 +---------------------------- R/explore-brier.R | 110 ++++++++++++++++++++++++++ inst/predict_cure_2024-03-15.R | 137 +++++++++++++++++++++++++++++++++ 3 files changed, 249 insertions(+), 119 deletions(-) create mode 100644 R/explore-brier.R create mode 100644 inst/predict_cure_2024-03-15.R diff --git a/R/cureit_lasso_predict_eval.R b/R/cureit_lasso_predict_eval.R index d954009..61043cb 100644 --- a/R/cureit_lasso_predict_eval.R +++ b/R/cureit_lasso_predict_eval.R @@ -51,131 +51,14 @@ cv.glmnet(x = dat$x, y = Surv(dat$t, dat$d), family = "cox") # Trying to get predictions in similar format as tidymodels yardstick # Eventually will integrate this with existing predict method function -predict_cure <- function(final_fit, - new_data = NULL, - eval_timepoints = NULL) { - - # Get values from observed data - survival_times = new_data$times - sorted_survival_times <- sort(survival_times) - sorted_event_times = sort(new_data$times[new_data$event==1]) - - # This may need to be eval timepoints.... - censoring_survfit = survfit(Surv(new_data$times, 1 - new_data$event) ~ 1) - censoring_weights = summary(censoring_survfit, times = survival_times) - censoring_weights = censoring_weights$surv - - data_x <- new_data %>% - select(-c(times, event, truth_survival)) %>% - as.matrix() - - - # Extract cumulative hazard from final fit (for event times) - haz <- final_fit$haz - cumhaz <- final_fit$cumhaz - - # default eval timepoints are observed event times if not supplied by user - if(is.null(eval_timepoints)) { - eval_timepoints = sorted_survival_times - } - - - # Get Relative Cure and Cox Risk (independent of timepoint) ------------------- - - # Cure probability risk prediction - fitcure <- final_fit$fitcure - predcure <- predict(fitcure, newx = data_x, - s = min(fitcure$lambda), type = "response") - - # Cox survival probability risk prediction - fitcox <- final_fit$fitcox - predsurvexp <- predict(fitcox, - newx = data_x, - s = min(fitcox$lambda), - type = "response") - - # Get Predicted probabilities at Selected Timepoints ------------------- - - all_preds_all_tps <- list() - - # for each eval timepoint, get predicted survival values - for (l in 1:length(eval_timepoints)) { - all_preds <- data.frame("eval_timepoint" = NA, - "preds" = NA, - "pred_cure" = NA, - "preds_surv" = NA, - "weights" = NA) - - # vector for conditional survival prob for all patients at sorted_survival_times[l] - #predsurv <- rep(NA,length(survival_times)) - - - if (eval_timepoints[l] >= min(sorted_event_times)){ - - # maximum event time that is less than timepoint we are evaluating - - ids <- which( - sorted_event_times == - max(sorted_event_times[sorted_event_times <= eval_timepoints[l]]) - ) - - # Conditional survival prob for all patients at given time- - # get cumhaz for time closest to eval timepoint - predsurv <- exp(-cumhaz[ids]*predsurvexp) - - # HERE- check this - tru predict of survfit - # inverse probability of the censoring weights - - ipw <- as.numeric(survival_times > eval_timepoints[l])*censoring_weights[l] + - as.numeric(survival_times <= eval_timepoints[l])*censoring_weights - - ipw = 1/ipw - - # if no events occurred before eval time - } else if (eval_timepoints[l] < min(sorted_event_times)) { - - predsurv <- rep(1,length(survival_times)) - ipw <- 1 - - } - - preds <- 1 - predcure + predcure*predsurv - - all_preds <- tibble( - - ".eval_time" = eval_timepoints[l], - ".pred_survival" = preds, - ".predcure" = predcure, - ".predsurv" = predsurv, - ".weight_censored_raw" = censoring_weights[l], - ".weight_censored" = ipw) %>% - mutate(id = 1:nrow(.)) - - all_preds_nest <- all_preds %>% nest(.preds = -c(id)) - - all_preds_all_tps[[l]] <- all_preds - - } - - - all_pred_df <- all_preds_all_tps %>% - do.call("rbind", .) - - all_pred_df <- all_pred_df %>% - nest(.pred = -c(id)) %>% - bind_cols(., "event" = new_data$event, - "times" = new_data$times) - - - return(all_pred_df) -} - predict_df <- predict_cure(final_fit = final_fit, new_data = data) predict_df_long <- predict_df %>% unnest(everything()) + + # quick brier --- x <- c() for (i in 1:10) { diff --git a/R/explore-brier.R b/R/explore-brier.R new file mode 100644 index 0000000..be3623b --- /dev/null +++ b/R/explore-brier.R @@ -0,0 +1,110 @@ +library(tidymodels) +library(censored) +tidymodels_prefer() + + +# Try Tidymodels Example -------------------------------------------------- + + +# Fit Lung Model --------------------------------------------------------- + +data(cancer, package="survival") +data(lung_surv) +#lung <- lung %>% drop_na() + +lung <- lung %>% + mutate(status = case_when( + status == 1 ~ 0, + status == 2 ~ 1 + )) + + + +ph_spec <- + proportional_hazards() %>% + set_engine("survival") %>% + set_mode("censored regression") + +ph_spec + +set.seed(1) +ph_fit <- ph_spec %>% fit(Surv(time, status) ~ ., data = lung) +ph_fit + +# * Predict Lung Model ----------- + +# Make predictions +p <- predict( + ph_fit, + lung, + type = "survival", + eval_time = lung$time +) %>% + mutate(surv_truth = Surv(lung$time, lung$status)) + +# add weights +p_weights <- .censoring_weights_graf(ph_fit, predictions = p) + +# calc brier +brier_scores_lung <- + p_weights %>% + brier_survival(truth = surv_truth, .pred) + + +# Try our code on their data ---------------------------------------------- + + +eval_time_test <- 300 + +new_data = lung +survival_times <- lung$time +sorted_survival_times <- sort(survival_times) + +survival_status <- lung$status +sorted_event_times = sort(survival_times[survival_status==1]) + +censoring_survfit = survfit(Surv(survival_times, 1 - survival_status) ~ 1) +censoring_weights = summary(censoring_survfit, times = survival_times) +censoring_weights = censoring_weights$surv + +all_weights = bind_cols(censoring_weights_time, censoring_weights) +names(all_weights) <- c("weights_time", "weights_surv") + +eval_timepoint = 300 + +# * Inverse Probability Censoring Weights --------- + +# If no time points in new data less than lowest event time in model fit data (most cases): +if (eval_timepoint >= min(sorted_event_times)) { + + # get maximum event time that is less than time point we are evaluating + closest_event_time <- max( + sorted_event_times[sorted_event_times <= eval_timepoint]) + + index_closest_event_time <- which(sorted_event_times == closest_event_time) + + # Get predicted survival by using cumulative hazard at closest previous event time + predsurv <- exp(-cumhaz[index_closest_event_time]*predsurvexp) + + # CHECK - Probability of the censoring at evaluation time point (or should it be at previous index closest event time??) + censoring_weight_at_eval_time <- censoring_weights[timepoint_index] + + ipw <- + as.numeric(survival_times > eval_timepoint)*censoring_weight_at_eval_time + + as.numeric(survival_times <= eval_timepoint)*censoring_weights + + ipw = 1/ipw + + # if no events occurred before eval time +} else if (eval_timepoint < min(sorted_event_times)) { + + predsurv <- rep(1,length(survival_times)) + ipw <- 1 + +} + +cbind(survival_times, ipw) %>% View() +# Compare ------ +lung_surv_long <- lung_surv %>% unnest(everything()) + +lung_surv_long <- lung_surv_long %>% filter(.eval_time == eval_time_test) diff --git a/inst/predict_cure_2024-03-15.R b/inst/predict_cure_2024-03-15.R new file mode 100644 index 0000000..64d684b --- /dev/null +++ b/inst/predict_cure_2024-03-15.R @@ -0,0 +1,137 @@ + +predict_cure <- function(final_fit, + new_data = NULL, + eval_timepoints = NULL) { + + # Values from Observed Data & Model Fit --------------------------------- + + survival_times = new_data$times + events = new_data$event + sorted_survival_times <- sort(survival_times) + sorted_event_times = sort(new_data$times[new_data$event==1]) + + # Extract cumulative hazard from final fit (for event times) + haz <- final_fit$haz + cumhaz <- final_fit$cumhaz + + # Set up Data + data_x <- new_data %>% + select(-c(times, event, truth_survival)) %>% + as.matrix() + + # Evaluation Time Points -------------------------------------------------- + # Default evaluation time points are observed event times if not supplied by user + + all_eval_timepoints <- eval_timepoints %||% sorted_survival_times + + # Censoring Weights -------------------------------------------------------- + + censoring_survfit = survfit(Surv(survival_times, 1 - events) ~ 1) + + # CHECK- or censoring weights at all all respective survival times? + # censoring_weights = summary(censoring_survfit, times = all_eval_timepoints) + censoring_weights = summary(censoring_survfit, times = survival_times)$surv + + + # Relative Risk for Cure and Cox Risk ------------------------------------ + # (independent of time point) + + # * Cure probability risk prediction ------ + fitcure <- final_fit$fitcure + predcure <- predict(fitcure, newx = data_x, + s = min(fitcure$lambda), type = "response") + + # * Cox survival probability risk prediction ------ + fitcox <- final_fit$fitcox + predsurvexp <- predict(fitcox, + newx = data_x, + s = min(fitcox$lambda), + type = "response") + + + + # Predicted Probabilities at Evaluated Time points ------------------------- + # (dependent on time point- need IPCW) + + all_preds_all_tps <- list() + + # Evaluated in all patients, for each eval time point (iteration on time point) + for (timepoint_index in 1:length(all_eval_timepoints)) { + + # timepoint we are evaluating + eval_timepoint <- all_eval_timepoints[timepoint_index] + + + # * Inverse Probability Censoring Weights --------- + + # If no time points in new data less than lowest event time in model fit data (most cases): + if (eval_timepoint >= min(sorted_event_times)) { + + # get maximum event time that is less than time point we are evaluating + closest_event_time <- max( + sorted_event_times[sorted_event_times <= eval_timepoint]) + + index_closest_event_time <- which(sorted_event_times == closest_event_time) + + # Get predicted survival by using cumulative hazard at closest previous event time + predsurv <- exp(-cumhaz[index_closest_event_time]*predsurvexp) + + # CHECK - Probability of the censoring at evaluation time point (or should it be at previous index closest event time??) + # censoring_weight_at_eval_time <- censoring_weights[timepoint_index] + censoring_weight_at_eval_time <- censoring_weights[index_closest_event_time] + + ipw <- + # if observed time is greater than event time, use censoring weight at evaluation time (or time closest) + (as.numeric(survival_times > eval_timepoint)*censoring_weight_at_eval_time) + + + # if observed time is less than or equal to eval time, use censoring weight at it's own observed time + as.numeric(survival_times <= eval_timepoint)*censoring_weights + + ipw = 1/ipw + + # if no events occurred before eval time + } else if (eval_timepoint < min(sorted_event_times)) { + + predsurv <- rep(1,length(survival_times)) + ipw <- 1 + + } + + preds <- 1 - predcure + predcure*predsurv + + all_preds <- tibble( + + ".eval_time" = eval_timepoint, + ".pred_survival" = preds, + ".pred_curerate" = predcure, + ".pred_survportion" = predsurv, + ".weight_censored_raw" = censoring_weights[timepoint_index], + ".weight_censored" = ipw) %>% + mutate(id = 1:nrow(.)) + + all_preds_nest <- all_preds %>% nest(.preds = -c(id)) + + all_preds_all_tps[[timepoint_index]] <- all_preds + + } + + all_pred_df <- all_preds_all_tps %>% + do.call("rbind", .) + + all_pred_df <- all_pred_df %>% + nest(.pred = -c(id)) %>% + bind_cols(., "event" = new_data$event, + "times" = new_data$times) + + return(all_pred_df) + +} + + +# Try Function ------------------------------------------------------------ +# +# predict_df <- predict_cure(final_fit = final_fit, new_data = data) +# predict_df_long <- predict_df %>% +# unnest(everything()) + + From a81cc085b833ac05f1619f321a5238800f1b473b Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Thu, 6 Jun 2024 09:09:58 -0400 Subject: [PATCH 07/11] update to predict --- R/cureit_lasso_predict_eval.R | 153 +++++++++++++++++++++++++++++++-- R/explore-brier.R | 85 +++++++++++++++++- inst/predict_cure_2024-03-15.R | 2 +- inst/sim-briers-exploration.R | 115 +++++++++++++++++++++++++ 4 files changed, 346 insertions(+), 9 deletions(-) create mode 100644 inst/sim-briers-exploration.R diff --git a/R/cureit_lasso_predict_eval.R b/R/cureit_lasso_predict_eval.R index 61043cb..334e006 100644 --- a/R/cureit_lasso_predict_eval.R +++ b/R/cureit_lasso_predict_eval.R @@ -59,30 +59,52 @@ predict_df_long <- predict_df %>% -# quick brier --- +# quick brier --------- x <- c() -for (i in 1:10) { +for (i in 1:100) { time_1 <- predict_df_long$.eval_time[[i]] predict_df_long_1 <- predict_df_long %>% filter(.eval_time == time_1) - x[i] <- mean(I(predict_df_long_1$event == 0)*(1 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001) + - I(predict_df_long_1$event ==1)*(0 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001)) + x[i] <- mean(I(predict_df_long_1$times > time_1)*(1 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored) + + I(predict_df_long_1$event == 1 & predict_df_long_1$times <= time_1)* + (0 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored)) + + # OLD ----- + # x[i] <- mean(I(predict_df_long_1$event == 0)*(1 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001) + + # I(predict_df_long_1$event ==1)*(0 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001)) + # + # OLD ---- + # brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + + # I(di==1)*(0 - preds)^2/(ipw+0.001)) + # + # NEW ----- + # brier[l] <- mean(I(ti > tbrier[l])*(1 - preds)^2/(ipw+0.001) + + # I(di==1 & ti <= tbrier[l])*(0 - preds)^2/(ipw+0.001)) + } -x +times <- unique(predict_df_long$.eval_time) +brier <- x + +brier_scores_manual <- bind_cols("times" = times, "brier" = brier) + # Brier With Yardstick ------------------------------------------------------- library(yardstick) # Doesn't match- weights are wrong -brier_scores <- +brier_scores_yardstick <- predict_df %>% mutate(truth = Surv(times, event)) %>% brier_survival(truth = truth, .pred) +all_brier <- brier_scores_yardstick %>% + select(times = .eval_time, brier_yardstick = .estimate) %>% + left_join(brier_scores_manual) + # Brier Function ------------------------------------------------------- brier_cure <- function(predict_df, truth = times, eval_timepoints = NULL) { @@ -105,13 +127,132 @@ brier_cure <- function(predict_df, truth = times, eval_timepoints = NULL) { mean(I(x$event == 0)*(1 - x$.pred_survival)^2/(x$.weight_censored + 0.001) + I(x$event == 1)*(0 - x$.pred_survival)^2/(x$.weight_censored + 0.001)) }) + + for (l in 1:length(eval_timepoints)) { + brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) + + + } +} + + +brier_cure_update <- function(predict_df, truth = times, eval_timepoints = NULL) { + + + # Get values from observed data + survival_times = predict_df$times + sorted_survival_times <- sort(survival_times) + sorted_event_times = sort(data$times[data$event==1]) + + + predict_df_long <- predict_df %>% + unnest(everything()) + + predict_df_nest_time <- predict_df_long %>% + nest(data = -.eval_time) + + + map(predict_df_nest_time$data, function(x) { + mean(I(x$event == 0)*(1 - x$.pred_survival)^2/(x$.weight_censored + 0.001) + I(x$event == 1)*(0 - x$.pred_survival)^2/(x$.weight_censored + 0.001)) + + }) + for (l in 1:length(eval_timepoints)) { brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) + + + } +} + +predict_df_long <- predict_df %>% + unnest(everything()) + +predict_df_nest_time <- predict_df_long %>% + nest(data = -.eval_time) + +predict_df_timepoint = predict_df_nest_time[10,] %>% + unnest(everything()) + +brier_for_each_eval <- function(predict_df_timepoint) { + + x = mean(I(predict_df_timepoint$times > predict_df_timepoint$.eval_time )*(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + + I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time )* + (0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001)) + x } + + +brier_for_each_eval <- function(predict_df_timepoint) { + + + x = mean( + I(predict_df_timepoint$times > predict_df_timepoint$.eval_time)*(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + + + I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time )* + (0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + ) + x/100 +} + +brier_for_each_eval2 <- function(predict_df_timepoint) { + + category1 = I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time ) + category2 = I(predict_df_timepoint$times > predict_df_timepoint$.eval_time) + + x = mean( + category1*(0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored) + + category2 *(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored) + + ) + x + } + +brier_for_each_eval(predict_df_timepoint) + +# Compare to yardstick ---- + +brier_survival_impl <- function(truth, + estimate, + censoring_weights, + case_weights, + eval_time) { + surv_time <- yardstick:::.extract_surv_time(truth) + surv_status <- yardstick:::.extract_surv_status(truth) + + if (!is.null(case_weights)) { + case_weights <- vec_cast(case_weights, to = double()) + norm_const <- sum(case_weights) + } else { + case_weights <- rep(1, length(estimate)) + norm_const <- sum(!survival::is.na.Surv(truth)) + } + category_1 <- surv_time <= eval_time & surv_status == 1 + category_2 <- surv_time > eval_time + + # (0 - estimate) ^ 2 == estimate ^ 2 + res <- (category_1 * estimate^2 * censoring_weights) + + (category_2 * (1 - estimate)^2 * censoring_weights) + + res <- res * case_weights + res <- sum(res, na.rm = TRUE) + res / norm_const +} +predict_df_timepoint2 <- predict_df_timepoint %>% + mutate(surv_obj = Surv(predict_df_timepoint$times, predict_df_timepoint$event)) +library(yardstick) +brier_survival_impl( + truth = predict_df_timepoint2$surv_obj, + + estimate = predict_df_timepoint2$.pred_survival, + censoring_weights = predict_df_timepoint2$.weight_censored, + case_weights = NULL, + eval_time = predict_df_timepoint2$.eval_time) + +brier_for_each_eval(predict_df_timepoint) # Simple Former Brier Calculation --------------------------------------------------- library(glmnet) diff --git a/R/explore-brier.R b/R/explore-brier.R index be3623b..ab42001 100644 --- a/R/explore-brier.R +++ b/R/explore-brier.R @@ -31,6 +31,15 @@ set.seed(1) ph_fit <- ph_spec %>% fit(Surv(time, status) ~ ., data = lung) ph_fit +cox <- coxph(Surv(time, status) ~ ., data = lung) +cox +cumhaz_all<- survfit(cox) +cumhaz <- bind_cols(time = cumhaz_all$time, surv = cumhaz_all$surv) + +#cumhaz$surv[which.min(abs(cumhaz$time - closest_event_time))] + +#cumhaz$surv[cumhaz$time[] + # * Predict Lung Model ----------- # Make predictions @@ -43,16 +52,61 @@ p <- predict( mutate(surv_truth = Surv(lung$time, lung$status)) # add weights -p_weights <- .censoring_weights_graf(ph_fit, predictions = p) +p_weights <- .censoring_weights_graf(ph_fit, predictions = p) %>% + mutate(pat_id = 1:nrow(.)) + +p_weights_long <- p_weights %>% + unnest(cols = .pred) # calc brier brier_scores_lung <- p_weights %>% brier_survival(truth = surv_truth, .pred) - +brier_scores_lung # Try our code on their data ---------------------------------------------- +# +# predict_df_long <- predict_df %>% +# unnest(everything()) + +predict_df_nest_time <- p_weights_long %>% + nest(data = -.eval_time) + +predict_df_timepoint = predict_df_nest_time[10,] %>% + unnest(everything()) + +brier_for_each_eval <- function(predict_df_timepoint) { + + + x = mean(I(predict_df_timepoint$times > predict_df_timepoint$.eval_time )*(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + + I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time )* + (0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001)) + x +} + + + + +lung2 <- lung %>% + rename("times" = time, + "event" = status) +cox <- coxph(Surv(times, event) ~ ., data = lung2) +cox +cumhaz_all<- survfit(cox) +cumhaz <- bind_cols(time = cumhaz_all$time, surv = cumhaz_all$surv) + + +lung2 <- lung2 %>% + mutate(truth_survival = Surv(times, event)) + +predict_cure(final_fit = cox, new_data = lung2, eval_timepoints = lung2$times) + + + + +# Misc -------------------------------------------------------------------- + eval_time_test <- 300 @@ -104,7 +158,34 @@ if (eval_timepoint >= min(sorted_event_times)) { } cbind(survival_times, ipw) %>% View() + # Compare ------ lung_surv_long <- lung_surv %>% unnest(everything()) lung_surv_long <- lung_surv_long %>% filter(.eval_time == eval_time_test) + +#lung %>% select(time, status) %>% distinct() %>% nrow() + + +# Here- New --------------------------------------------------------------------- +# ti & di is censored patient - If censored time is after eval time () ti > tbrier[l] +# if event, and event time earlier than evaluation time - (0 - preds)^2/(ipw+0.001)) + +# 1) If the observed time is a censoring time and that is before the evaluation time, the data point should make no contribution to the performance metric (their "category 3"). These values have a missing value for their probability estimate (and also for their weight column). +# ^ not included, ensure they are NAs in predict df + +# 2) I(ti <= tbrier[l] & di == 1)*(0 - preds)^2/(ipw+0.001)) +# Old version ---- +# brier[l] <- mean(I(ti > tbrier[l] & di==0)* +# (1 - preds)^2/(ipw+0.001) + +# +# I(ti <= tbrier[l] & di == 1)*(0 - preds)^2/(ipw+0.001)) + +# Updated Version --- + + +brier[l] <- mean(I(ti > tbrier[l])* + (1 - preds)^2/(ipw+0.001) + + + I(ti <= tbrier[l] & di == 1)*(0 - preds)^2/(ipw+0.001)) + diff --git a/inst/predict_cure_2024-03-15.R b/inst/predict_cure_2024-03-15.R index 64d684b..4502c57 100644 --- a/inst/predict_cure_2024-03-15.R +++ b/inst/predict_cure_2024-03-15.R @@ -87,7 +87,7 @@ predict_cure <- function(final_fit, # if observed time is less than or equal to eval time, use censoring weight at it's own observed time as.numeric(survival_times <= eval_timepoint)*censoring_weights - ipw = 1/ipw + ipw = 1/(ipw+ 0.001) # if no events occurred before eval time } else if (eval_timepoint < min(sorted_event_times)) { diff --git a/inst/sim-briers-exploration.R b/inst/sim-briers-exploration.R new file mode 100644 index 0000000..bc51536 --- /dev/null +++ b/inst/sim-briers-exploration.R @@ -0,0 +1,115 @@ +# we should use this: https://www.tidymodels.org/learn/statistics/survival-metrics/ + +library(survival) +library(tidyverse) +library(glmnet) + +# Prep Data --------------------------------------------------------------- + +load(here::here("inst", "cureit-simulation-results", "sim_data_list.RData")) +load(here::here("inst", "cureit-simulation-results", "sim_valid_data_list.RData")) +load(here::here("inst","cureit-simulation-results", "cv_fits_100.RData")) + +dat <- sim_data_list[[1]] +fit <- cv_fits_list[[1]] +final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] + +get_brier_for_each_data <- function(dat, fit) { + + final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] + + # Create Data frame to pass ----- + + data = data.frame(dat$x) %>% + mutate(times = dat$t, + event = dat$d, + "truth_survival" = Surv(dat$t, dat$d)) + + # Predict Function (glmnet cure) --------------------------------------------- + + predict_df <- predict_cure(final_fit = final_fit, new_data = data) + predict_df_long <- predict_df %>% + unnest(everything()) + + + brier_scores_yardstick <- + predict_df %>% + mutate(truth = Surv(times, event)) %>% + brier_survival(truth = truth, .pred) + + return(brier_scores_yardstick) + +} + +# estimates greater than 1 +b <- get_brier_for_each_data(dat = sim_data_list[[3]], fit = cv_fits_list[[3]]) + + +res <- list() +for (i in 1:10) { + + res[[i]] <- get_brier_for_each_data(sim_data_list[[i]], cv_fits_list[[i]]) +} + +ggplot(res[[3]], aes(x = .eval_time, .estimate)) + geom_point() + + + + +# quick brier --------- +dat <- sim_data_list[[3]] +fit <- cv_fits_list[[3]] +final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] + +# Create Data frame to pass ----- + +data = data.frame(dat$x) %>% + mutate(times = dat$t, + event = dat$d, + "truth_survival" = Surv(dat$t, dat$d)) + +predict_df <- predict_cure(final_fit = final_fit, new_data = data) +predict_df_long <- predict_df %>% + unnest(everything()) + + +x <- c() +for (i in 1:100) { + + time_1 <- predict_df_long$.eval_time[[i]] + + predict_df_long_1 <- predict_df_long %>% + filter(.eval_time == time_1) + + x[i] <- mean(I(predict_df_long_1$times > time_1)*(1 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored) + + I(predict_df_long_1$event == 1 & predict_df_long_1$times <= time_1)* + (0 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored)) + + # OLD ----- + # x[i] <- mean(I(predict_df_long_1$event == 0)*(1 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001) + + # I(predict_df_long_1$event ==1)*(0 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001)) + # + # OLD ---- + # brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + + # I(di==1)*(0 - preds)^2/(ipw+0.001)) + # + # NEW ----- + # brier[l] <- mean(I(ti > tbrier[l])*(1 - preds)^2/(ipw+0.001) + + # I(di==1 & ti <= tbrier[l])*(0 - preds)^2/(ipw+0.001)) + +} + +times <- unique(predict_df_long$.eval_time) +brier <- x + +brier_scores_manual <- bind_cols("times" = times, "brier" = brier) + +brier_scores_yardstick <- + predict_df %>% + mutate(truth = Surv(times, event)) %>% + brier_survival(truth = truth, .pred) + +# Brier With Yardstick ------------------------------------------------------- +library(yardstick) + + From 990c7e3ec521c48cabf461eb762c8e731f8caaea Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Thu, 6 Jun 2024 09:49:36 -0400 Subject: [PATCH 08/11] organize scripts --- {R => inst}/cureit_lasso_predict_eval.R | 0 {R => inst}/explore-brier.R | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {R => inst}/cureit_lasso_predict_eval.R (100%) rename {R => inst}/explore-brier.R (100%) diff --git a/R/cureit_lasso_predict_eval.R b/inst/cureit_lasso_predict_eval.R similarity index 100% rename from R/cureit_lasso_predict_eval.R rename to inst/cureit_lasso_predict_eval.R diff --git a/R/explore-brier.R b/inst/explore-brier.R similarity index 100% rename from R/explore-brier.R rename to inst/explore-brier.R From 2c6b541e9f8f3ba7d9ea4ee4fec94c50cf5edf79 Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Thu, 6 Jun 2024 10:49:46 -0400 Subject: [PATCH 09/11] organize scripts --- R/cv.cureitlasso.R | 2 +- {inst => R}/predict_cure_2024-03-15.R | 36 +++- ...compare-predict-brier-with-yardstick_P1.R} | 5 +- ...compare-predict-brier-with-yardstick_P2.R} | 171 ++++++++++++--- ...mpare-predict-brier-with-yardstick_P3.R.R} | 57 ----- inst/evaluate-simulations.R | 199 ------------------ inst/inst-README.md | 15 ++ inst/testcode.R | 48 ----- 8 files changed, 191 insertions(+), 342 deletions(-) rename {inst => R}/predict_cure_2024-03-15.R (67%) rename inst/{explore-brier.R => compare-predict-brier-with-yardstick_P1.R} (98%) rename inst/{sim-briers-exploration.R => compare-predict-brier-with-yardstick_P2.R} (53%) rename inst/{cureit_lasso_predict_eval.R => compare-predict-brier-with-yardstick_P3.R.R} (86%) delete mode 100644 inst/evaluate-simulations.R create mode 100644 inst/inst-README.md delete mode 100644 inst/testcode.R diff --git a/R/cv.cureitlasso.R b/R/cv.cureitlasso.R index 2bdd53c..3ef2447 100644 --- a/R/cv.cureitlasso.R +++ b/R/cv.cureitlasso.R @@ -246,6 +246,6 @@ cv.cureitlasso <- function(t, `1se`= idx1se), foldid = foldid ) - ) + } diff --git a/inst/predict_cure_2024-03-15.R b/R/predict_cure_2024-03-15.R similarity index 67% rename from inst/predict_cure_2024-03-15.R rename to R/predict_cure_2024-03-15.R index 4502c57..7d0c721 100644 --- a/inst/predict_cure_2024-03-15.R +++ b/R/predict_cure_2024-03-15.R @@ -1,4 +1,38 @@ - +#' Predict Cure and Survival Probabilities +#' +#' This function predicts cure and survival probabilities for data based on a final fitted model. +#' It calculates predicted survival probabilities at specified evaluation time points, accounting for censoring. +#' Output is consistent with tidymodels predict guidelines and can be passed into yardstick functions like `brier()` +#' +#' @param final_fit A list containing the fitted cure and survival models. It should have elements `fitcure`, `fitcox`, `haz`, and `cumhaz`. +#' @param new_data A data frame containing the new observations to predict on. It should include columns `times` (observed survival times), `event` (event indicator, 1 for event, 0 for censoring), and other covariates used for prediction. +#' @param eval_timepoints Optional. A vector of time points at which to evaluate survival probabilities. If NULL, the function uses the observed event times from `new_data`. +#' +#' @return A data frame containing the predicted survival probabilities, cure rates, and other related metrics for each observation at the specified evaluation time points. The returned data frame includes the following columns: +#' \itemize{ +#' \item \code{id}: Identifier for each observation. +#' \item \code{.eval_time}: Evaluation time point. +#' \item \code{.pred_survival}: Predicted survival probability at the evaluation time point. +#' \item \code{.pred_curerate}: Predicted cure rate for each observation. +#' \item \code{.pred_survportion}: Component of the survival prediction that excludes the cure component. +#' \item \code{.weight_censored_raw}: Raw censoring weight at each evaluation time point. +#' \item \code{.weight_censored}: Inverse probability weight for censoring. +#' \item \code{event}: Event indicator from the new data. +#' \item \code{times}: Observed survival times from the new data. +#' } +#' +#' @examples +#' \dontrun{ +#' # Assuming `final_fit` is a fitted model list and `new_data` is a data frame with appropriate structure: +#' predict_df <- predict_cure(final_fit = final_fit, new_data = new_data) +#' print(predict_df) +#' } +#' +#' @importFrom dplyr select mutate bind_cols +#' @importFrom tidyr nest unnest +#' @importFrom tibble tibble +#' +#' @export predict_cure <- function(final_fit, new_data = NULL, eval_timepoints = NULL) { diff --git a/inst/explore-brier.R b/inst/compare-predict-brier-with-yardstick_P1.R similarity index 98% rename from inst/explore-brier.R rename to inst/compare-predict-brier-with-yardstick_P1.R index ab42001..db759a3 100644 --- a/inst/explore-brier.R +++ b/inst/compare-predict-brier-with-yardstick_P1.R @@ -6,10 +6,11 @@ tidymodels_prefer() # Try Tidymodels Example -------------------------------------------------- -# Fit Lung Model --------------------------------------------------------- +# * Fit Lung Model -------- data(cancer, package="survival") data(lung_surv) + #lung <- lung %>% drop_na() lung <- lung %>% @@ -19,7 +20,6 @@ lung <- lung %>% )) - ph_spec <- proportional_hazards() %>% set_engine("survival") %>% @@ -64,6 +64,7 @@ brier_scores_lung <- brier_survival(truth = surv_truth, .pred) brier_scores_lung + # Try our code on their data ---------------------------------------------- # diff --git a/inst/sim-briers-exploration.R b/inst/compare-predict-brier-with-yardstick_P2.R similarity index 53% rename from inst/sim-briers-exploration.R rename to inst/compare-predict-brier-with-yardstick_P2.R index bc51536..d281510 100644 --- a/inst/sim-briers-exploration.R +++ b/inst/compare-predict-brier-with-yardstick_P2.R @@ -3,29 +3,29 @@ library(survival) library(tidyverse) library(glmnet) +library(yardstick) -# Prep Data --------------------------------------------------------------- +# Load Simulation Data --------------------------------------------------------------- load(here::here("inst", "cureit-simulation-results", "sim_data_list.RData")) load(here::here("inst", "cureit-simulation-results", "sim_valid_data_list.RData")) load(here::here("inst","cureit-simulation-results", "cv_fits_100.RData")) -dat <- sim_data_list[[1]] -fit <- cv_fits_list[[1]] -final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] + +# Function to Get Brier on Each Sim Data ---------------------------------- get_brier_for_each_data <- function(dat, fit) { - + final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] - # Create Data frame to pass ----- + # * Create Data frame to pass ----- data = data.frame(dat$x) %>% mutate(times = dat$t, event = dat$d, "truth_survival" = Surv(dat$t, dat$d)) - # Predict Function (glmnet cure) --------------------------------------------- + # * Run new Predict Function (glmnet cure) ------ predict_df <- predict_cure(final_fit = final_fit, new_data = data) predict_df_long <- predict_df %>% @@ -41,7 +41,125 @@ get_brier_for_each_data <- function(dat, fit) { } -# estimates greater than 1 + +# Manual Brier Score Calc (From Teng) ------------------------------------- + +# * Version 1 ----- + +for (i in 1:100) { + + time_1 <- predict_df_long$.eval_time[[i]] + + predict_df_long_1 <- predict_df_long %>% + filter(.eval_time == time_1) + + x[i] <- mean(I(predict_df_long_1$times > time_1)*(1 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored) + + I(predict_df_long_1$event == 1 & predict_df_long_1$times <= time_1)* + (0 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored)) + + # OLD ----- + # x[i] <- mean(I(predict_df_long_1$event == 0)*(1 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001) + + # I(predict_df_long_1$event ==1)*(0 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001)) + # + # OLD ---- + # brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + + # I(di==1)*(0 - preds)^2/(ipw+0.001)) + # + # NEW ?----- + # brier[l] <- mean(I(ti > tbrier[l])*(1 - preds)^2/(ipw+0.001) + + # I(di==1 & ti <= tbrier[l])*(0 - preds)^2/(ipw+0.001)) + +} + + + +# * Version 2 ----- + +brier[l] <- mean(I(ti > tbrier[l])* + (1 - preds)^2/(ipw+0.001) + + + I(ti <= tbrier[l] & di == 1)*(0 - preds)^2/(ipw+0.001)) + + +# * Version 3 ----- + +brier_cure <- function(predict_df, truth = times, eval_timepoints = NULL) { + + + # Get values from observed data + survival_times = predict_df$times + sorted_survival_times <- sort(survival_times) + sorted_event_times = sort(data$times[data$event==1]) + + + predict_df_long <- predict_df %>% + unnest(everything()) + + predict_df_nest_time <- predict_df_long %>% + nest(data = -.eval_time) + + + map(predict_df_nest_time$data, function(x) { + mean(I(x$event == 0)*(1 - x$.pred_survival)^2/(x$.weight_censored + 0.001) + I(x$event == 1)*(0 - x$.pred_survival)^2/(x$.weight_censored + 0.001)) + + }) + + for (l in 1:length(eval_timepoints)) { + brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) + + + } +} + +# * Version 4 ----- +brier_cure_update <- function(predict_df, truth = times, eval_timepoints = NULL) { + + + # Get values from observed data + survival_times = predict_df$times + sorted_survival_times <- sort(survival_times) + sorted_event_times = sort(data$times[data$event==1]) + + + predict_df_long <- predict_df %>% + unnest(everything()) + + predict_df_nest_time <- predict_df_long %>% + nest(data = -.eval_time) + + + map(predict_df_nest_time$data, function(x) { + mean(I(x$event == 0)*(1 - x$.pred_survival)^2/(x$.weight_censored + 0.001) + I(x$event == 1)*(0 - x$.pred_survival)^2/(x$.weight_censored + 0.001)) + + }) + + for (l in 1:length(eval_timepoints)) { + brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) + + + } +} + + +# Example on Single Data Set ------------------------------------------------- + +# Grab one data set as an example +dat <- sim_data_list[[1]] +fit <- cv_fits_list[[1]] +final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] + +# Run predict functon +predict_df <- predict_cure(final_fit = final_fit, new_data = data) +predict_df_long <- predict_df %>% + unnest(everything()) + +# Get yardstick values +brier_scores_yardstick <- + predict_df %>% + mutate(truth = Surv(times, event)) %>% + brier_survival(truth = truth, .pred) + + b <- get_brier_for_each_data(dat = sim_data_list[[3]], fit = cv_fits_list[[3]]) @@ -56,6 +174,8 @@ ggplot(res[[3]], aes(x = .eval_time, .estimate)) + geom_point() +# Compare to Our Manual Brier Calc ---------------------------------------- + # quick brier --------- dat <- sim_data_list[[3]] fit <- cv_fits_list[[3]] @@ -73,31 +193,11 @@ predict_df_long <- predict_df %>% unnest(everything()) + +# Brier With Yardstick ------------------------------------------------------- +library(yardstick) x <- c() -for (i in 1:100) { - - time_1 <- predict_df_long$.eval_time[[i]] - - predict_df_long_1 <- predict_df_long %>% - filter(.eval_time == time_1) - - x[i] <- mean(I(predict_df_long_1$times > time_1)*(1 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored) + - I(predict_df_long_1$event == 1 & predict_df_long_1$times <= time_1)* - (0 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored)) - - # OLD ----- - # x[i] <- mean(I(predict_df_long_1$event == 0)*(1 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001) + - # I(predict_df_long_1$event ==1)*(0 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001)) - # - # OLD ---- - # brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + - # I(di==1)*(0 - preds)^2/(ipw+0.001)) - # - # NEW ----- - # brier[l] <- mean(I(ti > tbrier[l])*(1 - preds)^2/(ipw+0.001) + - # I(di==1 & ti <= tbrier[l])*(0 - preds)^2/(ipw+0.001)) - -} + times <- unique(predict_df_long$.eval_time) brier <- x @@ -109,7 +209,10 @@ brier_scores_yardstick <- mutate(truth = Surv(times, event)) %>% brier_survival(truth = truth, .pred) -# Brier With Yardstick ------------------------------------------------------- -library(yardstick) + + + +# Misc Code --------------------------------------------------------------- + diff --git a/inst/cureit_lasso_predict_eval.R b/inst/compare-predict-brier-with-yardstick_P3.R.R similarity index 86% rename from inst/cureit_lasso_predict_eval.R rename to inst/compare-predict-brier-with-yardstick_P3.R.R index 334e006..e3fede4 100644 --- a/inst/cureit_lasso_predict_eval.R +++ b/inst/compare-predict-brier-with-yardstick_P3.R.R @@ -107,63 +107,6 @@ all_brier <- brier_scores_yardstick %>% # Brier Function ------------------------------------------------------- -brier_cure <- function(predict_df, truth = times, eval_timepoints = NULL) { - - - # Get values from observed data - survival_times = predict_df$times - sorted_survival_times <- sort(survival_times) - sorted_event_times = sort(data$times[data$event==1]) - - - predict_df_long <- predict_df %>% - unnest(everything()) - - predict_df_nest_time <- predict_df_long %>% - nest(data = -.eval_time) - - - map(predict_df_nest_time$data, function(x) { - mean(I(x$event == 0)*(1 - x$.pred_survival)^2/(x$.weight_censored + 0.001) + I(x$event == 1)*(0 - x$.pred_survival)^2/(x$.weight_censored + 0.001)) - - }) - - for (l in 1:length(eval_timepoints)) { - brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) - - - } -} - - -brier_cure_update <- function(predict_df, truth = times, eval_timepoints = NULL) { - - - # Get values from observed data - survival_times = predict_df$times - sorted_survival_times <- sort(survival_times) - sorted_event_times = sort(data$times[data$event==1]) - - - predict_df_long <- predict_df %>% - unnest(everything()) - - predict_df_nest_time <- predict_df_long %>% - nest(data = -.eval_time) - - - map(predict_df_nest_time$data, function(x) { - mean(I(x$event == 0)*(1 - x$.pred_survival)^2/(x$.weight_censored + 0.001) + I(x$event == 1)*(0 - x$.pred_survival)^2/(x$.weight_censored + 0.001)) - - }) - - for (l in 1:length(eval_timepoints)) { - brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) - - - } -} - predict_df_long <- predict_df %>% unnest(everything()) diff --git a/inst/evaluate-simulations.R b/inst/evaluate-simulations.R deleted file mode 100644 index 15abbdb..0000000 --- a/inst/evaluate-simulations.R +++ /dev/null @@ -1,199 +0,0 @@ -source(here::here("R/cv.cureitlasso.R")) -source(here::here("R/cureitlasso.R")) -source(here::here("R/simulasso.R")) -source(here::here("R/coxsplit.R")) - -# Calculate Performance Metrics on Simulations ---------------------------------- - -# * Load all relevant data ------- -load(here::here("inst", "cv_fits_50.RData")) -load(here::here("inst", "cv_fits_100.RData")) -load(here::here("inst", "sim_data_list.RData")) -load(here::here("inst", "sim_valid_data_list.RData")) - -# * Calculate Sensitivity of Variable Selection in Training Fits ------- -spe_cox_min <- c() -sen_cox_min <- c() - -x <- for (i in 1:50) { - fit <- cv_fits_list[[i]] - #fit <- x$fit - dat <- sim_data_list[[i]] - - spe_cox_min[[i]] <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) - sen_cox_min[[i]] <- mean(dat$id_cox %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) - # spe_cox_min <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) - - # print(sen_cox_min) - -} - - -hist(unlist(sen_cox_min)) -hist(unlist(sen_cox_min)) - -# Sensitivity and specificity (for both min and 1se choices of parameters) --- - -sen_cox_min <- mean(dat$id_cox %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) - - -spe_cure_min <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$alpha!=0)) -spe_cox_min <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) - -sen_cure_1se <- mean(dat$id_cure %in% which(fit$fit[[fit$index$`1se`[1]]][[fit$index$`1se`[2]]]$alpha!=0)) -sen_cox_1se <- mean(dat$id_cox %in% which(fit$fit[[fit$index$`1se`[1]]][[fit$index$`1se`[2]]]$beta!=0)) -spe_cure_1se <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$`1se`[1]]][[fit$index$`1se`[2]]]$alpha!=0)) -spe_cox_1se <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$`1se`[1]]][[fit$index$`1se`[2]]]$beta!=0)) - - - - -# * Get it to work with yardstick ----- -library(yardstick) - -vec <- c(TRUE, TRUE, FALSE) -df <- data.frame( - pred = factor(as.integer(dat$id_cure %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$alpha!=0))) - # Presence = as.integer(vec) # Convert logical values to integers (TRUE -> 1, FALSE -> 0) -) %>% - mutate(obs = factor(rep(1, length = nrow(.)), levels = c("0", "1"))) - -sens(df, obs, pred, event_level = "second") -sen_cure_min <- mean(dat$id_cure %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$alpha!=0)) - - - -# * Brier Score ------------------------------------------------------------- -#' - `ti`- simulated survival times -#' - `d`- simulated event indicator -#' - `x` simulated data n x p matrix or data.frame -#' - `id_cure` - IDs of selected cure parameters -#' - `id_cure` - IDs of selected cox parameters -#' - `coefs_cure` - Coefficients of cure parameters -#' - `coefs_cox` - Coefficients of cox parameters -#' - `tj` - is unique event times sorted - - -for (i in 1:length(sim_data_list)) { - - dat <- sim_data_list[[i]] - dat_valid = sim_valid_data_list[[i]] - fit <- cv_fits_list[[i]] - - # for training set - ti <- dat$t - di <- dat$d - xi <- dat$x - # unique event times sorted - tj <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$tj - - # for test set - ti_valid <- dat_valid$t - di_valid <- dat_valid$d - xi_valid <- dat_valid$x - tj_valid <- sort(ti_valid[di_valid==1]) - - - # get final fit (hyperparam with min error) for cure model (fits used train data) - final_select_model = fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]] - fitcure <- final_select_model$fitcure - - # use this model to make predictions on data (fitted relative-risk for "cox) - predcure <- predict(fitcure, newx=xi, s=min(fitcure$lambda), type="response") - predcure_valid <- predict(fitcure, newx = xi_valid, s=min(fitcure$lambda), type="response") - - # get final fit of min coxcure model - fitcox <- final_select_model$fitcox - - # Get Data from Selected "Final" Prediction Model ----- - # use this model to make predictions on data - - # get linear predictors from cox - predsurvexp <- predict(fitcox, - newx=xi, - s = min(fitcox$lambda), - type="response") - - predsurvexp_valid <- predict(fitcox, - newx=xi_valid, - s = min(fitcox$lambda), - type="response") - - haz <- final_select_model$haz - cumhaz <- final_select_model$cumhaz - - # Get Kaplan Meier estimate of actual data - predcens <- survfit(Surv(ti,1-di)~1)$surv - tcens <- survfit(Surv(ti,1-di)~1)$time - # - # predcens <- survfit(Surv(ti,1-di)~1)$surv - # tcens <- survfit(Surv(ti,1-di)~1)$time - - # times at which to get score - tbrier <- sort(ti) - brier <- rep(NA,length(tbrier)) - - tbrier_valid <- sort(ti_valid) - brier_valid <- rep(NA,length(tbrier_valid)) - - # Train Data---- - for (l in 1:length(tbrier)){ - - predsurv <- rep(NA,length(ti)) # Conditional survival prob for all patients at tbrier[l] - - if (tbrier[l] >= min(tj)){ - - ids <- which(tj == max(tj[tj <= tbrier[l]])) - - # get predicted survival probability - predsurv <- exp(-cumhaz[ids]*predsurvexp) - - # weights - ipw <- as.numeric(ti > tbrier[l])*predcens[l] + as.numeric(ti <= tbrier[l])*predcens - - } else if (tbrier[l] < min(tj)){ - - predsurv <- rep(1,length(ti)) - ipw <- 1 - - } - - preds <- 1 - predcure + predcure*predsurv - brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) - - } - - # Test Data---- - for (l in 1:length(tbrier)){ - - predsurv_valid <- rep(NA,length(ti_valid)) # Conditional survival prob for all patients at tbrier[l] - - if (tbrier_valid[l] >= min(tj_valid)){ - - ids_valid <- which(tj_valid == max(tj_valid[tj_valid <= tbrier_valid[l]])) - - # get predicted survival probability - predsurv_valid <- exp(-cumhaz[ids_valid]*predsurvexp_valid) - - # weights - ipw_valid <- as.numeric(ti_valid > tbrier_valid[l])*predcens[l] + as.numeric(ti_valid <= tbrier_valid[l])*predcens - - }else if (tbrier_valid[l] < min(tj_valid)){ - - predsurv_valid <- rep(1,length(ti_valid)) - ipw_valid <- 1 - - } - - preds_valid <- 1 - predcure_valid + predcure_valid*predsurv_valid - brier_valid[l] <- mean(I(di_valid==0)*(1 - preds_valie)^2/(ipw_valid + 0.001) + I(di_valid==1)*(0 - preds_valid^2/(ipw_valid+0.001)) - - - - all_brier = cbind.data.frame("tbrier" = tbrier, "brier" = brier, "brier_valid" = brier_valid) - return(all_brier) - # plot(tbrier,brier,type="S") - - - } -} diff --git a/inst/inst-README.md b/inst/inst-README.md new file mode 100644 index 0000000..396daaf --- /dev/null +++ b/inst/inst-README.md @@ -0,0 +1,15 @@ +# inst + +- +- `compare-predict-brier-with-yardstick_P2.R` - compare yardstick prediction and brier using our simulation data. Using new predict_cure() compare Teng's manual brier score calc to yardstick brier score calc + + +- `compare-predict-brier-with-yardstick_P1.R` - compare yardstick prediction and brier using their example data + +- `compare-predict-brier-with-yardstick_P1.R` - misc comparison code (need to review this) + +- `run-simulations.R` - produce simulated data for lasso cure models + +- `archive` + - `evaluate-simulations.R` - old code to calculate brier score on simulations (modified from Teng) + - `testcode.R` - old code to calculate brier score \ No newline at end of file diff --git a/inst/testcode.R b/inst/testcode.R deleted file mode 100644 index 4ecfc25..0000000 --- a/inst/testcode.R +++ /dev/null @@ -1,48 +0,0 @@ -#### Evaluate the calibration by Brier score for the validation data -#### Line 79 - 123: Brier score for the cureit model - -# ti <- dat_valid$t -# di <- dat_valid$d -# xi <- dat_valid$x -ti <- dat$t -di <- dat$d -xi <- dat$x -tj <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$tj - -fitcure <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$fitcure -predcure <- predict(fitcure,newx=xi,s=min(fitcure$lambda),type="response") -fitcox <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$fitcox -predsurvexp <- predict(fitcox,newx=xi,s=min(fitcox$lambda),type="response") -haz <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$haz -cumhaz <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$cumhaz - -predcens <- survfit(Surv(ti,1-di)~1)$surv -tcens <- survfit(Surv(ti,1-di)~1)$time - -tbrier <- sort(ti) -brier <- rep(NA,length(tbrier)) -ipw <- rep(NA,length(tbrier)) -for (l in 1:length(tbrier)){ - - predsurv <- rep(NA,length(ti)) # Conditional survival prob for all patients at tbrier[l] - - if (tbrier[l] >= min(tj)){ - - ids <- which(tj == max(tj[tj <= tbrier[l]])) - predsurv <- exp(-cumhaz[ids]*predsurvexp) - ipw <- as.numeric(ti > tbrier[l])*predcens[l] + as.numeric(ti <= tbrier[l])*predcens - - }else if (tbrier[l] < min(tj)){ - - predsurv <- rep(1,length(ti)) - ipw <- 1 - - } - - ipw[l] <- ipw - preds <- 1 - predcure + predcure*predsurv - brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) - -} - -plot(tbrier,brier,type="S") \ No newline at end of file From 232e13428e5fb09aff6553351e8878056ae72dab Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Thu, 6 Jun 2024 10:53:20 -0400 Subject: [PATCH 10/11] update info on scripts --- inst/inst-README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/inst/inst-README.md b/inst/inst-README.md index 396daaf..7cf4a72 100644 --- a/inst/inst-README.md +++ b/inst/inst-README.md @@ -1,6 +1,6 @@ # inst -- + - `compare-predict-brier-with-yardstick_P2.R` - compare yardstick prediction and brier using our simulation data. Using new predict_cure() compare Teng's manual brier score calc to yardstick brier score calc @@ -10,6 +10,9 @@ - `run-simulations.R` - produce simulated data for lasso cure models +- `nomogram-custom-edits.R` - custom edits to nomogram for cureit paper + - `archive` - `evaluate-simulations.R` - old code to calculate brier score on simulations (modified from Teng) - - `testcode.R` - old code to calculate brier score \ No newline at end of file + - `testcode.R` - old code to calculate brier score + From e693f4ca871cbe08509581da18e26ca1e604c897 Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Thu, 6 Jun 2024 11:00:14 -0400 Subject: [PATCH 11/11] organize --- .../compare-predict-brier-with-yardstick_P2.R | 63 ++++++++++++------- ...compare-predict-brier-with-yardstick_P3.R} | 0 2 files changed, 39 insertions(+), 24 deletions(-) rename inst/{compare-predict-brier-with-yardstick_P3.R.R => compare-predict-brier-with-yardstick_P3.R} (100%) diff --git a/inst/compare-predict-brier-with-yardstick_P2.R b/inst/compare-predict-brier-with-yardstick_P2.R index d281510..de5c1f3 100644 --- a/inst/compare-predict-brier-with-yardstick_P2.R +++ b/inst/compare-predict-brier-with-yardstick_P2.R @@ -46,6 +46,7 @@ get_brier_for_each_data <- function(dat, fit) { # * Version 1 ----- +# iterates over eval time for (i in 1:100) { time_1 <- predict_df_long$.eval_time[[i]] @@ -57,15 +58,15 @@ for (i in 1:100) { I(predict_df_long_1$event == 1 & predict_df_long_1$times <= time_1)* (0 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored)) - # OLD ----- + # OLD -- # x[i] <- mean(I(predict_df_long_1$event == 0)*(1 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001) + # I(predict_df_long_1$event ==1)*(0 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001)) # - # OLD ---- + # OLD -- # brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + # I(di==1)*(0 - preds)^2/(ipw+0.001)) # - # NEW ?----- + # NEW ?-- # brier[l] <- mean(I(ti > tbrier[l])*(1 - preds)^2/(ipw+0.001) + # I(di==1 & ti <= tbrier[l])*(0 - preds)^2/(ipw+0.001)) @@ -75,6 +76,7 @@ for (i in 1:100) { # * Version 2 ----- +# simple version of above- I think I can delete brier[l] <- mean(I(ti > tbrier[l])* (1 - preds)^2/(ipw+0.001) + @@ -82,8 +84,7 @@ brier[l] <- mean(I(ti > tbrier[l])* # * Version 3 ----- - -brier_cure <- function(predict_df, truth = times, eval_timepoints = NULL) { +brier_cure_update <- function(predict_df, truth = times, eval_timepoints = NULL) { # Get values from observed data @@ -111,36 +112,46 @@ brier_cure <- function(predict_df, truth = times, eval_timepoints = NULL) { } } -# * Version 4 ----- -brier_cure_update <- function(predict_df, truth = times, eval_timepoints = NULL) { +# * Version 4 ---- + +brier_for_each_eval <- function(predict_df_timepoint) { - # Get values from observed data - survival_times = predict_df$times - sorted_survival_times <- sort(survival_times) - sorted_event_times = sort(data$times[data$event==1]) + x = mean(I(predict_df_timepoint$times > predict_df_timepoint$.eval_time )*(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + + I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time )* + (0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001)) + x +} + + +brier_for_each_eval <- function(predict_df_timepoint) { - predict_df_long <- predict_df %>% - unnest(everything()) - - predict_df_nest_time <- predict_df_long %>% - nest(data = -.eval_time) + x = mean( + I(predict_df_timepoint$times > predict_df_timepoint$.eval_time)*(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + + + I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time )* + (0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + ) + x/100 +} + +brier_for_each_eval2 <- function(predict_df_timepoint) { + category1 = I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time ) + category2 = I(predict_df_timepoint$times > predict_df_timepoint$.eval_time) - map(predict_df_nest_time$data, function(x) { - mean(I(x$event == 0)*(1 - x$.pred_survival)^2/(x$.weight_censored + 0.001) + I(x$event == 1)*(0 - x$.pred_survival)^2/(x$.weight_censored + 0.001)) + x = mean( + category1*(0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored) + + category2 *(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored) - }) + ) + x - for (l in 1:length(eval_timepoints)) { - brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) - - - } } + # Example on Single Data Set ------------------------------------------------- # Grab one data set as an example @@ -150,6 +161,7 @@ final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] # Run predict functon predict_df <- predict_cure(final_fit = final_fit, new_data = data) + predict_df_long <- predict_df %>% unnest(everything()) @@ -160,6 +172,9 @@ brier_scores_yardstick <- brier_survival(truth = truth, .pred) +# Example on Multiple Data ------------------------------------------------ + + b <- get_brier_for_each_data(dat = sim_data_list[[3]], fit = cv_fits_list[[3]]) diff --git a/inst/compare-predict-brier-with-yardstick_P3.R.R b/inst/compare-predict-brier-with-yardstick_P3.R similarity index 100% rename from inst/compare-predict-brier-with-yardstick_P3.R.R rename to inst/compare-predict-brier-with-yardstick_P3.R