diff --git a/.Rbuildignore b/.Rbuildignore index 241c55c..0317850 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,4 +13,8 @@ ^.*[.]Rproj.*$ ^\.Rproj\.user$ ^.Rhistory$ -[~] \ No newline at end of file +[~] +^R/cv.cureitlasso.R +^R/simulasso.R +^R/coxsplit.R +^R/run-simulations.R \ No newline at end of file diff --git a/.gitignore b/.gitignore index 3f26a3f..22b011a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ inst/doc inst/model-fits inst/cure-model-fits +inst/cureit-simulation-results/* inst/archive docs/ revdep/ diff --git a/R/cureitlasso.R b/R/cureitlasso.R index ba82871..a4f391f 100644 --- a/R/cureitlasso.R +++ b/R/cureitlasso.R @@ -1,3 +1,47 @@ + +# mu is penlanlity for cure +# lamda is penality for cox +# touch is posterior probabilty of being cured fr uncured (maybe don't need) (Entropy) + # NOTE maybe change name of alphs +# calculated linear peredictor of that iteration + + +#' Lasso for Cure Models +#' +#' @param t Survival times +#' @param d Event indicator +#' @param x data n x p matrix or data.frame +#' @param minmu.ratio minimum penalty value in the logistic model +#' @param minlambda.ratio minimum penalty value in the cox model +#' @param adaptive +#' @param length.grid +#' @param mus penalty for cure +#' @param lambdas is penalty for cox +#' @param tol posterior probability of being cured for uncured (maybe don't need) (Entropy) +#' @param maxiter +#' @param progress verbose iteration counter +#' +#' @return +#' - `all_fits` +#' - `t`, +#' - `tj` - unique event times sorted, +#' - `d`- event indicator, +#' - `x`- data n x p matrix or data.frame, +#' - `alpha0` fitcure$a0[length(fitcure$lambda)], +#' - `alpha` =alpha, +#' - `beta` =beta, +#' - `haz`=haz, +#' - `cumhaz`=cumhaz, +#' - `tau`=as.vector(tau), +#' - `predsurv`=as.vector(predsurv_iter), +#' - `predcure`=as.vector(predcure_iter), +#' - `fitcure`=fitcure, +#' - `fitcox` =fitcox +#' - `mus` - mus that were used as penalties for cure models in cross validation +#' - `lambda` - lambdas that were used as penalities in cix model cross validation +#' @export +#' +#' @examples cureitlasso <- function(t, d, x, @@ -15,6 +59,7 @@ cureitlasso <- function(t, require(glmnet) require(survival) + # initialize initial weights tau <- matrix(0.5,nrow=length(t),ncol=2) # First column: cured; second column: uncured tau[d==1,1] = 0 tau[d==1,2] = 1 @@ -24,6 +69,7 @@ cureitlasso <- function(t, # CV: Be careful for fold split! Two replicates from the same subject should always be assigned to the same fold + #this is for later integreation with tuning penalty.factor.cure=rep(1,ncol(x)) penalty.factor.cox=rep(1,ncol(x)) @@ -51,6 +97,8 @@ cureitlasso <- function(t, # # } + # So this section is finding the set of optimal hyperparemters to test ---- + # tau is uncure probability fitcure0 <- glmnet(x=do.call("rbind", rep(list(x), 2)), y=lab_class, family="binomial", @@ -72,8 +120,10 @@ cureitlasso <- function(t, lambdas <- fitcox0$lambda[idx_lambdas] } + musmax <- fitcure0$lambda[1] lambdasmax <- fitcox0$lambda[1] + # ----------------- predcure <- predict(fitcure0,newx=x,s=mus,type="response") #Prob of uncured predsurvexp <- predict(fitcox0,newx=x,s=lambdas,type="response") # exp(betaX) @@ -114,6 +164,7 @@ cureitlasso <- function(t, fit[[i]] <- list() + for (j in 1:length(lambdas)){ tau <- d + (1-d) * (predcure[,i] * predsurv[,j])/( (1 - predcure[,i]) + predcure[,i] * predsurv[,j] ) @@ -319,14 +370,24 @@ cureitlasso <- function(t, # num_alpha[i,j] <- sum(alpha!=0) # num_beta[i,j] <- sum(beta!=0) + # names(fit[i][j]) <- paste0("mu_fit_", i, "_lambda_fit_", "j") + } + + names(fit[[i]]) <- paste0("lambda_fit_", 1:length(lambdas)) } - return(list(fit=fit, + names(fit) <- paste0("mu_fit_", 1:length(mus)) + + return(list(fit = fit, # num_alpha=num_alpha, # num_beta=num_beta, mus=mus, lambdas=lambdas)) -} \ No newline at end of file +} + +# tj - is unique event times sorted +# haz + diff --git a/R/cv.cureitlasso.R b/R/cv.cureitlasso.R index 991f3ba..90e418f 100644 --- a/R/cv.cureitlasso.R +++ b/R/cv.cureitlasso.R @@ -1,10 +1,14 @@ +# Returns: +# fit - +# arg: length.grid - + cv.cureitlasso <- function(t, d, x, minmu.ratio=0.05, # minimum penalty value in the logistic model minlambda.ratio=0.05, # minimum penalty value in the cox model adaptive=FALSE, - length.grid=10, + length.grid=10, # tells how many hyperparameters to test. default is 10 for each nfolds=5, tol=1e-2, maxiter=100, @@ -22,6 +26,7 @@ cv.cureitlasso <- function(t, require(survcomp) require(pracma) + # order data by times order_idx <- order(t) t <- t[order_idx] d <- d[order_idx] @@ -29,6 +34,7 @@ cv.cureitlasso <- function(t, if (progress) print("Fitting by EM algorithm ...") + # main model with no cross validation fit <- cureitlasso(t,d,x, minmu.ratio, minlambda.ratio, @@ -38,7 +44,7 @@ cv.cureitlasso <- function(t, lambdas=NULL, tol, maxiter, - progress) + progress = TRUE) if (progress) print("Running cross validations ...") @@ -46,6 +52,8 @@ cv.cureitlasso <- function(t, # Fold split foldid <- coxsplit(as.matrix(Surv(t,d)), nfolds) + + # grid/array of values cv_brier <- array(NA,dim=c(length.grid,length.grid,nfolds)) # Run CVs @@ -57,6 +65,7 @@ cv.cureitlasso <- function(t, if (progress) print(i) + cv.fit[[i]] <- cureitlasso(t[foldid != i], d[foldid != i], x[foldid != i,], @@ -114,6 +123,7 @@ cv.cureitlasso <- function(t, } + # calcualte trapazoidal area for brier score cv_brier[j,k,i] <- trapz(tbrier,brier) @@ -130,7 +140,7 @@ cv.cureitlasso <- function(t, cv.fit <- foreach(i = 1:nfolds) %dopar% { - source("~/Projects/Whiting-Qin-cureit/cureit/R/cureitlasso.R") + source(here::here("R/cureitlasso.R")) cureitlasso(t[foldid != i], d[foldid != i], @@ -206,6 +216,7 @@ cv.cureitlasso <- function(t, } + # summarize - take average across all folds cv_brier_mean <- apply(cv_brier,c(1,2),function(x) mean(x)) cv_brier_se <- apply(cv_brier,c(1,2),function(x) sd(x)/sqrt(nfolds)) # pheatmap::pheatmap(cv_brier_mean,cluster_cols = F,cluster_rows = F) @@ -231,11 +242,12 @@ cv.cureitlasso <- function(t, cv_brier_mean = cv_brier_mean, cv_brier_se = cv_brier_se, index = list(min=idxmin, + `1se`= idx1se), `1se`= idx1se), foldid = foldid, selected = list(min=selectedmin, `1se`=selected1se) ) - ) -} \ No newline at end of file + +} diff --git a/R/predict.R b/R/predict.R new file mode 100644 index 0000000..087eee0 --- /dev/null +++ b/R/predict.R @@ -0,0 +1,183 @@ +#' #' Predicted survival probability by cure model +#' #' +#' #' @param object A cureit object. +#' #' @param times Numeric vector of times to obtain survival probability estimates at +#' #' @param probs Numeric vector of quantiles to obtain estimates at +#' #' @param newdata A `base::data.frame()` or `tibble::tibble()` containing all +#' #' the original predictors used to create object. Defaults to `NULL`. +#' #' @param type Output format of predicted values: "lp" (linear predictor) or "prob" (predicted probabilities). +#' #' @param brier Boolean indicator of calculating the Brier scores at specified `times`. +#' #' @param cox Boolean indicator of fitting the Cox model for the training data and calculating the Brier scores at specified `times` for `newdata`. +#' #' @inheritParams cureit +#' #' @return named list of prediction estimates +#' #' @family cureit() functions +#' #' @export +#' #' @examples +#' #' p <- cureit(surv_formula = Surv(ttdeath, death) ~ age, +#' #' cure_formula = ~ age, +#' #' data = trial) +#' #' +#' #' pred <- predict(p, times = seq(5,24,0.5), +#' #' newdata = trial[complete.cases(trial), ], brier=TRUE,cox=TRUE) +#' #' +#' #' plot(seq(5,24,0.5),pred$brier,type="S",pch=1) +#' #' lines(seq(5,24,0.5),pred$brier_cox,type="S",col="red",pch=3) +#' #' legend("topright",c("Cure model","Cox model"), +#' #' col=c("black","red"),lty=1,pch=c(1,3)) +#' #' +#' #' +#' predict.cureit <- function(object, times = NULL, probs = NULL, +#' newdata = NULL, type = c("prob", "lp"), +#' brier = FALSE, cox = FALSE, ...) { +#' # checking inputs ------------------------------------------------------------ +#' +#' type <- match.arg(type) +#' +#' if (is.null(times) + is.null(probs) != 1L) { +#' stop("Must specify one and only one of `times=` and `probs=`.", call. = FALSE) +#' } +#' if (!is.null(times) && any(times < 0)) { +#' stop("`times=` must be non-negative.", call. = FALSE) +#' } +#' if (!is.null(probs) && !all(dplyr::between(probs, 0, 1))) { +#' stop("`probs=` must be between 0 and 1.", call. = FALSE) +#' } +#' if (!(type %in% c("lp","prob"))) { +#' stop("`type=` must be one out of 'lp' or 'prob'.", call. = FALSE) +#' } +#' # if (type == "lp" & brier){ +#' # stop("`type=` must be 'prob' to calculate the Brier scores.") +#' # } +#' # if (brier & is.null(times)){ +#' # stop("Must specify one or more times to calculate the Brier scores.") +#' # } +#' # if (!brier & cox){ +#' # stop("Must let `brier=TRUE` to perform Cox regression and corresponding Brier scores.") +#' # } +#' +#' # getting predictions on the original model fit ------------------------------ +#' processed <- cureit_mold(object$surv_formula, object$cure_formula, newdata %||% object$data, +#' surv_blueprint = object$surv_blueprint, cure_blueprint = object$cure_blueprint) +#' +#' newX=processed$surv_processed$predictors +#' newZ=processed$cure_processed$predictors +#' if (is.vector(newZ)) +#' newZ = as.matrix(newZ) +#' newZ = cbind(1, newZ) +#' if (is.vector(newX)) +#' newX = as.matrix(newX) +#' +#' s0 = as.matrix(object$smcure$s[order(object$smcure$Time)], ncol = 1) +#' n = nrow(s0) +#' uncureprob = exp(object$smcure$b %*% t(newZ))/(1 + exp(object$smcure$b %*% t(newZ))) +#' +#' scure = array(0, dim = c(n, nrow(newX))) +#' t = array(0, dim = c(n, nrow(newX))) +#' spop = array(0, dim = c(n, nrow(newX))) +#' +#' #### Only support PH model for now +#' ebetaX = exp(object$smcure$beta %*% t(newX)) +#' for (i in 1:nrow(newZ)) { +#' scure[, i] = s0^ebetaX[i] +#' } +#' +#' for (i in 1:n) { +#' for (j in 1:nrow(newX)) { +#' spop[i, j] = uncureprob[j] * scure[i, j] + (1 - +#' uncureprob[j]) +#' } +#' } +#' +#' spop_prd = cbind(object$smcure$Time, spop) +#' scure_prd = cbind(object$smcure$Time, scure) +#' cure_prd = 1-uncureprob +#' +#' # Type = "Prob --------- +#' if (type == "prob"){ +#' +#' if (!is.null(times)) { +#' return(list(cured = cure_prd, +#' surv_uncured = probs_at_times(scure_prd,times), +#' surv_marginal = probs_at_times(spop_prd,times))) +#' } +#' +#' if (!is.null(probs)) { +#' return(list(cured = cure_prd, +#' surv_uncured = times_at_probs(scure_prd,probs), +#' surv_marginal = times_at_probs(spop_prd,probs))) +#' } +#' } +#' +#' # Type = "Prob --------- +#' if (type == "lp"){ +#' +#' return(list(lp_cure_model = object$smcure$b %*% t(newZ), +#' lp_surv_model = object$smcure$beta %*% t(newX))) +#' +#' } +#' +#' } +#' +#' times_at_probs <- function(matrix_pred, probs) { +#' matrix_zero <- matrix(c(0, 1), ncol = 2) +#' lst_time_risk <- +#' purrr::map( +#' probs, +#' ~ purrr::map_dbl( +#' seq_len(ncol(matrix_pred) - 1L), +#' function(i) { +#' # adding 1 to the matrix, keeping only the time column, and col of interest +#' m <- rbind(matrix_zero, matrix_pred[, c(1L, i + 1L)]) +#' +#' # return NA if the quantiles are all missing OR prob is larger than observed +#' if (isTRUE(all(is.na(m[-1, 2])) || .x > max(m[, 2]))) { +#' return(NA) +#' } +#' if (isTRUE(.x %in% m[, 2])) { +#' return(m[m[, 2] %in% .x, 1]) +#' } +#' return(m[which.min(m[, 2] >= .x), 1]) +#' } +#' ) +#' ) %>% +#' stats::setNames(paste0("prob ", probs * 100, "%")) +#' +#' # returning results ---------------------------------------------------------- +#' lst_time_risk +#' } +#' +#' +#' +#' probs_at_times <- function(matrix_pred, times) { +#' # defining times for predictions --------------------------------------------- +#' # CHECK- here should this be warning? +#' all_times <- union(0, matrix_pred[, 1]) %>% sort() +#' if (isTRUE(max(times) > max(all_times))) { +#' stringr::str_glue("`times=` cannot be larger than {max(all_times)}") %>% +#' stop(call. = FALSE) +#' } +#' times_obs <- +#' purrr::map_dbl( +#' times, +#' function(.x) { +#' if (isTRUE(.x %in% all_times)) { +#' return(all_times[all_times %in% .x]) +#' } +#' all_times[which.min(all_times <= .x) - 1L] +#' } +#' ) +#' +#' # named list of the risks, the names are the times, +#' # the values are the estimates of risk at the covar levels +#' lst_risk_time <- +#' purrr::map(seq_len(length(all_times) - 1L), ~ matrix_pred[.x, -1]) %>% +#' stats::setNames(all_times[-1]) %>% +#' dplyr::bind_cols() %>% +#' mutate(`0` = 0, .before = 1) %>% +#' as.list() %>% +#' stats::setNames(all_times) +#' +#' # extracting risks at specified time ----------------------------------------- +#' lst_risk_time[as.character(times_obs)] %>% +#' stats::setNames(paste("time", times)) +#' } diff --git a/R/predict_cure_2024-03-15.R b/R/predict_cure_2024-03-15.R new file mode 100644 index 0000000..7d0c721 --- /dev/null +++ b/R/predict_cure_2024-03-15.R @@ -0,0 +1,171 @@ +#' Predict Cure and Survival Probabilities +#' +#' This function predicts cure and survival probabilities for data based on a final fitted model. +#' It calculates predicted survival probabilities at specified evaluation time points, accounting for censoring. +#' Output is consistent with tidymodels predict guidelines and can be passed into yardstick functions like `brier()` +#' +#' @param final_fit A list containing the fitted cure and survival models. It should have elements `fitcure`, `fitcox`, `haz`, and `cumhaz`. +#' @param new_data A data frame containing the new observations to predict on. It should include columns `times` (observed survival times), `event` (event indicator, 1 for event, 0 for censoring), and other covariates used for prediction. +#' @param eval_timepoints Optional. A vector of time points at which to evaluate survival probabilities. If NULL, the function uses the observed event times from `new_data`. +#' +#' @return A data frame containing the predicted survival probabilities, cure rates, and other related metrics for each observation at the specified evaluation time points. The returned data frame includes the following columns: +#' \itemize{ +#' \item \code{id}: Identifier for each observation. +#' \item \code{.eval_time}: Evaluation time point. +#' \item \code{.pred_survival}: Predicted survival probability at the evaluation time point. +#' \item \code{.pred_curerate}: Predicted cure rate for each observation. +#' \item \code{.pred_survportion}: Component of the survival prediction that excludes the cure component. +#' \item \code{.weight_censored_raw}: Raw censoring weight at each evaluation time point. +#' \item \code{.weight_censored}: Inverse probability weight for censoring. +#' \item \code{event}: Event indicator from the new data. +#' \item \code{times}: Observed survival times from the new data. +#' } +#' +#' @examples +#' \dontrun{ +#' # Assuming `final_fit` is a fitted model list and `new_data` is a data frame with appropriate structure: +#' predict_df <- predict_cure(final_fit = final_fit, new_data = new_data) +#' print(predict_df) +#' } +#' +#' @importFrom dplyr select mutate bind_cols +#' @importFrom tidyr nest unnest +#' @importFrom tibble tibble +#' +#' @export +predict_cure <- function(final_fit, + new_data = NULL, + eval_timepoints = NULL) { + + # Values from Observed Data & Model Fit --------------------------------- + + survival_times = new_data$times + events = new_data$event + sorted_survival_times <- sort(survival_times) + sorted_event_times = sort(new_data$times[new_data$event==1]) + + # Extract cumulative hazard from final fit (for event times) + haz <- final_fit$haz + cumhaz <- final_fit$cumhaz + + # Set up Data + data_x <- new_data %>% + select(-c(times, event, truth_survival)) %>% + as.matrix() + + # Evaluation Time Points -------------------------------------------------- + # Default evaluation time points are observed event times if not supplied by user + + all_eval_timepoints <- eval_timepoints %||% sorted_survival_times + + # Censoring Weights -------------------------------------------------------- + + censoring_survfit = survfit(Surv(survival_times, 1 - events) ~ 1) + + # CHECK- or censoring weights at all all respective survival times? + # censoring_weights = summary(censoring_survfit, times = all_eval_timepoints) + censoring_weights = summary(censoring_survfit, times = survival_times)$surv + + + # Relative Risk for Cure and Cox Risk ------------------------------------ + # (independent of time point) + + # * Cure probability risk prediction ------ + fitcure <- final_fit$fitcure + predcure <- predict(fitcure, newx = data_x, + s = min(fitcure$lambda), type = "response") + + # * Cox survival probability risk prediction ------ + fitcox <- final_fit$fitcox + predsurvexp <- predict(fitcox, + newx = data_x, + s = min(fitcox$lambda), + type = "response") + + + + # Predicted Probabilities at Evaluated Time points ------------------------- + # (dependent on time point- need IPCW) + + all_preds_all_tps <- list() + + # Evaluated in all patients, for each eval time point (iteration on time point) + for (timepoint_index in 1:length(all_eval_timepoints)) { + + # timepoint we are evaluating + eval_timepoint <- all_eval_timepoints[timepoint_index] + + + # * Inverse Probability Censoring Weights --------- + + # If no time points in new data less than lowest event time in model fit data (most cases): + if (eval_timepoint >= min(sorted_event_times)) { + + # get maximum event time that is less than time point we are evaluating + closest_event_time <- max( + sorted_event_times[sorted_event_times <= eval_timepoint]) + + index_closest_event_time <- which(sorted_event_times == closest_event_time) + + # Get predicted survival by using cumulative hazard at closest previous event time + predsurv <- exp(-cumhaz[index_closest_event_time]*predsurvexp) + + # CHECK - Probability of the censoring at evaluation time point (or should it be at previous index closest event time??) + # censoring_weight_at_eval_time <- censoring_weights[timepoint_index] + censoring_weight_at_eval_time <- censoring_weights[index_closest_event_time] + + ipw <- + # if observed time is greater than event time, use censoring weight at evaluation time (or time closest) + (as.numeric(survival_times > eval_timepoint)*censoring_weight_at_eval_time) + + + # if observed time is less than or equal to eval time, use censoring weight at it's own observed time + as.numeric(survival_times <= eval_timepoint)*censoring_weights + + ipw = 1/(ipw+ 0.001) + + # if no events occurred before eval time + } else if (eval_timepoint < min(sorted_event_times)) { + + predsurv <- rep(1,length(survival_times)) + ipw <- 1 + + } + + preds <- 1 - predcure + predcure*predsurv + + all_preds <- tibble( + + ".eval_time" = eval_timepoint, + ".pred_survival" = preds, + ".pred_curerate" = predcure, + ".pred_survportion" = predsurv, + ".weight_censored_raw" = censoring_weights[timepoint_index], + ".weight_censored" = ipw) %>% + mutate(id = 1:nrow(.)) + + all_preds_nest <- all_preds %>% nest(.preds = -c(id)) + + all_preds_all_tps[[timepoint_index]] <- all_preds + + } + + all_pred_df <- all_preds_all_tps %>% + do.call("rbind", .) + + all_pred_df <- all_pred_df %>% + nest(.pred = -c(id)) %>% + bind_cols(., "event" = new_data$event, + "times" = new_data$times) + + return(all_pred_df) + +} + + +# Try Function ------------------------------------------------------------ +# +# predict_df <- predict_cure(final_fit = final_fit, new_data = data) +# predict_df_long <- predict_df %>% +# unnest(everything()) + + diff --git a/R/simulasso.R b/R/simulasso.R index 91d6998..53ae132 100644 --- a/R/simulasso.R +++ b/R/simulasso.R @@ -1,28 +1,57 @@ -simulasso <- function(n=100, - p=200, - id_cure=NULL, - id_cox=NULL, - coefs_cure=NULL, - coefs_cox=NULL - ){ - +#' Simulate Data +#' +#' @param n Number of patients to simulate +#' @param p Number of parameters to simulate +#' @param id_cure Which parameters to include in cure simulation +#' @param id_cox Which parameters to include in cox simulation +#' @param coefs_cure Pre-specified coefficients of cure +#' @param coefs_cox Pre-specified Coefficients of cox +#' @param n_true_p Number of "true" non-zero coefficients +#' +#' @return +#' - `t`- simulated survival times +#' - `d`- simulated event indicator +#' - `x` simulated data n x p matrix or data.frame +#' - `id_cure` - IDs of selected cure parameters +#' - `id_cure` - IDs of selected cox parameters +#' - `coefs_cure` - Coefficients of cure parameters +#' - `coefs_cox` - Coefficients of cox parameters +#' +#' @export +#' +#' @examples +#' simulasso(n = 10, p = 3) +simulasso <- function(n = 100, + p = 200, + id_cure = NULL, + id_cox = NULL, + coefs_cure = NULL, + coefs_cox = NULL, + n_true_p = 10) { require(mvtnorm) + + t <- rep(NA, length = n) + d <- rep(NA, length = n) + + # number of predictors is size + if (is.null(id_cure)) id_cure <- sample(p, size = n_true_p) + if (is.null(id_cox)) id_cox <- sample(p, size = n_true_p) + + if (is.null(coefs_cure)) coefs_cure <- runif(length(id_cure) + 1, min = 2, max = 3) + if (is.null(coefs_cox)) coefs_cox <- runif(length(id_cox), min = 2, max = 3) + + x <- mvtnorm::rmvnorm(n, mean = rep(0, p)) + + # add intercept and calculate linear predictor and probability of cure using the logistic function: - t <- rep(NA,length=n) - d <- rep(NA,length=n) - - if(is.null(id_cure)) id_cure = sample(p,size=3) - if(is.null(id_cox)) id_cox = sample(p,size=3) - - if(is.null(coefs_cure)) coefs_cure = runif(length(id_cure)+1,min=2,max=3) - if(is.null(coefs_cox)) coefs_cox = runif(length(id_cox),min=2,max=3) - - x <- mvtnorm::rmvnorm(n,mean=rep(0,p)) - puncure <- exp(cbind(1,x[,id_cure]) %*% coefs_cure)/(1+exp(cbind(1,x[,id_cure]) %*% coefs_cure)) - uncure <- rbinom(n,1,prob=puncure) - - lambda <- exp(x[,id_cox] %*% coefs_cox) + puncure <- exp(cbind(1, x[, id_cure]) %*% coefs_cure) / (1 + exp(cbind(1, x[, id_cure]) %*% coefs_cure)) + # make into event index + uncure <- rbinom(n, 1, prob = puncure) + + # linear predictor survival + lp_cox <- exp(x[, id_cox] %*% coefs_cox) + ### Should censoring time for cured subjects be generated from a same distribution as those uncured? for(i in 1:n){ @@ -31,8 +60,12 @@ simulasso <- function(n=100, c0 <- min(rexp(1,rate=1/30),runif(1,min=20,max=40)) t[i] <- ifelse(uncure[i]==1, min(t0,c0), c0) d[i] <- ifelse(uncure[i]==1, as.numeric(I(t0 <= c0)), 0) + } - - dat <- list(t=t,d=d,x=x,uncure=uncure,id_cure=id_cure,id_cox=id_cox,coefs_cure=coefs_cure,coefs_cox=coefs_cox) - -} \ No newline at end of file + + dat <- list(t = t, d = d, x = x, + uncure = uncure, + id_cure = id_cure, + id_cox = id_cox, coefs_cure = coefs_cure, + coefs_cox = coefs_cox) +} diff --git a/inst/compare-predict-brier-with-yardstick_P1.R b/inst/compare-predict-brier-with-yardstick_P1.R new file mode 100644 index 0000000..db759a3 --- /dev/null +++ b/inst/compare-predict-brier-with-yardstick_P1.R @@ -0,0 +1,192 @@ +library(tidymodels) +library(censored) +tidymodels_prefer() + + +# Try Tidymodels Example -------------------------------------------------- + + +# * Fit Lung Model -------- + +data(cancer, package="survival") +data(lung_surv) + +#lung <- lung %>% drop_na() + +lung <- lung %>% + mutate(status = case_when( + status == 1 ~ 0, + status == 2 ~ 1 + )) + + +ph_spec <- + proportional_hazards() %>% + set_engine("survival") %>% + set_mode("censored regression") + +ph_spec + +set.seed(1) +ph_fit <- ph_spec %>% fit(Surv(time, status) ~ ., data = lung) +ph_fit + +cox <- coxph(Surv(time, status) ~ ., data = lung) +cox +cumhaz_all<- survfit(cox) +cumhaz <- bind_cols(time = cumhaz_all$time, surv = cumhaz_all$surv) + +#cumhaz$surv[which.min(abs(cumhaz$time - closest_event_time))] + +#cumhaz$surv[cumhaz$time[] + +# * Predict Lung Model ----------- + +# Make predictions +p <- predict( + ph_fit, + lung, + type = "survival", + eval_time = lung$time +) %>% + mutate(surv_truth = Surv(lung$time, lung$status)) + +# add weights +p_weights <- .censoring_weights_graf(ph_fit, predictions = p) %>% + mutate(pat_id = 1:nrow(.)) + +p_weights_long <- p_weights %>% + unnest(cols = .pred) + +# calc brier +brier_scores_lung <- + p_weights %>% + brier_survival(truth = surv_truth, .pred) + +brier_scores_lung + +# Try our code on their data ---------------------------------------------- + +# +# predict_df_long <- predict_df %>% +# unnest(everything()) + +predict_df_nest_time <- p_weights_long %>% + nest(data = -.eval_time) + +predict_df_timepoint = predict_df_nest_time[10,] %>% + unnest(everything()) + +brier_for_each_eval <- function(predict_df_timepoint) { + + + x = mean(I(predict_df_timepoint$times > predict_df_timepoint$.eval_time )*(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + + I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time )* + (0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001)) + x +} + + + + +lung2 <- lung %>% + rename("times" = time, + "event" = status) +cox <- coxph(Surv(times, event) ~ ., data = lung2) +cox +cumhaz_all<- survfit(cox) +cumhaz <- bind_cols(time = cumhaz_all$time, surv = cumhaz_all$surv) + + +lung2 <- lung2 %>% + mutate(truth_survival = Surv(times, event)) + +predict_cure(final_fit = cox, new_data = lung2, eval_timepoints = lung2$times) + + + + +# Misc -------------------------------------------------------------------- + + +eval_time_test <- 300 + +new_data = lung +survival_times <- lung$time +sorted_survival_times <- sort(survival_times) + +survival_status <- lung$status +sorted_event_times = sort(survival_times[survival_status==1]) + +censoring_survfit = survfit(Surv(survival_times, 1 - survival_status) ~ 1) +censoring_weights = summary(censoring_survfit, times = survival_times) +censoring_weights = censoring_weights$surv + +all_weights = bind_cols(censoring_weights_time, censoring_weights) +names(all_weights) <- c("weights_time", "weights_surv") + +eval_timepoint = 300 + +# * Inverse Probability Censoring Weights --------- + +# If no time points in new data less than lowest event time in model fit data (most cases): +if (eval_timepoint >= min(sorted_event_times)) { + + # get maximum event time that is less than time point we are evaluating + closest_event_time <- max( + sorted_event_times[sorted_event_times <= eval_timepoint]) + + index_closest_event_time <- which(sorted_event_times == closest_event_time) + + # Get predicted survival by using cumulative hazard at closest previous event time + predsurv <- exp(-cumhaz[index_closest_event_time]*predsurvexp) + + # CHECK - Probability of the censoring at evaluation time point (or should it be at previous index closest event time??) + censoring_weight_at_eval_time <- censoring_weights[timepoint_index] + + ipw <- + as.numeric(survival_times > eval_timepoint)*censoring_weight_at_eval_time + + as.numeric(survival_times <= eval_timepoint)*censoring_weights + + ipw = 1/ipw + + # if no events occurred before eval time +} else if (eval_timepoint < min(sorted_event_times)) { + + predsurv <- rep(1,length(survival_times)) + ipw <- 1 + +} + +cbind(survival_times, ipw) %>% View() + +# Compare ------ +lung_surv_long <- lung_surv %>% unnest(everything()) + +lung_surv_long <- lung_surv_long %>% filter(.eval_time == eval_time_test) + +#lung %>% select(time, status) %>% distinct() %>% nrow() + + +# Here- New --------------------------------------------------------------------- +# ti & di is censored patient - If censored time is after eval time () ti > tbrier[l] +# if event, and event time earlier than evaluation time - (0 - preds)^2/(ipw+0.001)) + +# 1) If the observed time is a censoring time and that is before the evaluation time, the data point should make no contribution to the performance metric (their "category 3"). These values have a missing value for their probability estimate (and also for their weight column). +# ^ not included, ensure they are NAs in predict df + +# 2) I(ti <= tbrier[l] & di == 1)*(0 - preds)^2/(ipw+0.001)) +# Old version ---- +# brier[l] <- mean(I(ti > tbrier[l] & di==0)* +# (1 - preds)^2/(ipw+0.001) + +# +# I(ti <= tbrier[l] & di == 1)*(0 - preds)^2/(ipw+0.001)) + +# Updated Version --- + + +brier[l] <- mean(I(ti > tbrier[l])* + (1 - preds)^2/(ipw+0.001) + + + I(ti <= tbrier[l] & di == 1)*(0 - preds)^2/(ipw+0.001)) + diff --git a/inst/compare-predict-brier-with-yardstick_P2.R b/inst/compare-predict-brier-with-yardstick_P2.R new file mode 100644 index 0000000..de5c1f3 --- /dev/null +++ b/inst/compare-predict-brier-with-yardstick_P2.R @@ -0,0 +1,233 @@ +# we should use this: https://www.tidymodels.org/learn/statistics/survival-metrics/ + +library(survival) +library(tidyverse) +library(glmnet) +library(yardstick) + +# Load Simulation Data --------------------------------------------------------------- + +load(here::here("inst", "cureit-simulation-results", "sim_data_list.RData")) +load(here::here("inst", "cureit-simulation-results", "sim_valid_data_list.RData")) +load(here::here("inst","cureit-simulation-results", "cv_fits_100.RData")) + + +# Function to Get Brier on Each Sim Data ---------------------------------- + +get_brier_for_each_data <- function(dat, fit) { + + final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] + + # * Create Data frame to pass ----- + + data = data.frame(dat$x) %>% + mutate(times = dat$t, + event = dat$d, + "truth_survival" = Surv(dat$t, dat$d)) + + # * Run new Predict Function (glmnet cure) ------ + + predict_df <- predict_cure(final_fit = final_fit, new_data = data) + predict_df_long <- predict_df %>% + unnest(everything()) + + + brier_scores_yardstick <- + predict_df %>% + mutate(truth = Surv(times, event)) %>% + brier_survival(truth = truth, .pred) + + return(brier_scores_yardstick) + +} + + +# Manual Brier Score Calc (From Teng) ------------------------------------- + +# * Version 1 ----- + +# iterates over eval time +for (i in 1:100) { + + time_1 <- predict_df_long$.eval_time[[i]] + + predict_df_long_1 <- predict_df_long %>% + filter(.eval_time == time_1) + + x[i] <- mean(I(predict_df_long_1$times > time_1)*(1 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored) + + I(predict_df_long_1$event == 1 & predict_df_long_1$times <= time_1)* + (0 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored)) + + # OLD -- + # x[i] <- mean(I(predict_df_long_1$event == 0)*(1 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001) + + # I(predict_df_long_1$event ==1)*(0 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001)) + # + # OLD -- + # brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + + # I(di==1)*(0 - preds)^2/(ipw+0.001)) + # + # NEW ?-- + # brier[l] <- mean(I(ti > tbrier[l])*(1 - preds)^2/(ipw+0.001) + + # I(di==1 & ti <= tbrier[l])*(0 - preds)^2/(ipw+0.001)) + +} + + + +# * Version 2 ----- + +# simple version of above- I think I can delete +brier[l] <- mean(I(ti > tbrier[l])* + (1 - preds)^2/(ipw+0.001) + + + I(ti <= tbrier[l] & di == 1)*(0 - preds)^2/(ipw+0.001)) + + +# * Version 3 ----- +brier_cure_update <- function(predict_df, truth = times, eval_timepoints = NULL) { + + + # Get values from observed data + survival_times = predict_df$times + sorted_survival_times <- sort(survival_times) + sorted_event_times = sort(data$times[data$event==1]) + + + predict_df_long <- predict_df %>% + unnest(everything()) + + predict_df_nest_time <- predict_df_long %>% + nest(data = -.eval_time) + + + map(predict_df_nest_time$data, function(x) { + mean(I(x$event == 0)*(1 - x$.pred_survival)^2/(x$.weight_censored + 0.001) + I(x$event == 1)*(0 - x$.pred_survival)^2/(x$.weight_censored + 0.001)) + + }) + + for (l in 1:length(eval_timepoints)) { + brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) + + + } +} + +# * Version 4 ---- + +brier_for_each_eval <- function(predict_df_timepoint) { + + + x = mean(I(predict_df_timepoint$times > predict_df_timepoint$.eval_time )*(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + + I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time )* + (0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001)) + x +} + + +brier_for_each_eval <- function(predict_df_timepoint) { + + + x = mean( + I(predict_df_timepoint$times > predict_df_timepoint$.eval_time)*(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + + + I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time )* + (0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + ) + x/100 +} + +brier_for_each_eval2 <- function(predict_df_timepoint) { + + category1 = I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time ) + category2 = I(predict_df_timepoint$times > predict_df_timepoint$.eval_time) + + x = mean( + category1*(0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored) + + category2 *(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored) + + ) + x + +} + + + +# Example on Single Data Set ------------------------------------------------- + +# Grab one data set as an example +dat <- sim_data_list[[1]] +fit <- cv_fits_list[[1]] +final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] + +# Run predict functon +predict_df <- predict_cure(final_fit = final_fit, new_data = data) + +predict_df_long <- predict_df %>% + unnest(everything()) + +# Get yardstick values +brier_scores_yardstick <- + predict_df %>% + mutate(truth = Surv(times, event)) %>% + brier_survival(truth = truth, .pred) + + +# Example on Multiple Data ------------------------------------------------ + + +b <- get_brier_for_each_data(dat = sim_data_list[[3]], fit = cv_fits_list[[3]]) + + +res <- list() +for (i in 1:10) { + + res[[i]] <- get_brier_for_each_data(sim_data_list[[i]], cv_fits_list[[i]]) +} + +ggplot(res[[3]], aes(x = .eval_time, .estimate)) + geom_point() + + + + +# Compare to Our Manual Brier Calc ---------------------------------------- + +# quick brier --------- +dat <- sim_data_list[[3]] +fit <- cv_fits_list[[3]] +final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] + +# Create Data frame to pass ----- + +data = data.frame(dat$x) %>% + mutate(times = dat$t, + event = dat$d, + "truth_survival" = Surv(dat$t, dat$d)) + +predict_df <- predict_cure(final_fit = final_fit, new_data = data) +predict_df_long <- predict_df %>% + unnest(everything()) + + + +# Brier With Yardstick ------------------------------------------------------- +library(yardstick) +x <- c() + + +times <- unique(predict_df_long$.eval_time) +brier <- x + +brier_scores_manual <- bind_cols("times" = times, "brier" = brier) + +brier_scores_yardstick <- + predict_df %>% + mutate(truth = Surv(times, event)) %>% + brier_survival(truth = truth, .pred) + + + + + + +# Misc Code --------------------------------------------------------------- + diff --git a/inst/compare-predict-brier-with-yardstick_P3.R b/inst/compare-predict-brier-with-yardstick_P3.R new file mode 100644 index 0000000..e3fede4 --- /dev/null +++ b/inst/compare-predict-brier-with-yardstick_P3.R @@ -0,0 +1,315 @@ +# we should use this: https://www.tidymodels.org/learn/statistics/survival-metrics/ + +library(survival) +library(tidyverse) +library(glmnet) + +# Prep Data --------------------------------------------------------------- + +load(here::here("inst", "cureit-simulation-results", "sim_data_list.RData")) +load(here::here("inst", "cureit-simulation-results", "sim_valid_data_list.RData")) +load(here::here("inst","cureit-simulation-results", "cv_fits_100.RData")) + +dat <- sim_data_list[[1]] +fit <- cv_fits_list[[1]] +final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$min[2]]] + +# Create Data frame to pass + +data = data.frame(dat$x) %>% + mutate(times = dat$t, + event = dat$d, + "truth_survival" = Surv(dat$t, dat$d)) + +# * Calculate Sensitivity of Variable Selection in Training Fits ------- +spe_cox_min <- c() +sen_cox_min <- c() + +x <- for (i in 1:100) { + fit <- cv_fits_list[[i]] + #fit <- x$fit + dat <- sim_data_list[[i]] + + spe_cox_min[[i]] <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + sen_cox_min[[i]] <- mean(dat$id_cox %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + # spe_cox_min <- 1-mean(setdiff(1:200,dat$id_cure) %in% which(fit$fit[[fit$index$min[1]]][[fit$index$min[2]]]$beta!=0)) + + # print(sen_cox_min) + +} + + +hist(unlist(sen_cox_min)) +hist(unlist(spe_cox_min)) + +# Check Variable Selection on Standard Lasso +# will add foldid (see new version of cv.lassocure()) +cv.glmnet(x = dat$x, y = Surv(dat$t, dat$d), family = "cox") + +# Predict Function (glmnet cure) --------------------------------------------- + +# Trying to get predictions in similar format as tidymodels yardstick +# Eventually will integrate this with existing predict method function + + + +predict_df <- predict_cure(final_fit = final_fit, new_data = data) +predict_df_long <- predict_df %>% + unnest(everything()) + + + +# quick brier --------- +x <- c() +for (i in 1:100) { + + time_1 <- predict_df_long$.eval_time[[i]] + + predict_df_long_1 <- predict_df_long %>% + filter(.eval_time == time_1) + + x[i] <- mean(I(predict_df_long_1$times > time_1)*(1 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored) + + I(predict_df_long_1$event == 1 & predict_df_long_1$times <= time_1)* + (0 - predict_df_long_1$.pred_survival)^2*(predict_df_long_1$.weight_censored)) + + # OLD ----- + # x[i] <- mean(I(predict_df_long_1$event == 0)*(1 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001) + + # I(predict_df_long_1$event ==1)*(0 - predict_df_long_1$.pred_survival)^2/(predict_df_long_1$.weight_censored + 0.001)) + # + # OLD ---- + # brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + + # I(di==1)*(0 - preds)^2/(ipw+0.001)) + # + # NEW ----- + # brier[l] <- mean(I(ti > tbrier[l])*(1 - preds)^2/(ipw+0.001) + + # I(di==1 & ti <= tbrier[l])*(0 - preds)^2/(ipw+0.001)) + +} + +times <- unique(predict_df_long$.eval_time) +brier <- x + +brier_scores_manual <- bind_cols("times" = times, "brier" = brier) + +# Brier With Yardstick ------------------------------------------------------- +library(yardstick) + +# Doesn't match- weights are wrong +brier_scores_yardstick <- + predict_df %>% + mutate(truth = Surv(times, event)) %>% + brier_survival(truth = truth, .pred) + + +all_brier <- brier_scores_yardstick %>% + select(times = .eval_time, brier_yardstick = .estimate) %>% + left_join(brier_scores_manual) + +# Brier Function ------------------------------------------------------- + +predict_df_long <- predict_df %>% + unnest(everything()) + +predict_df_nest_time <- predict_df_long %>% + nest(data = -.eval_time) + +predict_df_timepoint = predict_df_nest_time[10,] %>% + unnest(everything()) + +brier_for_each_eval <- function(predict_df_timepoint) { + + + x = mean(I(predict_df_timepoint$times > predict_df_timepoint$.eval_time )*(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + + I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time )* + (0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001)) + x + } + + +brier_for_each_eval <- function(predict_df_timepoint) { + + + x = mean( + I(predict_df_timepoint$times > predict_df_timepoint$.eval_time)*(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + + + I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time )* + (0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored+0.001) + ) + x/100 +} + +brier_for_each_eval2 <- function(predict_df_timepoint) { + + category1 = I(predict_df_timepoint$event == 1 & predict_df_timepoint$times <= predict_df_timepoint$.eval_time ) + category2 = I(predict_df_timepoint$times > predict_df_timepoint$.eval_time) + + x = mean( + category1*(0 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored) + + category2 *(1 - predict_df_timepoint$.pred_survival)^2/(predict_df_timepoint$.weight_censored) + + ) + x + +} + +brier_for_each_eval(predict_df_timepoint) + +# Compare to yardstick ---- + +brier_survival_impl <- function(truth, + estimate, + censoring_weights, + case_weights, + eval_time) { + surv_time <- yardstick:::.extract_surv_time(truth) + surv_status <- yardstick:::.extract_surv_status(truth) + + if (!is.null(case_weights)) { + case_weights <- vec_cast(case_weights, to = double()) + norm_const <- sum(case_weights) + } else { + case_weights <- rep(1, length(estimate)) + norm_const <- sum(!survival::is.na.Surv(truth)) + } + + category_1 <- surv_time <= eval_time & surv_status == 1 + category_2 <- surv_time > eval_time + + # (0 - estimate) ^ 2 == estimate ^ 2 + res <- (category_1 * estimate^2 * censoring_weights) + + (category_2 * (1 - estimate)^2 * censoring_weights) + + res <- res * case_weights + res <- sum(res, na.rm = TRUE) + res / norm_const +} +predict_df_timepoint2 <- predict_df_timepoint %>% + mutate(surv_obj = Surv(predict_df_timepoint$times, predict_df_timepoint$event)) + +library(yardstick) +brier_survival_impl( + truth = predict_df_timepoint2$surv_obj, + + estimate = predict_df_timepoint2$.pred_survival, + censoring_weights = predict_df_timepoint2$.weight_censored, + case_weights = NULL, + eval_time = predict_df_timepoint2$.eval_time) + +brier_for_each_eval(predict_df_timepoint) + +# Simple Former Brier Calculation --------------------------------------------------- +library(glmnet) +library(tidyverse) +library(survival) + +calc_brier_old <- function(new_data, fit) { + # ti <- dat_valid$t + # di <- dat_valid$d + # xi <- dat_valid$x + ti <- new_data$t + di <- new_data$d + xi <- new_data$x + + # tj from train data + tj <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]$tj + final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]] + + fitcure <- final_fit$fitcure + predcure <- predict(fitcure,newx=xi,s=min(fitcure$lambda),type="response") + + fitcox <- final_fit$fitcox + predsurvexp <- predict(fitcox,newx=xi,s=min(fitcox$lambda),type="response") + + haz <- final_fit$haz + cumhaz <- final_fit$cumhaz + + # censoring weights from test data + sf <- survfit(Surv(ti,1-di)~1) + sf_pred <- summary(sf, time = ti) + + predcens <- sf_pred$surv + tcens <- sf_pred$time + + # tbrier is evaluation timepoints (default to all timepoints in test data) + tbrier <- sort(ti) + brier <- rep(NA,length(tbrier)) + ipw <- rep(NA,length(tbrier)) + + for (l in 1:length(tbrier)){ + + predsurv <- rep(NA,length(ti)) # Conditional survival prob for all patients at tbrier[l] + + if (tbrier[l] >= min(tj)){ + + ids <- which(tj == max(tj[tj <= tbrier[l]])) + predsurv <- exp(-cumhaz[ids]*predsurvexp) + ipw <- as.numeric(ti > tbrier[l])*predcens[l] + as.numeric(ti <= tbrier[l])*predcens + + }else if (tbrier[l] < min(tj)){ + + predsurv <- rep(1,length(ti)) + ipw <- 1 + + } + + preds <- 1 - predcure + predcure*predsurv + brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001)) + #ipw[l] <- ipw + # print(ipw) + } + + preds <- cbind.data.frame(tbrier, brier) + return(preds) + # plot(tbrier,brier,type="S") + +} + +x <- calc_brier_old(sim_data_list[[1]], cv_fits_list[[1]]) +x_val <- calc_brier_old(sim_valid_data_list[[1]], cv_fits_list[[1]]) + +x +plot(x$tbrier,x$brier,type="S") +plot(x_val$tbrier,x_val$brier,type="S") + + +all_brier <- list() + +for (i in 1:length(sim_data_list)) { + all_brier[[i]] <- calc_brier_old( new_data = sim_data_list[[i]], fit = cv_fits_list[[i]]) +} + +all_brier_valid <- list() + +for (i in 1:length(sim_valid_data_list)) { + all_brier_valid[[i]] <- calc_brier_old( new_data = sim_valid_data_list[[i]], + fit = cv_fits_list[[i]]) +} + +x <- all_brier_valid[[1]] +x_valid <- all_brier[[1]] + +ggplot(x, aes(x = tbrier,y = brier)) + geom_line( aes(x = tbrier,y = brier)) + + geom_line(data = x_valid, aes(x =tbrier, y = brier), color = "blue") + +plot(x$tbrier,x$brier,type="S") +line(x_valid$tbrier,x_valid$brier,type="S") + + +traps <- c() + +for (i in 1:length(all_brier)) { + traps[i] <- trapz(all_brier[[i]]$tbrier,all_brier[[i]]$brier) + +} + + +all_brier2 <- all_brier %>% + do.call("rbind", .) + +x <- all_brier2 %>% group_by(round(tbrier, 1)) %>% + summarize(mean = mean(brier)) + +plot(x$`round(tbrier, 1)`, x$mean,type="S") + + + diff --git a/inst/inst-README.md b/inst/inst-README.md new file mode 100644 index 0000000..7cf4a72 --- /dev/null +++ b/inst/inst-README.md @@ -0,0 +1,18 @@ +# inst + + +- `compare-predict-brier-with-yardstick_P2.R` - compare yardstick prediction and brier using our simulation data. Using new predict_cure() compare Teng's manual brier score calc to yardstick brier score calc + + +- `compare-predict-brier-with-yardstick_P1.R` - compare yardstick prediction and brier using their example data + +- `compare-predict-brier-with-yardstick_P1.R` - misc comparison code (need to review this) + +- `run-simulations.R` - produce simulated data for lasso cure models + +- `nomogram-custom-edits.R` - custom edits to nomogram for cureit paper + +- `archive` + - `evaluate-simulations.R` - old code to calculate brier score on simulations (modified from Teng) + - `testcode.R` - old code to calculate brier score + diff --git a/inst/run-simulations.R b/inst/run-simulations.R new file mode 100644 index 0000000..11f7ace --- /dev/null +++ b/inst/run-simulations.R @@ -0,0 +1,175 @@ +source(here::here("R/cv.cureitlasso.R")) +source(here::here("R/cureitlasso.R")) +source(here::here("R/simulasso.R")) +source(here::here("R/coxsplit.R")) + +# Potentially change sample sizes and number of parameters +# We could try simulating scenarios with overlap between cure and cox + +# Simulate Cure Data -------------------------------------------------------- + +# Test- Try simulating one train and test set + + +dat <- simulasso() # training data + + + +dat_valid <- simulasso(id_cure = dat$id_cure, + id_cox = dat$id_cox, + coefs_cure = dat$coefs_cure, + coefs_cox = dat$coefs_cox) + + +setdiff(dat$id_cox, dat_valid$id_cox) + + +# * Training- Simulate n Data Sets -------------------------------------------------------- +sim_data_list <- list() +set.seed(999) + +for (i in 1:100) { + sim_data_list[[i]] = simulasso() +} + +sim_data_list[[1]]$id_cure + +# * Validation - Simulate n Validation Data --------------------------------------------------- + +sim_valid_data_list <- list() + +for (i in 1:100) { + dat <- sim_data_list[[i]] + sim_valid_data_list[[i]] = simulasso(id_cure = dat$id_cure, + id_cox = dat$id_cox, + coefs_cure = dat$coefs_cure, + coefs_cox = dat$coefs_cox) +} + +sim_data_list[[1]]$id_cure +sim_valid_data_list[[1]]$id_cure + +save(sim_data_list, file = here::here("inst", + "cureit-simulation-results", + "sim_data_list_v2.RData")) + +save(sim_valid_data_list, file = here::here("inst", + "cureit-simulation-results", + "sim_valid_data_list_v2.RData")) + +# Fit Lasso ------------------------------------------------------------ + +# test- A Single Lasso Fit + +dat <- sim_data_list[[1]] +test_single_run <- cureitlasso( + t = dat$t, + d = dat$d, + x = dat$x, + minmu.ratio=0.05, # minimum penalty value in the logistic model + minlambda.ratio=0.05, # minimum penalty value in the cox model + adaptive=FALSE, + length.grid=8, + tol=1e-2, + maxiter=100, + progress=FALSE) + +# Run Cross Validation ---------------------------------------------------- + +# * Test a Single CV Fit ------ +# To get a sense of what is in the output + +dat_valid <- sim_data_list[[1]] + +test_single_cv <- cv.cureitlasso(t = dat_valid$t, + d = dat_valid$d, + x = dat_valid$x, + minmu.ratio=0.05, # minimum penalty value in the logistic model + minlambda.ratio=0.05, # minimum penalty value in the cox model + adaptive=FALSE, + length.grid=10, + + tol=1e-2, + maxiter=100, + + nfolds=5, + ncore=5, + seed=NULL, + progress=FALSE) + +test_list <- list() +test_list[[1]] <- test_single_cv + +# * 100 Runs ---- + +cv_fits_list <- list() +start_time <- Sys.time() + +for (i in 1:50) { + + dat_valid <- sim_data_list[[i]] + + cv_fits_list[[i]] <- cv.cureitlasso(t = dat_valid$t, + d = dat_valid$d, + x = dat_valid$x, + minmu.ratio=0.05, # minimum penalty value in the logistic model + minlambda.ratio=0.05, # minimum penalty value in the cox model + adaptive=FALSE, + length.grid=10, + + tol=1e-2, + maxiter=100, + + nfolds=5, + ncore=5, + seed=NULL, + progress=FALSE + ) + + +} + +# 8.670569 mins for 3 data sets +end_time <- Sys.time() +end_time - start_time + + +save(cv_fits_list, file = here::here("inst", "cureit-simulation-results", "cv_fits_50_v2.RData")) + +# * 50 Runs ---- + +cv_fits_list2 <- list() + +start_time <- Sys.time() + +for (i in 1:50) { + + dat_valid <- sim_data_list[[i]] + + cv_fits_list2[[i]] <- cv.cureitlasso(t = dat_valid$t, + d = dat_valid$d, + x = dat_valid$x, + minmu.ratio=0.05, # minimum penalty value in the logistic model + minlambda.ratio=0.05, # minimum penalty value in the cox model + adaptive=FALSE, + length.grid=10, + + tol=1e-2, + maxiter=100, + + nfolds=5, + ncore=5, + seed=NULL, + progress=FALSE + ) + + +} + +# 2.369856 hours for 50 fits +end_time <- Sys.time() +end_time - start_time + + +save(cv_fits_list2, file = here::here("inst", "cv_fits_50.RData")) +