utility/run_pipeline.R at main · ncborcherding/utility · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# run_pipeline.R
#
# Main orchestrator script for the integration benchmark pipeline.

# --- 1. Preamble ---

# Load required libraries
suppressPackageStartupMessages({
  library(optparse)
  library(yaml)
})

options(future.globals.maxSize = 16000 * 1024^2)

# Source all R scripts in the R/ directory
# This makes their functions available to the pipeline runner.
r_scripts <- list.files("R", pattern = "\\.R$", full.names = TRUE)
for (script in r_scripts) {
  if (basename(script) != "install_packages.R" && basename(script) != "run_pipeline.R") {
    source(script)
  }
}

# Ensure utils are loaded
source("R/00_utils.R")
source("R/00_clust_utils.R")

# --- 2. Command-Line Argument Parsing ---

option_list <- list(
  make_option(c("-c", "--config"), type = "character", default = "config.yaml",
              help = "Path to the YAML configuration file", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list)
opts <- parse_args(opt_parser)

config_path <- opts$config

# --- 3. Main Pipeline Function ---

run_pipeline <- function(config_path) {

  # --- Setup ---
  log_message("=== Starting Integration Benchmark Pipeline ===")

  if (!file.exists(config_path)) {
    stop("Configuration file not found: ", config_path)
  }
  config <- yaml::read_yaml(config_path)

  # Set seed for reproducibility
  set.seed(config$seed)

  # Create output directories
  dir.create(config$paths$results_dir, showWarnings = FALSE, recursive = TRUE)
  dir.create(config$paths$figures_dir, showWarnings = FALSE, recursive = TRUE)
  dir.create(config$paths$tmp_dir, showWarnings = FALSE, recursive = TRUE)

  # --- Pipeline Steps ---
  #10_process_seurat_runs is the script run to process individual sequencing
  #runs, skipped for the purposes of the integration pipeline

  # Step 1: Process raw Seurat objects to on-disk BPCells format
  # This includes filtering and subsetting per-sample.
  bpcells_dir <- process_to_bpcells(config_path)

  # Step 2: Load BPCells data, find HVGs, merge, and preprocess (Scale/PCA)
  #bpcells_dir <- file.path(config$paths$results_dir, "01_bpcells_data")
  preprocessed_obj_path <- preprocess_bpcells_data(bpcells_dir, config_path)

  # Step 3: Run integration, analysis, and metrics for each method
  methods_to_run <- config$methods$run
  log_message("=== Starting Analysis for ", length(methods_to_run), " Methods ===")

  for (method in methods_to_run) {
    log_message(paste0(">>> Processing Method: ", toupper(method), " <<<"))

    # 4a. Integration
    integration_func <- get(paste0("integrate_", method))
    #preprocessed_obj_path <- "results/03_preprocessed_data/preprocessed_bpcells_object.rds"
    integrated_obj_path <- integration_func(preprocessed_obj_path, config_path)

    # 4b. Post-integration graph/UMAP
    analyzed_obj_path <- generate_graph_umap(integrated_obj_path, method, config_path)

    # 4c. Batch metrics
    calculate_batch_metrics(analyzed_obj_path, method, config_path)

    # 4d. Label metrics
    calculate_label_metrics(analyzed_obj_path, method, config_path)

    log_message(paste0(">>> Finished Processing: ", toupper(method), " <<<"))
  }

  # Step 5: Rank methods based on all collected metrics
  log_message("=== Aggregating Metrics and Ranking Methods ===")
  best_method <- rank_methods(config_path)
  method_df <- read.csv(best_method)
  best_method <- method_df$method[method_df$method != "scanvi"][1]
  log_message("Best integration method selected:", best_method)

  # Step 6: Generate final summary plots
  log_message("=== Generating Final Plots ===")
  generate_plots(config_path)

  log_message("=== Running Clustering Optimization on Best Method ===")

  # Locate integrated object from the best method
  best_integrated_obj_path <- file.path(
    config$paths$results_dir, "04_integrated_data",
    paste0(best_method, ".rds")
  )

  # Step 7: Grid search for leiden clustering
  run_cluster_param_grid(
    best_method = best_method,
    seurat_rds_path = best_integrated_obj_path,
    dims = seq(config$clustering$dims[[1]], config$clustering$dims[[2]]),
    graph_name = config$clustering$graph_name,
    resolutions = config$clustering$grid$resolutions,
    k_params = config$clustering$grid$k_params,
  )

  # Step 8: Assessing leiden cluster stability
  run_cluster_stability(
    seurat_rds_path = best_integrated_obj_path,
    dims = seq(config$clustering$dims[[1]], config$clustering$dims[[2]]),
    subsample_frac = config$clustering$stability$subsample_frac,
    n_repeats = config$clustering$stability$n_repeats,
  )

  # Step 9: Scoring leiden clusters
  cluster.parameters <- combine_and_score(
        w_sil = config$clustering$scoring_weights$w_sil,
        w_mod = config$clustering$scoring_weights$w_mod,
        w_conn = config$clustering$scoring_weights$w_conn,
        w_stab = config$clustering$scoring_weights$w_stab,
        singleton_penalty = config$clustering$scoring_weights$singleton_penalty)


  # Step 10: Creating Final Object on FULL Data
  make_final_object(
    cfg = config,
    best_method = best_method,
    k = cluster.parameters$k,
    resolution = cluster.parameters$resolution
  )

  #Step 11: Cell Annotation
  results <- run_tcell_annotation(
    obj = readRDS(file.path(cfg$paths$results_dir, "FINAL_integrated_object.rds")),
    markers_yaml = "tcell_markers.yaml",
    cluster_col = "leiden.cluster",
    output_dir = file.path(config$paths$results_dir, "08_annotations"),
    save_object = FALSE,
    create_plots = TRUE
  )
  saveRDS(results$object, file.path(cfg$paths$results_dir, "FINAL_integrated_object.rds"))

  #Step 12: Export to scanpy/scirpy
  export_to_scanpy()

  #Step 13: Automatically update README and summary tables
  summarise_cohort()

  writeLines(capture.output(sessionInfo()), "./summary/sessionInfo.txt")

  log_message("=== Pipeline Finished Successfully ===")
}

# --- 4. Execute Pipeline ---

# Record start time
pipeline_timer <- start_timer()

# Run the main function
run_pipeline(config_path)

# Print total elapsed time
stop_timer(pipeline_timer, "Total pipeline execution")