-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathrun_pipeline.R
More file actions
181 lines (138 loc) · 5.96 KB
/
run_pipeline.R
File metadata and controls
181 lines (138 loc) · 5.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# run_pipeline.R
#
# Main orchestrator script for the integration benchmark pipeline.
# --- 1. Preamble ---
# Load required libraries
suppressPackageStartupMessages({
library(optparse)
library(yaml)
})
options(future.globals.maxSize = 16000 * 1024^2)
# Source all R scripts in the R/ directory
# This makes their functions available to the pipeline runner.
r_scripts <- list.files("R", pattern = "\\.R$", full.names = TRUE)
for (script in r_scripts) {
if (basename(script) != "install_packages.R" && basename(script) != "run_pipeline.R") {
source(script)
}
}
# Ensure utils are loaded
source("R/00_utils.R")
source("R/00_clust_utils.R")
# --- 2. Command-Line Argument Parsing ---
option_list <- list(
make_option(c("-c", "--config"), type = "character", default = "config.yaml",
help = "Path to the YAML configuration file", metavar = "character")
)
opt_parser <- OptionParser(option_list = option_list)
opts <- parse_args(opt_parser)
config_path <- opts$config
# --- 3. Main Pipeline Function ---
run_pipeline <- function(config_path) {
# --- Setup ---
log_message("=== Starting Integration Benchmark Pipeline ===")
if (!file.exists(config_path)) {
stop("Configuration file not found: ", config_path)
}
config <- yaml::read_yaml(config_path)
# Set seed for reproducibility
set.seed(config$seed)
# Create output directories
dir.create(config$paths$results_dir, showWarnings = FALSE, recursive = TRUE)
dir.create(config$paths$figures_dir, showWarnings = FALSE, recursive = TRUE)
dir.create(config$paths$tmp_dir, showWarnings = FALSE, recursive = TRUE)
# --- Pipeline Steps ---
#10_process_seurat_runs is the script run to process individual sequencing
#runs, skipped for the purposes of the integration pipeline
# Step 1: Process raw Seurat objects to on-disk BPCells format
# This includes filtering and subsetting per-sample.
bpcells_dir <- process_to_bpcells(config_path)
# Step 2: Load BPCells data, find HVGs, merge, and preprocess (Scale/PCA)
#bpcells_dir <- file.path(config$paths$results_dir, "01_bpcells_data")
preprocessed_obj_path <- preprocess_bpcells_data(bpcells_dir, config_path)
# Step 3: Run integration, analysis, and metrics for each method
methods_to_run <- config$methods$run
log_message("=== Starting Analysis for ", length(methods_to_run), " Methods ===")
for (method in methods_to_run) {
log_message(paste0(">>> Processing Method: ", toupper(method), " <<<"))
# 4a. Integration
integration_func <- get(paste0("integrate_", method))
#preprocessed_obj_path <- "results/03_preprocessed_data/preprocessed_bpcells_object.rds"
integrated_obj_path <- integration_func(preprocessed_obj_path, config_path)
# 4b. Post-integration graph/UMAP
analyzed_obj_path <- generate_graph_umap(integrated_obj_path, method, config_path)
# 4c. Batch metrics
calculate_batch_metrics(analyzed_obj_path, method, config_path)
# 4d. Label metrics
calculate_label_metrics(analyzed_obj_path, method, config_path)
log_message(paste0(">>> Finished Processing: ", toupper(method), " <<<"))
}
# Step 5: Rank methods based on all collected metrics
log_message("=== Aggregating Metrics and Ranking Methods ===")
best_method <- rank_methods(config_path)
method_df <- read.csv(best_method)
best_method <- method_df$method[method_df$method != "scanvi"][1]
log_message("Best integration method selected:", best_method)
# Step 6: Generate final summary plots
log_message("=== Generating Final Plots ===")
generate_plots(config_path)
log_message("=== Running Clustering Optimization on Best Method ===")
# Locate integrated object from the best method
best_integrated_obj_path <- file.path(
config$paths$results_dir, "04_integrated_data",
paste0(best_method, ".rds")
)
# Step 7: Grid search for leiden clustering
run_cluster_param_grid(
best_method = best_method,
seurat_rds_path = best_integrated_obj_path,
dims = seq(config$clustering$dims[[1]], config$clustering$dims[[2]]),
graph_name = config$clustering$graph_name,
resolutions = config$clustering$grid$resolutions,
k_params = config$clustering$grid$k_params,
)
# Step 8: Assessing leiden cluster stability
run_cluster_stability(
seurat_rds_path = best_integrated_obj_path,
dims = seq(config$clustering$dims[[1]], config$clustering$dims[[2]]),
subsample_frac = config$clustering$stability$subsample_frac,
n_repeats = config$clustering$stability$n_repeats,
)
# Step 9: Scoring leiden clusters
cluster.parameters <- combine_and_score(
w_sil = config$clustering$scoring_weights$w_sil,
w_mod = config$clustering$scoring_weights$w_mod,
w_conn = config$clustering$scoring_weights$w_conn,
w_stab = config$clustering$scoring_weights$w_stab,
singleton_penalty = config$clustering$scoring_weights$singleton_penalty)
# Step 10: Creating Final Object on FULL Data
make_final_object(
cfg = config,
best_method = best_method,
k = cluster.parameters$k,
resolution = cluster.parameters$resolution
)
#Step 11: Cell Annotation
results <- run_tcell_annotation(
obj = readRDS(file.path(cfg$paths$results_dir, "FINAL_integrated_object.rds")),
markers_yaml = "tcell_markers.yaml",
cluster_col = "leiden.cluster",
output_dir = file.path(config$paths$results_dir, "08_annotations"),
save_object = FALSE,
create_plots = TRUE
)
saveRDS(results$object, file.path(cfg$paths$results_dir, "FINAL_integrated_object.rds"))
#Step 12: Export to scanpy/scirpy
export_to_scanpy()
#Step 13: Automatically update README and summary tables
summarise_cohort()
writeLines(capture.output(sessionInfo()), "./summary/sessionInfo.txt")
log_message("=== Pipeline Finished Successfully ===")
}
# --- 4. Execute Pipeline ---
# Record start time
pipeline_timer <- start_timer()
# Run the main function
run_pipeline(config_path)
# Print total elapsed time
stop_timer(pipeline_timer, "Total pipeline execution")