-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathconfig.yaml
More file actions
158 lines (140 loc) · 4.47 KB
/
config.yaml
File metadata and controls
158 lines (140 loc) · 4.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# config.yaml
# 1. Paths and File Settings
paths:
seurat_objects: "./data/processedData/seuratObjects/" # Path to individual Seurat objects
sequencing_runs: "./data/sequencingRuns/"
results_dir: "./results/"
figures_dir: "./figs/"
tmp_dir: "./tmp/" # For intermediate files like .h5ad
log_file: "./results/pipeline.log"
# 2. QC for original processing
qc:
min_features: 100
min_counts: 0
max_mito_pct: 10
max_ribo_pct: 60
high_feature_sd_enable: true
high_feature_sd: 2.5
# 3. Filtering (applied per-object before BPCells conversion)
filtering:
run: true
filters:
# Each item is a filter condition applied to the metadata.
# Format: column, type, values
# `type` can be 'in' (keep values in list) or 'equals' (keep value)
- { column: "predicted.celltype.l1", type: "in", values: ["CD4 T", "CD8 T", "other T", "NK"] }
- { column: "db.class", type: "equals", values: "singlet" }
# Add more filters as needed, e.g., to filter by 'CTaa' if required.
# - { column: "CTaa", type: "not_na", values: null }
# 4. Subsetting and Sketching (applied per-object after filtering)
subset:
run: true
n_cells_total: 50000 # Total number of cells to subset/sketch to
method: "stratified" # "stratified" or "geometric"
# Settings for stratified sampling
stratified_sampling:
strata_cols: ["tissue", "orig.ident", "predicted.celltype.l1"]
min_cells_per_stratum: 10
# 5. Preprocessing
preprocess:
n_variable_features: 3000
# 6. Integration Methods: settings for each
methods:
run: ["harmony", "fastmnn", "scvi", "scanvi"] # Which methods to run
harmony:
group_by_vars: ["orig.ident"] # Variables to harmonize
theta: [2] # Diversity clustering penalty parameter
lambda: [1] # Ridge regression penalty parameter
max_iter_harmony: 10
n_dims: 30 # Number of Harmony components
fastmnn:
workers: 8
n_dims: 40
k: 20
ndist: 3
batch_var: "orig.ident"
hnsw_M: 16
hnsw_ef_search: 50
scvi:
use_var_highly_variable: true # honor var['highly_variable'] if present
hvg_n: null # null = do NOT recompute or cap; use your list as-is
hvg_flavor: "seurat_v5" # if we must compute HVGs and scanpy is installed
n_layers: 2
n_hidden: 128
n_latent: 30
max_epochs: 200
learning_rate: 0.001
batch_size: 64
precision: "32-true" # "16-mixed", "bf16-mixed", or omit for CPU/older GPUs
accelerator: cpu # "auto"|"gpu"|"cpu"|"mps"
devices: 1
early_stopping: true
check_val_every_n_epoch: 5
scanvi:
use_var_highly_variable: true # honor var['highly_variable'] if present
hvg_n: null # null = do NOT recompute or cap; use your list as-is
hvg_flavor: "seurat_v5"
unlabeled_category: "Unknown" # Value for cells with no label
labels_key: "Monaco.labels"
n_latent: 30
n_layers: 2
n_hidden: 128
max_epochs: 200
learning_rate: 0.001
batch_size: 64
precision: "32-true"
accelerator: cpu
devices: 1
early_stopping: true
check_val_every_n_epoch: 5
# 7. Post-integration Clustering and UMAP
post_integration:
n_neighbors: 30
leiden_resolution: 1.0
n_dims_use: 30 # Number of dimensions from each method to use for UMAP/clustering
labels: "predicted.celltype.l2"
# 8. Metrics
metrics:
batch: ["asw_batch", "ilisi", "kbet_batch"]
label: ["nmi", "ari", "clisi", "kbet_label"]
# 9. Ranking
ranking:
w_batch: 0.6 # Weight for the batch correction score
w_label: 0.4 # Weight for the label conservation score
# 10. Plotting
plotting:
umap_dot_size: 0.1
umap_group_by: ["batch", "predicted.celltype.l1"]
palette: "viridis" # Color palette for plots
# 11. Reproducibility
seed: 42
# 12 Memory Handling
memory:
num_workers: 0
pin_memory: false
min_cells_per_batch: 3
keep_layers: ["counts"]
drop_raw: true
drop_obsm: true
drop_obsp: true
drop_varm: true
drop_layers_except: true
# 13. Clustering
clustering:
dims: [1, 30] # inclusive PCA dims used
graph_name: "KNN"
grid:
resolutions: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
k_params: [20, 30, 40, 50]
weights: null # reserved for future (none/SCTransform, etc.)
stability:
subsample_frac: 0.8
n_repeats: 25
scoring_weights:
w_sil: 0.35
w_mod: 0.25
w_conn: 0.15
w_stab: 0.25
singleton_penalty: 0.05
parallel:
nworkers: 8 # used by the new clustering steps