utility/config.yaml at main · ncborcherding/utility · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# config.yaml

# 1. Paths and File Settings
paths:
  seurat_objects: "./data/processedData/seuratObjects/" # Path to individual Seurat objects
  sequencing_runs: "./data/sequencingRuns/"
  results_dir: "./results/"
  figures_dir: "./figs/"
  tmp_dir: "./tmp/" # For intermediate files like .h5ad
  log_file: "./results/pipeline.log"

# 2. QC for original processing
qc:
  min_features: 100
  min_counts: 0
  max_mito_pct: 10
  max_ribo_pct: 60
  high_feature_sd_enable: true
  high_feature_sd: 2.5

# 3. Filtering (applied per-object before BPCells conversion)
filtering:
  run: true
  filters:
    # Each item is a filter condition applied to the metadata.
    # Format: column, type, values
    # `type` can be 'in' (keep values in list) or 'equals' (keep value)
    - { column: "predicted.celltype.l1", type: "in", values: ["CD4 T", "CD8 T", "other T", "NK"] }
    - { column: "db.class", type: "equals", values: "singlet" }
    # Add more filters as needed, e.g., to filter by 'CTaa' if required.
    # - { column: "CTaa", type: "not_na", values: null }

# 4. Subsetting and Sketching (applied per-object after filtering)
subset:
  run: true
  n_cells_total: 50000 # Total number of cells to subset/sketch to
  method: "stratified" # "stratified" or "geometric"

  # Settings for stratified sampling
  stratified_sampling:
    strata_cols: ["tissue", "orig.ident", "predicted.celltype.l1"]
    min_cells_per_stratum: 10

# 5. Preprocessing
preprocess:
  n_variable_features: 3000

# 6. Integration Methods: settings for each
methods:
  run: ["harmony", "fastmnn", "scvi", "scanvi"] # Which methods to run

  harmony:
    group_by_vars: ["orig.ident"] # Variables to harmonize
    theta: [2] # Diversity clustering penalty parameter
    lambda: [1] # Ridge regression penalty parameter
    max_iter_harmony: 10
    n_dims: 30 # Number of Harmony components

  fastmnn:
    workers: 8
    n_dims: 40
    k: 20
    ndist: 3
    batch_var: "orig.ident"
    hnsw_M: 16
    hnsw_ef_search: 50

  scvi:
    use_var_highly_variable: true        # honor var['highly_variable'] if present
    hvg_n: null                          # null = do NOT recompute or cap; use your list as-is
    hvg_flavor: "seurat_v5"              # if we must compute HVGs and scanpy is installed
    n_layers: 2
    n_hidden: 128
    n_latent: 30
    max_epochs: 200
    learning_rate: 0.001
    batch_size: 64
    precision: "32-true"                # "16-mixed", "bf16-mixed", or omit for CPU/older GPUs
    accelerator: cpu                   # "auto"|"gpu"|"cpu"|"mps"
    devices: 1
    early_stopping: true
    check_val_every_n_epoch: 5

  scanvi:
    use_var_highly_variable: true        # honor var['highly_variable'] if present
    hvg_n: null                          # null = do NOT recompute or cap; use your list as-is
    hvg_flavor: "seurat_v5"
    unlabeled_category: "Unknown" # Value for cells with no label
    labels_key: "Monaco.labels"
    n_latent: 30
    n_layers: 2
    n_hidden: 128
    max_epochs: 200
    learning_rate: 0.001
    batch_size: 64
    precision: "32-true"
    accelerator: cpu
    devices: 1
    early_stopping: true
    check_val_every_n_epoch: 5

# 7. Post-integration Clustering and UMAP
post_integration:
  n_neighbors: 30
  leiden_resolution: 1.0
  n_dims_use: 30 # Number of dimensions from each method to use for UMAP/clustering
  labels: "predicted.celltype.l2"

# 8. Metrics
metrics:
  batch: ["asw_batch", "ilisi", "kbet_batch"]
  label: ["nmi", "ari", "clisi", "kbet_label"]

# 9. Ranking
ranking:
  w_batch: 0.6 # Weight for the batch correction score
  w_label: 0.4 # Weight for the label conservation score

# 10. Plotting
plotting:
  umap_dot_size: 0.1
  umap_group_by: ["batch", "predicted.celltype.l1"]
  palette: "viridis" # Color palette for plots

# 11. Reproducibility
seed: 42

# 12 Memory Handling
memory:
  num_workers: 0
  pin_memory: false
  min_cells_per_batch: 3
  keep_layers: ["counts"]
  drop_raw: true
  drop_obsm: true
  drop_obsp: true
  drop_varm: true
  drop_layers_except: true

# 13. Clustering
clustering:
  dims: [1, 30] # inclusive PCA dims used
  graph_name: "KNN"
  grid:
    resolutions: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
    k_params: [20, 30, 40, 50]
    weights: null # reserved for future (none/SCTransform, etc.)
  stability:
    subsample_frac: 0.8
    n_repeats: 25
  scoring_weights:
    w_sil: 0.35
    w_mod: 0.25
    w_conn: 0.15
    w_stab: 0.25
    singleton_penalty: 0.05
  parallel:
    nworkers: 8 # used by the new clustering steps