diff --git a/code/reference_data/rss_ld_sketch.ipynb b/code/reference_data/rss_ld_sketch.ipynb index 5c266023..53b626c5 100644 --- a/code/reference_data/rss_ld_sketch.ipynb +++ b/code/reference_data/rss_ld_sketch.ipynb @@ -10,30 +10,30 @@ "# RSS LD Sketch Pipeline\n", "\n", "## Overview\n", - "This pipeline generates a **stochastic genotype sample** U from whole-genome sequencing VCF files and stores it as a PLINK2 pgen file for use as an LD reference panel.\n", "\n", - "**Terminology:**\n", - "- **Stochastic genotype sample**: U = WT G, shape (B × p). B pseudo-individuals × p variants. This is what we store in the pgen file (after scaling to $[0,2]$).\n", - "- **LD sketch** (computed downstream, never stored here): R̂ = UTU / B, computed by PLINK2 or SuSiE-RSS when LD is requested from the pgen.\n", + "This pipeline generates a stochastic genotype sample **U = WᵀG** from whole-genome sequencing VCF files and stores it as a PLINK2 pgen file for use as an LD reference panel with SuSiE-RSS fine-mapping.\n", "\n", - "**Steps:**\n", - "1. Generate projection matrix W once from total cohort sample size n (saved as `.npy`, shared across all chromosomes)\n", - "2. Read LD block BED file and emit one coordinate file per block (`process_block_1`)\n", - "3. For each block in parallel: load VCF, compute U = WTG, scale to [0,2], write compressed dosage file + allele frequencies — no pgen yet (`process_block_2`)\n", - "4. For each chromosome: convert per-block dosage files to pgen, merge into one per-chromosome pgen, clean up all intermediate block files (`merge_chrom`)\n", + "**Key idea:** Rather than storing the full genotype matrix G (n × p, ~530 MB/block), we compute U = WᵀG (B × p, ~320 MB/block) using a random projection matrix W ~ N(0, 1/√n). The approximate LD matrix R̂ = UᵀU / B ≈ R = GᵀG / n by the Johnson-Lindenstrauss lemma. G is never stored.\n", "\n", - "**Matrix convention:**\n", - "- G : (n × p) — individuals × variants\n", - "- W : (n × B) — individuals × pseudo-individuals, W ~ N(0, 1/√n)\n", - "- U : (B × p) — stochastic genotype sample = WT G\n", + "**Matrix dimensions:**\n", + "- G : (n × p) — n individuals × p variants (~8,000 per block)\n", + "- W : (n × B) — projection matrix, W ~ N(0, 1/√n), generated once per cohort\n", + "- U : (B × p) — stochastic genotype sample = WᵀG, stored in pgen\n", + "- R̂ : (p × p) — approximate LD matrix, computed on-the-fly by SuSiE-RSS from U\n", + "\n", + "**Pipeline steps:**\n", + "1. `generate_W` — generate W once from cohort sample size n (saved as `.npy`, shared across all chromosomes)\n", + "2. `process_block_1` — read LD block BED file, emit one coordinate file per block\n", + "3. `process_block_2` — for each block in parallel: load VCF, compute U = WᵀG, scale to [0,2], write compressed dosage + allele frequencies\n", + "4. `merge_chrom` — for each chromosome: convert per-block dosage files to sorted pgen, merge into one per-chromosome pgen, clean up all intermediates\n", "\n", "## Input Files\n", "\n", "| File | Description |\n", "|------|-------------|\n", - "| VCF (.bgz) | Per-chromosome VCF files with tabix index. If the cohort contains mixed ancestries, provide `--sample-list` to restrict to one source population. |\n", - "| `sample_list.txt` | Optional. One sample ID per line. If provided, only these samples are loaded from the VCF. |\n", - "| `EUR_LD_blocks.bed` | LD block BED file (0-based half-open, e.g. Berisa & Pickrell EUR blocks). |\n", + "| VCF (.bgz) | Per-chromosome VCF files with tabix index |\n", + "| `sample_list.txt` | Optional. One sample ID per line. Leave empty to use all samples. |\n", + "| `EUR_LD_blocks.bed` | LD block BED file (0-based half-open, e.g. Berisa & Pickrell EUR blocks) |\n", "\n", "## Parameters\n", "\n", @@ -41,64 +41,55 @@ "\n", "| Parameter | Default | Description |\n", "|-----------|---------|-------------|\n", - "| `plink2_bin` | `plink2` | Path to plink2 binary, or just `plink2` if on `$PATH` |\n", + "| `plink2_bin` | `plink2` | Path to plink2 binary |\n", + "| `walltime` | `24:00:00` | Per-task walltime |\n", + "| `mem` | `32G` | Per-task memory |\n", + "| `numThreads` | `8` | Per-task CPU cores |\n", "\n", - "**`generate_W` workflow parameters:**\n", + "**`generate_W` parameters:**\n", "\n", "| Parameter | Default | Description |\n", "|-----------|---------|-------------|\n", - "| `n_samples` | required | Total number of individuals in the cohort (determines W shape) |\n", + "| `n_samples` | required | Total number of individuals in the cohort |\n", "| `output_dir` | required | Output directory for `W_B{B}.npy` |\n", - "| `B` | `10000` | Number of pseudo-individuals (sketch dimension) |\n", - "| `seed` | `123` | Random seed for W generation |\n", + "| `B` | `10000` | Sketch dimension (number of pseudo-individuals) |\n", + "| `seed` | `123` | Random seed |\n", "\n", - "**`process_block` workflow parameters:**\n", + "**`process_block` parameters:**\n", "\n", "| Parameter | Default | Description |\n", "|-----------|---------|-------------|\n", - "| `vcf_base` | required | Directory containing VCF (.bgz) files, one or more per chromosome |\n", - "| `vcf_prefix` | required | Shared filename prefix of all VCF files (everything before `chr{N}:` or `chr{N}.`) |\n", - "| `ld_block_file` | required | Path to LD block BED file (e.g. `EUR_LD_blocks.bed`) |\n", - "| `chrom` | `0` | Chromosome number (1–22), or 0 for all chromosomes |\n", + "| `vcf_base` | required | Directory containing VCF (.bgz) files |\n", + "| `vcf_prefix` | required | Shared filename prefix of all VCF files |\n", + "| `ld_block_file` | required | Path to LD block BED file |\n", + "| `chrom` | `0` | Chromosome 1–22, or 0 for all chromosomes |\n", "| `cohort_id` | `ADSP.R5.EUR` | Cohort prefix for output filenames |\n", "| `output_dir` | required | Base output directory |\n", "| `W_matrix` | required | Path to W `.npy` output from `generate_W` |\n", - "| `B` | `10000` | Number of pseudo-individuals — must match W |\n", - "| `sample_list` | `\"\"` | Optional path to a file of sample IDs (one per line) to subset from the VCF. Leave empty to use all samples. |\n", + "| `B` | `10000` | Must match W |\n", + "| `sample_list` | `\"\"` | Optional path to sample ID file |\n", "| `maf_min` | `0.0005` | Minimum minor allele frequency |\n", "| `mac_min` | `5` | Minimum minor allele count |\n", "| `msng_min` | `0.05` | Maximum missingness rate per variant |\n", "\n", - "**`merge_chrom` workflow parameters:**\n", + "**`merge_chrom` parameters:**\n", "\n", "| Parameter | Default | Description |\n", "|-----------|---------|-------------|\n", - "| `chrom` | `0` | Chromosome to merge (1–22), or 0 for all |\n", + "| `chrom` | `0` | Chromosome 1–22, or 0 for all chromosomes |\n", "| `output_dir` | required | Base output directory (same as `process_block`) |\n", - "| `cohort_id` | `ADSP.R5.EUR` | Must match the `process_block` cohort_id used |\n", - "\n", - "**Global parameters (shared across all workflows):**\n", - "\n", - "| Parameter | Default | Description |\n", - "|-----------|---------|-------------|\n", - "| `walltime` | `24:00:00` | Per-task walltime |\n", - "| `mem` | `32G` | Per-task memory |\n", - "| `numThreads` | `8` | Per-task CPU cores |\n", + "| `cohort_id` | `ADSP.R5.EUR` | Must match `process_block` cohort_id |\n", "\n", "## Output Files\n", "\n", "| File | Description |\n", "|------|-------------|\n", - "| `W_B{B}.npy` | Projection matrix W, shape (n × B). Output of `generate_W`, input to `process_block`. |\n", - "| `{cohort_id}.chr{N}_stocld_panel.pgen` | **Final output.** Per-chromosome stochastic genotype sample U as PLINK2 binary |\n", - "| `{cohort_id}.chr{N}_stocld_panel.pvar` | Per-chromosome variant information |\n", - "| `{cohort_id}.chr{N}_stocld_panel.psam` | Pseudo-individual IDs S1…S{B} |\n", - "| `{cohort_id}.chr{N}_stocld_panel.afreq` | Per-chromosome allele frequencies (OBS_CT = 2 × n) |\n", - "| *(cleaned up after merge)* | Per-block `.dosage.gz`, `.map`, `.afreq`, `.meta` intermediate files |\n", - "\n", - "**Timing:** ~7 hours for all 22 chromosomes with -J 32 (blocks parallelized), plus ~1 hour for merge_chrom\n", - "\n", - "## Minimal Working Example\n" + "| `W_B{B}.npy` | Projection matrix W, shape (n × B). Output of `generate_W`. |\n", + "| `{cohort_id}.chr{N}.pgen` | Final output — per-chromosome stochastic genotype sample U as PLINK2 binary |\n", + "| `{cohort_id}.chr{N}.pvar` | Per-chromosome variant information |\n", + "| `{cohort_id}.chr{N}.psam` | Pseudo-individual IDs S1…S{B} |\n", + "| `{cohort_id}.chr{N}.afreq` | Per-chromosome allele frequencies (OBS_CT = 2 × n) |\n", + "| *(cleaned up after merge)* | Per-block `.dosage.gz`, `.map`, `.afreq`, `.meta` intermediate files |\n" ] }, { @@ -106,7 +97,7 @@ "id": "3547538e-ef0f-49d3-aa45-208a278f5d3c", "metadata": {}, "source": [ - "### Step 0: Generate W once — provide your VCF directory, prefix, and LD block file" + "### Step 0 — Generate W (run once per cohort)\n" ] }, { @@ -119,10 +110,10 @@ "outputs": [], "source": [ "sos run pipeline/rss_ld_sketch.ipynb generate_W \\\n", - " --n-samples 2000 \\\n", + " --n-samples 16571 \\\n", " --output-dir output/rss_ld_sketch \\\n", " --B 10000 \\\n", - " --seed 123" + " --seed 123\n" ] }, { @@ -130,7 +121,7 @@ "id": "e2559a2e-8bdd-443b-980f-26fe7f728674", "metadata": {}, "source": [ - "### Step 1: Process all blocks → compressed dosage files\n" + "### Step 1+2 — Process all blocks → compressed dosage files\n" ] }, { @@ -142,14 +133,14 @@ }, "outputs": [], "source": [ - "# Process all blocks for one chromosome\n", + "# Process one chromosome\n", "sos run pipeline/rss_ld_sketch.ipynb process_block \\\n", " --vcf-base /path/to/vcfs/ \\\n", " --vcf-prefix your.prefix. \\\n", " --ld-block-file /path/to/EUR_LD_blocks.bed \\\n", " --chrom 22 \\\n", " --output-dir output/rss_ld_sketch \\\n", - " --W-matrix output/rss_ld_sketch/W_B10000.npy\n" + " --W-matrix output/rss_ld_sketch/W_B10000.npy " ] }, { @@ -166,7 +157,8 @@ " --ld-block-file /restricted/projectnb/casa/oaolayin/ROSMAP_NIA_geno/EUR_LD_blocks.bed \\\n", " --chrom 0 \\\n", " --output-dir output/rss_ld_sketch \\\n", - " --W-matrix output/rss_ld_sketch/W_B10000.npy" + " --W-matrix output/rss_ld_sketch/W_B10000.npy \n", + "\n" ] }, { @@ -174,7 +166,7 @@ "id": "19fdaf69-ae0d-45fb-b7a1-dc5a03ff9a51", "metadata": {}, "source": [ - "#### Step 2: Merge per-block dosage files into per-chromosome pgen\n" + "### Step 3 — Merge per-block dosage files into per-chromosome pgen\n" ] }, { @@ -186,7 +178,8 @@ "source": [ "sos run pipeline/rss_ld_sketch.ipynb merge_chrom \\\n", " --output-dir output/rss_ld_sketch \\\n", - " --chrom 0" + " --chrom 0\n", + "\n" ] }, { @@ -281,10 +274,10 @@ " Must match cohort-id used in process_block.\n", " --output-dir VAL (as str, required)\n", " Output per chromosome (final, block intermediates cleaned up):\n", - " {cohort_id}.chr{N}_stocld_panel.pgen\n", - " {cohort_id}.chr{N}_stocld_panel.pvar\n", - " {cohort_id}.chr{N}_stocld_panel.psam\n", - " {cohort_id}.chr{N}_stocld_panel.afreq\n", + " {cohort_id}.chr{N}.pgen\n", + " {cohort_id}.chr{N}.pvar\n", + " {cohort_id}.chr{N}.psam\n", + " {cohort_id}.chr{N}.afreq\n", "```\n" ] }, @@ -295,7 +288,7 @@ "kernel": "SoS" }, "source": [ - "# Setup and Global Parameters" + "# Pipeline Implementation\n" ] }, { @@ -310,14 +303,11 @@ "[global]\n", "parameter: cwd = path(\"output\")\n", "parameter: job_size = 1\n", - "parameter: walltime = \"4h\"\n", - "parameter: mem = \"32G\"\n", - "parameter: numThreads = 8\n", + "parameter: walltime = \"24:00:00\"\n", + "parameter: mem = \"16G\"\n", + "parameter: numThreads = 16\n", "parameter: container = \"\"\n", - "parameter: plink2_bin = \"plink2\"\n", - "parameter: walltime = \"24:00:00\"\n", - "parameter: mem = \"32G\"\n", - "parameter: numThreads = 8\n", + "parameter: plink2_bin = \"plink2\"\n", "\n", "import re\n", "from sos.utils import expand_size\n", @@ -337,7 +327,9 @@ "kernel": "SoS" }, "source": [ - "## `generate_W`" + "## `generate_W`\n", + "\n", + "Generates the random projection matrix W ~ N(0, 1/√n), shape (n × B). Run once per cohort. W is shared across all chromosomes.\n" ] }, { @@ -357,10 +349,10 @@ "# n_samples is passed directly as a parameter; no VCF reading is needed.\n", "# All 22 chromosomes reuse the same W so that per-chromosome stochastic\n", "# genotype samples can be arithmetically merged for meta-analysis.\n", - "parameter: n_samples = int\n", - "parameter: output_dir = str\n", - "parameter: B = 10000\n", - "parameter: seed = 123\n", + "parameter: n_samples = int\n", + "parameter: output_dir = str\n", + "parameter: B = 10000\n", + "parameter: seed = 123\n", "\n", "import os\n", "input: []\n", @@ -396,7 +388,9 @@ "kernel": "SoS" }, "source": [ - "## `process_block`\n" + "## `process_block`\n", + "\n", + "Two-step workflow. `process_block_1` reads the BED file and emits one coordinate file per LD block. `process_block_2` reads the VCF for each block, computes U = WᵀG, scales to [0,2], and writes compressed dosage + allele frequency files. G is never stored.\n" ] }, { @@ -458,7 +452,9 @@ "id": "a1b2c3d4-0031-4031-a031-000000000031", "metadata": {}, "source": [ - "## `process_block_2`\n" + "## `process_block_2`\n", + "\n", + "Per-block parallel task. Reads VCF, computes U = WᵀG in chunks, scales to [0,2], writes `.dosage.gz`, `.map`, `.afreq`, `.meta`. Output is intermediate — cleaned up by `merge_chrom`.\n" ] }, { @@ -683,7 +679,9 @@ "id": "a1b2c3d4-0040-4040-a040-000000000040", "metadata": {}, "source": [ - "## `merge_chrom`\n" + "## `merge_chrom`\n", + "\n", + "For each chromosome: converts per-block `.dosage.gz` files to sorted pgen (two-pass: import then `--sort-vars`), merges all blocks into one per-chromosome pgen using `plink2 --pmerge-list`, concatenates allele frequencies, and removes all per-block intermediate directories. Output: `{cohort_id}.chr{N}.{pgen,pvar,psam,afreq}`.\n" ] }, { @@ -698,98 +696,102 @@ "[merge_chrom]\n", "# Merge all per-block dosage files for a chromosome into one per-chrom pgen.\n", "# Steps per chromosome:\n", - "# 1. For each block: plink2 --import-dosage → per-block pgen (temp)\n", + "# 1. For each block: plink2 --import-dosage → unsorted pgen, then --sort-vars → sorted pgen\n", "# 2. plink2 --pmerge-list → one per-chrom pgen\n", - "# 3. Clean up all per-block dirs\n", + "# 3. Clean up all intermediates including -merge.* files\n", "#\n", "# Run after process_block completes (or run together: sos run ... process_block merge_chrom)\n", - "parameter: chrom = 0\n", - "parameter: output_dir = str\n", - "parameter: cohort_id = \"ADSP.R5.EUR\"\n", + "parameter: chrom = 0\n", + "parameter: output_dir = str\n", + "parameter: cohort_id = \"ADSP.R5.EUR\"\n", "\n", "import os, glob\n", "\n", "def _chroms_to_process(output_dir, chrom_filter):\n", " if chrom_filter != 0:\n", " return [f\"chr{chrom_filter}\"]\n", - " found = sorted(set(\n", + " return sorted(set(\n", " os.path.basename(d)\n", " for d in glob.glob(os.path.join(output_dir, \"chr*\"))\n", " if os.path.isdir(d)\n", " ))\n", - " return found\n", "\n", "_chroms = _chroms_to_process(output_dir, chrom)\n", - "print(f\" Merging {len(_chroms)} chromosome(s): {', '.join(_chroms)}\")\n", "\n", - "input: []\n", - "for_each: {\"_chrom\": _chroms}\n", - "task: trunk_workers = 1, trunk_size = 1, walltime = walltime, mem = mem, cores = numThreads\n", - "bash: expand = \"${ }\"\n", + "input: []\n", + "output: [f\"{output_dir}/{c}/{cohort_id}.{c}.pgen\" for c in _chroms]\n", "\n", + "for _chrom in _chroms:\n", + " bash(f\"\"\"\n", " set -euo pipefail\n", - " chrom=\"${_chrom}\"\n", - " cohort_id=\"${cohort_id}\"\n", - " output_dir=\"${output_dir}\"\n", - " plink2_bin=\"${plink2_bin}\"\n", - " chrom_dir=\"${output_dir}/${chrom}\"\n", - " final_prefix=\"${chrom_dir}/${cohort_id}.${chrom}_stocld_panel\"\n", - " merge_list=\"${chrom_dir}/${cohort_id}.${chrom}_pmerge_list.txt\"\n", - " psam_path=\"${chrom_dir}/${cohort_id}.${chrom}_stocld_panel.psam\"\n", - "\n", - " echo \"[merge_chrom] ${chrom}: starting\"\n", - "\n", - " # ── Step 1: Convert each block dosage.gz → temp per-block pgen ──\n", - " > \"${merge_list}\"\n", - " for dosage_gz in \"${chrom_dir}\"/*/*.dosage.gz; do\n", - " block_dir=$(dirname \"${dosage_gz}\")\n", - " block_tag=$(basename \"${block_dir}\")\n", - " prefix=\"${block_dir}/${cohort_id}.${block_tag}_tmp\"\n", - " map_file=\"${block_dir}/${cohort_id}.${block_tag}.map\"\n", - " psam_file=\"${block_dir}/${cohort_id}.${block_tag}.psam\"\n", - "\n", - " # Write .psam if not present (B pseudo-individuals)\n", - " B=$(grep \"^B=\" \"${block_dir}/${cohort_id}.${block_tag}.meta\" | cut -d= -f2)\n", - " if [ ! -f \"${psam_file}\" ]; then\n", - " echo \"#FID IID\" > \"${psam_file}\"\n", - " for i in $(seq 1 ${B}); do echo \"S${i} S${i}\"; done >> \"${psam_file}\"\n", - " fi\n", - "\n", - " echo \" importing ${block_tag} ($(zcat ${dosage_gz} | wc -l) variants)\"\n", - " \"${plink2_bin}\" --import-dosage \"${dosage_gz}\" format=1 noheader --psam \"${psam_file}\" --map \"${map_file}\" --make-pgen --out \"${prefix}\" --silent\n", - "\n", - " echo \"${prefix}\" >> \"${merge_list}\"\n", + " shopt -s nullglob\n", + "\n", + " chrom_dir=\"{output_dir}/{_chrom}\"\n", + " final_prefix=\"${{chrom_dir}}/{cohort_id}.{_chrom}\"\n", + " merge_list=\"${{chrom_dir}}/{cohort_id}.{_chrom}_pmerge_list.txt\"\n", + "\n", + " # ── Step 1: Convert each block dosage.gz → sorted per-block pgen (two-pass) ──\n", + " > \"${{merge_list}}\"\n", + " files=(\"${{chrom_dir}}\"/*/*.dosage.gz)\n", + " if [ ${{#files[@]}} -eq 0 ]; then\n", + " exit 1\n", + " fi\n", + " for dosage_gz in \"${{files[@]}}\"; do\n", + " block_dir=$(dirname \"${{dosage_gz}}\")\n", + " block_tag=$(basename \"${{block_dir}}\")\n", + " prefix=\"${{block_dir}}/{cohort_id}.${{block_tag}}_tmp\"\n", + " map_file=\"${{block_dir}}/{cohort_id}.${{block_tag}}.map\"\n", + " psam_file=\"${{block_dir}}/{cohort_id}.${{block_tag}}.psam\"\n", + " meta_file=\"${{block_dir}}/{cohort_id}.${{block_tag}}.meta\"\n", + " B=$(grep \"^B=\" \"${{meta_file}}\" | cut -d= -f2)\n", + " printf '#FID\\tIID\\n' > \"${{psam_file}}\"\n", + " for i in $(seq 1 ${{B}}); do\n", + " printf 'S%d\\tS%d\\n' ${{i}} ${{i}} >> \"${{psam_file}}\"\n", + " done\n", + " # Pass 1: import dosage → unsorted pgen\n", + " \"{plink2_bin}\" \\\n", + " --import-dosage \"${{dosage_gz}}\" format=1 noheader \\\n", + " --psam \"${{psam_file}}\" \\\n", + " --map \"${{map_file}}\" \\\n", + " --make-pgen \\\n", + " --out \"${{prefix}}_unsorted\" \\\n", + " --silent\n", + " # Pass 2: sort variants\n", + " \"{plink2_bin}\" \\\n", + " --pfile \"${{prefix}}_unsorted\" \\\n", + " --make-pgen \\\n", + " --sort-vars \\\n", + " --out \"${{prefix}}\" \\\n", + " --silent\n", + " rm -f \"${{prefix}}_unsorted.pgen\" \"${{prefix}}_unsorted.pvar\" \"${{prefix}}_unsorted.psam\"\n", + " echo \"${{prefix}}\" >> \"${{merge_list}}\"\n", " done\n", "\n", - " # ── Step 2: Write shared .psam for the merged file ──\n", - " B=$(grep \"^B=\" $(ls \"${chrom_dir}\"/*/*.meta | head -1) | cut -d= -f2)\n", - " echo \"#FID IID\" > \"${psam_path}\"\n", - " for i in $(seq 1 ${B}); do echo \"S${i} S${i}\"; done >> \"${psam_path}\"\n", - "\n", - " # ── Step 3: Merge all per-block pgens → one per-chrom pgen ──\n", - " n_blocks=$(wc -l < \"${merge_list}\")\n", - " echo \" merging ${n_blocks} blocks → ${final_prefix}\"\n", - " \"${plink2_bin}\" --pmerge-list \"${merge_list}\" pfile --psam \"${psam_path}\" --make-pgen --out \"${final_prefix}\"\n", + " # ── Step 2: Merge all per-block pgens → one per-chrom pgen ──\n", + " \"{plink2_bin}\" \\\n", + " --pmerge-list \"${{merge_list}}\" pfile \\\n", + " --make-pgen \\\n", + " --sort-vars \\\n", + " --out \"${{final_prefix}}\"\n", "\n", - " # ── Step 4: Concatenate .afreq files ──\n", - " afreq_out=\"${final_prefix}.afreq\"\n", + " # ── Step 3: Concatenate .afreq ──\n", " first=1\n", - " for f in \"${chrom_dir}\"/*/*.afreq; do\n", - " if [ \"${first}\" -eq 1 ]; then\n", - " cat \"${f}\" > \"${afreq_out}\"\n", + " for f in \"${{chrom_dir}}\"/*/*.afreq; do\n", + " if [ \"${{first}}\" -eq 1 ]; then\n", + " cat \"${{f}}\" > \"${{final_prefix}}.afreq\"\n", " first=0\n", " else\n", - " tail -n +2 \"${f}\" >> \"${afreq_out}\"\n", + " tail -n +2 \"${{f}}\" >> \"${{final_prefix}}.afreq\"\n", " fi\n", " done\n", - " echo \" afreq written: ${afreq_out}\"\n", "\n", - " # ── Step 5: Clean up per-block intermediate files ──\n", - " rm -f \"${merge_list}\" \"${psam_path}\"\n", - " for block_dir in \"${chrom_dir}\"/*/; do\n", - " rm -rf \"${block_dir}\"\n", + " # ── Step 4: Cleanup — remove merge intermediates and per-block dirs ──\n", + " rm -f \"${{merge_list}}\"\n", + " rm -f \"${{final_prefix}}-merge.pgen\" \"${{final_prefix}}-merge.pvar\" \"${{final_prefix}}-merge.psam\"\n", + " for block_dir in \"${{chrom_dir}}\"/*/; do\n", + " rm -rf \"${{block_dir}}\"\n", " done\n", - " echo \"[merge_chrom] ${chrom}: done → ${final_prefix}.{pgen,pvar,psam,afreq}\"\n" + " \"\"\")" ] } ],