Merge pull request #1310 from al4225/main

gaow · web-flow · commit 8588e590ff03 · 2026-04-09T13:50:20.000-04:00
Add trans_geno_chromosome parameter and improve trans-QTL pipeline
diff --git a/code/association_scan/TensorQTL/TensorQTL.ipynb b/code/association_scan/TensorQTL/TensorQTL.ipynb
@@ -1318,6 +1318,9 @@
     "parameter: chromosome = []\n",
     "# Minor allele count cutoff\n",
     "parameter: MAC = 0\n",
+    "# Specify genotype chromosome for trans analysis (e.g. --trans-geno-chromosome 5)\n",
+    "# When set, trans step loads this genotype chrom instead of the one from input_files\n",
+    "parameter: trans_geno_chromosome = \"\"\n",
     "\n",
     "# Specify the cis window for the up and downstream radius to analyze around the region of interest in units of bp\n",
     "# This parameter will be set to zero if `customized_cis_windows` is provided.\n",
@@ -1422,7 +1425,8 @@
     "    input_chroms = [x[0] for x in input_files]\n",
     "    input_files = [x[1:] for x in input_files]\n",
     "    if len(name) == 0:\n",
-    "        name = f'{path(input_files[0][0]):bnn}' if len(input_files) == 1 else f'{path(input_files[0][0]):bnnn}'\n"
+    "        name = f'{path(input_files[0][0]):bnn}' if len(input_files) == 1 else f'{path(input_files[0][0]):bnnn}'\n",
+    ""
    ]
   },
   {
@@ -1869,7 +1873,7 @@
     "parameter: pval = 0.0\n",
     "\n",
     "input: input_files, group_by = len(input_files[0]), group_with = \"input_chroms\"\n",
-    "output: nominal = f'{cwd:a}/{_input[0]:bnn}{\"_%s\" % input_chroms[_index] if input_chroms[_index] != 0 else \"\"}.trans_qtl{\"_p_%.0e\" % pval if pval > 0.0 else \"\"}.pairs.tsv.gz'\n",
+    "output: nominal = f'{cwd:a}/{_input[0]:bnn}{\"_chr%s\" % input_chroms[_index] if input_chroms[_index] != 0 else \"\"}{\"_geno_chr%s\" % trans_geno_chromosome if trans_geno_chromosome else \"\"}.trans_qtl{\"_p_%.0e\" % pval if pval > 0.0 else \"\"}.pairs.tsv.gz'\n",
     "\n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n",
     "python: expand= \"$[ ]\", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container =container\n",
@@ -1896,7 +1900,7 @@
     "\n",
     "    ## Analyze only the regions listed\n",
     "    if $[region_list.is_file()]:\n",
-    "        region = pd.read_csv(\"$[region_list:a]\")\n",
+    "        region = pd.read_csv(\"$[region_list:a]\", comment=\"#\", header=None, sep=\"\\t\")\n",
     "        phenotype_column = 1 if len(region.columns) == 1 else $[region_list_phenotype_column]\n",
     "        keep_region = region.iloc[:, phenotype_column-1].astype(str).str.strip().to_list()\n",
     "        phenotype_df = phenotype_df[phenotype_df.index.isin(keep_region)]\n",
@@ -1948,7 +1952,39 @@
     "            covariates_df = covariates_df[keep_cols]\n",
     "       \n",
     "\n",
+    "    ## If trans_geno_chromosome is specified, load that genotype chrom instead\n",
+    "    trans_geno_chrom = \"$[trans_geno_chromosome]\"\n",
+    "    if trans_geno_chrom:\n",
+    "        trans_geno_chrom = trans_geno_chrom.replace(\"chr\", \"\")\n",
+    "        geno_file_str = \"$[genotype_file:a]\"\n",
+    "        if geno_file_str.endswith(\".bed\") or geno_file_str.endswith(\".pgen\"):\n",
+    "            # Single plink file: load as-is, will filter variants by chrom after loading\n",
+    "            print(f\"Trans mode: will filter genotype to chr{trans_geno_chrom} after loading\")\n",
+    "        else:\n",
+    "            # Manifest file: find the plink file for the specified chromosome\n",
+    "            geno_list_df = pd.read_csv(geno_file_str, sep=\"\\t\")\n",
+    "            geno_list_df[\"#id\"] = [x.replace(\"chr\",\"\") for x in geno_list_df[\"#id\"].astype(str)]\n",
+    "            geno_row = geno_list_df[geno_list_df[\"#id\"] == trans_geno_chrom]\n",
+    "            if len(geno_row) == 0:\n",
+    "                raise ValueError(f\"No genotype file found for chromosome {trans_geno_chrom}\")\n",
+    "            geno_plink = geno_row[\"#path\"].values[0]\n",
+    "            if geno_plink.endswith(\".bed\"):\n",
+    "                geno_plink = geno_plink[:-4]\n",
+    "            elif geno_plink.endswith(\".pgen\"):\n",
+    "                geno_plink = geno_plink[:-5]\n",
+    "            plink_prefix_path = geno_plink\n",
+    "            print(f\"Trans mode: overriding genotype to chr{trans_geno_chrom} from {plink_prefix_path}\")\n",
+    "\n",
     "    genotype_df, variant_df = genotypeio.load_genotypes(plink_prefix_path, dosages = True) \n",
+    "\n",
+    "    ## Filter genotype to specified chromosome if trans_geno_chrom is set\n",
+    "    if trans_geno_chrom:\n",
+    "        chrom_filter = variant_df[\"chrom\"].astype(str).str.replace(\"chr\", \"\") == trans_geno_chrom\n",
+    "        if chrom_filter.sum() == 0:\n",
+    "            raise ValueError(f\"No variants found for chromosome {trans_geno_chrom} in genotype data\")\n",
+    "        variant_df = variant_df[chrom_filter]\n",
+    "        genotype_df = genotype_df.loc[variant_df.index]\n",
+    "        print(f\"Trans mode: filtered to {len(variant_df)} variants on chr{trans_geno_chrom}\")\n",
     "    ## use custom sample list to subset the covariates data\n",
     "    if $[keep_sample.is_file()]:\n",
     "        sample_list = pd.read_csv(\"$[keep_sample:a]\", comment=\"#\", header=None, names=[\"sample_id\"], sep=\"\\t\")\n",
@@ -2178,7 +2214,8 @@
     "\n",
     "bash: expand= \"$[ ]\", stderr = f'{_output[\"nominal\"]:nnn}.stderr', stdout = f'{_output[\"nominal\"]:nnn}.stdout', container = container\n",
     "        bgzip --compress-level 9 $[_output[\"nominal\"]:n]\n",
-    "        tabix -S 1 -s 1 -b 2 -e 2 $[_output[\"nominal\"]]\n"
+    "        tabix -S 1 -s 1 -b 2 -e 2 $[_output[\"nominal\"]]\n",
+    ""
    ]
   }
  ],
@@ -2217,4 +2254,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}