Skip to content

Commit 8588e59

Browse files
authored
Merge pull request #1310 from al4225/main
Add trans_geno_chromosome parameter and improve trans-QTL pipeline
2 parents 4bf2ca6 + f642709 commit 8588e59

1 file changed

Lines changed: 42 additions & 5 deletions

File tree

code/association_scan/TensorQTL/TensorQTL.ipynb

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1318,6 +1318,9 @@
13181318
"parameter: chromosome = []\n",
13191319
"# Minor allele count cutoff\n",
13201320
"parameter: MAC = 0\n",
1321+
"# Specify genotype chromosome for trans analysis (e.g. --trans-geno-chromosome 5)\n",
1322+
"# When set, trans step loads this genotype chrom instead of the one from input_files\n",
1323+
"parameter: trans_geno_chromosome = \"\"\n",
13211324
"\n",
13221325
"# Specify the cis window for the up and downstream radius to analyze around the region of interest in units of bp\n",
13231326
"# This parameter will be set to zero if `customized_cis_windows` is provided.\n",
@@ -1422,7 +1425,8 @@
14221425
" input_chroms = [x[0] for x in input_files]\n",
14231426
" input_files = [x[1:] for x in input_files]\n",
14241427
" if len(name) == 0:\n",
1425-
" name = f'{path(input_files[0][0]):bnn}' if len(input_files) == 1 else f'{path(input_files[0][0]):bnnn}'\n"
1428+
" name = f'{path(input_files[0][0]):bnn}' if len(input_files) == 1 else f'{path(input_files[0][0]):bnnn}'\n",
1429+
""
14261430
]
14271431
},
14281432
{
@@ -1869,7 +1873,7 @@
18691873
"parameter: pval = 0.0\n",
18701874
"\n",
18711875
"input: input_files, group_by = len(input_files[0]), group_with = \"input_chroms\"\n",
1872-
"output: nominal = f'{cwd:a}/{_input[0]:bnn}{\"_%s\" % input_chroms[_index] if input_chroms[_index] != 0 else \"\"}.trans_qtl{\"_p_%.0e\" % pval if pval > 0.0 else \"\"}.pairs.tsv.gz'\n",
1876+
"output: nominal = f'{cwd:a}/{_input[0]:bnn}{\"_chr%s\" % input_chroms[_index] if input_chroms[_index] != 0 else \"\"}{\"_geno_chr%s\" % trans_geno_chromosome if trans_geno_chromosome else \"\"}.trans_qtl{\"_p_%.0e\" % pval if pval > 0.0 else \"\"}.pairs.tsv.gz'\n",
18731877
"\n",
18741878
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n",
18751879
"python: expand= \"$[ ]\", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container =container\n",
@@ -1896,7 +1900,7 @@
18961900
"\n",
18971901
" ## Analyze only the regions listed\n",
18981902
" if $[region_list.is_file()]:\n",
1899-
" region = pd.read_csv(\"$[region_list:a]\")\n",
1903+
" region = pd.read_csv(\"$[region_list:a]\", comment=\"#\", header=None, sep=\"\\t\")\n",
19001904
" phenotype_column = 1 if len(region.columns) == 1 else $[region_list_phenotype_column]\n",
19011905
" keep_region = region.iloc[:, phenotype_column-1].astype(str).str.strip().to_list()\n",
19021906
" phenotype_df = phenotype_df[phenotype_df.index.isin(keep_region)]\n",
@@ -1948,7 +1952,39 @@
19481952
" covariates_df = covariates_df[keep_cols]\n",
19491953
" \n",
19501954
"\n",
1955+
" ## If trans_geno_chromosome is specified, load that genotype chrom instead\n",
1956+
" trans_geno_chrom = \"$[trans_geno_chromosome]\"\n",
1957+
" if trans_geno_chrom:\n",
1958+
" trans_geno_chrom = trans_geno_chrom.replace(\"chr\", \"\")\n",
1959+
" geno_file_str = \"$[genotype_file:a]\"\n",
1960+
" if geno_file_str.endswith(\".bed\") or geno_file_str.endswith(\".pgen\"):\n",
1961+
" # Single plink file: load as-is, will filter variants by chrom after loading\n",
1962+
" print(f\"Trans mode: will filter genotype to chr{trans_geno_chrom} after loading\")\n",
1963+
" else:\n",
1964+
" # Manifest file: find the plink file for the specified chromosome\n",
1965+
" geno_list_df = pd.read_csv(geno_file_str, sep=\"\\t\")\n",
1966+
" geno_list_df[\"#id\"] = [x.replace(\"chr\",\"\") for x in geno_list_df[\"#id\"].astype(str)]\n",
1967+
" geno_row = geno_list_df[geno_list_df[\"#id\"] == trans_geno_chrom]\n",
1968+
" if len(geno_row) == 0:\n",
1969+
" raise ValueError(f\"No genotype file found for chromosome {trans_geno_chrom}\")\n",
1970+
" geno_plink = geno_row[\"#path\"].values[0]\n",
1971+
" if geno_plink.endswith(\".bed\"):\n",
1972+
" geno_plink = geno_plink[:-4]\n",
1973+
" elif geno_plink.endswith(\".pgen\"):\n",
1974+
" geno_plink = geno_plink[:-5]\n",
1975+
" plink_prefix_path = geno_plink\n",
1976+
" print(f\"Trans mode: overriding genotype to chr{trans_geno_chrom} from {plink_prefix_path}\")\n",
1977+
"\n",
19511978
" genotype_df, variant_df = genotypeio.load_genotypes(plink_prefix_path, dosages = True) \n",
1979+
"\n",
1980+
" ## Filter genotype to specified chromosome if trans_geno_chrom is set\n",
1981+
" if trans_geno_chrom:\n",
1982+
" chrom_filter = variant_df[\"chrom\"].astype(str).str.replace(\"chr\", \"\") == trans_geno_chrom\n",
1983+
" if chrom_filter.sum() == 0:\n",
1984+
" raise ValueError(f\"No variants found for chromosome {trans_geno_chrom} in genotype data\")\n",
1985+
" variant_df = variant_df[chrom_filter]\n",
1986+
" genotype_df = genotype_df.loc[variant_df.index]\n",
1987+
" print(f\"Trans mode: filtered to {len(variant_df)} variants on chr{trans_geno_chrom}\")\n",
19521988
" ## use custom sample list to subset the covariates data\n",
19531989
" if $[keep_sample.is_file()]:\n",
19541990
" sample_list = pd.read_csv(\"$[keep_sample:a]\", comment=\"#\", header=None, names=[\"sample_id\"], sep=\"\\t\")\n",
@@ -2178,7 +2214,8 @@
21782214
"\n",
21792215
"bash: expand= \"$[ ]\", stderr = f'{_output[\"nominal\"]:nnn}.stderr', stdout = f'{_output[\"nominal\"]:nnn}.stdout', container = container\n",
21802216
" bgzip --compress-level 9 $[_output[\"nominal\"]:n]\n",
2181-
" tabix -S 1 -s 1 -b 2 -e 2 $[_output[\"nominal\"]]\n"
2217+
" tabix -S 1 -s 1 -b 2 -e 2 $[_output[\"nominal\"]]\n",
2218+
""
21822219
]
21832220
}
21842221
],
@@ -2217,4 +2254,4 @@
22172254
},
22182255
"nbformat": 4,
22192256
"nbformat_minor": 4
2220-
}
2257+
}

0 commit comments

Comments
 (0)