Merge pull request StatFunGen#435 from xueweic/main

gaow · web-flow · commit 4561bfb5ea83 · 2026-03-03T11:36:27.000-05:00
fixed format issue for colocboost pipeline
diff --git a/vignettes/colocboost-pipeline.Rmd b/vignettes/colocboost-pipeline.Rmd
@@ -1,5 +1,7 @@
 ---
 title: "Bioinformatics Pipeline for ColocBoost"
+author: "Xuewei Cao"
+date: "`r Sys.Date()`"
 output: rmarkdown::html_vignette
 vignette: >
   %\VignetteIndexEntry{Bioinformatics Pipeline for ColocBoost}
@@ -21,6 +23,7 @@ This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost
 `colocboost_pipeline` with [link](https://github.com/StatFunGen/pecotmr/blob/main/R/colocboost_pipeline.R). 
 - See more details about input data preparation in `xqtl_protocol` with [link](https://statfungen.github.io/xqtl-protocol/code/mnm_analysis/mnm_methods/colocboost.html).
 
+Acknowledgment: Thanks to Kate (Kathryn) Lawrence (GitHub:@kal26) for her contributions to this vignette.
 
 # 1. Loading Data using `colocboost_analysis_pipeline` function
 
@@ -38,15 +41,16 @@ Below are the input parameters for this function for loading individual-level da
 
 ## 1.1. Loading individual-level data from multiple cohorts
 
-inputs:
+Inputs:
+
 - **`region`**: String ; Genomic region of interest in the format of `chr:start-end` for the phenotype region you want to analyze. 
 - **`genotype_list`**: Character vector; Paths for PLINK bed files containing genotype data (do NOT include .bed suffix). 
 - **`phenotype_list`**: Character vector; Paths for phenotype file names.
 - **`covariate_list`**: Character vector; Paths for covariate file names for each phenotype. Must have the same length as the phenotype file vector.
 - **`conditions_list_individual`**: Character vector; Strings representing different conditions or groups used for naming. Must have the same length as the phenotype file vector.
 - **`match_geno_pheno`**: Integer vector; Indices of phenotypes matched to genotype if multiple genotype PLINK files are used. For each phenotype file in `phenotype_list`, the index of the genotype file in `genotype_list` it matches with.
 - **`association_window`**: String; Genomic region of interest in the format of `chr:start-end` for the association analysis window of variants to test (cis or trans). If not provided, all genotype data will be loaded.
-- **`extract_region_name`**: List of character vectors; Phenotype names (e.g., gene ID `ENSG00000269699`) to subset the phenotype data when there are multiple phenotypes availible in the region. Must have the same length as the phenotype file vector. Default is `NULL`, which will use all phenotypes in the region.
+- **`extract_region_name`**: List of character vectors; Phenotype names (e.g., gene ID `ENSG00000269699`) to subset the phenotype data when there are multiple phenotypes available in the region. Must have the same length as the phenotype file vector. Default is `NULL`, which will use all phenotypes in the region.
 - **`region_name_col`**: Integer; 1-based index of the column containing the region name (i.e. 4 for gene ID in a bed file). Required if `extract_region_name` is not `NULL`, or if multiple phenotypes fall into the same region in one phenotype file
 - **`keep_indel`**: Logical; indicating whether to keep insertions/deletions (INDELs). Default is `TRUE`.
 - **`keep_samples`**: Character vector; Sample names to keep. Default is `NULL`. Currently only supports keeping the same samples from all genotype and phenotype files.
@@ -55,15 +59,16 @@ inputs:
 - **`xvar_cutoff`**: Numeric; Minimum genotype variance cutoff. Default is 0.
 - **`imiss_cutoff`**: Numeric; Maximum individual missingness cutoff. Default is 0.
 
-outputs: 
+Outputs: 
+
 - **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. If only individual-level data is loaded, `sumstat_data` will be `NULL`.
 
 
-**Indivudual-level data loading example**
+**Individual-level data loading example**
 
 The following example demonstrates how to set up input data with 3 phenotypes and 2 cohorts. The first cohort has 2 phenotypes and the second cohort has 1 phenotype. The first phenotype has 2 genes and the second phenotype has 1 gene.
 
-```{r, data-loader-individual}
+```{r, data-loader-individual, eval = FALSE}
 # Example of loading individual-level data
 region = "chr1:1000000-2000000"
 genotype_list = c("plink_cohort1.1", "plink_cohort1.2")
@@ -84,34 +89,32 @@ xvar_cutoff = 0
 imiss_cutoff = 0.9
 
 # More advanced parameters see pecotmr::load_multitask_regional_data()
-
-#### Comment out to avoid running this code here, as we do not have real data files in this example ####
-# region_data_individual <- load_multitask_regional_data(
-#     region = region,
-#     genotype_list = genotype_list,
-#     phenotype_list = phenotype_list,
-#     covariate_list = covariate_list,
-#     conditions_list_individual = conditions_list_individual,
-#     match_geno_pheno = match_geno_pheno,
-#     association_window = association_window,
-#     region_name_col = region_name_col,
-#     extract_region_name = extract_region_name,
-#     keep_indel = keep_indel,
-#     keep_samples = keep_samples,
-#     maf_cutoff = maf_cutoff,
-#     mac_cutoff = mac_cutoff,
-#     xvar_cutoff = xvar_cutoff,
-#     imiss_cutoff = imiss_cutoff
-# )
-#### End of comment out
+region_data_individual <- load_multitask_regional_data(
+    region = region,
+    genotype_list = genotype_list,
+    phenotype_list = phenotype_list,
+    covariate_list = covariate_list,
+    conditions_list_individual = conditions_list_individual,
+    match_geno_pheno = match_geno_pheno,
+    association_window = association_window,
+    region_name_col = region_name_col,
+    extract_region_name = extract_region_name,
+    keep_indel = keep_indel,
+    keep_samples = keep_samples,
+    maf_cutoff = maf_cutoff,
+    mac_cutoff = mac_cutoff,
+    xvar_cutoff = xvar_cutoff,
+    imiss_cutoff = imiss_cutoff
+)
 
 ```
 
 
 
 ## 1.2. Loading summary statistics from multiple cohorts or datasets
 
-inputs:
+Inputs:
+
 - **`sumstat_path_list`**: Character vector; Paths to the summary statistics.
 - **`column_file_path_list`**: Character vector; Paths to the column mapping files. See below for expected format.
 - **`LD_meta_file_path_list`**: Character vector; Paths to LD metadata files. See below for expected format.
@@ -122,14 +125,15 @@ inputs:
 - **`n_cases`**: Integer vector; Number of cases. Set a 0 if `n_samples` is passed explicitly. If unknown, set as 0 and include `n_cases` column in the column mapping file to retrieve from the sumstat file.
 - **`n_controls`**: Integer vector; Number of controls. Set a 0 if `n_samples` is passed explicitly. If unknown, set as 0 and include `n_controls` column in the column mapping file to retrieve from the sumstat file.
 
-outputs: 
+Outputs: 
+
 - **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. If only summary statistics data is loaded, `individual_data` will be `NULL`.
 
 **Summary statistics loading example**
 
 The following example demonstrates how to set up input data with 2 summary statistics and one LD reference.
 
-```{r, data-loader-sumstat}
+```{r, data-loader-sumstat, eval = FALSE}
 # Example of loading summary statistics
 sumstat_path_list = c("sumstat1.tsv.gz", "sumstat2.tsv.gz")
 column_file_path_list = c("column_mapping_sumstat1.yml", "column_mapping_sumstat2.yml")
@@ -145,28 +149,26 @@ n_controls = c(0, 40000)
 
 
 # More advanced parameters see pecotmr::load_multitask_regional_data()
-
-#### Comment out to avoid running this code here, as we do not have real data files in this example ####
-# region_data_sumstat <- load_multitask_regional_data(
-#     sumstat_path_list = sumstat_path_list,
-#     column_file_path_list = column_file_path_list,
-#     LD_meta_file_path_list = LD_meta_file_path_list,
-#     conditions_list_sumstat = conditions_list_sumstat,
-#     match_LD_sumstat = match_LD_sumstat,
-#     association_window = association_window,
-#     n_samples = n_samples,
-#     n_cases = n_cases,
-#     n_controls = n_controls
-# )
-#### End of comment out
+region_data_sumstat <- load_multitask_regional_data(
+    sumstat_path_list = sumstat_path_list,
+    column_file_path_list = column_file_path_list,
+    LD_meta_file_path_list = LD_meta_file_path_list,
+    conditions_list_sumstat = conditions_list_sumstat,
+    match_LD_sumstat = match_LD_sumstat,
+    association_window = association_window,
+    n_samples = n_samples,
+    n_cases = n_cases,
+    n_controls = n_controls
+)
 ```
 
 
 
 **Expected format for column mapping file**
+
 The column mapping file is YAML (`.yml`) with key: value pairs mapping your input column names to the standardized names expected by the loader. 
 Required columns are `chrom`, `pos`, `A1`, and `A2`, and either `z` or `beta` and `sebeta`. 
-Either 'n_case' and 'n_control' or 'n_samples' can be passed as part of the column mapping, but will be overwritten by the n_cases and n_controls or n_samples parameterspassed explicitly.
+Either 'n_case' and 'n_control' or 'n_samples' can be passed as part of the column mapping, but will be overwritten by the n_cases and n_controls or n_samples parameters passed explicitly.
 ```yaml
 # required
 chrom: chromosome
@@ -188,7 +190,7 @@ n_sample: N
 
 
 **Expected format for LD metadata file**
-LD files sould be in the format generated by for instance `plink --r squared`, then xz compressed.
+LD files should be in the format generated by for instance `plink --r squared`, then xz compressed.
 The LD metadata file is a tab-separated file with the following columns:
 - `chrom`: chromosome
 - `start`: start position
@@ -208,84 +210,80 @@ The colocalization analysis can be run in any one of three modes, or in a combin
 - **`joint GWAS mode`**: Perform colocalization analysis in disease-agnostic mode on the individual-level and summary statistics data together.
 - **`separate GWAS mode`**: Perform colocalization analysis in disease-prioritized mode on the the individual-level data and each summary statistics dataset separately, treating each summary statistics dataset as the focal trait.
 
-inputs:
+Inputs:
+
 - **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function.
 - **`focal_trait`**: String; For xQTL-only mode, the name of the trait to perform disease-prioritized ColocBoost, from `conditions_list_individual`. If not provided, xQTL-only mode will be run without disease-prioritized mode.
 - **`event_filters`**: List of character vectors; Patterns for filtering events based on context names. 
 Example: for sQTL, `list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:")`.
 - **`maf_cutoff`**: Numeric; Minor allele frequency cutoff. Default is 0.005.
 - **`pip_cutoff_to_skip_ind`**: Integer vector; Cutoff values for skipping analysis based on pre-screening with single-effect SuSiE (L=1). Context is skipped if none of the variants in the context have PIP values greater than the cutoff. Default is 0 (does not run single-effect SuSiE). Passing a negative value sets the cutoff to 3/number of variants.
 - **`pip_cutoff_to_skip_sumstat`**: Integer vector; Cutoff values for skipping analysis based on pre-screening with single-effect SuSiE (L=1). Sumstat is skipped if none of the variants in the sumstat have PIP values greater than the cutoff. Default is 0 (does not run single-effect SuSiE). Passing a negative value sets the cutoff to 3/number of variants.
-- **`qc_method`**: String; Quality control method to use. Options are "dentist" or "slalom". Default is `dentist`.
+- **`qc_method`**: String; Quality control method to use. Options are "rss_qc", "dentist", or "slalom". Default is `rss_qc`.
 - **`impute`**: Logical; if TRUE, performs imputation for outliers identified in the analysis. Default is `TRUE`.
 - **`impute_opts`**: List of lists; Imputation options including rcond, R2_threshold, and minimum_ld. Default is `list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5)`.
 - **`xqtl_coloc`**: Logical; if TRUE, performs xQTL-only mode. Default is `TRUE`.
 - **`joint_gwas`**: Logical; if TRUE, performs joint GWAS mode, mapping all individual-level and sumstat data together.Default is `FALSE`.
 - **`separate_gwas`**: Logical; if TRUE, runs separate GWAS mode, where each sumstat dataset is analyzed separately with all individual-level data, treating each sumstat as the focal trait in disease-prioritized mode. Default is `FALSE`.
 
-outputs:
-- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_analysis_pipeline` function. If the mode is not run, the corresponding element will be `NULL`.
+Outputs:
 
-```{r, colocboost-analysis}
+- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_analysis_pipeline` function. If the mode is not run, the corresponding element will be `NULL`.
 
-#### Comment out to avoid running this code here, as we do not have real data files in this example ####
+```{r, colocboost-analysis, eval = FALSE}
 #### Please check the example code below ####
-
 # # load in individual-level and sumstat data
-# region_data_combined <- load_multitask_regional_data(
-#     region = region,
-#     genotype_list = genotype_list,
-#     phenotype_list = phenotype_list,
-#     covariate_list = covariate_list,
-#     conditions_list_individual = conditions_list_individual,
-#     match_geno_pheno = match_geno_pheno,
-#     association_window = association_window,
-#     region_name_col = region_name_col,
-#     extract_region_name = extract_region_name,
-#     keep_indel = keep_indel,
-#     keep_samples = keep_samples,
-#     maf_cutoff = maf_cutoff,
-#     mac_cutoff = mac_cutoff,
-#     xvar_cutoff = xvar_cutoff,
-#     imiss_cutoff = imiss_cutoff,
-#     sumstat_path_list = sumstat_path_list,
-#     column_file_path_list = column_file_path_list,
-#     LD_meta_file_path_list = LD_meta_file_path_list,
-#     conditions_list_sumstat = conditions_list_sumstat,
-#     match_LD_sumstat = match_LD_sumstat,
-#     n_samples = n_samples,
-#     n_cases = n_cases,
-#     n_controls = n_controls
-# )
-
-# maf_cutoff = 0.01
-# pip_cutoff_to_skip_ind = rep(0, length(phenotype_list))
-# pip_cutoff_to_skip_sumstat = rep(0, length(sumstat_path_list))
-# qc_method = "dentist"
-
-# # run colocboost analysis
-# colocboost_results <- colocboost_analysis_pipeline(
-#     region_data_combined,
-#     maf_cutoff = maf_cutoff, 
-#     pip_cutoff_to_skip_ind = pip_cutoff_to_skip_ind,
-#     pip_cutoff_to_skip_sumstat = pip_cutoff_to_skip_sumstat,
-#     qc_method = qc_method,
-#     xqtl_coloc = TRUE,
-#     joint_gwas = TRUE,
-#     separate_gwas = TRUE
-# )
-
-# # visualize results for xQTL-only mode
-# colocboost_plot(colocboost_results$xqtl_coloc)
-
-# # visualize results for joint GWAS mode
-# colocboost_plot(colocboost_results$joint_gwas)
-
-# # visualize results for separate GWAS mode
-# for (i in 1:length(colocboost_results$separate_gwas)) {
-#     colocboost_plot(colocboost_results$separate_gwas[[i]])
-# }
-
-
-#### End of comment out
+region_data_combined <- load_multitask_regional_data(
+    region = region,
+    genotype_list = genotype_list,
+    phenotype_list = phenotype_list,
+    covariate_list = covariate_list,
+    conditions_list_individual = conditions_list_individual,
+    match_geno_pheno = match_geno_pheno,
+    association_window = association_window,
+    region_name_col = region_name_col,
+    extract_region_name = extract_region_name,
+    keep_indel = keep_indel,
+    keep_samples = keep_samples,
+    maf_cutoff = maf_cutoff,
+    mac_cutoff = mac_cutoff,
+    xvar_cutoff = xvar_cutoff,
+    imiss_cutoff = imiss_cutoff,
+    sumstat_path_list = sumstat_path_list,
+    column_file_path_list = column_file_path_list,
+    LD_meta_file_path_list = LD_meta_file_path_list,
+    conditions_list_sumstat = conditions_list_sumstat,
+    match_LD_sumstat = match_LD_sumstat,
+    n_samples = n_samples,
+    n_cases = n_cases,
+    n_controls = n_controls
+)
+
+maf_cutoff = 0.01
+pip_cutoff_to_skip_ind = rep(0, length(phenotype_list))
+pip_cutoff_to_skip_sumstat = rep(0, length(sumstat_path_list))
+qc_method = "rss_qc" 
+
+# run colocboost analysis
+colocboost_results <- colocboost_analysis_pipeline(
+    region_data_combined,
+    maf_cutoff = maf_cutoff, 
+    pip_cutoff_to_skip_ind = pip_cutoff_to_skip_ind,
+    pip_cutoff_to_skip_sumstat = pip_cutoff_to_skip_sumstat,
+    qc_method = qc_method,
+    xqtl_coloc = TRUE,
+    joint_gwas = TRUE,
+    separate_gwas = TRUE
+)
+
+# visualize results for xQTL-only mode
+colocboost_plot(colocboost_results$xqtl_coloc)
+
+# visualize results for joint GWAS mode
+colocboost_plot(colocboost_results$joint_gwas)
+
+# visualize results for separate GWAS mode
+for (i in 1:length(colocboost_results$separate_gwas)) {
+    colocboost_plot(colocboost_results$separate_gwas[[i]])
+}
 ```