From b479a0626d2fc90c72cb94b0e7d99e760dae3a89 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 14 Jan 2026 13:37:00 +0000 Subject: [PATCH 1/6] Initial plan From e0875a6cfa1d0103dbff3ecdd2a0210cea8e7c34 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 14 Jan 2026 13:41:42 +0000 Subject: [PATCH 2/6] Add two comprehensive case study vignettes for diabetic nephropathy and urothelial cancer Co-authored-by: ShixiangWang <25057508+ShixiangWang@users.noreply.github.com> --- vignettes/case-diabetic-nephropathy.Rmd | 301 +++++++++++++++++ vignettes/case-urothelial-cancer.Rmd | 419 ++++++++++++++++++++++++ 2 files changed, 720 insertions(+) create mode 100644 vignettes/case-diabetic-nephropathy.Rmd create mode 100644 vignettes/case-urothelial-cancer.Rmd diff --git a/vignettes/case-diabetic-nephropathy.Rmd b/vignettes/case-diabetic-nephropathy.Rmd new file mode 100644 index 0000000..6c5bfb6 --- /dev/null +++ b/vignettes/case-diabetic-nephropathy.Rmd @@ -0,0 +1,301 @@ +--- +title: "Case Study: Diabetic Nephropathy Meta-Analysis" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Case Study: Diabetic Nephropathy Meta-Analysis} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +This vignette demonstrates a comprehensive meta-analysis workflow for diabetic nephropathy (DN) gene expression datasets. We'll show how to systematically search, filter, annotate, and visualize relevant GEO datasets to prepare for downstream meta-analysis. + +## Introduction + +Diabetic nephropathy is a major complication of diabetes and a leading cause of chronic kidney disease. Conducting a meta-analysis of gene expression studies can help identify robust molecular signatures. This workflow demonstrates how to use `geokit` to: + +- Search GEO using multiple query strategies +- Build a customized metadata database +- Filter datasets by quality criteria +- Visualize dataset characteristics +- Prepare a curated list for downstream analysis + +```{r setup} +library(geokit) +library(dplyr) +library(stringr) +library(ggplot2) +``` + +## 1. Multi-Strategy Dataset Discovery + +We'll use multiple search terms to comprehensively identify diabetic nephropathy datasets. Different researchers may use varying terminology, so we query for both "diabetic nephropathy" and "diabetic kidney disease". + +```{r search_dn, cache = TRUE} +# Define multiple search strategies +dn_search_terms <- c( + "diabetic nephropathy[ALL] AND Homo sapiens[ORGN] AND GSE[ETYP]", + "diabetic kidney disease[ALL] AND Homo sapiens[ORGN] AND GSE[ETYP]" +) + +# Execute searches and combine results +dn_gse_list <- lapply(dn_search_terms, geo_search) +dn_gse <- unique(dplyr::bind_rows(dn_gse_list)) + +# Display summary +cat(sprintf("Found %d unique GSE datasets\n", nrow(dn_gse))) +head(dn_gse[, 1:3]) +``` + +## 2. Extract Sample Information + +Extract the number of samples from the "Contains" field to enable filtering by sample size. + +```{r extract_samples} +dn_gse <- dn_gse |> + dplyr::mutate( + number_of_samples = stringr::str_match( + Contains, "(\\d+) Samples?" + )[, 2L, drop = TRUE], + number_of_samples = as.integer(number_of_samples) + ) + +# Quick statistics +summary(dn_gse$number_of_samples) +``` + +## 3. Filter by Quality Criteria + +Apply filters for sample size, platform type, and study type to focus on high-quality expression profiling studies. + +```{r filter_datasets} +dn_gse_filtered <- dn_gse |> + dplyr::filter( + # At least 6 samples for meaningful analysis + number_of_samples >= 6, + # Focus on expression profiling studies + stringr::str_detect(Type, "(?i)expression profiling"), + # Exclude certain platform types if needed + !stringr::str_detect(Type, "(?i)methylation") + ) + +cat(sprintf("After filtering: %d datasets (%.1f%% of original)\n", + nrow(dn_gse_filtered), + 100 * nrow(dn_gse_filtered) / nrow(dn_gse))) +``` + +## 4. Build Metadata Database + +Fetch detailed metadata for filtered datasets using `geo_meta()`. This creates a local database for offline analysis and detailed inspection. + +```{r build_metadb, eval = FALSE} +# Create output directory for metadata +dn_metadb_dir <- "dn_metadb" +dir.create(dn_metadb_dir, showWarnings = FALSE, recursive = TRUE) + +# Fetch metadata (this may take several minutes) +dn_metadb <- geo_meta( + dn_gse_filtered[["Series Accession"]], + odir = dn_metadb_dir +) + +# Save for future use +saveRDS(dn_metadb, file.path(dn_metadb_dir, "dn_metadb.rds")) +``` + +```{r load_metadb, eval = FALSE} +# Load previously saved metadb +dn_metadb <- readRDS(file.path(dn_metadb_dir, "dn_metadb.rds")) + +# Calculate actual sample counts from metadb +dn_metadb <- dn_metadb |> + dplyr::mutate( + number_of_samples = lengths( + strsplit(Series_sample_id, "; ", fixed = TRUE) + ) + ) +``` + +## 5. Further Categorization + +Categorize datasets by specific criteria relevant to diabetic nephropathy research. + +```{r categorize, eval = FALSE} +# Identify datasets with specific DN-related keywords +dn_metadb <- dn_metadb |> + dplyr::mutate( + # Check for nephropathy mentions + has_nephropathy = dplyr::if_any( + c(Series_title, Series_summary, Series_overall_design), + ~ stringr::str_detect(.x, "(?i)nephropath") + ), + # Check for kidney mentions + has_kidney = dplyr::if_any( + c(Series_title, Series_summary, Series_overall_design), + ~ stringr::str_detect(.x, "(?i)kidney|renal") + ), + # Check for biopsy samples + has_biopsy = dplyr::if_any( + c(Series_title, Series_summary, Series_overall_design), + ~ stringr::str_detect(.x, "(?i)biopsy|biopsies") + ), + # Extract tissue type mentions + tissue_type = dplyr::case_when( + stringr::str_detect(Series_title, "(?i)glomerul") ~ "glomerular", + stringr::str_detect(Series_title, "(?i)tubul") ~ "tubular", + stringr::str_detect(Series_title, "(?i)kidney|renal") ~ "kidney", + TRUE ~ "other" + ) + ) + +# Summary by tissue type +table(dn_metadb$tissue_type) +``` + +## 6. Visualization: Dataset Characteristics + +### Timeline of Dataset Submissions + +```{r timeline, eval = FALSE, fig.width = 8, fig.height = 4} +# Extract submission year +dn_metadb <- dn_metadb |> + dplyr::mutate( + submission_year = as.integer( + stringr::str_extract(Series_pubmed_id, "^\\d{4}") + ) + ) + +# Create timeline plot +ggplot(dn_metadb, aes(x = submission_year)) + + geom_histogram(binwidth = 1, fill = "steelblue", color = "white") + + labs( + title = "Timeline of Diabetic Nephropathy GEO Dataset Submissions", + x = "Submission Year", + y = "Number of Datasets" + ) + + theme_minimal() + + theme( + plot.title = element_text(hjust = 0.5, face = "bold"), + panel.grid.minor = element_blank() + ) +``` + +### Sample Size Distribution + +```{r sample_distribution, eval = FALSE, fig.width = 8, fig.height = 4} +ggplot(dn_metadb, aes(x = number_of_samples)) + + geom_histogram(binwidth = 5, fill = "coral", color = "white") + + geom_vline( + xintercept = median(dn_metadb$number_of_samples, na.rm = TRUE), + linetype = "dashed", color = "red", linewidth = 1 + ) + + labs( + title = "Distribution of Sample Sizes in DN Datasets", + subtitle = sprintf( + "Median: %d samples", + median(dn_metadb$number_of_samples, na.rm = TRUE) + ), + x = "Number of Samples", + y = "Number of Datasets" + ) + + theme_minimal() + + theme( + plot.title = element_text(hjust = 0.5, face = "bold"), + plot.subtitle = element_text(hjust = 0.5), + panel.grid.minor = element_blank() + ) +``` + +### Platform Usage Trends + +```{r platform_trends, eval = FALSE, fig.width = 10, fig.height = 5} +# Extract platform information +platform_summary <- dn_metadb |> + dplyr::count(Series_platform_id, sort = TRUE) |> + dplyr::slice_max(n, n = 10) + +ggplot(platform_summary, aes(x = reorder(Series_platform_id, n), y = n)) + + geom_col(fill = "darkgreen", alpha = 0.7) + + geom_text(aes(label = n), hjust = -0.2, size = 3.5) + + coord_flip() + + labs( + title = "Top 10 Platforms Used in DN Studies", + x = "Platform ID", + y = "Number of Studies" + ) + + theme_minimal() + + theme( + plot.title = element_text(hjust = 0.5, face = "bold"), + panel.grid.major.y = element_blank() + ) +``` + +## 7. Prepare Curated Dataset List + +Create a final curated list of datasets suitable for meta-analysis. + +```{r curate_list, eval = FALSE} +# Select high-quality datasets with sufficient samples +dn_curated <- dn_metadb |> + dplyr::filter( + number_of_samples >= 10, + has_nephropathy | has_kidney + ) |> + dplyr::arrange(desc(number_of_samples)) |> + dplyr::select( + Series_geo_accession, + Series_title, + number_of_samples, + Series_platform_id, + tissue_type + ) + +# Display top candidates +head(dn_curated, 10) + +# Export for downstream analysis +write.csv(dn_curated, "dn_curated_datasets.csv", row.names = FALSE) +``` + +## 8. Next Steps for Meta-Analysis + +With this curated list, you can proceed to: + +1. **Download expression data**: Use `geo_matrix()` to download series matrix files +2. **Quality control**: Examine sample annotations and expression distributions +3. **Data integration**: Normalize and batch-correct across studies +4. **Differential expression**: Identify genes consistently dysregulated in DN +5. **Pathway analysis**: Investigate enriched biological processes + +```{r download_example, eval = FALSE} +# Example: Download top dataset +top_gse <- dn_curated$Series_geo_accession[1] +eset <- geo_matrix(top_gse, odir = tempdir()) + +# Quick inspection +print(eset) +dim(Biobase::exprs(eset)) +``` + +## Summary + +This workflow demonstrated how to: + +- Use multiple search strategies to comprehensively identify relevant datasets +- Build a customized metadata database with `geo_meta()` +- Apply quality filters for sample size and study type +- Visualize dataset characteristics over time +- Prepare a curated list for downstream meta-analysis + +The systematic approach ensures reproducibility and helps identify the most suitable datasets for gene expression meta-analysis in diabetic nephropathy research. + +## Session Information +```{r} +sessionInfo() +``` diff --git a/vignettes/case-urothelial-cancer.Rmd b/vignettes/case-urothelial-cancer.Rmd new file mode 100644 index 0000000..f3d0dba --- /dev/null +++ b/vignettes/case-urothelial-cancer.Rmd @@ -0,0 +1,419 @@ +--- +title: "Case Study: Urothelial Cancer Expression Analysis" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Case Study: Urothelial Cancer Expression Analysis} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +This vignette demonstrates a complete workflow for extracting, filtering, downloading, and analyzing urothelial cancer expression data from GEO. We'll cover dataset discovery, quality-based filtering, data retrieval, and exploratory analysis with publication-quality visualizations. + +## Introduction + +Urothelial carcinoma (bladder cancer) is one of the most common urological malignancies. Meta-analysis of gene expression studies can reveal consistent molecular alterations across patient cohorts. This workflow shows how to: + +- Search and combine results for bladder/urothelial cancer datasets +- Filter datasets by sample size and platform type +- Download expression matrices +- Perform log transformation and data integration +- Conduct exploratory analysis (PCA, boxplots, heatmaps) +- Generate publication-quality visualizations + +```{r setup, message = FALSE} +library(geokit) +library(dplyr) +library(stringr) +library(tidyr) +library(ggplot2) +``` + +## 1. Dataset Discovery + +Search for bladder and urothelial cancer datasets using multiple related terms to maximize coverage. + +```{r search_uc, cache = TRUE} +# Multiple search strategies for comprehensive coverage +uc_searches <- list( + bladder = geo_search( + "bladder cancer[ALL] AND Homo sapiens[ORGN] AND GSE[ETYP]" + ), + urothelial = geo_search( + "urothelial cancer[ALL] AND Homo sapiens[ORGN] AND GSE[ETYP]" + ) +) + +# Combine and deduplicate +uc_gse <- unique(dplyr::bind_rows(uc_searches)) + +cat(sprintf("Found %d unique urothelial cancer datasets\n", nrow(uc_gse))) +head(uc_gse[, 1:3]) +``` + +## 2. Extract and Process Sample Information + +```{r extract_info} +# Extract sample counts +uc_gse <- uc_gse |> + dplyr::mutate( + number_of_samples = stringr::str_match( + Contains, "(\\d+) Samples?" + )[, 2L, drop = TRUE], + number_of_samples = as.integer(number_of_samples), + # Extract platform information + platform_count = stringr::str_match( + Contains, "(\\d+) Platforms?" + )[, 2L, drop = TRUE], + platform_count = as.integer(platform_count) + ) + +# Summary statistics +cat("\nSample size summary:\n") +summary(uc_gse$number_of_samples) +``` + +## 3. Filter Datasets by Quality Criteria + +Apply stringent filters to select high-quality datasets suitable for meta-analysis. + +```{r filter_quality} +uc_gse_filtered <- uc_gse |> + dplyr::filter( + # At least 20 samples for robust analysis + number_of_samples >= 20, + # Focus on expression profiling + stringr::str_detect(Type, "(?i)expression profiling"), + # Prefer microarray studies (exclude RNA-seq for now) + stringr::str_detect(Type, "(?i)array"), + # Exclude methylation studies + !stringr::str_detect(Type, "(?i)methylation") + ) |> + dplyr::arrange(desc(number_of_samples)) + +cat(sprintf( + "\nFiltered to %d high-quality datasets (%.1f%% of original)\n", + nrow(uc_gse_filtered), + 100 * nrow(uc_gse_filtered) / nrow(uc_gse) +)) + +# Display top candidates +head(uc_gse_filtered[, c("Series Accession", "Title", "number_of_samples")], 10) +``` + +## 4. Download Expression Data + +For demonstration, we'll work with a subset of datasets. In practice, you would download all filtered datasets. + +```{r select_demo} +# Select top 3 datasets for demonstration +demo_gse <- head(uc_gse_filtered[["Series Accession"]], 3) +cat("Demo datasets:", paste(demo_gse, collapse = ", "), "\n") +``` + +```{r download_data, eval = FALSE} +# Download expression matrices +expression_sets <- lapply(demo_gse, function(gse_id) { + cat(sprintf("Downloading %s...\n", gse_id)) + tryCatch( + { + geo_matrix(gse_id, odir = tempdir()) + }, + error = function(e) { + message(sprintf("Failed to download %s: %s", gse_id, e$message)) + return(NULL) + } + ) +}) + +# Remove NULL entries (failed downloads) +expression_sets <- expression_sets[!sapply(expression_sets, is.null)] +names(expression_sets) <- sapply(expression_sets, function(x) { + Biobase::annotation(x) +}) + +cat(sprintf("Successfully downloaded %d datasets\n", length(expression_sets))) +``` + +## 5. Data Transformation + +Apply log2 transformation to ensure data comparability across datasets. + +```{r transform_data, eval = FALSE} +# Apply log transformation +expression_sets_log <- lapply(expression_sets, function(eset) { + log_trans(eset) +}) + +# Verify transformation +sapply(expression_sets_log, function(eset) { + exprs <- Biobase::exprs(eset) + c( + min = min(exprs, na.rm = TRUE), + median = median(exprs, na.rm = TRUE), + max = max(exprs, na.rm = TRUE) + ) +}) +``` + +## 6. Exploratory Analysis + +### 6.1 Expression Distribution + +Examine the distribution of expression values across samples using boxplots. + +```{r expr_boxplot, eval = FALSE, fig.width = 10, fig.height = 6} +# Prepare data for plotting (first dataset as example) +eset <- expression_sets_log[[1]] +expr_matrix <- Biobase::exprs(eset) + +# Convert to long format for ggplot +expr_df <- as.data.frame(expr_matrix) |> + tibble::rownames_to_column("gene") |> + tidyr::pivot_longer( + cols = -gene, + names_to = "sample", + values_to = "expression" + ) + +# Create boxplot +ggplot(expr_df, aes(x = sample, y = expression)) + + geom_boxplot(fill = "lightblue", outlier.size = 0.5, outlier.alpha = 0.3) + + labs( + title = sprintf("Expression Distribution - %s", names(expression_sets_log)[1]), + x = "Sample", + y = "Log2 Expression" + ) + + theme_minimal() + + theme( + axis.text.x = element_text(angle = 45, hjust = 1, size = 8), + plot.title = element_text(hjust = 0.5, face = "bold"), + panel.grid.major.x = element_blank() + ) +``` + +### 6.2 Principal Component Analysis (PCA) + +Perform PCA to visualize sample relationships and identify potential batch effects. + +```{r pca_analysis, eval = FALSE, fig.width = 8, fig.height = 6} +# Perform PCA on first dataset +eset <- expression_sets_log[[1]] +expr_matrix <- Biobase::exprs(eset) + +# Remove genes with missing values +expr_matrix <- expr_matrix[complete.cases(expr_matrix), ] + +# Perform PCA +pca_result <- stats::prcomp(t(expr_matrix), scale. = TRUE) + +# Extract PC scores +pca_df <- as.data.frame(pca_result$x[, 1:2]) +pca_df$sample <- rownames(pca_df) + +# Calculate variance explained +var_explained <- summary(pca_result)$importance[2, 1:2] * 100 + +# Create PCA plot +ggplot(pca_df, aes(x = PC1, y = PC2)) + + geom_point(size = 3, alpha = 0.7, color = "darkred") + + geom_text(aes(label = sample), size = 2.5, vjust = -1) + + labs( + title = sprintf("PCA - %s", names(expression_sets_log)[1]), + x = sprintf("PC1 (%.1f%% variance)", var_explained[1]), + y = sprintf("PC2 (%.1f%% variance)", var_explained[2]) + ) + + theme_minimal() + + theme( + plot.title = element_text(hjust = 0.5, face = "bold"), + panel.grid = element_line(color = "gray90") + ) +``` + +### 6.3 Sample Correlation Heatmap + +Calculate and visualize pairwise sample correlations to assess data quality and identify outliers. + +```{r correlation_heatmap, eval = FALSE, fig.width = 10, fig.height = 9} +# Calculate sample correlations +eset <- expression_sets_log[[1]] +expr_matrix <- Biobase::exprs(eset) +expr_matrix <- expr_matrix[complete.cases(expr_matrix), ] + +# Compute correlation matrix +cor_matrix <- cor(expr_matrix, method = "pearson") + +# Convert to long format for ggplot +cor_df <- as.data.frame(as.table(cor_matrix)) +colnames(cor_df) <- c("Sample1", "Sample2", "Correlation") + +# Create heatmap +ggplot(cor_df, aes(x = Sample1, y = Sample2, fill = Correlation)) + + geom_tile() + + scale_fill_gradient2( + low = "blue", mid = "white", high = "red", + midpoint = 0.8, + limits = c(min(cor_df$Correlation), 1), + name = "Pearson\nCorrelation" + ) + + labs( + title = sprintf("Sample Correlation Heatmap - %s", names(expression_sets_log)[1]), + x = "Sample", + y = "Sample" + ) + + theme_minimal() + + theme( + axis.text.x = element_text(angle = 45, hjust = 1, size = 8), + axis.text.y = element_text(size = 8), + plot.title = element_text(hjust = 0.5, face = "bold"), + panel.grid = element_blank() + ) +``` + +### 6.4 Summary Statistics + +Generate comprehensive summary statistics across datasets. + +```{r summary_stats, eval = FALSE} +# Calculate summary statistics for each dataset +summary_stats <- lapply(names(expression_sets_log), function(name) { + eset <- expression_sets_log[[name]] + expr_matrix <- Biobase::exprs(eset) + + data.frame( + dataset = name, + n_samples = ncol(expr_matrix), + n_genes = nrow(expr_matrix), + mean_expr = mean(expr_matrix, na.rm = TRUE), + median_expr = median(expr_matrix, na.rm = TRUE), + sd_expr = sd(as.vector(expr_matrix), na.rm = TRUE), + missing_pct = 100 * sum(is.na(expr_matrix)) / length(expr_matrix) + ) +}) + +summary_table <- dplyr::bind_rows(summary_stats) +print(summary_table) + +# Export summary +write.csv(summary_table, "uc_expression_summary.csv", row.names = FALSE) +``` + +## 7. Data Integration Considerations + +When integrating multiple datasets for meta-analysis: + +```{r integration_tips, eval = FALSE} +# 1. Check for common genes across datasets +common_genes <- Reduce(intersect, lapply(expression_sets_log, function(eset) { + rownames(Biobase::exprs(eset)) +})) + +cat(sprintf("Common genes across all datasets: %d\n", length(common_genes))) + +# 2. Subset to common genes +expression_sets_common <- lapply(expression_sets_log, function(eset) { + eset[common_genes, ] +}) + +# 3. Combine expression matrices +combined_expr <- do.call(cbind, lapply(expression_sets_common, Biobase::exprs)) + +cat(sprintf( + "Combined matrix dimensions: %d genes x %d samples\n", + nrow(combined_expr), + ncol(combined_expr) +)) + +# 4. Create study labels for batch correction +study_labels <- unlist(lapply(seq_along(expression_sets_common), function(i) { + rep(names(expression_sets_common)[i], ncol(Biobase::exprs(expression_sets_common[[i]]))) +})) + +# Note: At this point, you would apply batch correction methods +# such as ComBat (sva package) or limma's removeBatchEffect +``` + +## 8. Visualization: Cross-Dataset Comparison + +Compare expression patterns across multiple datasets. + +```{r cross_dataset, eval = FALSE, fig.width = 10, fig.height = 6} +# Calculate mean expression per dataset +mean_expr_per_dataset <- sapply(expression_sets_log, function(eset) { + rowMeans(Biobase::exprs(eset), na.rm = TRUE) +}) + +# Convert to data frame for plotting +mean_expr_df <- as.data.frame(mean_expr_per_dataset) |> + tibble::rownames_to_column("gene") |> + tidyr::pivot_longer( + cols = -gene, + names_to = "dataset", + values_to = "mean_expression" + ) + +# Create density plot +ggplot(mean_expr_df, aes(x = mean_expression, color = dataset)) + + geom_density(linewidth = 1, alpha = 0.7) + + labs( + title = "Mean Expression Distribution Across Datasets", + x = "Mean Log2 Expression", + y = "Density", + color = "Dataset" + ) + + theme_minimal() + + theme( + plot.title = element_text(hjust = 0.5, face = "bold"), + legend.position = "bottom" + ) +``` + +## 9. Next Steps + +With the downloaded and processed data, you can proceed to: + +1. **Batch correction**: Use ComBat or similar methods to remove technical variation +2. **Differential expression**: Identify genes altered in cancer vs. normal samples +3. **Meta-analysis**: Combine results across studies using fixed/random effects models +4. **Pathway analysis**: Investigate enriched biological processes and pathways +5. **Validation**: Cross-validate findings in independent cohorts + +```{r next_steps, eval = FALSE} +# Example: Extract phenotype data for differential expression +eset <- expression_sets_log[[1]] +pdata <- Biobase::pData(eset) + +# View available phenotype columns +colnames(pdata) + +# Typical workflow would identify tumor vs. normal samples +# and perform differential expression analysis +``` + +## Summary + +This workflow demonstrated: + +- Comprehensive dataset discovery using multiple search terms +- Filtering by sample size (≥20), platform type (microarray), and relevance +- Downloading expression matrices with `geo_matrix()` +- Log2 transformation using `log_trans()` +- Exploratory analysis including: + - Expression distribution boxplots + - Principal Component Analysis + - Sample correlation heatmaps + - Summary statistics +- Data integration considerations for meta-analysis + +These steps provide a solid foundation for conducting rigorous gene expression meta-analysis in urothelial cancer research. + +## Session Information +```{r} +sessionInfo() +``` From a348fbc0a8ecdcdca8c97cc4a836c73adaacddc3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 14 Jan 2026 13:42:15 +0000 Subject: [PATCH 3/6] Add ggplot2 and tidyr to Suggests for new vignettes Co-authored-by: ShixiangWang <25057508+ShixiangWang@users.noreply.github.com> --- DESCRIPTION | 2 ++ 1 file changed, 2 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index 8b4100b..2cee42c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,6 +27,8 @@ Suggests: stats, dplyr, stringr, + tidyr, + ggplot2, R.utils, testthat (>= 3.0.0), knitr, From 976f7e65685a68e30bd3d0fcac31a30c09607280 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 14 Jan 2026 13:44:41 +0000 Subject: [PATCH 4/6] Address code review feedback: fix date extraction, improve tempdir() documentation, and simplify missing value calculation Co-authored-by: ShixiangWang <25057508+ShixiangWang@users.noreply.github.com> --- vignettes/case-diabetic-nephropathy.Rmd | 6 ++++-- vignettes/case-urothelial-cancer.Rmd | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/vignettes/case-diabetic-nephropathy.Rmd b/vignettes/case-diabetic-nephropathy.Rmd index 6c5bfb6..b77c9a0 100644 --- a/vignettes/case-diabetic-nephropathy.Rmd +++ b/vignettes/case-diabetic-nephropathy.Rmd @@ -163,11 +163,13 @@ table(dn_metadb$tissue_type) ### Timeline of Dataset Submissions ```{r timeline, eval = FALSE, fig.width = 8, fig.height = 4} -# Extract submission year +# Extract submission year from GEO accession or date fields +# Note: This assumes availability of date information in metadata +# You may need to adjust based on actual metadata structure dn_metadb <- dn_metadb |> dplyr::mutate( submission_year = as.integer( - stringr::str_extract(Series_pubmed_id, "^\\d{4}") + stringr::str_extract(Series_geo_accession, "\\d{4}$") ) ) diff --git a/vignettes/case-urothelial-cancer.Rmd b/vignettes/case-urothelial-cancer.Rmd index f3d0dba..a1876b5 100644 --- a/vignettes/case-urothelial-cancer.Rmd +++ b/vignettes/case-urothelial-cancer.Rmd @@ -119,6 +119,9 @@ cat("Demo datasets:", paste(demo_gse, collapse = ", "), "\n") ```{r download_data, eval = FALSE} # Download expression matrices +# Note: Using tempdir() for demo. For persistent storage, use a dedicated directory: +# odir <- "uc_expression_data" +# dir.create(odir, showWarnings = FALSE, recursive = TRUE) expression_sets <- lapply(demo_gse, function(gse_id) { cat(sprintf("Downloading %s...\n", gse_id)) tryCatch( @@ -293,7 +296,7 @@ summary_stats <- lapply(names(expression_sets_log), function(name) { mean_expr = mean(expr_matrix, na.rm = TRUE), median_expr = median(expr_matrix, na.rm = TRUE), sd_expr = sd(as.vector(expr_matrix), na.rm = TRUE), - missing_pct = 100 * sum(is.na(expr_matrix)) / length(expr_matrix) + missing_pct = mean(is.na(expr_matrix)) * 100 ) }) From 1e7e1257e0e785e456ec18e71b2a406c8b67c53d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 14 Jan 2026 13:46:53 +0000 Subject: [PATCH 5/6] Fix tibble dependency issue and improve date extraction method to use GSE number as time proxy Co-authored-by: ShixiangWang <25057508+ShixiangWang@users.noreply.github.com> --- vignettes/case-diabetic-nephropathy.Rmd | 24 ++++++++++++++---------- vignettes/case-urothelial-cancer.Rmd | 4 ++-- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/vignettes/case-diabetic-nephropathy.Rmd b/vignettes/case-diabetic-nephropathy.Rmd index b77c9a0..66ecc1e 100644 --- a/vignettes/case-diabetic-nephropathy.Rmd +++ b/vignettes/case-diabetic-nephropathy.Rmd @@ -162,28 +162,32 @@ table(dn_metadb$tissue_type) ### Timeline of Dataset Submissions +**Note**: This visualization requires date information from the metadata. The actual implementation depends on the available date fields in the GEO metadata structure. If submission dates are not available in the metadata, consider alternative visualizations such as chronological ordering by GEO accession number ranges. + ```{r timeline, eval = FALSE, fig.width = 8, fig.height = 4} -# Extract submission year from GEO accession or date fields -# Note: This assumes availability of date information in metadata -# You may need to adjust based on actual metadata structure +# Alternative: Create a proxy year from GEO accession number +# GSE numbers are assigned chronologically, so they can serve as a time proxy +# This is a workaround when actual submission dates are not available dn_metadb <- dn_metadb |> dplyr::mutate( - submission_year = as.integer( - stringr::str_extract(Series_geo_accession, "\\d{4}$") + gse_number = as.integer( + stringr::str_extract(Series_geo_accession, "\\d+") ) ) -# Create timeline plot -ggplot(dn_metadb, aes(x = submission_year)) + - geom_histogram(binwidth = 1, fill = "steelblue", color = "white") + +# Create distribution plot by GSE number (proxy for time) +ggplot(dn_metadb, aes(x = gse_number)) + + geom_histogram(bins = 30, fill = "steelblue", color = "white") + labs( - title = "Timeline of Diabetic Nephropathy GEO Dataset Submissions", - x = "Submission Year", + title = "Distribution of DN Datasets by GEO Accession Number", + subtitle = "GSE numbers are assigned chronologically (lower = older)", + x = "GSE Number", y = "Number of Datasets" ) + theme_minimal() + theme( plot.title = element_text(hjust = 0.5, face = "bold"), + plot.subtitle = element_text(hjust = 0.5, size = 9), panel.grid.minor = element_blank() ) ``` diff --git a/vignettes/case-urothelial-cancer.Rmd b/vignettes/case-urothelial-cancer.Rmd index a1876b5..e76c858 100644 --- a/vignettes/case-urothelial-cancer.Rmd +++ b/vignettes/case-urothelial-cancer.Rmd @@ -178,7 +178,7 @@ expr_matrix <- Biobase::exprs(eset) # Convert to long format for ggplot expr_df <- as.data.frame(expr_matrix) |> - tibble::rownames_to_column("gene") |> + dplyr::mutate(gene = rownames(expr_matrix)) |> tidyr::pivot_longer( cols = -gene, names_to = "sample", @@ -354,7 +354,7 @@ mean_expr_per_dataset <- sapply(expression_sets_log, function(eset) { # Convert to data frame for plotting mean_expr_df <- as.data.frame(mean_expr_per_dataset) |> - tibble::rownames_to_column("gene") |> + dplyr::mutate(gene = rownames(mean_expr_per_dataset)) |> tidyr::pivot_longer( cols = -gene, names_to = "dataset", From 5ddd546775f3c9599e6c443d7af750a379af3818 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 14 Jan 2026 13:48:14 +0000 Subject: [PATCH 6/6] Improve backward compatibility by replacing dplyr::if_any() with grepl() and using size instead of linewidth Co-authored-by: ShixiangWang <25057508+ShixiangWang@users.noreply.github.com> --- vignettes/case-diabetic-nephropathy.Rmd | 20 ++++++++++---------- vignettes/case-urothelial-cancer.Rmd | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/vignettes/case-diabetic-nephropathy.Rmd b/vignettes/case-diabetic-nephropathy.Rmd index 66ecc1e..54be480 100644 --- a/vignettes/case-diabetic-nephropathy.Rmd +++ b/vignettes/case-diabetic-nephropathy.Rmd @@ -131,19 +131,19 @@ Categorize datasets by specific criteria relevant to diabetic nephropathy resear dn_metadb <- dn_metadb |> dplyr::mutate( # Check for nephropathy mentions - has_nephropathy = dplyr::if_any( - c(Series_title, Series_summary, Series_overall_design), - ~ stringr::str_detect(.x, "(?i)nephropath") + has_nephropathy = grepl( + "(?i)nephropath", + paste(Series_title, Series_summary, Series_overall_design) ), # Check for kidney mentions - has_kidney = dplyr::if_any( - c(Series_title, Series_summary, Series_overall_design), - ~ stringr::str_detect(.x, "(?i)kidney|renal") + has_kidney = grepl( + "(?i)kidney|renal", + paste(Series_title, Series_summary, Series_overall_design) ), # Check for biopsy samples - has_biopsy = dplyr::if_any( - c(Series_title, Series_summary, Series_overall_design), - ~ stringr::str_detect(.x, "(?i)biopsy|biopsies") + has_biopsy = grepl( + "(?i)biopsy|biopsies", + paste(Series_title, Series_summary, Series_overall_design) ), # Extract tissue type mentions tissue_type = dplyr::case_when( @@ -199,7 +199,7 @@ ggplot(dn_metadb, aes(x = number_of_samples)) + geom_histogram(binwidth = 5, fill = "coral", color = "white") + geom_vline( xintercept = median(dn_metadb$number_of_samples, na.rm = TRUE), - linetype = "dashed", color = "red", linewidth = 1 + linetype = "dashed", color = "red", size = 1 ) + labs( title = "Distribution of Sample Sizes in DN Datasets", diff --git a/vignettes/case-urothelial-cancer.Rmd b/vignettes/case-urothelial-cancer.Rmd index e76c858..7ee2dd1 100644 --- a/vignettes/case-urothelial-cancer.Rmd +++ b/vignettes/case-urothelial-cancer.Rmd @@ -363,7 +363,7 @@ mean_expr_df <- as.data.frame(mean_expr_per_dataset) |> # Create density plot ggplot(mean_expr_df, aes(x = mean_expression, color = dataset)) + - geom_density(linewidth = 1, alpha = 0.7) + + geom_density(size = 1, alpha = 0.7) + labs( title = "Mean Expression Distribution Across Datasets", x = "Mean Log2 Expression",