diff --git a/CHANGELOG.md b/CHANGELOG.md index cdfba55..d16a87f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,13 +5,19 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed + + - `downsample_data` will now default to the number of markers or cells in the data if `n_markers` or `n_cells` are higher than available in the data. + ## [0.11.2] 2026-06-15 -### Changes +### Changed - `component_hashing` now returns a sample confidence plot with either hash purity or hash enrichment factor, depending on which metric is present in the data. - ### Removed +### Removed - `harmony` has been removed, and `do_harmonize` is no longer an option. diff --git a/R/read_data.R b/R/read_data.R index f9f7d45..9ff1fe1 100644 --- a/R/read_data.R +++ b/R/read_data.R @@ -277,14 +277,19 @@ merge_data <- #' Downsample data to a specified number of cells and markers #' -#' Downsamples the data to a specified number of cells and markers, ensuring that control markers are always included. +#' Downsamples the data to a specified number of cells and markers, +#' ensuring that control markers are always included. +#' If a sample has fewer than `n_cells`, all available cells are kept for that sample. +#' If fewer non-control markers are available than requested, all available non-control markers are kept. +#' `control_markers` are always kept. +#' In these cases, the function warns and proceeds instead of failing. #' #' @param pg_data A Seurat object containing the data to be downsampled. #' @param control_markers A character vector of control markers to always include in the downsampled data. #' @param n_cells An integer specifying the number of cells to keep in each sample. #' @param n_markers An integer specifying the total number of markers to keep in the downsampled data. #' -#' @return A downsampled Seurat object with the specified number of cells and markers. +#' @return A downsampled Seurat object with selected cells and markers. #' #' @export #' @@ -295,24 +300,78 @@ downsample_data <- n_markers = 20) { set.seed(37) - keep_cells <- + pixelatorR:::assert_class(pg_data, "Seurat") + pixelatorR:::assert_vector(control_markers, "character", allow_null = TRUE) + pixelatorR:::assert_single_value(n_cells, "integer") + pixelatorR:::assert_single_value(n_markers, "integer") + + if (length(control_markers) > 0) { + pixelatorR:::assert_x_in_y(control_markers, rownames(pg_data)) + } + + control_markers <- unique(control_markers) + + # Downsample cells + cell_data <- FetchData(pg_data, "sample_alias") %>% - as_tibble(rownames = "cell_id") %>% + as_tibble(rownames = "cell_id") + + available_cells <- + cell_data %>% + count(sample_alias, name = "n_available") + + low_cell_samples <- + available_cells %>% + filter(n_available < n_cells) + + if (nrow(low_cell_samples) > 0) { + cli::cli_warn( + c( + "Requested {.val {n_cells}} cells per sample, but some samples have fewer cells.", + "i" = "Using all available cells for those samples." + ) + ) + } + + keep_cells <- + cell_data %>% group_by(sample_alias) %>% - slice_sample(n = n_cells) %>% + dplyr::group_modify(~ slice_sample(.x, n = min(nrow(.x), n_cells))) %>% + ungroup() %>% pull(cell_id) - pixelatorR:::assert_x_in_y(control_markers, rownames(pg_data)) + + # Downsample markers + markers <- rownames(pg_data) + non_control_markers <- + setdiff(markers, control_markers) + + target_non_control <- max(n_markers - length(control_markers), 0) + available_non_control <- length(non_control_markers) + + + if (target_non_control > available_non_control) { + cli::cli_warn( + c( + "Requested {.val {target_non_control}} non-control markers,", + "but only {.val {available_non_control}} are available.", + "i" = "Using all available non-control markers." + ) + ) + } + + n_non_control <- min(target_non_control, available_non_control) + + sampled_non_control <- + if (n_non_control > 0) { + sample(non_control_markers, size = n_non_control, replace = FALSE) + } else { + character(0) + } keep_markers <- - rownames(pg_data) %>% - { - .[!. %in% control_markers] - } %>% - { - .[sample(seq_along(.), size = n_markers - length(control_markers), replace = FALSE)] - } %>% - union(control_markers) + c(control_markers, sampled_non_control) %>% + unique() pg_data <- pg_data[keep_markers, keep_cells] diff --git a/man/downsample_data.Rd b/man/downsample_data.Rd index 842dfaf..99c8b8f 100644 --- a/man/downsample_data.Rd +++ b/man/downsample_data.Rd @@ -16,8 +16,13 @@ downsample_data(pg_data, control_markers = NULL, n_cells = 50, n_markers = 20) \item{n_markers}{An integer specifying the total number of markers to keep in the downsampled data.} } \value{ -A downsampled Seurat object with the specified number of cells and markers. +A downsampled Seurat object with selected cells and markers. } \description{ -Downsamples the data to a specified number of cells and markers, ensuring that control markers are always included. +Downsamples the data to a specified number of cells and markers, +ensuring that control markers are always included. +If a sample has fewer than \code{n_cells}, all available cells are kept for that sample. +If fewer non-control markers are available than requested, all available non-control markers are kept. +\code{control_markers} are always kept. +In these cases, the function warns and proceeds instead of failing. } diff --git a/tests/testthat/test_read_data.R b/tests/testthat/test_read_data.R index 3359098..5ae3816 100644 --- a/tests/testthat/test_read_data.R +++ b/tests/testthat/test_read_data.R @@ -209,6 +209,33 @@ test_that("File reading works as expected", { expect_s4_class(seur_down, "Seurat") expect_equal(dim(seur_down), c(5, 6)) + # Edge case 1: fewer cells than requested in at least one sample + expect_warning( + seur_down_low_cells <- downsample_data( + seur_comb, + control_markers = c("mIgG1", "mIgG2a", "mIgG2b"), + n_cells = 1000, + n_markers = 5 + ), + "fewer cells" + ) + expect_equal(ncol(seur_down_low_cells), ncol(seur_comb)) + + # Edge case 2: fewer non-control markers available than requested + all_markers <- rownames(seur_comb) + expect_gt(length(all_markers), 1) + control_set <- all_markers[-length(all_markers)] + expect_warning( + seur_down_low_markers <- downsample_data( + seur_comb, + control_markers = control_set, + n_cells = 3, + n_markers = nrow(seur_comb) + 5 + ), + "non-control markers" + ) + expect_equal(nrow(seur_down_low_markers), nrow(seur_comb)) + # Sample sheet reading expect_no_error(sample_sheet <- read_samplesheet(test_samplesheet())) expect_equal(