Skip to content
10 changes: 8 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,19 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Fixed

- `downsample_data` will now default to the number of markers or cells in the data if `n_markers` or `n_cells` are higher than available in the data.

## [0.11.2] 2026-06-15

### Changes
### Changed

- `component_hashing` now returns a sample confidence plot with either hash purity or hash enrichment factor, depending on which metric is present in the data.

### Removed
### Removed

- `harmony` has been removed, and `do_harmonize` is no longer an option.

Expand Down
87 changes: 73 additions & 14 deletions R/read_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -277,14 +277,19 @@ merge_data <-

#' Downsample data to a specified number of cells and markers
#'
#' Downsamples the data to a specified number of cells and markers, ensuring that control markers are always included.
#' Downsamples the data to a specified number of cells and markers,
#' ensuring that control markers are always included.
#' If a sample has fewer than `n_cells`, all available cells are kept for that sample.
#' If fewer non-control markers are available than requested, all available non-control markers are kept.
#' `control_markers` are always kept.
#' In these cases, the function warns and proceeds instead of failing.
#'
#' @param pg_data A Seurat object containing the data to be downsampled.
#' @param control_markers A character vector of control markers to always include in the downsampled data.
#' @param n_cells An integer specifying the number of cells to keep in each sample.
#' @param n_markers An integer specifying the total number of markers to keep in the downsampled data.
#'
#' @return A downsampled Seurat object with the specified number of cells and markers.
#' @return A downsampled Seurat object with selected cells and markers.
#'
#' @export
#'
Expand All @@ -295,24 +300,78 @@ downsample_data <-
n_markers = 20) {
set.seed(37)

keep_cells <-
pixelatorR:::assert_class(pg_data, "Seurat")
pixelatorR:::assert_vector(control_markers, "character", allow_null = TRUE)
pixelatorR:::assert_single_value(n_cells, "integer")
pixelatorR:::assert_single_value(n_markers, "integer")

if (length(control_markers) > 0) {
pixelatorR:::assert_x_in_y(control_markers, rownames(pg_data))
}

control_markers <- unique(control_markers)

# Downsample cells
cell_data <-
FetchData(pg_data, "sample_alias") %>%
as_tibble(rownames = "cell_id") %>%
as_tibble(rownames = "cell_id")

available_cells <-
cell_data %>%
count(sample_alias, name = "n_available")

low_cell_samples <-
available_cells %>%
filter(n_available < n_cells)

if (nrow(low_cell_samples) > 0) {
cli::cli_warn(
c(
"Requested {.val {n_cells}} cells per sample, but some samples have fewer cells.",
"i" = "Using all available cells for those samples."
)
)
}
Comment thread
ptajvar marked this conversation as resolved.

keep_cells <-
cell_data %>%
group_by(sample_alias) %>%
slice_sample(n = n_cells) %>%
dplyr::group_modify(~ slice_sample(.x, n = min(nrow(.x), n_cells))) %>%
ungroup() %>%
pull(cell_id)
Comment thread
ptajvar marked this conversation as resolved.

pixelatorR:::assert_x_in_y(control_markers, rownames(pg_data))

# Downsample markers
markers <- rownames(pg_data)
non_control_markers <-
setdiff(markers, control_markers)

target_non_control <- max(n_markers - length(control_markers), 0)
available_non_control <- length(non_control_markers)


if (target_non_control > available_non_control) {
cli::cli_warn(
c(
"Requested {.val {target_non_control}} non-control markers,",
"but only {.val {available_non_control}} are available.",
"i" = "Using all available non-control markers."
)
)
}

n_non_control <- min(target_non_control, available_non_control)

sampled_non_control <-
if (n_non_control > 0) {
sample(non_control_markers, size = n_non_control, replace = FALSE)
} else {
character(0)
}

keep_markers <-
rownames(pg_data) %>%
{
.[!. %in% control_markers]
} %>%
{
.[sample(seq_along(.), size = n_markers - length(control_markers), replace = FALSE)]
} %>%
union(control_markers)
c(control_markers, sampled_non_control) %>%
unique()

pg_data <-
pg_data[keep_markers, keep_cells]
Expand Down
9 changes: 7 additions & 2 deletions man/downsample_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions tests/testthat/test_read_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,33 @@ test_that("File reading works as expected", {
expect_s4_class(seur_down, "Seurat")
expect_equal(dim(seur_down), c(5, 6))

# Edge case 1: fewer cells than requested in at least one sample
expect_warning(
seur_down_low_cells <- downsample_data(
seur_comb,
control_markers = c("mIgG1", "mIgG2a", "mIgG2b"),
n_cells = 1000,
n_markers = 5
),
"fewer cells"
)
expect_equal(ncol(seur_down_low_cells), ncol(seur_comb))

# Edge case 2: fewer non-control markers available than requested
all_markers <- rownames(seur_comb)
expect_gt(length(all_markers), 1)
control_set <- all_markers[-length(all_markers)]
expect_warning(
seur_down_low_markers <- downsample_data(
seur_comb,
control_markers = control_set,
n_cells = 3,
n_markers = nrow(seur_comb) + 5
),
"non-control markers"
)
expect_equal(nrow(seur_down_low_markers), nrow(seur_comb))

# Sample sheet reading
expect_no_error(sample_sheet <- read_samplesheet(test_samplesheet()))
expect_equal(
Expand Down
Loading