diff --git a/NEWS.md b/NEWS.md index cde98bf6..a7f814d2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -71,6 +71,14 @@ `rbind()` / `cbind()` generics. * New API function `nv_flatten()` for flattening to 1-D. +### NA scanning + +* `nv_array()`, `nv_scalar()`, `as_array()`, and the `as.integer()` / + `as.double()` / `as.logical()` / `as.vector()` methods for + `AnvlArray` gained a `check` argument that opts into scanning for + `NA` values during host -> device and device -> host transfers. See + the "Gotchas" vignette. + ### Misc * New `AnvlArray` -> R `vector` converters: `as.numeric()`, diff --git a/R/api.R b/R/api.R index 441d1aa2..3238fe7c 100644 --- a/R/api.R +++ b/R/api.R @@ -101,7 +101,7 @@ nv_broadcast_scalars <- function(...) { target_shape <- non_scalar_shapes[[1L]] if (!all(vapply(non_scalar_shapes, identical, logical(1L), target_shape))) { - shapes <- paste0(sapply(shapes, shape2string), sep = ", ") + shapes <- paste0(sapply(shapes, shape2string), collapse = ", ") cli_abort( "All non-scalar arrays must have the same shape, but got {shapes}. Use {.fn nv_broadcast_arrays} for general broadcasting." # nolint ) diff --git a/R/array.R b/R/array.R index 5731c65f..170259e9 100644 --- a/R/array.R +++ b/R/array.R @@ -50,6 +50,12 @@ #' default column-major order, mirroring [`base::matrix()`]'s `byrow`. #' Only allowed when `data` is an R object — passing an existing #' `AnvlArray` together with `byrow = TRUE` is an error. +#' @param check (`logical(1)`)\cr +#' If `TRUE`, error when `data` contains any `NA` values. XLA has no +#' representation for missing values, so they are otherwise silently +#' coerced to the closest available value of the target dtype (e.g. `NaN` +#' for floats, the bit pattern `-2147483648` for `i32`, `TRUE` for +#' `bool`). Defaults to `FALSE`. See the "Gotchas" vignette. #' @return ([`AnvlArray`]) #' @examplesIf pjrt::plugins_downloaded() #' # A 1-d array (vector) with shape (4). Default type for integers is `i32` @@ -91,8 +97,25 @@ NULL #' @rdname AnvlArray #' @export -nv_array <- function(data, dtype = NULL, device = NULL, shape = NULL, ambiguous = NULL, backend = NULL, byrow = FALSE) { +nv_array <- function( + data, + dtype = NULL, + device = NULL, + shape = NULL, + ambiguous = NULL, + backend = NULL, + byrow = FALSE, + check = FALSE +) { assert_flag(byrow) + assert_flag(check) + if (check && !is_anvl_array(data) && anyNA(data)) { + n_na <- sum(is.na(data)) + cli_abort(c( + "Input {.arg data} contains {n_na} {.val NA} value{?s}, which {?has/have} no representation at the XLA level.", + i = "Replace or drop missing values before transferring, or set {.code check = FALSE} to skip this check." + )) + } if (is_anvl_array(data)) { if (byrow) { cli_abort("{.arg byrow} only applies when constructing an {.cls AnvlArray} from an R object.") @@ -262,8 +285,16 @@ unwrap_if_array <- function(x) { #' @rdname AnvlArray #' @export -nv_scalar <- function(data, dtype = NULL, device = NULL, ambiguous = NULL, backend = NULL) { - nv_array(data, dtype = dtype, device = device, shape = integer(), ambiguous = ambiguous, backend = backend) +nv_scalar <- function(data, dtype = NULL, device = NULL, ambiguous = NULL, backend = NULL, check = FALSE) { + nv_array( + data, + dtype = dtype, + device = device, + shape = integer(), + ambiguous = ambiguous, + backend = backend, + check = check + ) } infer_matrix_dim <- function(n, other, given) { @@ -393,9 +424,19 @@ shape.AnvlArray <- function(x, ...) { globals$backends[[x$backend]]$shape(x) } +#' @rdname as_array +#' @param check (`logical(1)`)\cr +#' If `TRUE`, sanity-check the materialized R vector against losing +#' information across the device-to-host boundary, and abort if any +#' problematic value is detected. Forwarded to the backend; for the +#' `xla` backend the relevant cases are `i32`/`i64` values colliding +#' with the `NA` bit pattern and `ui64` values `>= 2^63` wrapping +#' through `bit64::integer64`. See [`pjrt::as_array.PJRTBuffer()`] for +#' the full list. Defaults to `FALSE`. See the "Gotchas" vignette. #' @export -as_array.AnvlArray <- function(x, ...) { - globals$backends[[x$backend]]$as_array(x) +as_array.AnvlArray <- function(x, check = FALSE, ...) { + assert_flag(check) + globals$backends[[x$backend]]$as_array(x, check = check) } #' @export @@ -427,6 +468,8 @@ await.AnvlArray <- function(x, ...) { #' @param mode (`character(1)`)\cr #' For `as.vector()` only. See [base::as.vector()]. Defaults to `"any"`, #' meaning the natural R type for the array's dtype. +#' @param check (`logical(1)`)\cr +#' Forwarded to [`as_array()`]; see there for details. #' @param ... Unused. #' @return An R vector of the corresponding type (`double`, `integer`, or `logical`). #' @examplesIf pjrt::plugins_downloaded() @@ -441,33 +484,33 @@ NULL #' @rdname as-AnvlArray #' @method as.double AnvlArray #' @export -as.double.AnvlArray <- function(x, ...) { +as.double.AnvlArray <- function(x, check = FALSE, ...) { dt <- dtype(x) if (!(inherits(dt, "FloatType") || inherits(dt, "IntegerType") || inherits(dt, "UIntegerType"))) { cli_abort("{.fn as.double} requires a float or integer dtype, but got {.val {as.character(dt)}}.") } - as.double(as_array(x)) + as.double(as_array(x, check = check)) } #' @rdname as-AnvlArray #' @method as.integer AnvlArray #' @export -as.integer.AnvlArray <- function(x, ...) { +as.integer.AnvlArray <- function(x, check = FALSE, ...) { dt <- dtype(x) if (!(inherits(dt, "IntegerType") || inherits(dt, "UIntegerType"))) { cli_abort("{.fn as.integer} requires a (signed or unsigned) integer dtype, but got {.val {as.character(dt)}}.") } - as.integer(as_array(x)) + as.integer(as_array(x, check = check)) } #' @rdname as-AnvlArray #' @method as.logical AnvlArray #' @export -as.logical.AnvlArray <- function(x, ...) { +as.logical.AnvlArray <- function(x, check = FALSE, ...) { if (!inherits(dtype(x), "BooleanType")) { cli_abort("{.fn as.logical} requires a {.val bool} dtype, but got {.val {as.character(dtype(x))}}.") } - as.logical(as_array(x)) + as.logical(as_array(x, check = check)) } #' @rdname as-AnvlArray diff --git a/R/backend-quickr.R b/R/backend-quickr.R index 73c28631..b26348fe 100644 --- a/R/backend-quickr.R +++ b/R/backend-quickr.R @@ -162,7 +162,7 @@ AnvlBackendQuickr <- function() { dtype = function(x) x$dtype, shape = function(x) x$shape, ambiguous = function(x) x$ambiguous, - as_array = function(x) x$data, + as_array = function(x, check) x$data, as_raw = function(x, row_major) as.raw(x$data), platform = function(x) "cpu", device = function(x) quickr_device("cpu"), diff --git a/R/backend-xla.R b/R/backend-xla.R index 45b02270..b781ce74 100644 --- a/R/backend-xla.R +++ b/R/backend-xla.R @@ -319,7 +319,7 @@ AnvlBackendXla <- function() { dtype = function(x) tengen::dtype(x$data), shape = function(x) tengen::shape(x$data), ambiguous = function(x) x$ambiguous, - as_array = function(x) tengen::as_array(x$data), + as_array = function(x, check) tengen::as_array(x$data, check = check), as_raw = function(x, row_major) tengen::as_raw(x$data, row_major = row_major), platform = function(x) pjrt::platform(x$data), device = function(x) device(x$data), diff --git a/R/backend.R b/R/backend.R index 112552b5..4550ec97 100644 --- a/R/backend.R +++ b/R/backend.R @@ -6,7 +6,10 @@ #' @param dtype (`function`)\cr Extracts the dtype from an AnvlArray. #' @param shape (`function`)\cr Extracts the shape from an AnvlArray. #' @param ambiguous (`function`)\cr Extracts the ambiguous flag from an AnvlArray. -#' @param as_array (`function`)\cr Converts an AnvlArray to an R array. +#' @param as_array (`function(x, check)`)\cr Converts an AnvlArray to an R +#' array. The `check` flag is forwarded from [`as_array()`]; backends may use +#' it to abort when materialization would lose information (e.g. ui64 values +#' wrapping through `bit64::integer64`). See [`pjrt::as_array.PJRTBuffer()`]. #' @param as_raw (`function`)\cr Converts an AnvlArray to raw bytes. #' @param platform (`function`)\cr Returns the platform name (e.g. `"cpu"`). #' @param device (`function`)\cr Returns the device object for an AnvlArray. @@ -141,7 +144,7 @@ register_backend( dtype = function(x) x$dtype, shape = function(x) x$shape, ambiguous = function(x) x$ambiguous, - as_array = function(x) x$data, + as_array = function(x, check) x$data, as_raw = function(x, row_major) cli_abort("as_raw not supported for plain backend"), platform = function(x) "cpu", device = function(x) PlainDeviceCpu(), diff --git a/man/AnvlArray.Rd b/man/AnvlArray.Rd index 33a14cc0..713f5c92 100644 --- a/man/AnvlArray.Rd +++ b/man/AnvlArray.Rd @@ -18,10 +18,18 @@ nv_array( shape = NULL, ambiguous = NULL, backend = NULL, - byrow = FALSE + byrow = FALSE, + check = FALSE ) -nv_scalar(data, dtype = NULL, device = NULL, ambiguous = NULL, backend = NULL) +nv_scalar( + data, + dtype = NULL, + device = NULL, + ambiguous = NULL, + backend = NULL, + check = FALSE +) nv_matrix( data, @@ -100,6 +108,13 @@ default column-major order, mirroring \code{\link[base:matrix]{base::matrix()}}' Only allowed when \code{data} is an R object — passing an existing \code{AnvlArray} together with \code{byrow = TRUE} is an error.} +\item{check}{(\code{logical(1)})\cr +If \code{TRUE}, error when \code{data} contains any \code{NA} values. XLA has no +representation for missing values, so they are otherwise silently +coerced to the closest available value of the target dtype (e.g. \code{NaN} +for floats, the bit pattern \code{-2147483648} for \code{i32}, \code{TRUE} for +\code{bool}). Defaults to \code{FALSE}. See the "Gotchas" vignette.} + \item{nrow}{(\code{NULL} | \code{integer(1)})\cr Number of rows. Inferred from \code{ncol} and the data length if \code{NULL}. Defaults to \code{1} when \code{data} is a scalar.} diff --git a/man/AnvlBackend.Rd b/man/AnvlBackend.Rd index ec55487b..b1e68a38 100644 --- a/man/AnvlBackend.Rd +++ b/man/AnvlBackend.Rd @@ -30,7 +30,10 @@ underlying data (\code{PJRTBuffer} for \code{"xla"} backend, \code{array()} for \item{ambiguous}{(\code{function})\cr Extracts the ambiguous flag from an AnvlArray.} -\item{as_array}{(\code{function})\cr Converts an AnvlArray to an R array.} +\item{as_array}{(\verb{function(x, check)})\cr Converts an AnvlArray to an R +array. The \code{check} flag is forwarded from \code{\link[=as_array]{as_array()}}; backends may use +it to abort when materialization would lose information (e.g. ui64 values +wrapping through \code{bit64::integer64}). See \code{\link[pjrt:as_array.PJRTBuffer]{pjrt::as_array.PJRTBuffer()}}.} \item{as_raw}{(\code{function})\cr Converts an AnvlArray to raw bytes.} diff --git a/man/as-AnvlArray.Rd b/man/as-AnvlArray.Rd index f85646cf..2b7e4f73 100644 --- a/man/as-AnvlArray.Rd +++ b/man/as-AnvlArray.Rd @@ -8,11 +8,11 @@ \alias{as.vector.AnvlArray} \title{Coerce AnvlArray to an R Vector} \usage{ -\method{as.double}{AnvlArray}(x, ...) +\method{as.double}{AnvlArray}(x, check = FALSE, ...) -\method{as.integer}{AnvlArray}(x, ...) +\method{as.integer}{AnvlArray}(x, check = FALSE, ...) -\method{as.logical}{AnvlArray}(x, ...) +\method{as.logical}{AnvlArray}(x, check = FALSE, ...) \method{as.vector}{AnvlArray}(x, mode = "any") } @@ -20,6 +20,9 @@ \item{x}{(\code{\link{AnvlArray}})\cr Array to coerce.} +\item{check}{(\code{logical(1)})\cr +Forwarded to \code{\link[=as_array]{as_array()}}; see there for details.} + \item{...}{Unused.} \item{mode}{(\code{character(1)})\cr diff --git a/man/as_array.Rd b/man/as_array.Rd index 0b00f6e8..ecacd553 100644 --- a/man/as_array.Rd +++ b/man/as_array.Rd @@ -1,15 +1,27 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\name{as_array} +% Please edit documentation in R/array.R, R/reexports.R +\name{as_array.AnvlArray} +\alias{as_array.AnvlArray} \alias{as_array} \title{Convert to an R array} \usage{ +\method{as_array}{AnvlArray}(x, check = FALSE, ...) + as_array(x, ...) } \arguments{ \item{x}{(\code{\link{arrayish}})\cr An array-like object.} +\item{check}{(\code{logical(1)})\cr +If \code{TRUE}, sanity-check the materialized R vector against losing +information across the device-to-host boundary, and abort if any +problematic value is detected. Forwarded to the backend; for the +\code{xla} backend the relevant cases are \code{i32}/\code{i64} values colliding +with the \code{NA} bit pattern and \code{ui64} values \verb{>= 2^63} wrapping +through \code{bit64::integer64}. See \code{\link[pjrt:as_array.PJRTBuffer]{pjrt::as_array.PJRTBuffer()}} for +the full list. Defaults to \code{FALSE}. See the "Gotchas" vignette.} + \item{...}{Additional arguments passed to methods (unused).} } \value{ diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index eb1c16bd..19fd0f4f 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -39,6 +39,8 @@ navbar: href: articles/random-numbers.html - text: Type Promotion href: articles/type-promotion.html + - text: Gotchas + href: articles/gotchas.html - text: Efficiency href: articles/efficiency.html - text: FAQ diff --git a/vignettes/gotchas.Rmd b/vignettes/gotchas.Rmd new file mode 100644 index 00000000..256a310d --- /dev/null +++ b/vignettes/gotchas.Rmd @@ -0,0 +1,159 @@ +--- +title: "Gotchas" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Gotchas} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +This vignette lists various things to be aware of, specifically in relation to base R. + +```{r, include = FALSE} +library(anvl) +``` + +## Row-major vs column-major ordering + +R stores matrices and arrays in *column-major* order, while {anvl} (following XLA) uses *row-major* order. +For most operations, this is an internal implementation detail that does not change the semantics. +However, for reshaping operations such as `nv_flatten()` there is a difference. + +Consider the 2x2 matrix below: + +```{r} +m <- matrix(1:4, nrow = 2) +m +``` + +In base R, `as.vector()` flattens it column-by-column, so we get `1, 2, 3, 4`: + +```{r} +as.vector(m) +``` + +In {anvl}, reshaping to a length-4 vector traverses the data row-by-row, so we get `1, 3, 2, 4`: + +```{r} +nv_flatten(m) +``` + +If you need column-major flattening in {anvl}, transpose first: + +```{r} +nv_flatten(t(m)) +``` + +## No recycling + +Base R *recycles* the shorter operand when two vectors of different lengths are combined elementwise: + +```{r} +c(1, 2, 3, 4) + c(1, 2) +``` + +{anvl} only auto-broadcasts *scalars* (operands with shape `integer()`). +Adding a scalar to an array works as you would expect: + +```{r} +nv_array(1:4) + 10L +``` + +But combining two non-scalar arrays of different shapes errors, even when one shape is a "tile" of the other: + +```{r, error = TRUE} +nv_array(1:4) + nv_array(1:2) +``` + +When two non-scalar arrays differ only by size-1 dimensions (numpy-style broadcasting, e.g. shape `(2, 3)` and `(1, 3)`), use `nv_broadcast_arrays()` to align them explicitly first: + +```{r} +a <- nv_matrix(1:6, nrow = 2) +shape(a) +b <- nv_matrix(c(10, 20, 30), nrow = 1) +shape(b) +xs <- nv_broadcast_arrays(a, b) +lapply(xs, shape) +xs[[1]] + xs[[2]] +``` + +Note that even `nv_broadcast_arrays()` cannot replicate R's recycling for shapes like `(4)` and `(2)` -- the shapes must be broadcast-compatible in the numpy sense. + +## No `NA`s + +R has a dedicated missing-value marker (`NA`) for every atomic type. +{anvl} arrays do not -- there is no representation of "missing" at the XLA level, only `NaN` for floating point numbers. +When you convert R values containing `NA` into an `AnvlArray`, the `NA`s are silently turned into `NaN`s. + +```{r} +nv_array(NA_real_) +``` + +```{r} +nv_array(c(1, NA, 3)) +``` + +Round-tripping back to R is not guaranteed to produce `NA`, but can also yield `NaN`: + +```{r} +as_array(nv_array(c(1, NA, 3))) +``` + +For other data types, the situation is even worse, especially for integers, where R uses the smallest possible value to represent missingness: + +```{r} +nv_scalar(NA_integer_) +``` + +However, when you convert it back, you get a missing value again: + +```{r} +as.integer(nv_scalar(NA_integer_)) +``` + +When creating logicals, `NA` will be interpreted as `TRUE`: + +```{r} +nv_scalar(NA) +as.logical(nv_scalar(NA)) +``` + +In order to avoid these pitfals, array creators such as `nv_array()` have a `check` argument to prevent the above problems. +It is `FALSE` by default, because it needs to scan the complete data. + +```{r, error = TRUE} +nv_array(c(1, NA, 3), check = TRUE) +``` + +The same flag is available for converters like `as_array()`: + +```{r, error = TRUE} +as_array(nv_scalar(NA_integer_), check = TRUE) +``` + +## No unsigned integers + +R's `integer` type is signed 32-bit (range `-2147483648` to `2147483647`). +{anvl} also exposes unsigned integer dtypes (`ui8`, `ui16`, `ui32`, `ui64`) backed by XLA, but R has no native counterpart. +For values that fit into R's signed integer range, the round-trip works as expected: + +```{r} +as_array(nv_array(c(0L, 200L, 255L), dtype = "ui8")) +``` + +Because `ui32` does not fit into R's native integer type, it will be converted to `bit64::integer64` data type: + + +```{r} +big <- nv_array(2147483647L, dtype = "ui32") + 1L +as_array(big) +``` + +However, for `ui64`, we also convert to `integer64`, which does not cover the whole range, so overflow is possible, but can be detected via the `check` flag: + +```{r, error = TRUE} +big <- nv_array(0L, dtype = "ui64") - 1L +big +as_array(big) +as_array(big, check = TRUE) +```