diff --git a/NAMESPACE b/NAMESPACE index 729e87b9..e8ce17e3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -65,6 +65,7 @@ export(elucidate) export(extract_as_tidygraph) export(extract_class_axiom) export(extract_eq_axiom) +export(extract_obo_class) export(extract_obo_mappings) export(extract_ordo_mappings) export(extract_pm_date) diff --git a/R/extract.R b/R/extract.R index 05464696..03893ecc 100644 --- a/R/extract.R +++ b/R/extract.R @@ -283,7 +283,99 @@ extract_doid_url <- function(doid_edit, include_obsolete = FALSE, } -#' Extract Subtree +#' Extract OBO Classes with Parents +#' +#' Extracts the classes and their `rdfs:subClassOf` _asserted_ parents from an +#' OBO ontology. +#' +#' @inheritParams robot_query +#' @param class One or more valid OBO classes (see [is_valid_obo()] for +#' valid input formats). Accepted values may depend on `method`. +#' @param method The method to use when extracting classes. Options include: +#' +#' * `"self"`: Extract the class(es) only. +#' +#' * `"parents"`: Extract the class(es) and _asserted_ parent(s). +#' +#' * `"descendants"` (default): Extract the class(es) and all descendants. +#' +#' * `"ancestors"`: Extract the class(es) and all ancestors. +#' +#' * `"common_ancestor"`: Extract the class(es) and all ancestors up to a common +#' ancestor. The common ancestor could be in the set. +#' @inheritDotParams is_valid_obo allow ns_type +#' +#' @examples +#' \dontrun{ +#' # Extract all descendants of 'diabetes mellitus' +#' doid_path <- "" +#' extract_obo_class(doid_path, "DOID:9351") +#' +#' # Extract all ancestors of 'diabetes mellitus' +#' extract_obo_class(doid_path, "DOID:9351", method = "ancestors") +#' +#' # Extract all classes of appendicitis (DOID:8337), alcoholic gastritis +#' (DOID:8680), and viral esophagitis (DOID:6297) up to a common ancestor +#' extract_obo_class( +#' doid_path, +#' c("DOID:8337", "DOID:8680", "DOID:6297"), +#' method = "common_ancestor" +#' ) +#' } +#' +#' @returns +#' A [tibble][tibble::tibble] with the columns: `id`, `label`, `parent_id`, +#' and `parent_label`. +#' +#' @seealso [format_subtree()] to arrange data in a tree structure similar to +#' ontology browsers. +#' +#' @export +extract_obo_class <- function(input, class, method = "descendants", + .robot_path = NULL, + tidy_what = c("header", "uri_to_curie"), + ...) { + invalid_obo <- !is_valid_obo(class, ...) + if (any(invalid_obo)) { + invalid_n <- sum(invalid_obo) + invalid_pos <- which(invalid_obo) + if (invalid_n > 5) { + invalid_id <- paste0( + invalid_n, + " identifiers are invalid, e.g. positions ", + paste0(invalid_pos[1:3], collapse = ", "), + ", etc." + ) + } else { + invalid_id <- paste0( + class[invalid_obo], " (pos ", invalid_pos, ")" + ) + } + + rlang::abort( + c( + "`class` inputs must all be valid OBO identifiers", + purrr::set_names(invalid_id, rlang::rep_along(invalid_id, "x")) + ) + ) + } + obo_class <- format_obo(class, "obo_curie", ...) + + query <- prep_extract_query(obo_class, method) + out <- robot_query( + input, + query, + .robot_path = .robot_path, + tidy_what = tidy_what + ) + + class(out) <- c("obo_class", class(out)) + + out +} + + +#' Extract Subtree (DEPRECATED) #' #' Extracts the classes and parents of a DO subtree from a `pyDOID.owl.xml` #' object. @@ -299,10 +391,15 @@ extract_doid_url <- function(doid_edit, include_obsolete = FALSE, #' and `parent_label`, with one row for each unique combination for each #' subclass below and including `top_node`. #' -#' @seealso [format_subtree()] to arrange data in a tree structure similar to -#' ontology browsers. +#' @seealso [extract_obo_class(method = "descendants")][extract_obo_class] for the +#' same result with a single `class`. [format_subtree()] to arrange data in a +#' tree structure similar to ontology browsers. +#' #' @export extract_subtree <- function(x, top_node, reload = FALSE) { + rlang::warn( + "`extract_subtree()` is deprecated. Use `extract_obo_class()` instead." + ) owl <- access_owl_xml(x) assert_string(top_node) diff --git a/R/extract_helpers.R b/R/extract_helpers.R index a6896e4b..70be33d3 100644 --- a/R/extract_helpers.R +++ b/R/extract_helpers.R @@ -21,3 +21,64 @@ subtree_query_glue <- ' rdfs:label ?parent_label . FILTER(!isblank(?parent)) }}' + + +# extract_subclass() helpers ------------------------------------------------- + +prep_extract_query <- function(class, method) { + sparql_stmts <- c( + self = "?iri rdfs:subClassOf{0} ?set .", + parents = "?iri ^rdfs:subClassOf{0,1} ?set .", + descendants = "?iri rdfs:subClassOf* ?set .", + ancestors = "?iri ^rdfs:subClassOf* ?set .", + common_ancestor = " + { + # find the nearest common ancestor + SELECT ?ancestor_iri (COUNT(?mid) AS ?distance) + WHERE { + ?ancestor_iri ^rdfs:subClassOf* ?mid . + ?mid ^rdfs:subClassOf* !<>! . + FILTER(!isBlank(?ancestor_iri)) + } + GROUP BY ?ancestor_iri + ORDER BY ?distance + LIMIT 1 + } + + # get info for classes in path from set to ancestor + VALUES ?set { !<>! } + + ?iri rdfs:subClassOf* ?ancestor_iri ; + ^rdfs:subClassOf* ?set ." + ) + opts <- names(sparql_stmts) + if (!method %in% opts || length(method) != 1) { + rlang::abort( + c( + paste0( + "`method` must be one of: ", + paste0("'", opts, "'", collapse = ", ")), + x = paste0("'", method, "'", collapse = ", ") + ) + ) + } + + set <- paste0(class, collapse = " ") + sparql_selector <- sparql_stmts[method] + if (method == "common_ancestor") { + set_list <- paste0(class, collapse = ", ") + sparql_selector <- glueV(sparql_selector, set_list = set_list) + } + + q_path <- system.file( + "sparql", "template-subclass-set.rq", + package = "DO.utils", + mustWork = TRUE + ) + query <- glueV( + readr::read_file(q_path), + sparql_selector = sparql_selector, + set = set + ) + query +} diff --git a/R/format.R b/R/format.R index 400999bf..1d4746e9 100644 --- a/R/format.R +++ b/R/format.R @@ -152,9 +152,15 @@ format_doid <- function(x, as = "curie", validate = TRUE, #' @export format_subtree <- function(subtree_df, top_node) { rlang::check_installed("tidygraph", reason = "to use `format_subtree()`") + if ("obo_class" %in% class(subtree_df)) { + id_string <- "iri" + } else { + id_string <- "id" + } + top_class <- format_doid(top_node, as = "curie") - tg <- as_subtree_tidygraph(subtree_df, top_class) - formatted <- pivot_subtree(tg, top_class) + tg <- as_subtree_tidygraph(subtree_df, top_class, id_string) + formatted <- pivot_subtree(tg, top_class, id_string) formatted } diff --git a/R/format_helpers.R b/R/format_helpers.R index 1f303aa4..925c5cf6 100644 --- a/R/format_helpers.R +++ b/R/format_helpers.R @@ -8,33 +8,31 @@ #' [disease-ontology.org](https://disease-ontology.org/). #' #' @inheritParams format_subtree +#' @param parent_col The column with 'parent' class IDs. #' #' @family format_subtree() helpers #' @noRd -as_subtree_tidygraph <- function(subtree_df, top_node) { +as_subtree_tidygraph <- function(subtree_df, top_node, id_string) { + parent_col <- paste0("parent_", id_string) # keep all parent info in labels - label_df <- collapse_col_flex( - subtree_df, - "parent_id", - "parent_label" - ) + label_df <- collapse_col(subtree_df, c(parent_col, "parent_label")) # exclude parents which are not subclasses of top_node (usually due to # multi-parentage) - df <- dplyr::filter(subtree_df, .data$parent_id %in% .data$id) + df <- dplyr::filter(subtree_df, .data[[parent_col]] %in% .data[[id_string]]) # fill in subclasses where multi-parentage is within tree - df <- fill_subclass(df) + df <- fill_subclass(df, id_string) # create tidygraph tg <- df %>% - dplyr::select("id", "parent_id") %>% + dplyr::select(dplyr::all_of(c(id_string, parent_col))) %>% tidygraph::as_tbl_graph() %>% # fix needed for labels to match correctly tidygraph::activate("nodes") %>% dplyr::mutate( name = dplyr::if_else( - .data$name %in% label_df$id, + .data$name %in% label_df[[id_string]], .data$name, stringr::str_remove(.data$name, "-[0-9]+$") ) @@ -42,7 +40,7 @@ as_subtree_tidygraph <- function(subtree_df, top_node) { # add labels tidygraph::left_join( label_df, - by = c("name" = "id") + by = c("name" = id_string) ) tg @@ -57,10 +55,12 @@ as_subtree_tidygraph <- function(subtree_df, top_node) { #' #' @param subtree_tg A subtree tidygraph from [as_subtree_tidygraph()]. #' @inheritParams format_subtree +#' @inheritParams as_subtree_tidygraph #' #' @family format_subtree() helpers #' @noRd -pivot_subtree <- function(subtree_tg, top_node) { +pivot_subtree <- function(subtree_tg, top_node, id_string) { + parent_col <- paste0("parent_", id_string) # ensure alphabetical order of classes to match disease-ontology.org tg <- tidygraph::arrange(subtree_tg, .data$label) @@ -90,7 +90,7 @@ pivot_subtree <- function(subtree_tg, top_node) { dplyr::mutate(duplicated = all_duplicated(.data$name)) %>% # mv supporting info to left & tree to right dplyr::select( - "parent_id", "parent_label", id = "name", + dplyr::all_of(parent_col), "parent_label", id = "name", dplyr::everything() ) %>% tidyr::pivot_wider( @@ -110,12 +110,12 @@ pivot_subtree <- function(subtree_tg, top_node) { #' ensures that the subclasses appear each time their parent/superclass does. #' #' @inheritParams as_subtree_tidygraph -#' +#' @inheritParams as_subtree_tidygraph #' @family format_subtree() > as_subtree_tidygraph() helpers #' @noRd -fill_subclass <- function(subtree_df) { - - not_dup <- dplyr::filter(subtree_df, !duplicated(.data$id)) +fill_subclass <- function(subtree_df, id_string) { + parent_col <- paste0("parent_", id_string) + not_dup <- dplyr::filter(subtree_df, !duplicated(.data[[id_string]])) lvl <- 1 res_n <- 1L @@ -124,14 +124,14 @@ fill_subclass <- function(subtree_df) { while (res_n > 0) { if (lvl == 1) { new_rows[[lvl]] <- subtree_df %>% - dplyr::filter(duplicated(.data$id)) %>% - dplyr::mutate(id_new = paste(.data$id, lvl, sep = "-")) + dplyr::filter(duplicated(.data[[id_string]])) %>% + dplyr::mutate(id_new = paste(.data[[id_string]], lvl, sep = "-")) } else { new_rows[[lvl]] <- subtree_df %>% - dplyr::filter(.data$parent_id %in% new_rows[[lvl - 1]]$id) %>% + dplyr::filter(.data[[parent_col]] %in% new_rows[[lvl - 1]]$id) %>% dplyr::mutate( - id_new = paste(.data$id, lvl, sep = "-"), - parent_id_new = paste(.data$parent_id, lvl - 1, sep = "-") + id_new = paste(.data[[id_string]], lvl, sep = "-"), + parent_id_new = paste(.data[[parent_col]], lvl - 1, sep = "-") ) } res_n <- nrow(new_rows[[lvl]]) @@ -149,14 +149,18 @@ fill_subclass <- function(subtree_df) { # https://github.com/tidyverse/funs/issues/54#issuecomment-892377998 id = do.call( dplyr::coalesce, - rev(dplyr::across(dplyr::starts_with("id"))) + rev(dplyr::across(dplyr::starts_with(id_string))) ), parent_id = do.call( dplyr::coalesce, - rev(dplyr::across(dplyr::starts_with("parent_id"))) + rev(dplyr::across(dplyr::starts_with(parent_col))) ) ) %>% - dplyr::select("id", "label", "parent_id", "parent_label") + dplyr::select( + dplyr::all_of( + c(id_string, "label", parent_col, "parent_label") + ) + ) filled_df } diff --git a/inst/sparql/template-subclass-set.rq b/inst/sparql/template-subclass-set.rq new file mode 100644 index 00000000..a6777091 --- /dev/null +++ b/inst/sparql/template-subclass-set.rq @@ -0,0 +1,18 @@ +# Return class & parent IDs and labels for all classes in a set +PREFIX rdfs: +PREFIX oboInOwl: +PREFIX obo: + +SELECT DISTINCT ?iri ?label ?parent_iri ?parent_label +WHERE { + VALUES ?set { !<>! } + + !<>! + + ?iri rdfs:label ?label . + + OPTIONAL{ + ?iri rdfs:subClassOf ?parent_iri . + ?parent_iri rdfs:label ?parent_label . + } +} diff --git a/man/extract_obo_class.Rd b/man/extract_obo_class.Rd new file mode 100644 index 00000000..4e4b359b --- /dev/null +++ b/man/extract_obo_class.Rd @@ -0,0 +1,102 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extract.R +\name{extract_obo_class} +\alias{extract_obo_class} +\title{Extract OBO Classes with Parents} +\usage{ +extract_obo_class( + input, + class, + method = "descendants", + .robot_path = NULL, + tidy_what = c("header", "uri_to_curie"), + ... +) +} +\arguments{ +\item{input}{The path to an RDF/OWL file recognized by ROBOT, as a string.} + +\item{class}{One or more valid OBO classes (see \code{\link[=is_valid_obo]{is_valid_obo()}} for +valid input formats). Accepted values may depend on \code{method}.} + +\item{method}{The method to use when extracting classes. Options include: +\itemize{ +\item \code{"self"}: Extract the class(es) only. +\item \code{"parents"}: Extract the class(es) and \emph{asserted} parent(s). +\item \code{"descendants"} (default): Extract the class(es) and all descendants. +\item \code{"ancestors"}: Extract the class(es) and all ancestors. +\item \code{"common_ancestor"}: Extract the class(es) and all ancestors up to a common +ancestor. The common ancestor could be in the set. +}} + +\item{.robot_path}{The path to a ROBOT executable or .jar file, as a string. +When \code{NULL} (default), if a system ROBOT executable is available it will +be used, otherwise an error will be signaled. + +\strong{NOTE:} \code{DO.utils} caches the last ROBOT used for future use.} + +\item{tidy_what}{The elements of a SPARQL-created data.frame to tidy, as a +character vector. One or more of the following: +\itemize{ +\item \code{"everything"} to apply all tidy operations (has precedence over +\code{"nothing"}). +\item \code{"header"} to remove leading \verb{?} from header labels. +\item \code{"unnest"} to unnest list columns with \code{\link[=unnest_cross]{unnest_cross()}}. +\item \code{"uri_to_curie"} to convert all URIs recognized by DO.utils to CURIEs with +\code{\link[=to_curie]{to_curie()}}. +\item \code{"lgl_NA_FALSE"} to replace \code{NA} in logical columns with \code{FALSE}. +\item \code{"as_tibble"} to make the output a \link[tibble:tibble]{tibble}. +\item \code{"rm_lang_tag"} to remove language tags. Tags will only be removed from +\code{character} class columns, and then only if there is one unique language tag +in the given column. +\item \code{"nothing"} to prevent all tidying. +}} + +\item{...}{ + Arguments passed on to \code{\link[=is_valid_obo]{is_valid_obo}} + \describe{ + \item{\code{allow}}{The OBO ID format(s) to consider valid; as a character vector. +One or more of: \code{"curie"}, \code{"obo_curie"} (e.g. \code{obo:DOID_4}), \code{"uri"}, +\code{""} (URI in angle brackets), or \code{"ns.lui"} (i.e. an OBO CURIE without +\verb{obo:}). + +\code{"standard"} (default) is a generic grouping that represents syntactically +correct formats (\code{"curie"}, \code{"obo_curie"}, \code{"uri"} and \code{""}). + +\code{"ns.lui"} will only be included if explicitly specified.} + \item{\code{ns_type}}{The type of OBO namespaces to consider valid; as a character +vector. One of: \code{"obo"} (default), \code{"ont"} (ontology primary namespaces +only), or \code{"prop"} (ontology property namespaces only; may not be exhaustive).} + }} +} +\value{ +A \link[tibble:tibble]{tibble} with the columns: \code{id}, \code{label}, \code{parent_id}, +and \code{parent_label}. +} +\description{ +Extracts the classes and their \code{rdfs:subClassOf} \emph{asserted} parents from an +OBO ontology. +} +\examples{ +\dontrun{ +# Extract all descendants of 'diabetes mellitus' +doid_path <- "" +extract_obo_class(doid_path, "DOID:9351") + +# Extract all ancestors of 'diabetes mellitus' +extract_obo_class(doid_path, "DOID:9351", method = "ancestors") + +# Extract all classes of appendicitis (DOID:8337), alcoholic gastritis +(DOID:8680), and viral esophagitis (DOID:6297) up to a common ancestor +extract_obo_class( + doid_path, + c("DOID:8337", "DOID:8680", "DOID:6297"), + method = "common_ancestor" +) +} + +} +\seealso{ +\code{\link[=format_subtree]{format_subtree()}} to arrange data in a tree structure similar to +ontology browsers. +} diff --git a/man/extract_subtree.Rd b/man/extract_subtree.Rd index 2517fcc8..2f9fd390 100644 --- a/man/extract_subtree.Rd +++ b/man/extract_subtree.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/extract.R \name{extract_subtree} \alias{extract_subtree} -\title{Extract Subtree} +\title{Extract Subtree (DEPRECATED)} \usage{ extract_subtree(x, top_node, reload = FALSE) } @@ -26,6 +26,7 @@ Extracts the classes and parents of a DO subtree from a \code{pyDOID.owl.xml} object. } \seealso{ -\code{\link[=format_subtree]{format_subtree()}} to arrange data in a tree structure similar to -ontology browsers. +\link[=extract_obo_class]{extract_obo_class(method = "descendants")} for the +same result with a single \code{class}. \code{\link[=format_subtree]{format_subtree()}} to arrange data in a +tree structure similar to ontology browsers. }