Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ export(elucidate)
export(extract_as_tidygraph)
export(extract_class_axiom)
export(extract_eq_axiom)
export(extract_obo_class)
export(extract_obo_mappings)
export(extract_ordo_mappings)
export(extract_pm_date)
Expand Down
103 changes: 100 additions & 3 deletions R/extract.R
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,99 @@ extract_doid_url <- function(doid_edit, include_obsolete = FALSE,
}


#' Extract Subtree
#' Extract OBO Classes with Parents
#'
#' Extracts the classes and their `rdfs:subClassOf` _asserted_ parents from an
#' OBO ontology.
#'
#' @inheritParams robot_query
#' @param class One or more valid OBO classes (see [is_valid_obo()] for
#' valid input formats). Accepted values may depend on `method`.
#' @param method The method to use when extracting classes. Options include:
#'
#' * `"self"`: Extract the class(es) only.
#'
#' * `"parents"`: Extract the class(es) and _asserted_ parent(s).
#'
#' * `"descendants"` (default): Extract the class(es) and all descendants.
#'
#' * `"ancestors"`: Extract the class(es) and all ancestors.
#'
#' * `"common_ancestor"`: Extract the class(es) and all ancestors up to a common
#' ancestor. The common ancestor could be in the set.
#' @inheritDotParams is_valid_obo allow ns_type
#'
#' @examples
#' \dontrun{
#' # Extract all descendants of 'diabetes mellitus'
#' doid_path <- "<path to doid.owl>"
#' extract_obo_class(doid_path, "DOID:9351")
#'
#' # Extract all ancestors of 'diabetes mellitus'
#' extract_obo_class(doid_path, "DOID:9351", method = "ancestors")
#'
#' # Extract all classes of appendicitis (DOID:8337), alcoholic gastritis
#' (DOID:8680), and viral esophagitis (DOID:6297) up to a common ancestor
#' extract_obo_class(
#' doid_path,
#' c("DOID:8337", "DOID:8680", "DOID:6297"),
#' method = "common_ancestor"
#' )
#' }
#'
#' @returns
#' A [tibble][tibble::tibble] with the columns: `id`, `label`, `parent_id`,
#' and `parent_label`.
#'
#' @seealso [format_subtree()] to arrange data in a tree structure similar to
#' ontology browsers.
#'
#' @export
extract_obo_class <- function(input, class, method = "descendants",
.robot_path = NULL,
tidy_what = c("header", "uri_to_curie"),
...) {
invalid_obo <- !is_valid_obo(class, ...)
if (any(invalid_obo)) {
invalid_n <- sum(invalid_obo)
invalid_pos <- which(invalid_obo)
if (invalid_n > 5) {
invalid_id <- paste0(
invalid_n,
" identifiers are invalid, e.g. positions ",
paste0(invalid_pos[1:3], collapse = ", "),
", etc."
)
} else {
invalid_id <- paste0(
class[invalid_obo], " (pos ", invalid_pos, ")"
)
}

rlang::abort(
c(
"`class` inputs must all be valid OBO identifiers",
purrr::set_names(invalid_id, rlang::rep_along(invalid_id, "x"))
)
)
}
obo_class <- format_obo(class, "obo_curie", ...)

query <- prep_extract_query(obo_class, method)
out <- robot_query(
input,
query,
.robot_path = .robot_path,
tidy_what = tidy_what
)

class(out) <- c("obo_class", class(out))

out
}


#' Extract Subtree (DEPRECATED)
#'
#' Extracts the classes and parents of a DO subtree from a `pyDOID.owl.xml`
#' object.
Expand All @@ -299,10 +391,15 @@ extract_doid_url <- function(doid_edit, include_obsolete = FALSE,
#' and `parent_label`, with one row for each unique combination for each
#' subclass below and including `top_node`.
#'
#' @seealso [format_subtree()] to arrange data in a tree structure similar to
#' ontology browsers.
#' @seealso [extract_obo_class(method = "descendants")][extract_obo_class] for the
#' same result with a single `class`. [format_subtree()] to arrange data in a
#' tree structure similar to ontology browsers.
#'
#' @export
extract_subtree <- function(x, top_node, reload = FALSE) {
rlang::warn(
"`extract_subtree()` is deprecated. Use `extract_obo_class()` instead."
)
owl <- access_owl_xml(x)
assert_string(top_node)

Expand Down
61 changes: 61 additions & 0 deletions R/extract_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,64 @@ subtree_query_glue <- '
rdfs:label ?parent_label .
FILTER(!isblank(?parent))
}}'


# extract_subclass() helpers -------------------------------------------------

prep_extract_query <- function(class, method) {
sparql_stmts <- c(
self = "?iri rdfs:subClassOf{0} ?set .",
parents = "?iri ^rdfs:subClassOf{0,1} ?set .",
descendants = "?iri rdfs:subClassOf* ?set .",
ancestors = "?iri ^rdfs:subClassOf* ?set .",
common_ancestor = "
{
# find the nearest common ancestor
SELECT ?ancestor_iri (COUNT(?mid) AS ?distance)
WHERE {
?ancestor_iri ^rdfs:subClassOf* ?mid .
?mid ^rdfs:subClassOf* !<<set_list>>! .
FILTER(!isBlank(?ancestor_iri))
}
GROUP BY ?ancestor_iri
ORDER BY ?distance
LIMIT 1
}

# get info for classes in path from set to ancestor
VALUES ?set { !<<set>>! }

?iri rdfs:subClassOf* ?ancestor_iri ;
^rdfs:subClassOf* ?set ."
)
opts <- names(sparql_stmts)
if (!method %in% opts || length(method) != 1) {
rlang::abort(
c(
paste0(
"`method` must be one of: ",
paste0("'", opts, "'", collapse = ", ")),
x = paste0("'", method, "'", collapse = ", ")
)
)
}

set <- paste0(class, collapse = " ")
sparql_selector <- sparql_stmts[method]
if (method == "common_ancestor") {
set_list <- paste0(class, collapse = ", ")
sparql_selector <- glueV(sparql_selector, set_list = set_list)
}

q_path <- system.file(
"sparql", "template-subclass-set.rq",
package = "DO.utils",
mustWork = TRUE
)
query <- glueV(
readr::read_file(q_path),
sparql_selector = sparql_selector,
set = set
)
query
}
10 changes: 8 additions & 2 deletions R/format.R
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,15 @@ format_doid <- function(x, as = "curie", validate = TRUE,
#' @export
format_subtree <- function(subtree_df, top_node) {
rlang::check_installed("tidygraph", reason = "to use `format_subtree()`")
if ("obo_class" %in% class(subtree_df)) {
id_string <- "iri"
} else {
id_string <- "id"
}

top_class <- format_doid(top_node, as = "curie")
tg <- as_subtree_tidygraph(subtree_df, top_class)
formatted <- pivot_subtree(tg, top_class)
tg <- as_subtree_tidygraph(subtree_df, top_class, id_string)
formatted <- pivot_subtree(tg, top_class, id_string)

formatted
}
Expand Down
54 changes: 29 additions & 25 deletions R/format_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,39 @@
#' [disease-ontology.org](https://disease-ontology.org/).
#'
#' @inheritParams format_subtree
#' @param parent_col The column with 'parent' class IDs.
#'
#' @family format_subtree() helpers
#' @noRd
as_subtree_tidygraph <- function(subtree_df, top_node) {
as_subtree_tidygraph <- function(subtree_df, top_node, id_string) {
parent_col <- paste0("parent_", id_string)
# keep all parent info in labels
label_df <- collapse_col_flex(
subtree_df,
"parent_id",
"parent_label"
)
label_df <- collapse_col(subtree_df, c(parent_col, "parent_label"))

# exclude parents which are not subclasses of top_node (usually due to
# multi-parentage)
df <- dplyr::filter(subtree_df, .data$parent_id %in% .data$id)
df <- dplyr::filter(subtree_df, .data[[parent_col]] %in% .data[[id_string]])

# fill in subclasses where multi-parentage is within tree
df <- fill_subclass(df)
df <- fill_subclass(df, id_string)

# create tidygraph
tg <- df %>%
dplyr::select("id", "parent_id") %>%
dplyr::select(dplyr::all_of(c(id_string, parent_col))) %>%
tidygraph::as_tbl_graph() %>%
# fix needed for labels to match correctly
tidygraph::activate("nodes") %>%
dplyr::mutate(
name = dplyr::if_else(
.data$name %in% label_df$id,
.data$name %in% label_df[[id_string]],
.data$name,
stringr::str_remove(.data$name, "-[0-9]+$")
)
) %>%
# add labels
tidygraph::left_join(
label_df,
by = c("name" = "id")
by = c("name" = id_string)
)

tg
Expand All @@ -57,10 +55,12 @@ as_subtree_tidygraph <- function(subtree_df, top_node) {
#'
#' @param subtree_tg A subtree tidygraph from [as_subtree_tidygraph()].
#' @inheritParams format_subtree
#' @inheritParams as_subtree_tidygraph
#'
#' @family format_subtree() helpers
#' @noRd
pivot_subtree <- function(subtree_tg, top_node) {
pivot_subtree <- function(subtree_tg, top_node, id_string) {
parent_col <- paste0("parent_", id_string)

# ensure alphabetical order of classes to match disease-ontology.org
tg <- tidygraph::arrange(subtree_tg, .data$label)
Expand Down Expand Up @@ -90,7 +90,7 @@ pivot_subtree <- function(subtree_tg, top_node) {
dplyr::mutate(duplicated = all_duplicated(.data$name)) %>%
# mv supporting info to left & tree to right
dplyr::select(
"parent_id", "parent_label", id = "name",
dplyr::all_of(parent_col), "parent_label", id = "name",
dplyr::everything()
) %>%
tidyr::pivot_wider(
Expand All @@ -110,12 +110,12 @@ pivot_subtree <- function(subtree_tg, top_node) {
#' ensures that the subclasses appear each time their parent/superclass does.
#'
#' @inheritParams as_subtree_tidygraph
#'
#' @inheritParams as_subtree_tidygraph
#' @family format_subtree() > as_subtree_tidygraph() helpers
#' @noRd
fill_subclass <- function(subtree_df) {

not_dup <- dplyr::filter(subtree_df, !duplicated(.data$id))
fill_subclass <- function(subtree_df, id_string) {
parent_col <- paste0("parent_", id_string)
not_dup <- dplyr::filter(subtree_df, !duplicated(.data[[id_string]]))

lvl <- 1
res_n <- 1L
Expand All @@ -124,14 +124,14 @@ fill_subclass <- function(subtree_df) {
while (res_n > 0) {
if (lvl == 1) {
new_rows[[lvl]] <- subtree_df %>%
dplyr::filter(duplicated(.data$id)) %>%
dplyr::mutate(id_new = paste(.data$id, lvl, sep = "-"))
dplyr::filter(duplicated(.data[[id_string]])) %>%
dplyr::mutate(id_new = paste(.data[[id_string]], lvl, sep = "-"))
} else {
new_rows[[lvl]] <- subtree_df %>%
dplyr::filter(.data$parent_id %in% new_rows[[lvl - 1]]$id) %>%
dplyr::filter(.data[[parent_col]] %in% new_rows[[lvl - 1]]$id) %>%
dplyr::mutate(
id_new = paste(.data$id, lvl, sep = "-"),
parent_id_new = paste(.data$parent_id, lvl - 1, sep = "-")
id_new = paste(.data[[id_string]], lvl, sep = "-"),
parent_id_new = paste(.data[[parent_col]], lvl - 1, sep = "-")
)
}
res_n <- nrow(new_rows[[lvl]])
Expand All @@ -149,14 +149,18 @@ fill_subclass <- function(subtree_df) {
# https://github.com/tidyverse/funs/issues/54#issuecomment-892377998
id = do.call(
dplyr::coalesce,
rev(dplyr::across(dplyr::starts_with("id")))
rev(dplyr::across(dplyr::starts_with(id_string)))
),
parent_id = do.call(
dplyr::coalesce,
rev(dplyr::across(dplyr::starts_with("parent_id")))
rev(dplyr::across(dplyr::starts_with(parent_col)))
)
) %>%
dplyr::select("id", "label", "parent_id", "parent_label")
dplyr::select(
dplyr::all_of(
c(id_string, "label", parent_col, "parent_label")
)
)

filled_df
}
Expand Down
18 changes: 18 additions & 0 deletions inst/sparql/template-subclass-set.rq
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Return class & parent IDs and labels for all classes in a set
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
PREFIX obo: <http://purl.obolibrary.org/obo/>

SELECT DISTINCT ?iri ?label ?parent_iri ?parent_label
WHERE {
VALUES ?set { !<<set>>! }

!<<sparql_selector>>!

?iri rdfs:label ?label .

OPTIONAL{
?iri rdfs:subClassOf ?parent_iri .
?parent_iri rdfs:label ?parent_label .
}
}
Loading
Loading