diff --git a/NAMESPACE b/NAMESPACE index 729e87b9..a326e735 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,8 @@ S3method(as_tibble,esummary_list) S3method(as_tibble,esummary_list_nested) S3method(as_tibble,scopus_search) S3method(as_tibble,scopus_search_list) +S3method(curation_template,"NULL") +S3method(curation_template,obo_data) S3method(drop_blank,character) S3method(drop_blank,list) S3method(elucidate,omim_inventory) @@ -34,6 +36,7 @@ S3method(tidy_pub_records,scopus_search_list) S3method(to_character,data.frame) S3method(to_character,default) S3method(to_character,list) +S3method(write_gs,curation_template) S3method(write_gs,data.frame) S3method(write_gs,omim_inventory) export("%>%") @@ -56,6 +59,7 @@ export(confine_list) export(count_alliance_records) export(count_delim) export(cur_yr) +export(curation_template) export(download_alliance_tsv) export(download_file) export(download_obo_ontology) @@ -65,6 +69,8 @@ export(elucidate) export(extract_as_tidygraph) export(extract_class_axiom) export(extract_eq_axiom) +export(extract_obo_anon) +export(extract_obo_data) export(extract_obo_mappings) export(extract_ordo_mappings) export(extract_pm_date) @@ -163,5 +169,6 @@ importFrom(methods,new) importFrom(rentrez,set_entrez_key) importFrom(rlang,":=") importFrom(rlang,.data) +importFrom(rlang,.env) importFrom(tibble,as_tibble) importFrom(tidyr,replace_na) diff --git a/R/curation.R b/R/curation.R new file mode 100644 index 00000000..a0fe4fce --- /dev/null +++ b/R/curation.R @@ -0,0 +1,341 @@ +#' Create a Curation Template +#' +#' Create a curation template in a Google Sheet, optionally including data. +#' +#' @inheritParams googlesheets4::range_write +#' @param .data Data to add to the curation sheet. If `NULL` (default), an empty +#' curation sheet will be created. +#' @param sheet (OPTIONAL) The sheet name, as a string. If `NULL` (default), the +#' sheet name will default to "curation-" with today's date appended (formatted +#' as "%Y%m%d"; see [format.Date()]). +#' @param ... Additional arguments passed to methods. +#' +#' @returns The Google Sheet info (`ss`), as a [googlesheets4::sheets_id]. +#' +#' @section Formatting Limitations: +#' Formatting to make data more visually distinct is not currently supported due +#' to limitations of the Google Sheets API and the `googlesheets4` package: +#' * Google Sheets API does not support assigning colors to data validation. +#' * `googlesheets4` does not support any formatting. +#' +#' An alternative approach to support some formatting could be to create a +#' functional template with the desired formatting and copy that template with +#' [googlesheets4::sheet_copy()]. The `data_type` could still be populated by +#' this function (only needed to support types not in +#' `.curation_opts$data_type`). +#' +#' @export +curation_template <- function(.data = NULL, ss = NULL, sheet = NULL, ...) { + UseMethod("curation_template", .data) +} + +#' @param nrow The number of rows to create in the curation template when +#' `.data = NULL` (default: `50`). +#' +#' @export +#' @rdname curation_template +curation_template.NULL <- function(.data = NULL, ss = NULL, sheet = NULL, ..., + nrow = 50) { + val <- rep(NA, nrow) + + # inspired by https://stackoverflow.com/a/60495352/6938922 + cur_df <- tibble::as_tibble(rlang::rep_named(curation_cols, list(val))) + + class(cur_df) <- c("curation_template", class(cur_df)) + if (is.null(sheet)) sheet <- paste0("curation-", format(Sys.Date(), "%Y%m%d")) + gs_info <- googlesheets4::write_sheet(cur_df, ss, sheet) + + if (is.null(ss)) ss <- gs_info + set_curation_validation(cur_df, ss, sheet) + + invisible(gs_info) +} + +#' @param id_max The maximum number of unique classes to include (default: `20`). +#' @param n_id_sep The number of blank rows to insert between each `id` group +#' (default: `2`). +#' @param debug Controls debug output. `FALSE` (default) writes to Google Sheets +#' normally. One or more of: +#' * `"output"`: returns the final data frame visibly instead of writing to +#' Google Sheets. +#' * `"types"`: returns a list with `$matched` (named character vector where +#' names are the original predicate strings and values are the resolved +#' `data_type` labels, as mapped by `.sparql_dt_motif`) and `$unmatched` +#' (character vector of predicates not in `.sparql_dt_motif`, used as-is). +#' When combined with `"steps"`, the list is added as `$types` in that output. +#' Combine with `"output"` to also return the final data frame. +#' * `"steps"`: returns a named list of snapshots at each major pipeline step +#' (`filtered`, `pivoted`, `typed`, `output`); implies `"output"`. If `"types"` +#' is also requested, includes `$types` in the returned list. +#' +#' @export +#' @rdname curation_template +curation_template.obo_data <- function(.data, ss = NULL, sheet = NULL, ..., + id_max = 20, n_id_sep = 2L, + debug = FALSE) { + if (!isFALSE(debug)) { + debug <- match.arg( + debug, + choices = c("output", "types", "steps"), + several.ok = TRUE + ) + } + step_filtered <- filter_max_ids(.data, id_max) + step_pivoted <- pivot_obo_to_curation(step_filtered) + + # resolve data_type values via .sparql_dt_motif + step_typed <- step_pivoted |> + # collapse_col(value) |> # does nothing... probably don't want to collapse + dplyr::mutate( + data_type = dplyr::coalesce( + .sparql_dt_motif[.data$data_type], + .data$data_type + ) + ) + + # sort, finalise, and add id separators + cur_df <- step_typed |> + sort_by_curation_dt() |> + dplyr::mutate( + id = dplyr::if_else(duplicated(.data$id), NA_character_, .data$id), + # set default action for existing data + action = "retain" + ) |> + append_empty_col(curation_cols, order = TRUE) |> + add_id_sep(n = n_id_sep) + + class(cur_df) <- c("curation_template", class(cur_df)) + + # debug paths: never write to Google Sheets + if (!isFALSE(debug)) { + types_info <- NULL + if ("types" %in% debug) { + raw_types <- unique(step_pivoted$data_type) + matched <- raw_types[raw_types %in% names(.sparql_dt_motif)] + unmatched <- raw_types[!raw_types %in% names(.sparql_dt_motif)] + # $matched: names are original predicates, values are resolved data_types + # $unmatched: predicates not in .sparql_dt_motif, used as-is + types_info <- list( + matched = .sparql_dt_motif[matched], + unmatched = unmatched + ) + } + if ("steps" %in% debug) { + out <- list( + filtered = step_filtered, + pivoted = step_pivoted, + typed = step_typed, + output = cur_df + ) + if (!is.null(types_info)) out$types <- types_info + return(out) + } + if ("types" %in% debug) return(types_info) + return(cur_df) + } + + if (is.null(sheet)) sheet <- paste0("curation-", format(Sys.Date(), "%Y%m%d")) + gs_info <- googlesheets4::write_sheet(cur_df, ss, sheet) + + if (is.null(ss)) ss <- gs_info + set_curation_validation(cur_df, ss, sheet) + + invisible(gs_info) +} + + +# helpers -------------------------------------------------------------------- + +# Filter obo_data to the first id_max unique IDs; informs the user if any are +# excluded, listing up to 10 by name. +filter_max_ids <- function(.data, id_max) { + if (!is.numeric(id_max) || length(id_max) != 1L || id_max < 1L) { + rlang::abort("`id_max` must be a single positive integer.") + } + all_ids <- unique(.data$id) + incl_ids <- utils::head(all_ids, id_max) + excl_ids <- all_ids[!all_ids %in% incl_ids] + if (length(excl_ids) > 0) { + max_show <- 10L + if (length(excl_ids) <= max_show) { + excl_txt <- paste(excl_ids, collapse = ", ") + } else { + excl_txt <- paste0( + paste(utils::head(excl_ids, max_show), collapse = ", "), + ", ... and ", length(excl_ids) - max_show, " more" + ) + } + rlang::inform( + paste0( + length(incl_ids), " of ", length(all_ids), + " unique IDs included (id_max = ", id_max, ").", + "\nExcluded: ", excl_txt + ) + ) + } + dplyr::filter(.data, .data$id %in% incl_ids) +} + + +# Pivot obo_data to the long curation format: resolves compound predicate +# strings (handling oboInOwl:hasSynonymType annotations specially), pivots +# axiom columns into rows, arranges, renames to data_type/curation_notes, +# and deduplicates. +pivot_obo_to_curation <- function(.data) { + .data |> + # need smarter indexing... I think, not currently used (see below) + dplyr::mutate( + index = dplyr::dense_rank(paste0(.data$predicate, .data$value)), + .by = "id", + .before = "id" + ) |> + # convert predicate & axiom_predicate to patterns in .sparql_dt_motif + dplyr::mutate( + predicate = dplyr::if_else( + !is.na(.data$axiom_predicate) & .data$axiom_predicate == "oboInOwl:hasSynonymType", + paste0(.data$predicate, "-", .data$axiom_value), + .data$predicate + ), + axiom_predicate = dplyr::if_else( + !is.na(.data$axiom_predicate) & .data$axiom_predicate != "oboInOwl:hasSynonymType", + paste0(.data$predicate, "-", .data$axiom_predicate), + NA_character_ + ), + # removes axiom value where predicate is updated (redundant) + axiom_value = dplyr::if_else( + is.na(.data$axiom_predicate), + NA_character_, + .data$axiom_value + ) + ) |> + tidyr::pivot_longer( + cols = -c("index", "id"), + names_to = ".value", + names_prefix = "^axiom_", + values_drop_na = TRUE + ) |> + dplyr::arrange( + .data$id, + .data$index, + stringr::str_length(.data$predicate) + ) |> + dplyr::rename(data_type = "predicate", "curation_notes" = "extra") |> + # for now, just remove index --> need to use for sorting at some point + dplyr::select(-"index") |> + unique() +} + + +# Insert n blank rows between each id group (identified by non-NA id values). +add_id_sep <- function(.data, n = 2L) { + grp_id <- cumsum(!is.na(.data$id)) + groups <- split(.data, grp_id) + blanks <- .data[rep(NA_integer_, n), ] + purrr::reduce(groups[-1], ~ dplyr::bind_rows(.x, blanks, .y), .init = groups[[1]]) +} + + +# Sort data_type values within each id group per .curation_opts ordering. +# data_type values not found in .curation_opts are placed at the end. +# The existing order of id groups is preserved (not sorted alphabetically). +sort_by_curation_dt <- function(.data) { + dt_order <- .curation_opts$data_type + .data |> + dplyr::mutate( + .grp = dplyr::consecutive_id(.data$id), + .dt_rank = match(.data$data_type, dt_order, nomatch = length(dt_order) + 1L) + ) |> + dplyr::arrange(.data$.grp, .data$.dt_rank) |> + dplyr::select(-c(".grp", ".dt_rank")) +} + + +### define expected columns for curation template (in order) ### + +# full set of curations columns +curation_cols <- c( + "id", "data_type", "value", "action", "curation_notes", "links", + "action_notes" +) + +#' Curation Action +#' +#' Values used to establish `action` data validation in Google Sheets +#' [curation templates][curation_template()]. +#' +#' * `retain`: data already in ontology that should be kept; this is the default +#' `action` for existing data when creating a [curation_template()] +#' +#' * `add`: new data that should be added +#' +#' * `remove`: existing ontology data that should be removed +#' +#' * `exclude`: data relevant to the ontology that should be actively excluded +#' (e.g. an incorrect mapping) -- details should be included in `action_notes` +#' +#' * `ignore`: data not for active inclusion or exclusion that should be ignored +#' (e.g. dubious synonyms, incomplete curation data) +#' +#' * `restore`: data that was removed from the ontology and should be added back +#' +#' @keywords internal +curation_action <- c("retain", "add", "remove", "exclude", "ignore", "restore") + + +# Set Data Validation for Curation Templates +set_curation_validation <- function(cur_df, ss, sheet) { + # add data_type validation + dt_range <- spreadsheet_range(cur_df, "data_type") + range_add_dropdown(ss, sheet, dt_range, values = .curation_opts$data_type) + + # add action validation + action_range <- spreadsheet_range(cur_df, "action") + range_add_dropdown(ss, sheet, action_range, values = curation_action) + + # freeze first two columns + googlesheets4::with_gs4_quiet( + googlesheets4:::sheet_freeze(ss, sheet = sheet, ncol = 2) + ) +} + +#' Calculate a Spreadsheet Range +#' +#' Calculate a range for a spreadsheet program (Google Sheets or Excel). +#' +#' @inheritParams curation_template +#' @param .data A tibble. +#' @param .col The column to use for the range, as a string. +#' @param rows (OPTIONAL) The rows to use for the range, either as a continous +#' integer vector or as a string (i.e. "1:10"). If `NULL` (default), the entire +#' column will be used. +#' @param n_header The number of header rows to skip (default: `1`). +#' +#' @keywords internal +spreadsheet_range <- function(.data, .col, sheet = NULL, rows = NULL, + n_header = 1) { + col_letter <- colnum_to_ss_letter(which(names(.data) == .col)) + if (length(col_letter) != 1) { + rlang::abort("Exactly one column must be specified in `.col`") + } + + if (is.null(rows)) { + row_ends <- c(1, nrow(.data)) + n_header + } else if (is.numeric(rows)) { + # check one continuous range + collapsed_range <- to_range(rows, sep = c(",", ":")) + if (stringr::str_count(collapsed_range, "[,:]") > 1) { + rlang::abort( + c("`rows` must be one continuous range", x = collapsed_range) + ) + } + row_ends <- c(rows[1], utils::tail(rows, 1)) + n_header + } else { + row_ends <- as.integer(stringr::str_split(row_ends, ":")[[1]]) + n_header + } + + range <- paste0(col_letter, row_ends, collapse = ":") + if (!is.null(sheet)) { + range <- paste0(sheet, "!", range) + } + range +} diff --git a/R/extract.R b/R/extract.R index 05464696..0ab7af0c 100644 --- a/R/extract.R +++ b/R/extract.R @@ -611,3 +611,195 @@ extract_obo_mappings <- function(onto_path, id = NULL, version_as = "release", out } + + +#' Extract Anonymous Relationships from OBO Foundry Ontology +#' +#' Extracts all anonymous relations (logical/complex) from any OBO Foundry +#' Ontology in Manchester format, including equivalent classes, subclasses, and +#' disjoint classes. `extract_obo_anon()` is designed to supplement SPARQL +#' queries that generally cannot return anonymous relationships. +#' +#' @param obo_ont The path to an ontology file, as a string. +#' @param prefix A character vector of OBO prefixes (aka ID spaces) to filter +#' results to, or `NULL` (default) to return all axioms. _Ignored if `id` is_ +#' _provided._ +#' @param id A character vector of OBO IDs (CURIEs) to filter results to or +#' `NULL` (default) to return all entities with logical relations. +#' @param render The format for rendering classes & properties, as a string. +#' One of: +#' * `"label"` (default): Use labels, quoting as needed. +#' * `"id"`: Use OBO IDs (CURIEs). +#' @inheritParams robot +#' +#' @returns A tibble with the columns: `id`, `data_type`, and `value`, where `value` +#' is the axiom in Manchester syntax rendered according to `format`. +#' +#' @section NOTES: +#' Uses [ROBOT export](https://robot.obolibrary.org/export) internally. +#' +#' @export +extract_obo_anon <- function(obo_ont, prefix = NULL, id = NULL, + render = "label", .robot_path = NULL) { + render <- match.arg(render, c("label", "id")) + + temp <- tempfile(fileext = ".tsv") + robot( + "export", + i = obo_ont, + header = '"ID|LABEL|Equivalent Class [ANON ID]|SubClass Of [ANON ID]|Disjoint With [ANON ID]"', + include = '"classes properties"', + export = temp, + .robot_path = .robot_path + ) + .df <- readr::read_tsv( + temp, + col_names = c("id", "label", "owl:equivalentClass", "rdfs:subClassOf", "owl:disjointWith"), + skip = 1, + show_col_types = FALSE + ) + + if (!is.null(id)) { + out <- dplyr::filter(.df, .data$id %in% .env$id) + } else if (!is.null(prefix)) { + pattern <- sandwich_text(paste0(prefix, collapse = "|"), c("^(", ")")) + out <- dplyr::filter( + .df, + stringr::str_detect(.data$id, pattern) + ) + } else { + out <- .df + } + + out <- tidyr::pivot_longer( + out, + cols = c("owl:equivalentClass", "rdfs:subClassOf", "owl:disjointWith"), + names_to = "predicate", + values_to = "value", + values_drop_na = TRUE + ) + + if (render == "label") { + # subset to IDs actually in axioms + id_keep <- stringr::str_extract_all( + out$value, + "[A-Za-z0-9_]+:[A-Za-z0-9_#]+" + ) |> + unlist() |> + unique() + label_df <- .df |> + dplyr::select("id", "label") |> + dplyr::filter(.data$id %in% id_keep & !is.na(.data$label)) |> + dplyr::mutate( + label = dplyr::case_when( + stringr::str_detect(.data$label, "'") ~ sandwich_text(.data$label, '"'), + stringr::str_detect(.data$label, "[^[:alnum:]]") ~ sandwich_text(.data$label, "'"), + TRUE ~ .data$label + ) + ) |> + dplyr::arrange(-stringr::str_length(.data$id), .data$id) + label_replace <- purrr::set_names( + label_df$label, + label_df$id + ) + out <- dplyr::mutate( + out, + value = stringr::str_replace_all( + .data$value, + "[A-Za-z0-9_]+:[A-Za-z0-9_#]+", + # direct selection via replace fxn ~200x faster than + # label_replace used directly; returns input if ID not found + function(x) dplyr::coalesce(label_replace[x], x) + ) + ) + } + + ### FUTURE WORK -- identify subclass anon subtypes? ### + lengthen_col(out, "value") +} + +#' Extract OBO Foundry Ontology Data +#' +#' Extracts data from an OBO Foundry ontology. +#' +#' @param obo_ont The path to an ontology file, as a string. +#' @param prefix A character vector of OBO prefixes (aka ID spaces) to filter +#' results to, or `NULL` (default) to return all axioms. _Ignored if `id` is_ +#' _provided._ +#' @param id A character vector of OBO IDs (CURIEs) to filter results to or +#' `NULL` (default) to return all entities with logical relations. +#' @param include_anon Whether to include anonymous relationships +#' (logical/complex) in the output, as a boolean (default: `TRUE`). See +#' [extract_obo_anon()]. +#' @inheritDotParams extract_obo_anon +#' @inheritParams robot +#' +#' @returns A tibble of class `obo_data` with the columns: `id`, `predicate`, +#' `value`, `axiom predicate`, and `axiom value`. An additional class is added +#' to indicate the file the data came from (without extension or directories). +#' +#' @section NOTES: +#' Uses [ROBOT query](https://robot.obolibrary.org/query) internally. +#' +#' @export +extract_obo_data <- function(obo_ont, prefix = NULL, id = NULL, + include_anon = TRUE, ..., .robot_path = NULL) { + query <- system.file( + "sparql", + "obo-data.rq", + package = "DO.utils", + mustWork = TRUE + ) |> + readr::read_file() + + if (!is.null(id)) { + iri <- to_uri(id) |> + sandwich_text(c("<", ">")) + values_stmt <- paste0(iri, collapse = " ") |> + sandwich_text(c("VALUES ?id { ", " }")) + query <- glue::glue( + query, + values = values_stmt, + filter = "", + .open = "#@", + .close = "#" + ) + } else if (!is.null(prefix)) { + filter_stmt <- paste0(prefix, collapse = "|") |> + sandwich_text(c('FILTER(CONTAINS(str(?id), "', '_")')) + query <- glue::glue( + query, + values = "", + filter = filter_stmt, + .open = "#@", + .close = "#" + ) + } + + qres <- robot_query( + input = obo_ont, + query = query, + tidy_what = c("header", "uri_to_curie"), + col_types = readr::cols(.default = readr::col_character()), + .robot_path = .robot_path + ) + + if (!include_anon) { + out <- qres + } else { + anon <- extract_obo_anon( + obo_ont, + prefix = prefix, + id = id, + ..., + .robot_path = .robot_path + ) |> + dplyr::select("id", "predicate", "value") + + out <- dplyr::bind_rows(qres, anon) + } + + ont_src <- tools::file_path_sans_ext(basename(obo_ont)) + class(out) <- c(ont_src, "obo_data", class(out)) + out +} diff --git a/R/googlesheets.R b/R/googlesheets.R new file mode 100644 index 00000000..5242f750 --- /dev/null +++ b/R/googlesheets.R @@ -0,0 +1,85 @@ +# For information about data validation see, +# https://github.com/tidyverse/googlesheets4/blob/main/R/range_add_validation.R + +#' Add Data Validation to Google Sheet Range +#' +#' Add data validation to a Google Sheet range. +#' +#' @inheritParams googlesheets4::range_write +#' @param range Cells to apply data validation to. This `range` argument has +#' important similarities and differences to `range` elsewhere (e.g. +#' [googlesheets4::range_read()]): +#' * Similarities: Can be a cell range, using A1 notation ("A1:D3") or using +#' the helpers in [googlesheets4::cell-specification]. Can combine sheet +#' name and cell range ("Sheet1!A5:A") or refer to a sheet by name +#' (`range = "Sheet1"`, although `sheet = "Sheet1"` is preferred for clarity). +#' * Difference: Can NOT be a named range. +#' @param msg The message to display when the user types in a value that +#' violates the data validation rule. For `range_add_dropdown()`, only displayed +#' if `reject_input` is `TRUE`. +#' @param values The values to use for the dropdown list, as a character vector. +#' @param reject_input Whether to "Reject the input" (default: `TRUE`) if a +#' value violates the data validation rule or "Show a warning" (`FALSE`). +#' @param display_arrow Whether to display a dropdown arrow next to the cell +#' (default: `TRUE`) or not (`FALSE`). +#' @name range_add_validation +NULL + +#' @rdname range_add_validation +#' @keywords internal +range_add_checkbox <- function(ss, sheet = NULL, range, + msg = "Value must be TRUE or FALSE", + quiet = TRUE) { + rule <- googlesheets4:::new( + "DataValidationRule", + condition = googlesheets4:::new_BooleanCondition(type = "BOOLEAN"), + inputMessage = msg, + strict = TRUE, # same as in range_add_dropdown(), FALSE doesn't make sense + showCustomUi = TRUE # seems to be ignored + ) + + if (quiet) { + .fn <- function(...) { + googlesheets4::with_gs4_quiet(googlesheets4:::range_add_validation(...)) + } + } else { + .fn <- function(...) { + googlesheets4:::range_add_validation(...) + } + } + + .fn(ss = ss, sheet = sheet, range = range, rule = rule) +} + +#' @rdname range_add_validation +#' @section Limitations of the Google Sheets API/`googlesheets4`: +#' - The API does not support chipset multi-selection in dropdowns: +#' https://stackoverflow.com/questions/79653536/how-to-enable-multiple-selection-in-data-validation-dropdown-using-google-sheets +#' @keywords internal +range_add_dropdown <- function(ss, sheet = NULL, range, values, + msg = "Choose a valid value", + reject_input = TRUE, display_arrow = TRUE, + quiet = TRUE) { + rule <- googlesheets4:::new( + "DataValidationRule", + condition = googlesheets4:::new_BooleanCondition( + type = "ONE_OF_LIST", + values = values + ), + inputMessage = msg, + strict = reject_input, + showCustomUi = display_arrow + ) + + if (quiet) { + .fn <- function(...) { + googlesheets4::with_gs4_quiet(googlesheets4:::range_add_validation(...)) + } + } else { + .fn <- function(...) { + googlesheets4:::range_add_validation(...) + } + } + + .fn(ss = ss, sheet = sheet, range = range, rule = rule) +} diff --git a/R/sysdata.rda b/R/sysdata.rda index 8fbd5b36..7efedf42 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/R/utils-spreadsheet.R b/R/utils-spreadsheet.R new file mode 100644 index 00000000..4dd6b1c1 --- /dev/null +++ b/R/utils-spreadsheet.R @@ -0,0 +1,30 @@ +#' Convert Column Number to Spreadsheet Column Letter +#' +#' Convert a column numeric position into the corresponding spreadsheet column +#' letter reference (e.g. 1 -> "A", 27 -> "AA"). +#' +#' @param x The column number, as a single, positive whole number. +#' @return An spreadsheet column letter, as a string. +#' +#' @section Notes: +#' Modified from a vectorized version created by GPT-5 mini (2025-12-17). +#' +#' @examples +#' colnum_to_ss_letter(1) +#' sapply(c(26, 27, 52, 703), colnum_to_ss_letter) +#' +#' @noRd +colnum_to_ss_letter <- function(x) { + if (!is_scalar_whole_number(x) || !is_positive(x) || is.na(x)) { + rlang::abort("`x` must be a single, positive whole number") + } + + x <- as.integer(x) + pieces <- character() + while (x > 0L) { + r <- (x - 1L) %% 26L + pieces <- c(LETTERS[r + 1L], pieces) + x <- (x - 1L) %/% 26L + } + paste0(pieces, collapse = "") +} diff --git a/R/utils-tidyverse.R b/R/utils-tidyverse.R index a293d2ae..be70f7d9 100644 --- a/R/utils-tidyverse.R +++ b/R/utils-tidyverse.R @@ -29,6 +29,7 @@ NULL #' @name rlang_imports #' @noRd #' @importFrom rlang .data +#' @importFrom rlang .env #' @importFrom rlang := NULL diff --git a/R/write.R b/R/write.R index 926d1e20..97a78137 100644 --- a/R/write.R +++ b/R/write.R @@ -121,3 +121,31 @@ write_gs.data.frame <- function(data, ss, sheet = "data-%Y%m%d", invisible(gs_info) } + +#' @rdname write_gs +#' @export +write_gs.curation_template <- function(data, ss = NULL, + sheet = "curation-%Y%m%d", ...) { + data <- dplyr::mutate(data, links = format_hyperlink(data$links, as = "gs")) + + if (is.null(sheet)) { + sheet_nm <- "curation" + } else { + sheet_nm <- format(Sys.Date(), sheet) + } + + gs_info <- googlesheets4::write_sheet(data, ss, sheet) + + if (is.null(ss)) ss <- gs_info + + # add curation template data validation + gs_range <- spreadsheet_range(data, "annotation", sheet = sheet) + range_add_dropdown(ss, gs_range, values = .curation_opts$header) + + # freeze first two columns + googlesheets4::with_gs4_quiet( + googlesheets4:::sheet_freeze(ss, sheet = sheet, ncol = 2) + ) + + invisible(gs_info) +} diff --git a/data-raw/curation_opts.csv b/data-raw/curation_opts.csv index 46a16c17..39b500e5 100644 --- a/data-raw/curation_opts.csv +++ b/data-raw/curation_opts.csv @@ -1,57 +1,65 @@ -header,template,type,example,optional_values,alternate_title,alternate_format,notes -iri/curie,ID,required manual,DOID:0080943,IRI or CURIE,NA,NA,NA -label,AL rdfs:label@en,required manual,"46,XX sex reversal 5",NA,NA,NA,NA -parent iri/curie,SC % SPLIT=|,required manual,DOID:0111760,disease by infectious agent,CI,IRI or CURIE; CI means Class IRI --> type will be CLASS_TYPE,NA -definition,AL obo:IAO_0000115@en,required manual,"A 46,XX sex reversal that is characterized by genital virilization in 46,XX individuals, associated with congenital heart disease and variable somatic anomalies including blepharophimosis-ptosis-epicanthus inversus syndrome and congenital diaphragmatic hernia and that has_material_basis_in heterozygous mutation in the NR2F2 gene on chromosome 15q26.",NA,NA,NA,NA -definition source(s),>A oboInOwl:hasDbXref SPLIT=|,required manual,url:https://pubmed.ncbi.nlm.nih.gov/29478779/,NA,NA,NA,NA -definition source type(s),>AI dc11:type SPLIT=|,optional manual,curator inference from journal publication,"ECO codes, e.g. ECO:0007645",NA,NA,do not quote!!! -synonym(s): exact,AL oboInOwl:hasExactSynonym@en SPLIT=|,optional manual,hemangiosarcoma,NA,NA,NA,do not quote!!! -synonym(s): broad,AL oboInOwl:hasBroadSynonym@en SPLIT=|,optional manual,NA,NA,NA,NA,NA -synonym(s): narrow,AL oboInOwl:hasNarrowSynonym@en SPLIT=|,optional manual,NA,NA,NA,NA,NA -synonym(s): related,AL oboInOwl:hasRelatedSynonym@en SPLIT=|,optional manual,NA,NA,NA,NA,NA -acronym(s): exact,AL oboInOwl:hasExactSynonym@en SPLIT=|,optional manual,CAMRQ,NA,NA,NA,"must be accompanied by ""acronym annotation"" header/template in the adjacent column to the rigth in robot template" -acronym(s): broad,AL oboInOwl:hasBroadSynonym@en SPLIT=|,optional manual,NA,NA,NA,NA,NA -acronym(s): narrow,AL oboInOwl:hasNarrowSynonym@en SPLIT=|,optional manual,NA,NA,NA,NA,NA -acronym(s): related,AL oboInOwl:hasRelatedSynonym@en SPLIT=|,optional manual,DES,NA,NA,NA,"must be accompanied by ""acronym annotation"" header/template in the adjacent column to the rigth in robot template" -acronym annotation,>AI oboInOwl:hasSynonymType,optional auto,acronym,NA,NA,NA,NA -xref(s),A oboInOwl:hasDbXref SPLIT=|,optional manual,OMIM:618901,NA,NA,NA,NA -skos mapping(s): exact,A skos:exactMatch SPLIT=|,optional manual,OMIM:618901,NA,NA,"should use IRIs and be as follows: +data_type,template,inclusion,example,notes,optional_values,alternate_title,alternate_format,sparql,export_header,sparql_dt_motif +id,ID,required manual,DOID:0080943,replaced older 'iri/curie' header for simplicity; id now covered by obo id field,IRI or CURIE,CI,IRI or CURIE; CI means Class IRI --> type will be CLASS_TYPE,?iri a owl:Class .,ID,NA +label,A rdfs:label,required manual,"46,XX sex reversal 5",NA,NA,NA,NA,?iri rdfs:label ?label .,LABEL,rdfs:label +parent id,SC % SPLIT=|,required manual,DOID:0111760,"accepts CURIE or IRI; intended for only asserted subclass relationships between named classes +--> separated from subclass anon for practical purposes - ROBOT template is the same, ROBOT export differs",disease by infectious agent,NA,NA,"?iri rdfs:subClassOf ?parent . +FILTER(!isBlank(?parent))",SubClass Of [NAMED ID],rdfs:subClassOf +definition,A IAO:0000115,required manual,"A 46,XX sex reversal that is characterized by genital virilization in 46,XX individuals, associated with congenital heart disease and variable somatic anomalies including blepharophimosis-ptosis-epicanthus inversus syndrome and congenital diaphragmatic hernia and that has_material_basis_in heterozygous mutation in the NR2F2 gene on chromosome 15q26.",NA,NA,NA,NA,?iri obo:IAO_0000115 ?definition .,obo:IAO_0000115,IAO:0000115 +definition source(s),>A oboInOwl:hasDbXref SPLIT=|,required manual,url:https://pubmed.ncbi.nlm.nih.gov/29478779/,NA,NA,NA,NA,"!<<.definition_axiom>>! + oboInOwl:hasDbXref ?def_src .",NA,IAO:0000115-oboInOwl:hasDbXref +definition source type(s),>AI dc11:type SPLIT=|,optional manual,curator inference from journal publication,do not quote!!!,"ECO codes, e.g. ECO:0007645",NA,NA,"!<<.definition_axiom>>! + dc:type ?src_type .",NA,IAO:0000115-dc:type +synonym(s): exact,A oboInOwl:hasExactSynonym SPLIT=|,optional manual,hemangiosarcoma,do not quote!!!,NA,NA,NA,"glueV: .synonym_stmt, syn_scope = ""Exact""",oboInOwl:hasExactSynonym,oboInOwl:hasExactSynonym +synonym(s): broad,A oboInOwl:hasBroadSynonym SPLIT=|,optional manual,NA,NA,NA,NA,NA,"glueV: .synonym_stmt, syn_scope = ""Broad""",oboInOwl:hasBroadSynonym,oboInOwl:hasBroadSynonym +synonym(s): narrow,A oboInOwl:hasNarrowSynonym SPLIT=|,optional manual,NA,NA,NA,NA,NA,"glueV: .synonym_stmt, syn_scope = ""Narrow""",oboInOwl:hasNarrowSynonym,oboInOwl:hasNarrowSynonym +synonym(s): related,A oboInOwl:hasRelatedSynonym SPLIT=|,optional manual,NA,NA,NA,NA,NA,"glueV: .synonym_stmt, syn_scope = ""Related""",oboInOwl:hasRelatedSynonym,oboInOwl:hasRelatedSynonym +acronym(s): exact,A oboInOwl:hasExactSynonym SPLIT=|,optional manual,CAMRQ,"must be accompanied by ""acronym annotation"" header/template in the adjacent column to the rigth in robot template",NA,NA,NA,"glueV: .acronym_stmt, acronym_scope = ""Exact""",NA,oboInOwl:hasExactSynonym-OMO:0003012 +acronym(s): broad,A oboInOwl:hasBroadSynonym SPLIT=|,optional manual,NA,NA,NA,NA,NA,"glueV: .acronym_stmt, acronym_scope = ""Broad""",NA,oboInOwl:hasBroadSynonym-OMO:0003012 +acronym(s): narrow,A oboInOwl:hasNarrowSynonym SPLIT=|,optional manual,NA,NA,NA,NA,NA,"glueV: .acronym_stmt, acronym_scope = ""Narrow""",NA,oboInOwl:hasNarrowSynonym-OMO:0003012 +acronym(s): related,A oboInOwl:hasRelatedSynonym SPLIT=|,optional manual,DES,"must be accompanied by ""acronym annotation"" header/template in the adjacent column to the rigth in robot template",NA,NA,NA,"glueV: .acronym_stmt, acronym_scope = ""Related""",NA,oboInOwl:hasRelatedSynonym-OMO:0003012 +acronym annotation,>AI oboInOwl:hasSynonymType,optional auto,acronym,NA,NA,NA,NA,NA,NA,NA +xref(s),A oboInOwl:hasDbXref SPLIT=|,optional manual,OMIM:618901,NA,NA,NA,NA,?iri oboInOwl:hasDbXref ?xref .,oboInOwl:hasDbXref,oboInOwl:hasDbXref +skos mapping(s): exact,A skos:exactMatch SPLIT=|,optional manual,OMIM:618901,adds skos mappings as strings; current INCORRECT DO format,NA,NA,"should use IRIs and be as follows: AI skos:exactMatch SPLIT=| - - example input: https://omim.org/MIM:618901",adds skos mappings as strings; current INCORRECT DO format -skos mapping(s): broad,A skos:broadMatch SPLIT=|,optional manual,OMIM:PS613135,NA,NA,"should use IRIs and be as follows: + - example input: https://omim.org/MIM:618901",?iri skos:exactMatch ?skos_exact .,skos:exactMatch,skos:exactMatch +skos mapping(s): close,A skos:closeMatch SPLIT=|,optional manual,OMIM:618901,adds skos mappings as strings; current INCORRECT DO format,NA,NA,"should use IRIs and be as follows: +AI skos:closeMatch SPLIT=| + - example input: https://omim.org/MIM:618901",?iri skos:closeMatch ?skos_close .,skos:closeMatch,skos:closeMatch +skos mapping(s): broad,A skos:broadMatch SPLIT=|,optional manual,OMIM:PS613135,adds skos mappings as strings; current INCORRECT DO format,NA,NA,"should use IRIs and be as follows: AI skos:exactMatch SPLIT=| - - example input: https://omim.org/MIM:618901",adds skos mappings as strings; current INCORRECT DO format -skos mapping(s): narrow,A skos:narrowMatch SPLIT=|,optional manual,OMIM:618901,NA,NA,"should use IRIs and be as follows: + - example input: https://omim.org/MIM:618901",?iri skos:broadMatch ?skos_broad .,skos:broadMatch,skos:broadMatch +skos mapping(s): narrow,A skos:narrowMatch SPLIT=|,optional manual,OMIM:618901,adds skos mappings as strings; current INCORRECT DO format,NA,NA,"should use IRIs and be as follows: AI skos:exactMatch SPLIT=| - - example input: https://omim.org/MIM:618901",adds skos mappings as strings; current INCORRECT DO format -skos mapping(s): related,A skos:relatedMatch SPLIT=|,optional manual,NA,NA,NA,"should use IRIs and be as follows: + - example input: https://omim.org/MIM:618901",?iri skos:narrowMatch ?skos_narrow .,skos:narrowMatch,skos:narrowMatch +skos mapping(s): related,A skos:relatedMatch SPLIT=|,optional manual,NA,adds skos mappings as strings; current INCORRECT DO format,NA,NA,"should use IRIs and be as follows: AI skos:exactMatch SPLIT=| - - example input: https://omim.org/MIM:618901",adds skos mappings as strings; current INCORRECT DO format -equivalent class,EC %,optional manual,disease and ('has material basis in' some (Viruses or Bacteria or Eukaryota)),NA,NA,NA,NA -sc axiom: inheritance,SC 'has material basis in' some % SPLIT=|,optional manual,NA,NA,NA,NA,NA -sc axiom: anatomical location,SC 'disease has location' some %,optional manual,NA,NA,NA,NA,NA -sc axiom: onset,SC 'existence starts during' some %,optional manual,NA,NA,NA,NA,NA -sc axiom: has_material_basis_in,SC has_material_basis_in some %,optional manual,autosomal dominant inheritance,NA,NA,NA,do not quote!!! -sc axiom: located_in,SC located_in some %,optional manual,NA,rdfs:label (preferred); IRI or CURIE (possible),NA,NA,NA -disjoint class,DC %,optional manual,NA,NA,NA,NA,NA -subset(s),AI oboInOwl:inSubset SPLIT=|,optional manual,DO_AGR_slim,any subset (aka 'slim') defined in doid-edit.owl,NA,NA,NA -deprecate,AT owl:deprecated^^xsd:boolean,optional manual,true,NA,NA,NA,NA -alternate id(s),A oboInOwl:hasAlternativeId SPLIT=|,optional manual,DOID:4,CURIE of deprecated term,NA,NA,NA -term replaced by,AI obo:IAO_0100001,optional manual,DOID:4,IRI or CURIE of term to replace by,NA,NA,NA -comment,AL rdfs:comment@en,optional manual,This is a comment. There should only be one per term.,NA,NA,NA,NA -obo id,A oboInOwl:id,required auto,DOID:0080943,OBO CURIE,NA,NA,"required data, but not necessary to include in manual curation; will be inferred from iri/curie + - example input: https://omim.org/MIM:618901",?iri skos:relatedMatch ?skos_related .,skos:relatedMatch,skos:relatedMatch +eq class,EC % SPLIT=|,optional manual,disease and ('has material basis in' some (Viruses or Bacteria or Eukaryota)),NA,NA,NA,NA,"?iri owl:equivalentClass ?eq . +FILTER(!isBlank(?eq))",Equivalent Class [NAMED ID],owl:equivalentClass +eq class anon,EC % SPLIT=|,optional manual,NA,NA,NA,NA,NA,NA,Equivalent Class [ANON ID],NA +subclass anon,SC % SPLIT=|,optional manual,'disease has feature' some cherubism,"intended for only subclass of anonymous logical expressions +--> separated from parent id for practical purposes - ROBOT template is the same, ROBOT export differs",NA,NA,NA,NA,SubClass Of [ANON ID],NA +subclass anon: inheritance,SC 'has material basis in' some % SPLIT=|,optional manual,NA,NA,NA,NA,NA,NA,NA,NA +subclass anon: anatomical location,SC 'disease has location' some % SPLIT=|,optional manual,NA,NA,NA,NA,NA,NA,NA,NA +subclass anon: onset,SC 'existence starts during' some % SPLIT=|,optional manual,NA,NA,NA,NA,NA,NA,NA,NA +subclass anon: has_material_basis_in,SC has_material_basis_in some % SPLIT=|,optional manual,autosomal dominant inheritance,do not quote standalone terms!!!,NA,NA,NA,NA,NA,NA +subclass anon: located_in,SC located_in some % SPLIT=|,optional manual,NA,NA,rdfs:label (preferred); IRI or CURIE (possible),NA,NA,NA,NA,NA +disjoint class,DC % SPLIT=|,optional manual,NA,NA,NA,NA,NA,"?iri owl:disjointClass ?disjoint . +FILTER(!isBlank(?disjoint))",Disjoint With [NAMED ID],owl:disjointWith +disjoint class anon,DC % SPLIT=|,optional manual,NA,NA,NA,NA,NA,NA,Disjoint With [ANON ID],NA +subset(s),AI oboInOwl:inSubset SPLIT=|,optional manual,DO_AGR_slim,NA,any subset (aka 'slim') defined in doid-edit.owl,NA,NA,"?iri oboInOwl:inSubset ?subset_iri . +?subset_iri rdfs:label ?subset .",oboInOwl:inSubset,oboInOwl:inSubset +alternate id(s),A oboInOwl:hasAlternativeId SPLIT=|,optional manual,DOID:4,NA,CURIE of deprecated term,NA,NA,?iri oboInOwl:hasAlternativeId ?alt_id .,oboInOwl:hasAlternativeId,oboInOwl:hasAlternativeId +deprecated,AT owl:deprecated^^xsd:boolean,optional manual,true,NA,NA,NA,NA,?iri owl:deprecated ?deprecate .,owl:deprecated,owl:deprecated +obsolescence reason,AI IAO:0000231,optional manual,terms merged,must be IRI or label of child of 'obsolescence reason specification' (IAO:0000225),NA,NA,NA,?iri IAO:0000231 ?obs_reason .,NA,IAO:0000231 +term replaced by,AI IAO:0100001,optional manual,DOID:4,NA,IRI or CURIE of term to replace by,NA,NA,?iri obo:IAO_0100001 ?term_replaced_by .,IAO:0100001,IAO:0100001 +consider instead,oboInOwl:consider,optional manual,NA,NA,NA,NA,NA,?iri oboInOwl:consider ?consider .,oboInOwl:consider,oboInOwl:consider +comment,A rdfs:comment,optional manual,This is a comment. There should only be one per term.,NA,NA,NA,NA,?iri rdfs:comment ?comment .,rdfs:comment,rdfs:comment +created by,A oboInOwl:created_by,optional manual,NA,NA,NA,NA,NA,?iri oboInOwl:created_by ?created_by .,oboInOwl:created_by,oboInOwl:created_by +creation date,A oboInOwl:creation_date,optional manual,NA,NA,NA,NA,NA,?iri oboInOwl:creation_date ?creation_date .,oboInOwl:creation_date,oboInOwl:creation_date +obo id,A oboInOwl:id,required auto,DOID:0080943,"required data, but not necessary to include in manual curation; will be inferred from iri/curie -if manually entered it must match the CURIE form of iri/curie" -obo namespace,A oboInOwl:hasOBONamespace,required auto,disease_ontology,"OBO namespace of ontology: disease_ontology, symptoms, transmission_process",NA,NA,"required data, but not necessary to include in manual curation; will be automatically added for any new disease +if manually entered it must match the CURIE form of iri/curie",OBO CURIE,NA,NA,?iri oboInOwl:id ?id .,NA,oboInOwl:id +obo namespace,A oboInOwl:hasOBONamespace,required auto,disease_ontology,"required data, but not necessary to include in manual curation; will be automatically added for any new disease -if manually entered it must be ""disease_ontology"" (without quotes)" -español - label,AL rdfs:label@es,optional manual,NA,NA,NA,NA,NA -español - definition,AL obo:IAO_0000115@es,optional manual,NA,NA,NA,NA,NA -español - synonym(s): exact,AL oboInOwl:hasExactSynonym@es SPLIT=|,optional manual,NA,NA,NA,NA,NA -español - synonym(s): broad,AL oboInOwl:hasBroadSynonym@es SPLIT=|,optional manual,NA,NA,NA,NA,NA -español - synonym(s): narrow,AL oboInOwl:hasNarrowSynonym@es SPLIT=|,optional manual,NA,NA,NA,NA,NA -español - synonym(s): related,AL oboInOwl:hasRelatedSynonym@es SPLIT=|,optional manual,NA,NA,NA,NA,NA -español - acronym(s): exact,AL oboInOwl:hasExactSynonym@es SPLIT=|,optional manual,NA,NA,NA,NA,NA -español - acronym(s): broad,AL oboInOwl:hasBroadSynonym@es SPLIT=|,optional manual,NA,NA,NA,NA,NA -español - acronym(s): narrow,AL oboInOwl:hasNarrowSynonym@es SPLIT=|,optional manual,NA,NA,NA,NA,NA -español - acronym(s): related,AL oboInOwl:hasRelatedSynonym@es SPLIT=|,optional manual,NA,NA,NA,NA,NA +if manually entered it must be ""disease_ontology"" (without quotes)","OBO namespace of ontology: disease_ontology, symptoms, transmission_process",NA,NA,?iri oboInOwl:hasOBONamespace ?obo_namespace .,oboInOwl:hasOBONamespace,oboInOwl:hasOBONamespace diff --git a/data-raw/internal-curation_opts.R b/data-raw/internal-curation_opts.R deleted file mode 100644 index 4b887a31..00000000 --- a/data-raw/internal-curation_opts.R +++ /dev/null @@ -1,19 +0,0 @@ -## code to prepare `curation_opts` internal dataset -rlang::check_installed("googlesheets4") -devtools::load_all() - -curation_opts <- googlesheets4::read_sheet( - "https://docs.google.com/spreadsheets/d/1Zn6p5xkVHUwbWe1N8FUa3fNcEkAOoE9P4ADb12f69hQ/edit", - sheet = "template_options", - col_types = "c" -) |> - dplyr::filter(!is.na(.data$template)) - -readr::write_csv(curation_opts, "data-raw/curation_opts.csv") - -.curation_opts <- dplyr::select( - curation_opts, - tidyselect::all_of(c("header", "template", "type")) -) - -use_data_internal(.curation_opts, overwrite = TRUE) \ No newline at end of file diff --git a/data-raw/internal-sssom_spec.R b/data-raw/internal-sssom_spec.R deleted file mode 100644 index 57739278..00000000 --- a/data-raw/internal-sssom_spec.R +++ /dev/null @@ -1,31 +0,0 @@ -# Capture official SSSOM specification and parse for use by DO.utils -rlang::check_installed("yaml") -devtools::load_all() - -sssom_version <- stringr::str_remove( - httr::HEAD("https://github.com/mapping-commons/sssom/releases/latest/")$url, - ".*/" -) -sssom_yaml_path <- glueV( - "https://raw.githubusercontent.com/mapping-commons/sssom/!<>!/src/sssom_schema/schema/sssom_schema.yaml" -) -.sssom_spec <- yaml::read_yaml(sssom_yaml_path) -.sssom_spec$version <- sssom_version -.sssom_spec$access_date <- Sys.Date() - -.sssom_slot_types <- purrr::map_chr(.sssom_spec$slots, ~ .$range) -.sssom_mapping_slots <- .sssom_spec$classes$mapping$slots - -use_data_internal( - .sssom_spec, - .sssom_slot_types, - .sssom_mapping_slots, - overwrite = TRUE -) - -# save YAML for dev reference -dev_dir <- "setup_docs" -yaml_file <- file.path(dev_dir, paste0("sssom_schema-", sssom_version, ".yaml")) - -if (!dir.exists(dev_dir)) dir.create(dev_dir) -download.file(sssom_yaml_path, yaml_file) diff --git a/data-raw/internal-DO_gs.R b/data-raw/internal/DO_gs.R similarity index 53% rename from data-raw/internal-DO_gs.R rename to data-raw/internal/DO_gs.R index e7cda198..881f39b0 100644 --- a/data-raw/internal-DO_gs.R +++ b/data-raw/internal/DO_gs.R @@ -1,5 +1,9 @@ -## code to save DO Google Sheet information internally (`.DO_gs`) -devtools::load_all() +## code to prepare `.DO_gs` internal dataset ## +# +# Serves as a reference for DO-related Google Sheets and relevant sheets (tabs) +# for data retrieval + +rlang::check_installed("here") .DO_gs <- list( users = list( @@ -12,4 +16,8 @@ devtools::load_all() ) ) -use_data_internal(.DO_gs, overwrite = TRUE) +saveRDS( + .DO_gs, + file = here::here("data-raw", "internal", "DO_gs.rds"), + compress = "bzip2" +) diff --git a/data-raw/internal/DO_gs.rds b/data-raw/internal/DO_gs.rds new file mode 100644 index 00000000..3af90b10 Binary files /dev/null and b/data-raw/internal/DO_gs.rds differ diff --git a/data-raw/internal/curation_opts.R b/data-raw/internal/curation_opts.R new file mode 100644 index 00000000..9a6c85c0 --- /dev/null +++ b/data-raw/internal/curation_opts.R @@ -0,0 +1,56 @@ +## code to prepare `.curation_opts` and `.sparql_dt_motif`internal datasets ## +# +# .curation_opts is updated from a a Google Sheet and serves as a schema for +# establishment of curation templates and their conversion to robot templates +# +# .sparql_dt_motif is a named character vector of data types and their +# associated SPARQL "motif" for automated identification of curation data types +# supporting curation templates from ontology data + +rlang::check_installed( + c("dplyr", "googlesheets4", "here", "vroom") +) +devtools::load_all() + + +out_dir <- here::here("data-raw", "internal") + +# save full curation_opts schema for developer reference +curation_opts <- googlesheets4::read_sheet( + "https://docs.google.com/spreadsheets/d/1Zn6p5xkVHUwbWe1N8FUa3fNcEkAOoE9P4ADb12f69hQ/edit", + sheet = "template_options", + col_types = "c" +) |> + dplyr::filter(!is.na(.data$template) & !.data$inclusion == "deprecated") + +vroom::vroom_write( + curation_opts, + file.path(out_dir, "curation_opts.tsv"), + na = "" +) + +# save internal data +.curation_opts <- dplyr::select( + curation_opts, + dplyr::all_of(c("data_type", "template", "inclusion")) +) + +saveRDS( + .curation_opts, + file = file.path(out_dir, "curation_opts.rds"), + compress = "bzip2" +) + + +.sparql_dt_motif <- curation_opts |> + dplyr::filter(!is.na(.data$sparql_dt_motif)) |> + with( + purrr::set_names(data_type, sparql_dt_motif) + ) |> + length_sort(by_name = TRUE, decreasing = TRUE) + +saveRDS( + .sparql_dt_motif, + file = file.path(out_dir, "sparql_dt_motif.rds"), + compress = "bzip2" +) diff --git a/data-raw/internal/curation_opts.rds b/data-raw/internal/curation_opts.rds new file mode 100644 index 00000000..3c778bf4 Binary files /dev/null and b/data-raw/internal/curation_opts.rds differ diff --git a/data-raw/internal/curation_opts.tsv b/data-raw/internal/curation_opts.tsv new file mode 100644 index 00000000..6f32c3e2 --- /dev/null +++ b/data-raw/internal/curation_opts.tsv @@ -0,0 +1,65 @@ +data_type template inclusion example notes optional_values alternate_title alternate_format sparql export_header sparql_dt_motif +id ID required manual DOID:0080943 replaced older 'iri/curie' header for simplicity; id now covered by obo id field IRI or CURIE CI IRI or CURIE; CI means Class IRI --> type will be CLASS_TYPE ?iri a owl:Class . ID +label A rdfs:label required manual 46,XX sex reversal 5 ?iri rdfs:label ?label . LABEL rdfs:label +parent id SC % SPLIT=| required manual DOID:0111760 "accepts CURIE or IRI; intended for only asserted subclass relationships between named classes +--> separated from subclass anon for practical purposes - ROBOT template is the same, ROBOT export differs" disease by infectious agent "?iri rdfs:subClassOf ?parent . +FILTER(!isBlank(?parent))" SubClass Of [NAMED ID] rdfs:subClassOf +definition A IAO:0000115 required manual A 46,XX sex reversal that is characterized by genital virilization in 46,XX individuals, associated with congenital heart disease and variable somatic anomalies including blepharophimosis-ptosis-epicanthus inversus syndrome and congenital diaphragmatic hernia and that has_material_basis_in heterozygous mutation in the NR2F2 gene on chromosome 15q26. ?iri obo:IAO_0000115 ?definition . obo:IAO_0000115 IAO:0000115 +definition source(s) >A oboInOwl:hasDbXref SPLIT=| required manual url:https://pubmed.ncbi.nlm.nih.gov/29478779/ "!<<.definition_axiom>>! + oboInOwl:hasDbXref ?def_src ." IAO:0000115-oboInOwl:hasDbXref +definition source type(s) >AI dc11:type SPLIT=| optional manual curator inference from journal publication do not quote!!! ECO codes, e.g. ECO:0007645 "!<<.definition_axiom>>! + dc:type ?src_type ." IAO:0000115-dc:type +xref(s) A oboInOwl:hasDbXref SPLIT=| optional manual OMIM:618901 ?iri oboInOwl:hasDbXref ?xref . oboInOwl:hasDbXref oboInOwl:hasDbXref +skos mapping(s): exact A skos:exactMatch SPLIT=| optional manual OMIM:618901 adds skos mappings as strings; current INCORRECT DO format "should use IRIs and be as follows: +AI skos:exactMatch SPLIT=| + - example input: https://omim.org/MIM:618901" ?iri skos:exactMatch ?skos_exact . skos:exactMatch skos:exactMatch +skos mapping(s): close A skos:closeMatch SPLIT=| optional manual OMIM:618901 adds skos mappings as strings; current INCORRECT DO format "should use IRIs and be as follows: +AI skos:closeMatch SPLIT=| + - example input: https://omim.org/MIM:618901" ?iri skos:closeMatch ?skos_close . skos:closeMatch skos:closeMatch +skos mapping(s): broad A skos:broadMatch SPLIT=| optional manual OMIM:PS613135 adds skos mappings as strings; current INCORRECT DO format "should use IRIs and be as follows: +AI skos:exactMatch SPLIT=| + - example input: https://omim.org/MIM:618901" ?iri skos:broadMatch ?skos_broad . skos:broadMatch skos:broadMatch +skos mapping(s): narrow A skos:narrowMatch SPLIT=| optional manual OMIM:618901 adds skos mappings as strings; current INCORRECT DO format "should use IRIs and be as follows: +AI skos:exactMatch SPLIT=| + - example input: https://omim.org/MIM:618901" ?iri skos:narrowMatch ?skos_narrow . skos:narrowMatch skos:narrowMatch +skos mapping(s): related A skos:relatedMatch SPLIT=| optional manual adds skos mappings as strings; current INCORRECT DO format "should use IRIs and be as follows: +AI skos:exactMatch SPLIT=| + - example input: https://omim.org/MIM:618901" ?iri skos:relatedMatch ?skos_related . skos:relatedMatch skos:relatedMatch +synonym(s): exact A oboInOwl:hasExactSynonym SPLIT=| optional manual hemangiosarcoma do not quote!!! "glueV: .synonym_stmt, syn_scope = ""Exact""" oboInOwl:hasExactSynonym oboInOwl:hasExactSynonym +synonym(s): broad A oboInOwl:hasBroadSynonym SPLIT=| optional manual "glueV: .synonym_stmt, syn_scope = ""Broad""" oboInOwl:hasBroadSynonym oboInOwl:hasBroadSynonym +synonym(s): narrow A oboInOwl:hasNarrowSynonym SPLIT=| optional manual "glueV: .synonym_stmt, syn_scope = ""Narrow""" oboInOwl:hasNarrowSynonym oboInOwl:hasNarrowSynonym +synonym(s): related A oboInOwl:hasRelatedSynonym SPLIT=| optional manual "glueV: .synonym_stmt, syn_scope = ""Related""" oboInOwl:hasRelatedSynonym oboInOwl:hasRelatedSynonym +acronym(s): exact A oboInOwl:hasExactSynonym SPLIT=| optional manual CAMRQ "must be accompanied by ""acronym annotation"" header/template in the adjacent column to the rigth in robot template" "glueV: .acronym_stmt, acronym_scope = ""Exact""" oboInOwl:hasExactSynonym-OMO:0003012 +acronym(s): broad A oboInOwl:hasBroadSynonym SPLIT=| optional manual "glueV: .acronym_stmt, acronym_scope = ""Broad""" oboInOwl:hasBroadSynonym-OMO:0003012 +acronym(s): narrow A oboInOwl:hasNarrowSynonym SPLIT=| optional manual "glueV: .acronym_stmt, acronym_scope = ""Narrow""" oboInOwl:hasNarrowSynonym-OMO:0003012 +acronym(s): related A oboInOwl:hasRelatedSynonym SPLIT=| optional manual DES "must be accompanied by ""acronym annotation"" header/template in the adjacent column to the rigth in robot template" "glueV: .acronym_stmt, acronym_scope = ""Related""" oboInOwl:hasRelatedSynonym-OMO:0003012 +acronym annotation >AI oboInOwl:hasSynonymType optional auto acronym +eq class EC % SPLIT=| optional manual disease and ('has material basis in' some (Viruses or Bacteria or Eukaryota)) "?iri owl:equivalentClass ?eq . +FILTER(!isBlank(?eq))" Equivalent Class [NAMED ID] owl:equivalentClass +eq class anon EC % SPLIT=| optional manual Equivalent Class [ANON ID] +subclass anon SC % SPLIT=| optional manual 'disease has feature' some cherubism "intended for only subclass of anonymous logical expressions +--> separated from parent id for practical purposes - ROBOT template is the same, ROBOT export differs" SubClass Of [ANON ID] +subclass anon: inheritance SC 'has material basis in' some % SPLIT=| optional manual +subclass anon: anatomical location SC 'disease has location' some % SPLIT=| optional manual +subclass anon: onset SC 'existence starts during' some % SPLIT=| optional manual +subclass anon: has_material_basis_in SC has_material_basis_in some % SPLIT=| optional manual autosomal dominant inheritance do not quote standalone terms!!! +subclass anon: located_in SC located_in some % SPLIT=| optional manual rdfs:label (preferred); IRI or CURIE (possible) +disjoint class DC % SPLIT=| optional manual "?iri owl:disjointClass ?disjoint . +FILTER(!isBlank(?disjoint))" Disjoint With [NAMED ID] owl:disjointWith +disjoint class anon DC % SPLIT=| optional manual Disjoint With [ANON ID] +subset(s) AI oboInOwl:inSubset SPLIT=| optional manual DO_AGR_slim any subset (aka 'slim') defined in doid-edit.owl "?iri oboInOwl:inSubset ?subset_iri . +?subset_iri rdfs:label ?subset ." oboInOwl:inSubset oboInOwl:inSubset +alternate id(s) A oboInOwl:hasAlternativeId SPLIT=| optional manual DOID:4 CURIE of deprecated term ?iri oboInOwl:hasAlternativeId ?alt_id . oboInOwl:hasAlternativeId oboInOwl:hasAlternativeId +deprecated AT owl:deprecated^^xsd:boolean optional manual true ?iri owl:deprecated ?deprecate . owl:deprecated owl:deprecated +obsolescence reason AI IAO:0000231 optional manual terms merged must be IRI or label of child of 'obsolescence reason specification' (IAO:0000225) ?iri IAO:0000231 ?obs_reason . IAO:0000231 +term replaced by AI IAO:0100001 optional manual DOID:4 IRI or CURIE of term to replace by ?iri obo:IAO_0100001 ?term_replaced_by . IAO:0100001 IAO:0100001 +consider instead oboInOwl:consider optional manual ?iri oboInOwl:consider ?consider . oboInOwl:consider oboInOwl:consider +comment A rdfs:comment optional manual This is a comment. There should only be one per term. ?iri rdfs:comment ?comment . rdfs:comment rdfs:comment +created by A oboInOwl:created_by optional manual ?iri oboInOwl:created_by ?created_by . oboInOwl:created_by oboInOwl:created_by +creation date A oboInOwl:creation_date optional manual ?iri oboInOwl:creation_date ?creation_date . oboInOwl:creation_date oboInOwl:creation_date +obo id A oboInOwl:id required auto DOID:0080943 "required data, but not necessary to include in manual curation; will be inferred from iri/curie + +if manually entered it must match the CURIE form of iri/curie" OBO CURIE ?iri oboInOwl:id ?id . oboInOwl:id +obo namespace A oboInOwl:hasOBONamespace required auto disease_ontology "required data, but not necessary to include in manual curation; will be automatically added for any new disease + +if manually entered it must be ""disease_ontology"" (without quotes)" OBO namespace of ontology: disease_ontology, symptoms, transmission_process ?iri oboInOwl:hasOBONamespace ?obo_namespace . oboInOwl:hasOBONamespace oboInOwl:hasOBONamespace diff --git a/data-raw/internal/html4_tags.tsv b/data-raw/internal/html4_tags.tsv new file mode 100644 index 00000000..34ae8e9e --- /dev/null +++ b/data-raw/internal/html4_tags.tsv @@ -0,0 +1,97 @@ +name start_tag end_tag deprecated dtd description +a required required FALSE anchor +abbr required required FALSE "abbreviated form (e.g., www, http, +etc.)" +acronym required required FALSE +address required required FALSE information on author +applet required required TRUE loose java applet +area required forbidden FALSE client-side image map area +b required required FALSE bold text style +base required forbidden FALSE document base uri +basefont required forbidden TRUE loose base font size +bdo required required FALSE i18n bidi over-ride +big required required FALSE large text style +blockquote required required FALSE long quotation +body optional optional FALSE document body +br required forbidden FALSE forced line break +button required required FALSE push button +caption required required FALSE table caption +center required required TRUE loose shorthand for div align=center +cite required required FALSE citation +code required required FALSE computer code fragment +col required forbidden FALSE table column +colgroup required optional FALSE table column group +dd required optional FALSE definition description +del required required FALSE deleted text +dfn required required FALSE instance definition +dir required required TRUE loose directory list +div required required FALSE generic language/style container +dl required required FALSE definition list +dt required optional FALSE definition term +em required required FALSE emphasis +fieldset required required FALSE form control group +font required required TRUE loose local change to font +form required required FALSE interactive form +frame required forbidden FALSE forbidden subwindow +frameset required required FALSE forbidden window subdivision +h1 required required FALSE heading +h2 required required FALSE heading +h3 required required FALSE heading +h4 required required FALSE heading +h5 required required FALSE heading +h6 required required FALSE heading +head optional optional FALSE document head +hr required forbidden FALSE horizontal rule +html optional optional FALSE document root element +i required required FALSE italic text style +iframe required required FALSE loose inline subwindow +img required forbidden FALSE embedded image +input required forbidden FALSE form control +ins required required FALSE inserted text +isindex required forbidden TRUE loose single line prompt +kbd required required FALSE text to be entered by the user +label required required FALSE form field label text +legend required required FALSE fieldset legend +li required optional FALSE list item +link required forbidden FALSE a media-independent link +map required required FALSE client-side image map +menu required required TRUE loose menu list +meta required forbidden FALSE generic metainformation +noframes required required FALSE forbidden "alternate content container for non +frame-based rendering" +noscript required required FALSE "alternate content container for non +script-based rendering" +object required required FALSE generic embedded object +ol required required FALSE ordered list +optgroup required required FALSE option group +option required optional FALSE selectable choice +p required optional FALSE paragraph +param required forbidden FALSE named property value +pre required required FALSE preformatted text +q required required FALSE short inline quotation +s required required TRUE loose strike-through text style +samp required required FALSE "sample program output, scripts, +etc." +script required required FALSE script statements +select required required FALSE option selector +small required required FALSE small text style +span required required FALSE generic language/style container +strike required required TRUE loose strike-through text +strong required required FALSE strong emphasis +style required required FALSE style info +sub required required FALSE subscript +sup required required FALSE superscript +table required required FALSE +tbody optional optional FALSE table body +td required optional FALSE table data cell +textarea required required FALSE multi-line text field +tfoot required optional FALSE table footer +th required optional FALSE table header cell +thead required optional FALSE table header +title required required FALSE document title +tr required optional FALSE table row +tt required required FALSE teletype or monospaced text style +u required required TRUE loose underlined text style +ul required required FALSE unordered list +var required required FALSE "instance of a variable or program +argument" diff --git a/data-raw/internal-html_tags.R b/data-raw/internal/html_tags.R similarity index 51% rename from data-raw/internal-html_tags.R rename to data-raw/internal/html_tags.R index 157ab11f..95de392a 100644 --- a/data-raw/internal-html_tags.R +++ b/data-raw/internal/html_tags.R @@ -1,13 +1,31 @@ +## code to prepare `.html_tags` internal dataset ## +# +# HTML tag information is retrieved from the W3C HTML 4.01 specification and +# serves as a reference for parsing and validation of HTML elements in DO +# website curation +# +# NOTE: HTML 4.01 is used as a reference for tag information, but the web +# now supports the HTML Living Standard (https://html.spec.whatwg.org/), which +# includes additional tags and attributes. + rlang::check_installed( - c("dplyr", "janitor", "purrr", "rvest", "stringr", "tidyr") + c("dplyr", "here", "janitor", "purrr", "rvest", "stringr", "tidyr", "vroom") ) -raw_element_index <- rvest::read_html("https://www.w3.org/TR/html401/index/elements.html") +outdir <- here::here("data-raw", "internal") + +raw_element_index <- rvest::read_html( + "https://www.w3.org/TR/html401/index/elements.html" +) index_legend <- raw_element_index |> rvest::html_text() |> stringr::str_match( - stringr::regex("legend:(.*)name[^[:alnum:]]", dotall = TRUE, ignore_case = TRUE) + stringr::regex( + "legend:(.*)name[^[:alnum:]]", + dotall = TRUE, + ignore_case = TRUE + ) ) |> (\(x) x[, 2])() |> stringr::str_split(",[[:space:]]*") |> @@ -39,4 +57,15 @@ if (nrow(.html_tags) != dplyr::n_distinct(.html_tags$name)) { rlang::abort("Duplicate HTML tag names found") } -use_data_internal(.html_tags, overwrite = TRUE) +# save tabular data for reference +vroom::vroom_write( + .html_tags, + file = file.path(outdir, "html4_tags.tsv"), + na = "" +) + +saveRDS( + .html_tags, + file = file.path(outdir, "html_tags.rds"), + compress = "bzip2" +) diff --git a/data-raw/internal/html_tags.rds b/data-raw/internal/html_tags.rds new file mode 100644 index 00000000..4e3f4e30 Binary files /dev/null and b/data-raw/internal/html_tags.rds differ diff --git a/data-raw/internal/sparql_dt_motif.rds b/data-raw/internal/sparql_dt_motif.rds new file mode 100644 index 00000000..49ad0c8a Binary files /dev/null and b/data-raw/internal/sparql_dt_motif.rds differ diff --git a/data-raw/internal/sssom_mapping_slots.rds b/data-raw/internal/sssom_mapping_slots.rds new file mode 100644 index 00000000..623c38cd Binary files /dev/null and b/data-raw/internal/sssom_mapping_slots.rds differ diff --git a/data-raw/internal/sssom_schema-v1.0.0.yaml b/data-raw/internal/sssom_schema-v1.0.0.yaml new file mode 100644 index 00000000..099e8928 --- /dev/null +++ b/data-raw/internal/sssom_schema-v1.0.0.yaml @@ -0,0 +1,792 @@ +id: https://w3id.org/sssom/schema/ +name: sssom +description: Datamodel for Simple Standard for Sharing Ontological Mappings (SSSOM) +imports: +- linkml:types +prefixes: + dcterms: http://purl.org/dc/terms/ + linkml: https://w3id.org/linkml/ + sssom: https://w3id.org/sssom/ + rdfs: http://www.w3.org/2000/01/rdf-schema# + rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# + oboInOwl: http://www.geneontology.org/formats/oboInOwl# + pav: http://purl.org/pav/ + prov: http://www.w3.org/ns/prov# + skos: http://www.w3.org/2004/02/skos/core# + xsd: http://www.w3.org/2001/XMLSchema# + semapv: https://w3id.org/semapv/vocab/ +see_also: +- https://github.com/mapping-commons/sssom +- https://mapping-commons.github.io/sssom/home/ +default_curi_maps: +- semweb_context +- obo_context +default_prefix: sssom +default_range: string + +enums: + entity_type_enum: + permissible_values: + owl class: + meaning: owl:Class + owl object property: + meaning: owl:ObjectProperty + owl data property: + meaning: owl:DataProperty + owl annotation property: + meaning: owl:AnnotationProperty + owl named individual: + meaning: owl:NamedIndividual + skos concept: + meaning: skos:Concept + rdfs resource: + meaning: rdfs:Resource + rdfs class: + meaning: rdfs:Class + rdfs literal: + meaning: rdfs:Literal + description: This value indicate that the entity being mapped is not a semantic entity with a distinct identifier, but is instead represented entirely by its literal label. This value MUST NOT be used in the predicate_type slot. + see_also: + - https://mapping-commons.github.io/sssom/spec-model/#literal-mappings + - https://github.com/mapping-commons/sssom/issues/234 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/literals.sssom.tsv + rdfs datatype: + meaning: rdfs:Datatype + rdf property: + meaning: rdf:Property + + predicate_modifier_enum: + permissible_values: + Not: Negating the mapping predicate. The meaning of the triple becomes subject_id is not a predicate_id match to object_id. + mapping_cardinality_enum: + permissible_values: + "1:1": One-to-one mapping + "1:n": One-to-many mapping + "n:1": Many-to-one mapping + "1:0": One-to-none mapping + "0:1": None-to-one mapping + "n:n": Many-to-many mapping + +types: + EntityReference: + typeof: uriorcurie + description: | + A reference to an entity involved in the mapping. + base: str + uri: rdfs:Resource + see_also: + - https://mapping-commons.github.io/sssom/spec/#tsv + +slots: + prefix_name: + key: true + range: ncname + prefix_url: + range: uri + curie_map: + description: A dictionary that contains prefixes as keys and their URI expansions as values. + range: prefix + multivalued: true + inlined: true + see_also: + - https://github.com/mapping-commons/sssom/issues/225 + - https://github.com/mapping-commons/sssom/pull/349 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/curie_map.sssom.tsv + mirror_from: + description: A URL location from which to obtain a resource, such as a mapping set. + range: uri + registry_confidence: + description: This value is set by the registry that indexes the mapping set. It reflects the confidence the registry has in the correctness of the mappings in the mapping set. + range: double + last_updated: + description: The date this reference was last updated. + range: date + local_name: + description: The local name assigned to file that corresponds to the downloaded mapping set. + range: string + mapping_set_references: + description: A list of mapping set references. + range: mapping set reference + multivalued: true + recommended: true + mapping_registry_id: + description: The unique identifier of a mapping registry. + range: EntityReference + required: true + mapping_registry_title: + description: The title of a mapping registry. + range: string + mapping_registry_description: + description: The description of a mapping registry. + range: string + imports: + description: A list of registries that should be imported into this one. + multivalued: true + range: uri + documentation: + description: A URL to the documentation of this mapping commons. + range: uri + homepage: + description: A URL to a homepage of this mapping commons. + range: uri + mappings: + description: Contains a list of mapping objects + range: mapping + multivalued: true + inlined_as_list: true + recommended: true + subject_id: + description: The ID of the subject of the mapping. + range: EntityReference + mappings: + - owl:annotatedSource + slot_uri: owl:annotatedSource + examples: + - value: HP:0009894 + description: The CURIE denoting the Human Phenotype Ontology concept of 'Thickened ears' + subject_label: + description: The label of subject of the mapping + range: string + examples: + - value: Thickened ears + recommended: true + subject_category: + description: The conceptual category to which the subject belongs to. This can + be a string denoting the category or a term from a controlled vocabulary. + This slot is deliberately underspecified. Conceptual categories can range from + those that are found in general upper ontologies such as BFO (e.g. process, temporal region, etc) to those that serve + as upper ontologies in specific domains, such as COB or BioLink (e.g. gene, disease, chemical entity). The purpose of this + optional field is documentation for human reviewers - when a category is known + and documented clearly, the cost of interpreting and evaluating the mapping decreases. + range: string + see_also: + - https://github.com/mapping-commons/sssom/issues/13 + - https://github.com/mapping-commons/sssom/issues/256 + examples: + - value: UBERON:0001062 + description: (The CURIE of the Uberon term for "anatomical entity".) + - value: anatomical entity + description: (A string, rather than ID, describing the "anatomical entity" category. This is possible, but less preferred than using an ID.) + - value: biolink:Gene + description: (The CURIE of the biolink class for genes.) + subject_type: + description: The type of entity that is being mapped. + range: entity_type_enum + instantiates: sssom:Propagatable + annotations: + propagated: true + examples: + - value: owl:Class + predicate_id: + description: The ID of the predicate or relation that relates the subject and + object of this match. + mappings: + - owl:annotatedProperty + range: EntityReference + required: true + slot_uri: owl:annotatedProperty + + examples: + - value: owl:sameAs + description: The subject and the object are instances (owl individuals), and the two instances are the same. + - value: owl:equivalentClass + description: The subject and the object are classes (owl class), and the two classes are the same. + - value: owl:equivalentProperty + description: The subject and the object are properties (owl object, data, annotation properties), and the two properties are the same. + - value: rdfs:subClassOf + description: The subject and the object are classes (owl class), and the subject is a subclass of the object. + - value: rdfs:subPropertyOf + description: The subject and the object are properties (owl object, data, annotation properties), and the subject is a subproperty of the object. + - value: skos:relatedMatch + description: The subject and the object are associated in some unspecified way. + - value: skos:closeMatch + description: The subject and the object are sufficiently similar that they can be used interchangeably in some information retrieval applications. + - value: skos:exactMatch + description: The subject and the object can, with a high degree of confidence, be used interchangeably across a wide range of information retrieval applications. + - value: skos:narrowMatch + description: "From the SKOS primer: A triple skos:narrower (and skos:narrowMatch) asserts that , the object of the triple, is a narrower concept than , the subject of the triple." + - value: skos:broadMatch + description: "From the SKOS primer: A triple skos:broader (and skos:broadMatch) asserts that , the object of the triple, is a broader concept than , the subject of the triple." + - value: oboInOwl:hasDbXref + description: Two terms are related in some way. The meaning is frequently consistent across a single set of mappings. Note this property is often overloaded even where the terms are of a different nature (e.g. interpro2go) + - value: rdfs:seeAlso + description: The subject and the object are associated in some unspecified way. The object IRI often resolves to a resource on the web that provides additional information. + predicate_modifier: + description: A modifier for negating the predicate. See https://github.com/mapping-commons/sssom/issues/40 for discussion + range: predicate_modifier_enum + see_also: + - https://github.com/mapping-commons/sssom/issues/107 + examples: + - value: Not + description: Negates the predicate, see documentation of predicate_modifier_enum + predicate_label: + description: The label of the predicate/relation of the mapping + range: string + examples: + - value: has cross-reference + description: The label of the oboInOwl:hasDbXref property to represent cross-references. + predicate_type: + description: The type of entity that is being mapped. + range: entity_type_enum + examples: + - value: owl:AnnotationProperty + - value: owl:ObjectProperty + object_id: + description: The ID of the object of the mapping. + mappings: + - owl:annotatedTarget + range: EntityReference + slot_uri: owl:annotatedTarget + examples: + - value: HP:0009894 + description: The CURIE denoting the Human Phenotype Ontology concept of 'Thickened ears' + object_label: + description: The label of object of the mapping + range: string + examples: + - value: Thickened ears + recommended: true + object_category: + description: The conceptual category to which the subject belongs to. This can + be a string denoting the category or a term from a controlled vocabulary. + This slot is deliberately underspecified. Conceptual categories can range from + those that are found in general upper ontologies such as BFO (e.g. process, temporal region, etc) to those that serve + as upper ontologies in specific domains, such as COB or BioLink (e.g. gene, disease, chemical entity). The purpose of this + optional field is documentation for human reviewers - when a category is known + and documented clearly, the cost of interpreting and evaluating the mapping decreases. + range: string + see_also: + - https://github.com/mapping-commons/sssom/issues/13 + - https://github.com/mapping-commons/sssom/issues/256 + examples: + - value: UBERON:0001062 + description: (The CURIE of the Uberon term for "anatomical entity".) + - value: anatomical entity + description: (A string, rather than ID, describing the "anatomical entity" category. This is possible, but less preferred than using an ID.) + - value: biolink:Gene + description: (The CURIE of the biolink class for genes.) + mapping_justification: + description: A mapping justification is an action (or the written representation of that action) of showing a mapping to be right or reasonable. + range: EntityReference + pattern: "^semapv:(MappingReview|ManualMappingCuration|LogicalReasoning|LexicalMatching|CompositeMatching|UnspecifiedMatching|SemanticSimilarityThresholdMatching|LexicalSimilarityThresholdMatching|MappingChaining)$" + required: true + any_of: + - equals_string: semapv:LexicalMatching + - equals_string: semapv:LogicalReasoning + - equals_string: semapv:CompositeMatching + - equals_string: semapv:UnspecifiedMatching + - equals_string: semapv:SemanticSimilarityThresholdMatching + - equals_string: semapv:LexicalSimilarityThresholdMatching + - equals_string: semapv:MappingChaining + - equals_string: semapv:MappingReview + - equals_string: semapv:ManualMappingCuration + examples: + - value: semapv:LexicalMatching + - value: semapv:ManualMappingCuration + object_type: + description: The type of entity that is being mapped. + range: entity_type_enum + instantiates: sssom:Propagatable + annotations: + propagated: true + examples: + - value: owl:Class + mapping_set_id: + description: A globally unique identifier for the mapping set (not each individual + mapping). Should be IRI, ideally resolvable. + required: true + range: uri + examples: + - value: http://purl.obolibrary.org/obo/mondo/mappings/mondo_exactmatch_ncit.sssom.tsv + description: (A persistent URI pointing to the latest version of the Mondo - NCIT mapping in the Mondo namespace.) + mapping_set_version: + description: A version string for the mapping. + range: string + slot_uri: owl:versionInfo + examples: + - value: "2020-01-01" + description: (A date-based version that indicates that the mapping was published on the 1st January in 2021.) + - value: "1.2.1" + description: "(A semantic version tag that indicates that this is the 1st major, 2nd minor version, patch 1 (https://semver.org/).)" + mapping_set_group: + description: Set by the owners of the mapping registry. A way to group . + range: string + mapping_set_title: + description: The display name of a mapping set. + range: string + slot_uri: dcterms:title + examples: + - value: "The Mondo-OMIM mappings by Monarch Initiative." + mapping_set_description: + description: A description of the mapping set. + range: string + slot_uri: dcterms:description + examples: + - value: "This mapping set was produced to integrate human and mouse phenotype data at the IMPC. It is primarily used for making mouse phenotypes searchable by human synonyms at https://mousephenotype.org/." + creator_id: + description: Identifies the persons or groups responsible for the creation of + the mapping. The creator is the agent that put the mapping in its published form, + which may be different from the author, which is a person that was actively involved + in the assertion of the mapping. + Recommended to be a list of ORCIDs or otherwise + identifying URIs. + slot_uri: dcterms:creator + range: EntityReference + multivalued: true + creator_label: + description: A string identifying the creator of this mapping. In the spirit of + provenance, consider using creator_id instead. + range: string + multivalued: true + author_id: + description: Identifies the persons or groups responsible for asserting the mappings. + Recommended to be a list of ORCIDs or otherwise + identifying URIs. + slot_uri: pav:authoredBy + range: EntityReference + multivalued: true + author_label: + description: A string identifying the author of this mapping. In the spirit of + provenance, consider using author_id instead. + range: string + multivalued: true + reviewer_id: + description: Identifies the persons or groups that reviewed and confirmed the mapping. + Recommended to be a list of ORCIDs or otherwise + identifying URIs. + range: EntityReference + multivalued: true + reviewer_label: + description: A string identifying the reviewer of this mapping. In the spirit of + provenance, consider using reviewer_id instead. + range: string + multivalued: true + license: + description: A url to the license of the mapping. In absence of a license we assume + no license. + range: uri + slot_uri: dcterms:license + subject_source: + description: URI of vocabulary or identifier source for the subject. + range: EntityReference + instantiates: sssom:Propagatable + annotations: + propagated: true + examples: + - value: obo:mondo.owl + description: A persistent OBO CURIE pointing to the latest version of the Mondo ontology. + - value: wikidata:Q7876491 + description: A Wikidata identifier for the Uberon ontology resource. + subject_source_version: + description: Version IRI or version string of the source of the subject term. + range: string + instantiates: sssom:Propagatable + annotations: + propagated: true + examples: + - value: http://purl.obolibrary.org/obo/mondo/releases/2021-01-30/mondo.owl + description: (A persistent Version IRI pointing to the Mondo version '2021-01-30') + object_source: + description: URI of vocabulary or identifier source for the object. + range: EntityReference + instantiates: sssom:Propagatable + annotations: + propagated: true + examples: + - value: obo:mondo.owl + description: A persistent OBO CURIE pointing to the latest version of the Mondo ontology. + - value: wikidata:Q7876491 + description: A Wikidata identifier for the Uberon ontology resource. + object_source_version: + description: Version IRI or version string of the source of the object term. + range: string + instantiates: sssom:Propagatable + annotations: + propagated: true + examples: + - value: http://purl.obolibrary.org/obo/mondo/releases/2021-01-30/mondo.owl + description: (A persistent Version IRI pointing to the Mondo version '2021-01-30') + mapping_provider: + description: URL pointing to the source that provided the mapping, for example + an ontology that already contains the mappings, or a database from which it was derived. + range: uri + instantiates: sssom:Propagatable + annotations: + propagated: true + mapping_set_source: + description: A mapping set or set of mapping set that was used to derive the mapping set. + slot_uri: prov:wasDerivedFrom + range: uri + multivalued: true + examples: + - value: http://purl.obolibrary.org/obo/mondo/mappings/2022-05-20/mondo_exactmatch_ncit.sssom.tsv + description: A persistent, ideally versioned, link to the mapping set from which the current mapping set is derived. + mapping_source: + description: The mapping set this mapping was originally defined in. mapping_source is used for example when merging multiple + mapping sets or deriving one mapping set from another. + range: EntityReference + examples: + - value: MONDO_MAPPINGS:mondo_exactmatch_ncit.sssom.tsv + mapping_cardinality: + description: A string indicating whether this mapping is from a 1:1 (the subject_id + maps to a single object_id), 1:n (the subject maps to more than one object_id), + n:1, 1:0, 0:1 or n:n group. Note that this is a convenience field that should be derivable + from the mapping set. + range: mapping_cardinality_enum + mapping_tool: + description: A reference to the tool or algorithm that was used to generate the + mapping. Should be a URL pointing to more info about it, but can be free text. + range: string + instantiates: sssom:Propagatable + annotations: + propagated: true + examples: + - value: https://github.com/AgreementMakerLight/AML-Project + mapping_tool_version: + description: Version string that denotes the version of the mapping tool used. + range: string + instantiates: sssom:Propagatable + annotations: + propagated: true + examples: + - value: v3.2 + mapping_date: + description: The date the mapping was asserted. This is different from the date the mapping was published or compiled in a SSSOM file. + slot_uri: pav:authoredOn + range: date + instantiates: sssom:Propagatable + annotations: + propagated: true + publication_date: + description: The date the mapping was published. This is different from the date the mapping was asserted. + slot_uri: dcterms:created + range: date + confidence: + description: A score between 0 and 1 to denote the confidence or probability that + the match is correct, where 1 denotes total confidence. + range: double + minimum_value: 0.0 + maximum_value: 1.0 + subject_match_field: + description: A list of properties (term annotations on the subject) that was used + for the match. + range: EntityReference + multivalued: true + instantiates: sssom:Propagatable + annotations: + propagated: true + object_match_field: + description: A list of properties (term annotations on the object) that was used + for the match. + range: EntityReference + multivalued: true + instantiates: sssom:Propagatable + annotations: + propagated: true + match_string: + description: String that is shared by subj/obj. It is recommended to indicate the + fields for the match using the object and subject_match_field slots. + range: string + multivalued: true + subject_preprocessing: + description: Method of preprocessing applied to the fields of the subject. + If different preprocessing steps were performed on different fields, it is + recommended to store the match in separate rows. + range: EntityReference + multivalued: true + instantiates: sssom:Propagatable + annotations: + propagated: true + examples: + - value: semapv:Stemming + - value: semapv:StopWordRemoval + object_preprocessing: + description: Method of preprocessing applied to the fields of the object. + If different preprocessing steps were performed on different fields, it is + recommended to store the match in separate rows. + range: EntityReference + multivalued: true + instantiates: sssom:Propagatable + annotations: + propagated: true + examples: + - value: semapv:Stemming + - value: semapv:StopWordRemoval + curation_rule: + description: A curation rule is a (potentially) complex condition executed by an agent that led to the establishment of a mapping. + Curation rules often involve complex domain-specific considerations, which are hard to capture in an automated fashion. The curation + rule is captured as a resource rather than a string, which enables higher levels of transparency and sharing across mapping sets. + The URI representation of the curation rule is expected to be a resolvable identifier which provides details about the nature of the curation rule. + range: EntityReference + multivalued: true + see_also: + - https://github.com/mapping-commons/sssom/issues/166 + - https://github.com/mapping-commons/sssom/pull/258 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/curation_rule.sssom.tsv + curation_rule_text: + description: A curation rule is a (potentially) complex condition executed by an agent that led to the establishment of a mapping. + Curation rules often involve complex domain-specific considerations, which are hard to capture in an automated fashion. The curation + rule should be captured as a resource (entity reference) rather than a string (see curation_rule element), which enables higher levels of transparency and sharing across mapping sets. + The textual representation of curation rule is intended to be used in cases where (1) the creation of a resource is not practical from the + perspective of the mapping_provider and (2) as an additional piece of metadata to augment the curation_rule element with a human readable text. + range: string + multivalued: true + see_also: + - https://github.com/mapping-commons/sssom/issues/166 + - https://github.com/mapping-commons/sssom/pull/258 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/curation_rule_text.sssom.tsv + similarity_score: + description: A score between 0 and 1 to denote the similarity between two entities, where + 1 denotes equivalence, and 0 denotes disjointness. The score is meant to be used in conjunction + with the similarity_measure field, to document, for example, the lexical or semantic match + of a matching algorithm. + range: double + minimum_value: 0.0 + maximum_value: 1.0 + see_also: + - https://github.com/mapping-commons/sssom/issues/385 + - https://github.com/mapping-commons/sssom/pull/386 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/similarity_score.sssom.tsv + similarity_measure: + description: The measure used for computing a similarity score. + This field is meant to be used in conjunction with the similarity_score field, to document, + for example, the lexical or semantic match of a matching algorithm. + To make processing this field as unambiguous as possible, we recommend using + wikidata CURIEs, but the type of this field is deliberately unspecified. + range: string + examples: + - value: https://www.wikidata.org/entity/Q865360 + description: (the Wikidata IRI for the Jaccard index measure). + - value: wikidata:Q865360 + description: (the Wikidata CURIE for the Jaccard index measure). + - value: Levenshtein distance + description: (a score to measure the distance between two character sequences). + see_also: + - https://github.com/mapping-commons/sssom/issues/385 + - https://github.com/mapping-commons/sssom/pull/386 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/similarity_score.sssom.tsv + issue_tracker_item: + description: The issue tracker item discussing this mapping. + range: EntityReference + examples: + - value: SSSOM_GITHUB_ISSUE:166 + description: (A URL resolving to an issue discussing a new SSSOM element request) + see_also: + - https://github.com/mapping-commons/sssom/issues/78 + - https://github.com/mapping-commons/sssom/pull/259 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/issue_tracker_item.sssom.tsv + issue_tracker: + description: A URL location of the issue tracker for this entity. + range: uri + examples: + - value: https://github.com/mapping-commons/mh_mapping_initiative/issues + description: (A URL resolving to the issue tracker of the Mouse-Human mapping initiative) + see_also: + - https://github.com/mapping-commons/sssom/issues/78 + - https://github.com/mapping-commons/sssom/pull/259 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/issue_tracker.sssom.tsv + see_also: + description: A URL specific for the mapping instance. E.g. for kboom we have a + per-mapping image that shows surrounding axioms that drive probability. Could + also be a github issue URL that discussed a complicated alignment + slot_uri: rdfs:seeAlso + range: string + multivalued: true + other: + description: Pipe separated list of key value pairs for properties not part of + the SSSOM spec. Can be used to encode additional provenance data. + range: string + comment: + description: Free text field containing either curator notes or text generated + by tool providing additional informative information. + slot_uri: rdfs:comment + range: string + extension_definitions: + description: A list that defines the extension slots used in the mapping set. + range: extension definition + multivalued: true + see_also: + - https://github.com/mapping-commons/sssom/issues/328 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/extension-slots.sssom.tsv +classes: + mapping set: + description: Represents a set of mappings + slot_usage: + license: + required: true + slots: + - curie_map + - mappings + - mapping_set_id + - mapping_set_version + - mapping_set_source + - mapping_set_title + - mapping_set_description + - creator_id + - creator_label + - license + - subject_type + - subject_source + - subject_source_version + - object_type + - object_source + - object_source_version + - mapping_provider + - mapping_tool + - mapping_tool_version + - mapping_date + - publication_date + - subject_match_field + - object_match_field + - subject_preprocessing + - object_preprocessing + - see_also + - issue_tracker + - other + - comment + - extension_definitions + mapping: + description: Represents an individual mapping between a pair of entities + slots: + - subject_id + - subject_label + - subject_category + - predicate_id + - predicate_label + - predicate_modifier + - object_id + - object_label + - object_category + - mapping_justification + - author_id + - author_label + - reviewer_id + - reviewer_label + - creator_id + - creator_label + - license + - subject_type + - subject_source + - subject_source_version + - object_type + - object_source + - object_source_version + - mapping_provider + - mapping_source + - mapping_cardinality + - mapping_tool + - mapping_tool_version + - mapping_date + - publication_date + - confidence + - curation_rule + - curation_rule_text + - subject_match_field + - object_match_field + - match_string + - subject_preprocessing + - object_preprocessing + - similarity_score + - similarity_measure + - see_also + - issue_tracker_item + - other + - comment + class_uri: owl:Axiom + rules: + - preconditions: + slot_conditions: + subject_type: + equals_string: "rdfs literal" + postconditions: + slot_conditions: + subject_label: + required: true + - preconditions: + slot_conditions: + subject_type: + none_of: + equals_string: "rdfs literal" + postconditions: + slot_conditions: + subject_id: + required: true + - preconditions: + slot_conditions: + object_type: + equals_string: "rdfs literal" + postconditions: + slot_conditions: + object_label: + required: true + - preconditions: + slot_conditions: + object_type: + none_of: + equals_string: "rdfs literal" + postconditions: + slot_conditions: + object_id: + required: true + mapping registry: + description: A registry for managing mapping sets. It holds a set of + mapping set references, and can import other registries. + slots: + - mapping_registry_id + - mapping_registry_title + - mapping_registry_description + - imports + - mapping_set_references + - documentation + - homepage + - issue_tracker + mapping set reference: + description: A reference to a mapping set. It allows to augment mapping + set metadata from the perspective of the registry, for example, providing + confidence, or a local filename or a grouping. + slots: + - mapping_set_id + - mirror_from + - registry_confidence + - mapping_set_group + - last_updated + - local_name + prefix: + slots: + - prefix_name + - prefix_url + extension definition: + description: A definition of an extension (non-standard) slot. + attributes: + slot_name: + description: The name of the extension slot. + range: ncname + required: true + property: + description: The property associated with the extension slot. It is + intended to provide a non-ambiguous meaning to the slot (contrary + to the slot_name, which for brevity reasons may be ambiguous). + range: uriorcurie + type_hint: + description: Expected type of the values of the extension slot. + range: uriorcurie + Propagatable: + class_uri: sssom:Propagatable + description: Metamodel extension class to describe slots whose value can be + propagated down from the MappingSet class to the Mapping class. + see_also: + - https://github.com/mapping-commons/sssom/issues/305 + attributes: + propagated: + description: Indicates whether a slot can be propagated from a mapping + down to individual mappings. + range: boolean + NoTermFound: + class_uri: sssom:NoTermFound + description: sssom:NoTermFound can be used in place of a subject_id or object_id + when the corresponding entity could not be found. It SHOULD be used in conjuction with + a corresponding subject_source or object_source to signify where the term was not found. + see_also: + - https://github.com/mapping-commons/sssom/issues/28 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/no_term_found.sssom.tsv + diff --git a/data-raw/internal/sssom_slot_types.rds b/data-raw/internal/sssom_slot_types.rds new file mode 100644 index 00000000..a124dbea Binary files /dev/null and b/data-raw/internal/sssom_slot_types.rds differ diff --git a/data-raw/internal/sssom_spec.R b/data-raw/internal/sssom_spec.R new file mode 100644 index 00000000..bfd43f26 --- /dev/null +++ b/data-raw/internal/sssom_spec.R @@ -0,0 +1,58 @@ +## code to prepare `.sssom_spec`, `.sssom_slot_types`, and +## `.sssom_mapping_slots` internal datasets ## +# +# Capture official SSSOM specification and parse for use by DO.utils + +rlang::check_installed( + c("glue", "here", "purrr", "stringr", "yaml") +) + + +# identify latest SSSOM version and construct URL for raw YAML +sssom_version <- stringr::str_remove( + httr::HEAD("https://github.com/mapping-commons/sssom/releases/latest/")$url, + ".*/" +) +sssom_yaml_path <- glue::glue( + "https://raw.githubusercontent.com/mapping-commons/sssom/@sssom_version@/src/sssom_schema/schema/sssom_schema.yaml", + .open = "@", + .close = "@" +) + + +# download YAML schema and parse for internal use +outdir <- here::here("data-raw", "internal") +yaml_file <- file.path(outdir, paste0("sssom_schema-", sssom_version, ".yaml")) + +dl_status <- download.file(sssom_yaml_path, yaml_file) + +if (dl_status != 0) { + rlang::abort( + glue::glue( + "Failed to download SSSOM specification from {sssom_yaml_path}" + ) + ) +} + +.sssom_spec <- yaml::read_yaml(yaml_file) +.sssom_spec$version <- sssom_version +.sssom_spec$access_date <- Sys.Date() + +.sssom_slot_types <- purrr::map_chr(.sssom_spec$slots, ~ .$range) +.sssom_mapping_slots <- .sssom_spec$classes$mapping$slots + +saveRDS( + .sssom_spec, + file = file.path(outdir, "sssom_spec.rds"), + compress = "bzip2" +) +saveRDS( + .sssom_slot_types, + file = file.path(outdir, "sssom_slot_types.rds"), + compress = "bzip2" +) +saveRDS( + .sssom_mapping_slots, + file = file.path(outdir, "sssom_mapping_slots.rds"), + compress = "bzip2" +) diff --git a/data-raw/internal/sssom_spec.rds b/data-raw/internal/sssom_spec.rds new file mode 100644 index 00000000..199abb36 Binary files /dev/null and b/data-raw/internal/sssom_spec.rds differ diff --git a/data-raw/sysdata-update.R b/data-raw/sysdata-update.R new file mode 100644 index 00000000..22aae504 --- /dev/null +++ b/data-raw/sysdata-update.R @@ -0,0 +1,34 @@ +rlang::check_installed("here", "usethis") + +indir <- here::here("data-raw", "internal") + + +# DO Google Sheets reference +.DO_gs <- readRDS(file.path(indir, "DO_gs.rds")) + + +# HTML tags reference +.html_tags <- readRDS(file.path(indir, "html_tags.rds")) + + +# curation template specification +.curation_opts <- readRDS(file.path(indir, "curation_opts.rds")) +.sparql_dt_motif <- readRDS(file.path(indir, "sparql_dt_motif.rds")) + +# SSSOM specification +.sssom_spec <- readRDS(file.path(indir, "sssom_spec.rds")) +.sssom_slot_types <- readRDS(file.path(indir, "sssom_slot_types.rds")) +.sssom_mapping_slots <- readRDS(file.path(indir, "sssom_mapping_slots.rds")) + + +usethis::use_data( + .DO_gs, + .html_tags, + .curation_opts, + .sparql_dt_motif, + .sssom_spec, + .sssom_slot_types, + .sssom_mapping_slots, + internal = TRUE, + overwrite = TRUE +) diff --git a/inst/sparql/obo-data.rq b/inst/sparql/obo-data.rq new file mode 100644 index 00000000..b56f09e8 --- /dev/null +++ b/inst/sparql/obo-data.rq @@ -0,0 +1,34 @@ +# returns all class annotations, with any axiom annotations, and axioms to +# named classes (no anonymous classes); with optional VALUES or FILTER clauses +PREFIX rdf: +PREFIX rdfs: +PREFIX owl: +PREFIX obo: +PREFIX oboInOwl: + +SELECT ?id ?predicate ?value ?axiom_predicate ?axiom_value ?extra +WHERE { + #@values# + ?id a owl:Class ; + ?predicate ?value . + FILTER(!isBlank(?id) && !isBlank(?value)) + FILTER (?predicate NOT IN (oboInOwl:id, rdf:type)) + #@filter# + + OPTIONAL { + ?axiom a owl:Axiom ; + owl:annotatedSource ?id ; + owl:annotatedProperty ?predicate ; + owl:annotatedTarget ?value ; + ?axiom_predicate ?axiom_value . + FILTER ( + ?axiom_predicate NOT IN ( + rdf:type, owl:annotatedSource, owl:annotatedProperty, owl:annotatedTarget + ) + ) + } + + OPTIONAL { + ?value rdfs:label ?extra . + } +} diff --git a/man/curation_template.Rd b/man/curation_template.Rd new file mode 100644 index 00000000..1c85fccf --- /dev/null +++ b/man/curation_template.Rd @@ -0,0 +1,90 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/curation.R +\name{curation_template} +\alias{curation_template} +\alias{curation_template.NULL} +\alias{curation_template.obo_data} +\title{Create a Curation Template} +\usage{ +curation_template(.data = NULL, ss = NULL, sheet = NULL, ...) + +\method{curation_template}{`NULL`}(.data = NULL, ss = NULL, sheet = NULL, ..., nrow = 50) + +\method{curation_template}{obo_data}( + .data, + ss = NULL, + sheet = NULL, + ..., + id_max = 20, + n_id_sep = 2L, + debug = FALSE +) +} +\arguments{ +\item{.data}{Data to add to the curation sheet. If \code{NULL} (default), an empty +curation sheet will be created.} + +\item{ss}{Something that identifies a Google Sheet: +\itemize{ +\item its file id as a string or \code{\link[googledrive:drive_id]{drive_id}} +\item a URL from which we can recover the id +\item a one-row \code{\link[googledrive:dribble]{dribble}}, which is how googledrive +represents Drive files +\item an instance of \code{googlesheets4_spreadsheet}, which is what \code{\link[googlesheets4:gs4_get]{gs4_get()}} +returns +} + +Processed through \code{\link[googlesheets4:as_sheets_id]{as_sheets_id()}}.} + +\item{sheet}{(OPTIONAL) The sheet name, as a string. If \code{NULL} (default), the +sheet name will default to "curation-" with today's date appended (formatted +as "\%Y\%m\%d"; see \code{\link[=format.Date]{format.Date()}}).} + +\item{...}{Additional arguments passed to methods.} + +\item{nrow}{The number of rows to create in the curation template when +\code{.data = NULL} (default: \code{50}).} + +\item{id_max}{The maximum number of unique classes to include (default: \code{20}).} + +\item{n_id_sep}{The number of blank rows to insert between each \code{id} group +(default: \code{2}).} + +\item{debug}{Controls debug output. \code{FALSE} (default) writes to Google Sheets +normally. One or more of: +\itemize{ +\item \code{"output"}: returns the final data frame visibly instead of writing to +Google Sheets. +\item \code{"types"}: returns a list with \verb{$matched} (named character vector where +names are the original predicate strings and values are the resolved +\code{data_type} labels, as mapped by \code{.sparql_dt_motif}) and \verb{$unmatched} +(character vector of predicates not in \code{.sparql_dt_motif}, used as-is). +When combined with \code{"steps"}, the list is added as \verb{$types} in that output. +Combine with \code{"output"} to also return the final data frame. +\item \code{"steps"}: returns a named list of snapshots at each major pipeline step +(\code{filtered}, \code{pivoted}, \code{typed}, \code{output}); implies \code{"output"}. If \code{"types"} +is also requested, includes \verb{$types} in the returned list. +}} +} +\value{ +The Google Sheet info (\code{ss}), as a \link[googlesheets4:sheets_id]{googlesheets4::sheets_id}. +} +\description{ +Create a curation template in a Google Sheet, optionally including data. +} +\section{Formatting Limitations}{ + +Formatting to make data more visually distinct is not currently supported due +to limitations of the Google Sheets API and the \code{googlesheets4} package: +\itemize{ +\item Google Sheets API does not support assigning colors to data validation. +\item \code{googlesheets4} does not support any formatting. +} + +An alternative approach to support some formatting could be to create a +functional template with the desired formatting and copy that template with +\code{\link[googlesheets4:sheet_copy]{googlesheets4::sheet_copy()}}. The \code{data_type} could still be populated by +this function (only needed to support types not in +\code{.curation_opts$data_type}). +} + diff --git a/man/extract_obo_anon.Rd b/man/extract_obo_anon.Rd new file mode 100644 index 00000000..ef3ace2a --- /dev/null +++ b/man/extract_obo_anon.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extract.R +\name{extract_obo_anon} +\alias{extract_obo_anon} +\title{Extract Anonymous Relationships from OBO Foundry Ontology} +\usage{ +extract_obo_anon( + obo_ont, + prefix = NULL, + id = NULL, + render = "label", + .robot_path = NULL +) +} +\arguments{ +\item{obo_ont}{The path to an ontology file, as a string.} + +\item{prefix}{A character vector of OBO prefixes (aka ID spaces) to filter +results to, or \code{NULL} (default) to return all axioms. \emph{Ignored if \code{id} is} +\emph{provided.}} + +\item{id}{A character vector of OBO IDs (CURIEs) to filter results to or +\code{NULL} (default) to return all entities with logical relations.} + +\item{render}{The format for rendering classes & properties, as a string. +One of: +\itemize{ +\item \code{"label"} (default): Use labels, quoting as needed. +\item \code{"id"}: Use OBO IDs (CURIEs). +}} + +\item{.robot_path}{The path to a ROBOT executable or .jar file, as a string. +When \code{NULL} (default), if a system ROBOT executable is available it will +be used, otherwise an error will be signaled. + +\strong{NOTE:} \code{DO.utils} caches the last ROBOT used for future use.} +} +\value{ +A tibble with the columns: \code{id}, \code{data_type}, and \code{value}, where \code{value} +is the axiom in Manchester syntax rendered according to \code{format}. +} +\description{ +Extracts all anonymous relations (logical/complex) from any OBO Foundry +Ontology in Manchester format, including equivalent classes, subclasses, and +disjoint classes. \code{extract_obo_anon()} is designed to supplement SPARQL +queries that generally cannot return anonymous relationships. +} +\section{NOTES}{ + +Uses \href{https://robot.obolibrary.org/export}{ROBOT export} internally. +} + diff --git a/man/extract_obo_data.Rd b/man/extract_obo_data.Rd new file mode 100644 index 00000000..1b0fba19 --- /dev/null +++ b/man/extract_obo_data.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extract.R +\name{extract_obo_data} +\alias{extract_obo_data} +\title{Extract OBO Foundry Ontology Data} +\usage{ +extract_obo_data( + obo_ont, + prefix = NULL, + id = NULL, + include_anon = TRUE, + ..., + .robot_path = NULL +) +} +\arguments{ +\item{obo_ont}{The path to an ontology file, as a string.} + +\item{prefix}{A character vector of OBO prefixes (aka ID spaces) to filter +results to, or \code{NULL} (default) to return all axioms. \emph{Ignored if \code{id} is} +\emph{provided.}} + +\item{id}{A character vector of OBO IDs (CURIEs) to filter results to or +\code{NULL} (default) to return all entities with logical relations.} + +\item{include_anon}{Whether to include anonymous relationships +(logical/complex) in the output, as a boolean (default: \code{TRUE}). See +\code{\link[=extract_obo_anon]{extract_obo_anon()}}.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_obo_anon]{extract_obo_anon}} + \describe{ + \item{\code{render}}{The format for rendering classes & properties, as a string. +One of: +\itemize{ +\item \code{"label"} (default): Use labels, quoting as needed. +\item \code{"id"}: Use OBO IDs (CURIEs). +}} + }} + +\item{.robot_path}{The path to a ROBOT executable or .jar file, as a string. +When \code{NULL} (default), if a system ROBOT executable is available it will +be used, otherwise an error will be signaled. + +\strong{NOTE:} \code{DO.utils} caches the last ROBOT used for future use.} +} +\value{ +A tibble of class \code{obo_data} with the columns: \code{id}, \code{predicate}, +\code{value}, \verb{axiom predicate}, and \verb{axiom value}. An additional class is added +to indicate the file the data came from (without extension or directories). +} +\description{ +Extracts data from an OBO Foundry ontology. +} +\section{NOTES}{ + +Uses \href{https://robot.obolibrary.org/query}{ROBOT query} internally. +} + diff --git a/man/range_add_validation.Rd b/man/range_add_validation.Rd new file mode 100644 index 00000000..e017e5e0 --- /dev/null +++ b/man/range_add_validation.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/googlesheets.R +\name{range_add_validation} +\alias{range_add_validation} +\alias{range_add_checkbox} +\alias{range_add_dropdown} +\title{Add Data Validation to Google Sheet Range} +\usage{ +range_add_checkbox( + ss, + sheet = NULL, + range, + msg = "Value must be TRUE or FALSE", + quiet = TRUE +) + +range_add_dropdown( + ss, + sheet = NULL, + range, + values, + msg = "Choose a valid value", + reject_input = TRUE, + display_arrow = TRUE, + quiet = TRUE +) +} +\arguments{ +\item{ss}{Something that identifies a Google Sheet: +\itemize{ +\item its file id as a string or \code{\link[googledrive:drive_id]{drive_id}} +\item a URL from which we can recover the id +\item a one-row \code{\link[googledrive:dribble]{dribble}}, which is how googledrive +represents Drive files +\item an instance of \code{googlesheets4_spreadsheet}, which is what \code{\link[googlesheets4:gs4_get]{gs4_get()}} +returns +} + +Processed through \code{\link[googlesheets4:as_sheets_id]{as_sheets_id()}}.} + +\item{sheet}{Sheet to write into, in the sense of "worksheet" or "tab". You can identify a sheet by name, with a string, or by position, with a number. Ignored if the sheet is specified via \code{range}. If neither argument specifies the sheet, defaults to the first visible sheet.} + +\item{range}{Cells to apply data validation to. This \code{range} argument has +important similarities and differences to \code{range} elsewhere (e.g. +\code{\link[googlesheets4:range_read]{googlesheets4::range_read()}}): +\itemize{ +\item Similarities: Can be a cell range, using A1 notation ("A1:D3") or using +the helpers in \link[googlesheets4:cell-specification]{googlesheets4::cell-specification}. Can combine sheet +name and cell range ("Sheet1!A5:A") or refer to a sheet by name +(\code{range = "Sheet1"}, although \code{sheet = "Sheet1"} is preferred for clarity). +\item Difference: Can NOT be a named range. +}} + +\item{msg}{The message to display when the user types in a value that +violates the data validation rule. For \code{range_add_dropdown()}, only displayed +if \code{reject_input} is \code{TRUE}.} + +\item{values}{The values to use for the dropdown list, as a character vector.} + +\item{reject_input}{Whether to "Reject the input" (default: \code{TRUE}) if a +value violates the data validation rule or "Show a warning" (\code{FALSE}).} + +\item{display_arrow}{Whether to display a dropdown arrow next to the cell +(default: \code{TRUE}) or not (\code{FALSE}).} +} +\description{ +Add data validation to a Google Sheet range. +} +\section{Limitations of the Google Sheets API/\code{googlesheets4}}{ + +\itemize{ +\item The API does not support chipset multi-selection in dropdowns: +https://stackoverflow.com/questions/79653536/how-to-enable-multiple-selection-in-data-validation-dropdown-using-google-sheets +} +} + +\keyword{internal} diff --git a/man/spreadsheet_range.Rd b/man/spreadsheet_range.Rd new file mode 100644 index 00000000..f2e5629a --- /dev/null +++ b/man/spreadsheet_range.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/curation.R +\name{spreadsheet_range} +\alias{spreadsheet_range} +\title{Calculate a Spreadsheet Range} +\usage{ +spreadsheet_range(.data, .col, sheet = NULL, rows = NULL, n_header = 1) +} +\arguments{ +\item{.data}{A tibble.} + +\item{.col}{The column to use for the range, as a string.} + +\item{sheet}{(OPTIONAL) The sheet name, as a string. If \code{NULL} (default), the +sheet name will default to "curation-" with today's date appended (formatted +as "\%Y\%m\%d"; see \code{\link[=format.Date]{format.Date()}}).} + +\item{rows}{(OPTIONAL) The rows to use for the range, either as a continous +integer vector or as a string (i.e. "1:10"). If \code{NULL} (default), the entire +column will be used.} + +\item{n_header}{The number of header rows to skip (default: \code{1}).} +} +\description{ +Calculate a range for a spreadsheet program (Google Sheets or Excel). +} +\keyword{internal} diff --git a/man/write_gs.Rd b/man/write_gs.Rd index 1f309de1..91e6d237 100644 --- a/man/write_gs.Rd +++ b/man/write_gs.Rd @@ -4,6 +4,7 @@ \alias{write_gs} \alias{write_gs.omim_inventory} \alias{write_gs.data.frame} +\alias{write_gs.curation_template} \title{Write Data to a Google Sheet} \usage{ write_gs(data, ss, sheet = NULL, hyperlink_curie = NULL, ...) @@ -17,6 +18,8 @@ write_gs(data, ss, sheet = NULL, hyperlink_curie = NULL, ...) ) \method{write_gs}{data.frame}(data, ss, sheet = "data-\%Y\%m\%d", hyperlink_curie = NULL, ...) + +\method{write_gs}{curation_template}(data, ss = NULL, sheet = "curation-\%Y\%m\%d", ...) } \arguments{ \item{data}{A data.frame, possibly with a defined method.} diff --git a/tests/testthat/test-utils-spreadsheet.R b/tests/testthat/test-utils-spreadsheet.R new file mode 100644 index 00000000..2c83efda --- /dev/null +++ b/tests/testthat/test-utils-spreadsheet.R @@ -0,0 +1,7 @@ +test_that("colnum_to_ss_letter() works", { + expect_equal(colnum_to_ss_letter(1), "A") + expect_equal(colnum_to_ss_letter(26), "Z") + expect_equal(colnum_to_ss_letter(27), "AA") + expect_equal(colnum_to_ss_letter(52), "AZ") + expect_equal(colnum_to_ss_letter(703), "AAA") +})