diff --git a/.gitignore b/.gitignore index fa43643..2a8a2b3 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ __dev/ # Agents .claude/ CLAUDE.md -AGENTS.md \ No newline at end of file +AGENTS.md +SKILL.md diff --git a/julia/RtemisA3/src/api.jl b/julia/RtemisA3/src/api.jl index c578b98..3c7bf84 100644 --- a/julia/RtemisA3/src/api.jl +++ b/julia/RtemisA3/src/api.jl @@ -22,12 +22,12 @@ function create_a3( variant !== nothing && (annot["variant"] = variant) raw = Dict{String,Any}( - "\$schema" => _A3_SCHEMA_URI, - "a3_version" => _A3_VERSION, - "sequence" => sequence, + "\$schema" => _A3_SCHEMA_URI, + "a3_version" => _A3_VERSION, + "sequence" => sequence, "annotations" => annot, + "metadata" => metadata !== nothing ? metadata : Dict{String,Any}(), ) - metadata !== nothing && (raw["metadata"] = metadata) A3(raw) end diff --git a/julia/RtemisA3/src/validate.jl b/julia/RtemisA3/src/validate.jl index 490807c..ebae1e9 100644 --- a/julia/RtemisA3/src/validate.jl +++ b/julia/RtemisA3/src/validate.jl @@ -236,10 +236,14 @@ function A3(raw::AbstractDict) end haskey(raw, "sequence") || throw(A3ValidationError("missing required field 'sequence'")) + haskey(raw, "annotations") || + throw(A3ValidationError("missing required field 'annotations'")) + haskey(raw, "metadata") || + throw(A3ValidationError("missing required field 'metadata'")) seq = validate_sequence(raw["sequence"], "sequence") - annotations = parse_annotations(get(raw, "annotations", Dict{String,Any}()), "annotations") - metadata = parse_metadata(get(raw, "metadata", Dict{String,Any}()), "metadata") + annotations = parse_annotations(raw["annotations"], "annotations") + metadata = parse_metadata(raw["metadata"], "metadata") validate_bounds(seq, annotations) A3(seq, annotations, metadata) diff --git a/python/rtemis_a3/src/rtemis/a3/api.py b/python/rtemis_a3/src/rtemis/a3/api.py index b7dc394..43c62fd 100644 --- a/python/rtemis_a3/src/rtemis/a3/api.py +++ b/python/rtemis_a3/src/rtemis/a3/api.py @@ -124,6 +124,12 @@ def a3_from_json(text: str) -> A3: raise A3ParseError( f"'a3_version' must be '{_A3_VERSION}', got '{version_val}'" ) + if "sequence" not in data: + raise A3ParseError("missing required field 'sequence'") + if "annotations" not in data: + raise A3ParseError("missing required field 'annotations'") + if "metadata" not in data: + raise A3ParseError("missing required field 'metadata'") # Strip envelope keys before passing to the data model for key in _ENVELOPE_KEYS: data.pop(key, None) diff --git a/python/rtemis_a3/tests/test_api.py b/python/rtemis_a3/tests/test_api.py index 9fda01e..bea4e27 100644 --- a/python/rtemis_a3/tests/test_api.py +++ b/python/rtemis_a3/tests/test_api.py @@ -56,7 +56,7 @@ def test_bounds_error(self): # a3_from_json / a3_to_json # --------------------------------------------------------------------------- -MINIMAL_JSON = '{"$schema": "https://schema.rtemis.org/a3/v1/schema.json", "a3_version": "1.0.0", "sequence": "MAEPRQ"}' +MINIMAL_JSON = '{"$schema": "https://schema.rtemis.org/a3/v1/schema.json", "a3_version": "1.0.0", "sequence": "MAEPRQ", "annotations": {}, "metadata": {}}' FULL_JSON = """{ "$schema": "https://schema.rtemis.org/a3/v1/schema.json", @@ -104,9 +104,21 @@ def test_invalid_json(self): def test_valid_json_invalid_a3(self): with pytest.raises(A3ValidationError): a3_from_json( - '{"$schema": "https://schema.rtemis.org/a3/v1/schema.json", "a3_version": "1.0.0", "sequence": "M"}' + '{"$schema": "https://schema.rtemis.org/a3/v1/schema.json", "a3_version": "1.0.0", "sequence": "M", "annotations": {}, "metadata": {}}' ) # too short + def test_missing_annotations_field(self): + with pytest.raises(A3ParseError, match="annotations"): + a3_from_json( + '{"$schema": "https://schema.rtemis.org/a3/v1/schema.json", "a3_version": "1.0.0", "sequence": "MAEPRQ", "metadata": {}}' + ) + + def test_missing_metadata_field(self): + with pytest.raises(A3ParseError, match="metadata"): + a3_from_json( + '{"$schema": "https://schema.rtemis.org/a3/v1/schema.json", "a3_version": "1.0.0", "sequence": "MAEPRQ", "annotations": {}}' + ) + def test_missing_schema_field(self): with pytest.raises(A3ParseError, match=r"\$schema"): a3_from_json('{"a3_version": "1.0.0", "sequence": "MAEPRQ"}') diff --git a/r/.Rbuildignore b/r/.Rbuildignore index dda173c..b789db9 100644 --- a/r/.Rbuildignore +++ b/r/.Rbuildignore @@ -18,6 +18,5 @@ ^docs$ ^AGENTS\.md$ ^CLAUDE\.md$ -^NEWS\.md$ ^pkgdown$ ^SKILL\.md$ diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 338b5a6..4efe4d9 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,14 +1,16 @@ Package: rtemis.a3 -Title: Amino Acid Annotation (A3) format -Version: 0.5.1 -Date: 2026-03-29 -Authors@R: - person(given = "E.D.", family = "Gennatas", role = c("aut", "cre"), +Title: Amino Acid Annotation (A3) Format +Version: 0.5.2 +Date: 2026-04-03 +Authors@R: + person(given = "E.D.", family = "Gennatas", role = c("aut", "cre", "cph"), email = "gennatas@gmail.com", comment = c(ORCID = "0000-0001-9280-3609")) -Description: Defines the annotated amino acid (A3) format using S7 classes. Provides functions to - create A3 objects, read and write A3 JSON files. -URL: https://rtemis.a3.rtemis.org +Description: Implements the Amino Acid Annotation (A3) format using 'S7' classes. + The A3 format is a structured 'JSON' schema for annotating amino acid sequences + with site, region, post-translational modification (PTMs), processing event, + and sequence variant annotations. Provides functions to create, read, and write A3 objects. +URL: https://a3.rtemis.org License: GPL (>= 3) Encoding: UTF-8 Roxygen: list(markdown = TRUE) @@ -22,8 +24,6 @@ Imports: utils Suggests: biomaRt, - colorspace, - dplyr, httr, jsonlite, seqinr, diff --git a/r/NEWS.md b/r/NEWS.md new file mode 100644 index 0000000..717bef5 --- /dev/null +++ b/r/NEWS.md @@ -0,0 +1,17 @@ +# rtemis.a3 0.5.2 + +## New features + +* Initial CRAN release. +* Defines the Amino Acid Annotation (A3) format using S7 classes. +* Core classes: `A3`, `A3Sequence`, `A3Annotation`, `A3Metadata`, `A3Site`, + `A3Region`, `A3PTM`, `A3Processing`, `A3Variant`. +* `create_A3()`: create A3 objects with full validation. +* `annotation_position()`, `annotation_range()`, `annotation_variant()`: + helpers to build annotation entries. +* `write_A3json()` / `read_A3json()`: serialize and deserialize A3 objects to + and from JSON files. +* `concat()`: concatenate a character vector into a single sequence string. +* Database utilities: `uniprot_to_A3()`, `uniprot_sequence()`, + `gene2sequence()`, `get_alphafold()`, `pdb_annotations()`, + `clinvar_variants()`. diff --git a/r/R/0_init.R b/r/R/0_init.R index e3e1541..7b880cb 100644 --- a/r/R/0_init.R +++ b/r/R/0_init.R @@ -724,7 +724,6 @@ format_caller <- function(call_stack, call_depth, caller_id, max_char = 30L) { # \code{current <- as.list(sys.call())[[1]]} #' #' @param ... Message to print -#' @param date Logical: if TRUE, include date and time in the prefix #' @param caller Character: Name of calling function #' @param call_depth Integer: Print the system call path of this depth. #' @param caller_id Integer: Which function in the call stack to print @@ -732,8 +731,9 @@ format_caller <- function(call_stack, call_depth, caller_id, max_char = 30L) { #' @param newline Logical: If TRUE end with a new line. #' @param format_fn Function: Formatting function to use on the message text. #' @param sep Character: Use to separate objects in `...` +#' @param verbosity Integer: Verbosity level. #' -#' @return Invisibly: List with call, message, and date +#' @return Invisibly: `NULL` #' #' @author EDG #' @noRd @@ -742,27 +742,28 @@ format_caller <- function(call_stack, call_depth, caller_id, max_char = 30L) { #' msg("Hello, world!") msg <- function( ..., - date = TRUE, caller = NULL, call_depth = 1L, caller_id = 1L, newline_pre = FALSE, newline = TRUE, format_fn = plain, - sep = " " + sep = " ", + verbosity = 1L ) { + if (verbosity == 0L) { + return(invisible(NULL)) + } if (is.null(caller)) { call_stack <- as.list(sys.calls()) caller <- format_caller(call_stack, call_depth, caller_id) - } + } # / get caller txt <- Filter(Negate(is.null), list(...)) if (newline_pre) { message("") } - if (date) { - msgdatetime() - } + msgdatetime() message( format_fn(paste(txt, collapse = sep)), appendLF = FALSE @@ -791,8 +792,12 @@ msg0 <- function( newline_pre = FALSE, newline = TRUE, format_fn = plain, - sep = "" + sep = "", + verbosity = 1L ) { + if (verbosity == 0L) { + return(invisible(NULL)) + } if (is.null(caller)) { call_stack <- as.list(sys.calls()) caller <- format_caller(call_stack, call_depth, caller_id) @@ -828,13 +833,9 @@ msg0 <- function( #' @noRd #' #' @examples -#' \dontrun{ -#' { -#' msg("Hello") -#' pcat("super", "wow") -#' pcat(NULL, "oooo") -#' } -#' } +#' msg("Hello") +#' pcat("super", "potato") +#' pcat(NULL, "oooo") pcat <- function(left, right, pad = 17, newline = TRUE) { lpad <- max(0, pad - 1 - max(0, nchar(left))) cat(pad_string(left), right) diff --git a/r/R/a3.R b/r/R/a3.R index ccd9f75..e962c15 100644 --- a/r/R/a3.R +++ b/r/R/a3.R @@ -955,6 +955,15 @@ A3from_json <- function(x, ...) { "Field {.field a3_version} must be {.val {.A3_VERSION}}, got {.val {version_field}}." ) } + if (is.null(x[["sequence"]])) { + cli::cli_abort("JSON input missing required field {.field sequence}.") + } + if (is.null(x[["annotations"]])) { + cli::cli_abort("JSON input missing required field {.field annotations}.") + } + if (is.null(x[["metadata"]])) { + cli::cli_abort("JSON input missing required field {.field metadata}.") + } sequence <- x[["sequence"]] annotations <- x[["annotations"]] diff --git a/r/R/gene2sequence.R b/r/R/gene2sequence.R index be3ee42..b6a328b 100644 --- a/r/R/gene2sequence.R +++ b/r/R/gene2sequence.R @@ -14,7 +14,7 @@ #' #' @examples #' \dontrun{ -#' mapt_seqs <- gene2sequence("MAPT") +#' mapt_seq <- gene2sequence("MAPT") #' } gene2sequence <- function( gene, @@ -50,15 +50,14 @@ gene2sequence <- function( mart = mart ) - if (verbosity > 0) { - msg0( - "Found ", - bold(nrow(transcripts)), - " transcripts for gene ", - highlight(gene), - "." - ) - } + msg0( + "Found ", + bold(nrow(transcripts)), + " transcripts for gene ", + highlight(gene), + ".", + verbosity = verbosity + ) # Get sequence ---- # Retrieve sequence(s) using transcript ID diff --git a/r/R/rtemis_color_system.R b/r/R/rtemis_color_system.R index e3b71b0..6558046 100644 --- a/r/R/rtemis_color_system.R +++ b/r/R/rtemis_color_system.R @@ -27,39 +27,10 @@ rt_teal <- rtemis_teal rt_purple <- rtemis_purple rt_magenta <- rtemis_light_magenta highlight_col <- coastside_orange - col_object <- rt_green -#' rtemis Color System -#' -#' A named list of colors used consistently across all packages -#' in the rtemis ecosystem. -#' -#' Colors are provided as hex strings. -#' -#' @format A named list with the following elements: -#' \describe{ -#' \item{red}{"kaimana red"} -#' \item{blue}{"kaimana light blue"} -#' \item{green}{"kaimana medium green"} -#' \item{orange}{"coastside orange"} -#' \item{teal}{"rtemis teal"} -#' \item{purple}{"rtemis purple"} -#' \item{magenta}{"rtemis magenta"} -#' \item{highlight_col}{"highlight color"} -#' \item{object}{"rtemis teal"} -#' \item{info}{"lmd burgundy"} -#' \item{outer}{"kaimana red"} -#' \item{tuner}{"coastside orange"} -#' } -#' -#' @author EDG -#' -#' @noRd -#' -#' @examples -#' rtemis_colors[["teal"]] +# %% rtemis_colors ---- rtemis_colors <- list( red = kaimana_red, light_blue = kaimana_light_blue, diff --git a/r/R/utils_a3.R b/r/R/utils_a3.R index efb85d0..dc4fec8 100644 --- a/r/R/utils_a3.R +++ b/r/R/utils_a3.R @@ -55,20 +55,17 @@ aa_sub <- function(x, substitutions, verbosity = 1L) { from <- strngs[1] to <- strngs[length(strngs)] pos <- as.numeric(strngs[2:(length(strngs) - 1)] |> paste(collapse = "")) - if (verbosity > 0) { - msg( - "Substituting", - highlight(from), - "at position", - highlight(pos), - "with", - highlight(to) - ) - } + msg( + "Substituting", + highlight(from), + "at position", + highlight(pos), + "with", + highlight(to), + verbosity = verbosity + ) x[pos] <- to } - if (verbosity > 0) { - msg("All done.") - } + msg("All done.", verbosity = verbosity) x } diff --git a/r/R/utils_clinvar.R b/r/R/utils_clinvar.R index 5630f5a..b8e26f6 100644 --- a/r/R/utils_clinvar.R +++ b/r/R/utils_clinvar.R @@ -84,19 +84,20 @@ clinvar_variants <- function( uids <- search_dat[["esearchresult"]][["idlist"]] if (length(uids) == 0L) { - if (verbosity > 0L) { - msg("No ClinVar records found for gene:", highlight(gene)) - } - return(list()) - } - if (verbosity > 0L) { msg( - "Found", - highlight(length(uids)), - "ClinVar records for gene:", - highlight(gene) + "No ClinVar records found for gene:", + highlight(gene), + verbosity = verbosity ) + return(list()) } + msg( + "Found", + highlight(length(uids)), + "ClinVar records for gene:", + highlight(gene), + verbosity = verbosity + ) # -- Summarize: fetch variant details in batches ---- batches <- split(uids, ceiling(seq_along(uids) / batch_size)) @@ -199,9 +200,7 @@ clinvar_variants <- function( } if (length(variant_list) == 0L) { - if (verbosity > 0L) { - msg("No protein-level variants parsed.") - } + msg("No protein-level variants parsed.", verbosity = verbosity) return(list()) } @@ -230,13 +229,12 @@ clinvar_variants <- function( ) names(variant_list) <- final_names - if (verbosity > 0L) { - msg( - "Parsed", - highlight(length(variant_list)), - "protein-level variants." - ) - } + msg( + "Parsed", + highlight(length(variant_list)), + "protein-level variants.", + verbosity = verbosity + ) variant_list } diff --git a/r/R/utils_pdb.R b/r/R/utils_pdb.R index f3c2d16..91a5e06 100644 --- a/r/R/utils_pdb.R +++ b/r/R/utils_pdb.R @@ -78,13 +78,12 @@ pdb_annotations <- function( if (is.null(pdb_id)) { # First entry is PDBe's top-ranked structure pdb_id <- tolower(all_segs[[1L]][["pdb_id"]]) - if (verbosity > 0L) { - msg( - "Selected PDB structure:", - highlight(toupper(pdb_id)), - "(PDBe top-ranked)" - ) - } + msg( + "Selected PDB structure:", + highlight(toupper(pdb_id)), + "(PDBe top-ranked)", + verbosity = verbosity + ) } else { pdb_id <- tolower(pdb_id) seg_ids <- vapply( @@ -258,16 +257,15 @@ pdb_annotations <- function( region_list <- name_by_type(region_list) site_list <- name_by_type(site_list) - if (verbosity > 0L) { - msg( - "Parsed", - highlight(length(region_list)), - "secondary structure regions and", - highlight(length(site_list)), - "binding sites from", - highlight(toupper(pdb_id)) - ) - } + msg( + "Parsed", + highlight(length(region_list)), + "secondary structure regions and", + highlight(length(site_list)), + "binding sites from", + highlight(toupper(pdb_id)), + verbosity = verbosity + ) list(region = region_list, site = site_list) } diff --git a/r/R/utils_uniprot.R b/r/R/utils_uniprot.R index 4bb2acd..a0c4897 100644 --- a/r/R/utils_uniprot.R +++ b/r/R/utils_uniprot.R @@ -27,9 +27,7 @@ uniprot_sequence <- function( path <- paste0(base_url, "/", accession, ".fasta") dat <- seqinr::read.fasta(path, seqtype = "AA") - if (verbosity > 0L) { - msg("Got:", highlight(attr(dat[[1L]], "Annot"))) - } + msg("Got:", highlight(attr(dat[[1L]], "Annot")), verbosity = verbosity) paste(as.character(dat[[1L]]), collapse = "") } diff --git a/r/README.md b/r/README.md index 7add270..cb0c40d 100644 --- a/r/README.md +++ b/r/README.md @@ -12,10 +12,16 @@ which provides A3 implementations in R, TypeScript, Python, Julia, and Rust. ## Installation ```r -# From r-universe install.packages("rtemis.a3", repos = "https://rtemis-org.r-universe.dev") ``` +or using pak: + +```r +pak::repo_add(myuniverse = "https://rtemis-org.r-universe.dev") +pak::pak("rtemis.a3") +``` + ## Quick Start ```r @@ -43,18 +49,9 @@ a3 <- create_A3( print(a3) ``` -## Parsing JSON - -```r -a3 <- A3from_json("path/to/protein.json") -# or from a JSON string -a3 <- A3from_json(json_string) -``` - -## Serialization +## Read / Write JSON ```r -json_string <- to_json(a3) write_A3json(a3, "path/to/output.json") a3 <- read_A3json("path/to/protein.json") ``` @@ -101,8 +98,6 @@ entry is `{ index, type }` — bare arrays are rejected. Positions are | Function | Description | |---|---| -| `to_json(x)` | Serialize an A3 object to a JSON string | -| `A3from_json(x)` | Parse a JSON string or pre-parsed list into an A3 object | | `write_A3json(x, path)` | Write an A3 object to a JSON file | | `read_A3json(path)` | Read an A3 object from a JSON file | diff --git a/r/cran-comments.md b/r/cran-comments.md new file mode 100644 index 0000000..5ca35bc --- /dev/null +++ b/r/cran-comments.md @@ -0,0 +1,28 @@ +# CRAN Submission Comments + +## Test environments + +* macOS (local): R 4.4.x, via `devtools::check()` +* R CMD check via GitHub Actions (Ubuntu, macOS, Windows) + +## R CMD check results + +There were no ERRORs, WARNINGs, or NOTEs. + +## First submission + +This is the first submission of this package to CRAN. + +## Method references + +There are no published references describing the methods in this package. +The package implements original functionality for creating and manipulating +the Amino Acid Annotation (A3) format. + +## Suggested packages with external dependencies + +Several functions in `Suggests` (biomaRt, httr, jsonlite, seqinr) are used +only for optional database-fetching utilities (`uniprot_to_A3()`, +`gene2sequence()`, `get_alphafold()`, `pdb_annotations()`, +`clinvar_variants()`). Examples for these functions are wrapped in +`\dontrun{}` because they require network access and external API availability. diff --git a/r/man/gene2sequence.Rd b/r/man/gene2sequence.Rd index fb5d001..320a46f 100644 --- a/r/man/gene2sequence.Rd +++ b/r/man/gene2sequence.Rd @@ -34,7 +34,7 @@ Get the sequence of a gene } \examples{ \dontrun{ - mapt_seqs <- gene2sequence("MAPT") + mapt_seq <- gene2sequence("MAPT") } } \author{ diff --git a/r/man/rtemis.a3-package.Rd b/r/man/rtemis.a3-package.Rd index 83dd413..fbb71ca 100644 --- a/r/man/rtemis.a3-package.Rd +++ b/r/man/rtemis.a3-package.Rd @@ -11,11 +11,11 @@ Amino Acid Annotation format utilities \seealso{ Useful links: \itemize{ - \item \url{https://rtemis.a3.rtemis.org} + \item \url{https://a3.rtemis.org} } } \author{ -\strong{Maintainer}: E.D. Gennatas \email{gennatas@gmail.com} (\href{https://orcid.org/0000-0001-9280-3609}{ORCID}) +\strong{Maintainer}: E.D. Gennatas \email{gennatas@gmail.com} (\href{https://orcid.org/0000-0001-9280-3609}{ORCID}) [copyright holder] } diff --git a/r/tests/testthat/test_A3.R b/r/tests/testthat/test_A3.R index b1922f5..81f3aa4 100644 --- a/r/tests/testthat/test_A3.R +++ b/r/tests/testthat/test_A3.R @@ -592,12 +592,13 @@ test_that("A3from_json rejects legacy bare-array format", { "ptm": {}, "processing": {}, "variant": [] - } + }, + "metadata": {} }' expect_error(A3from_json(legacy_json), "index") }) -test_that("A3from_json handles missing metadata gracefully", { +test_that("A3from_json rejects missing metadata", { json <- '{ "$schema": "https://schema.rtemis.org/a3/v1/schema.json", "a3_version": "1.0.0", @@ -610,10 +611,7 @@ test_that("A3from_json handles missing metadata gracefully", { "variant": [] } }' - x <- A3from_json(json) - expect_s7_class(x, A3) - expect_identical(x@metadata@uniprot_id, "") - expect_identical(x@metadata@description, "") + expect_error(A3from_json(json), "metadata") }) test_that("A3from_json accepts pre-parsed list", { diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 8856ad6..bcc66f2 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -9,6 +9,7 @@ path = "src/main.rs" [dependencies] clap = { version = "4", features = ["derive"] } +colored = "2" serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.149" thiserror = "2.0.18" diff --git a/rust/README.md b/rust/README.md index 0fd1963..f3f304a 100644 --- a/rust/README.md +++ b/rust/README.md @@ -9,6 +9,10 @@ processing, and variant annotations. Part of the [rtemis-org/a3](https://github.com/rtemis-org/a3) monorepo, which provides A3 implementations in Python, TypeScript, R, Julia, and Rust. +> [!NOTE] +> The Rust code contains extensive comments and documentation as an educational tool to help learn +> Rust. + ## Installation Add to your `Cargo.toml`: @@ -38,44 +42,59 @@ Pass `-` as `` to read from stdin. **Options:** -| Flag | Description | -|---|---| -| `-l, --limit ` | Max sequence residues to display (default: 10) | -| `-q, --quiet` | Suppress all output; use exit code only | -| `-j, --json` | Output results in JSON format | -| `-h, --help` | Print help | -| `-V, --version` | Print version | +| Flag | Description | +| ----------------- | -------------------------------------------------------- | +| `-l, --limit ` | Max sequence residues to display (default: 20) | +| `-q, --quiet` | Suppress all output; use exit code only | +| `-j, --json` | Output results in JSON format | +| `-D, --diagnose` | Full step-by-step diagnostic validation (all errors) | +| `-h, --help` | Print help | +| `-V, --version` | Print version | **Example — valid file:** ``` $ a3 tau.json -✓ valid A3 schema version 1.0.0 (https://schema.rtemis.org/a3/v1/schema.json) -UniProt ID: P10636 -Description: Microtubule-associated protein tau -Reference: -Organism: Homo sapiens -Sequence: MAEPRQEFEV... (758) -Annotations: site: 2 region: 1 ptm: 3 processing: 0 variant: 5 + + ✓ valid A3 1.0.0 https://schema.rtemis.org/a3/v1/schema.json + + Sequence MAEPRQEFEVMEDHAGTYGL… (length = 441) + + Annotations + ├── site 2 + ├── region 1 + ├── ptm 3 + ├── processing — + └── variant 5 + + Metadata + ├── UniProt ID P10636 + ├── Description Microtubule-associated protein tau + ├── Reference — + └── Organism Homo sapiens ``` **Example — invalid file:** ``` $ a3 bad.json -✗ invalid: - - annotations.site.foo: position 999 is out of bounds for sequence of length 6 (must be 1–6) -UniProt ID: P10636 -... + + ✗ invalid + + ├── annotations.site.foo: position 999 is out of bounds for sequence of length 6 (must be 1–6) + └── annotations.region: annotation name must not be empty + + Sequence MAEPRQ (length = 6) + ... ``` **Exit codes:** -| Code | Meaning | -|---|---| -| `0` | Valid | -| `1` | Invalid (A3 validation errors) | -| `2` | Error (bad arguments, file not found, JSON parse failure) | +| Code | Meaning | +| ---- | --------------------------------------------------------- | +| `0` | Valid | +| `1` | Invalid (A3 validation errors) | +| `2` | Error (bad arguments, file not found, JSON parse failure) | Use `--quiet` for scripting: @@ -150,17 +169,17 @@ let vars = variants_at(&a3, 3); { "sequence": "MKTAYIAKQR", "annotations": { - "site": { "Active site": { "index": [3, 5], "type": "activeSite" } }, - "region": { "Repeat 1": { "index": [[1, 4]], "type": "" } }, - "ptm": { "Phospho": { "index": [7], "type": "" } }, + "site": { "Active site": { "index": [3, 5], "type": "activeSite" } }, + "region": { "Repeat 1": { "index": [[1, 4]], "type": "" } }, + "ptm": { "Phospho": { "index": [7], "type": "" } }, "processing": {}, - "variant": [{ "position": 3, "from": "K", "to": "R" }] + "variant": [{ "position": 3, "from": "K", "to": "R" }] }, "metadata": { - "uniprot_id": "P12345", + "uniprot_id": "P12345", "description": "Example protein", - "reference": "", - "organism": "Homo sapiens" + "reference": "", + "organism": "Homo sapiens" } } ``` @@ -174,17 +193,17 @@ entry is `{ index, type }` — bare arrays are rejected. Positions are ### Parsing and serialization -| Function | Description | -|---|---| -| `a3_from_json(text: &str)` | Parse a JSON string into a validated `A3` | +| Function | Description | +| -------------------------------------------- | --------------------------------------------------------------- | +| `a3_from_json(text: &str)` | Parse a JSON string into a validated `A3` | | `a3_to_json(a3: &A3, indent: Option)` | Serialize to JSON; `None` = compact, `Some(n)` = n-space indent | ### Queries -| Function | Description | -|---|---| -| `residue_at(a3: &A3, position: u32)` | Residue at a 1-based position; `None` if out of bounds | -| `variants_at<'a>(a3: &'a A3, position: u32)` | All variant records at a 1-based position | +| Function | Description | +| -------------------------------------------- | ------------------------------------------------------ | +| `residue_at(a3: &A3, position: u32)` | Residue at a 1-based position; `None` if out of bounds | +| `variants_at<'a>(a3: &'a A3, position: u32)` | All variant records at a 1-based position | ### Type hierarchy diff --git a/rust/src/diagnostic.rs b/rust/src/diagnostic.rs new file mode 100644 index 0000000..cd7b9d5 --- /dev/null +++ b/rust/src/diagnostic.rs @@ -0,0 +1,574 @@ +//! Diagnostic mode — full step-by-step A3 validation. +//! +//! Implements the 6-step plan from `specs/diagnostic.md`: +//! +//! 1. Valid JSON [fatal] +//! 2. Envelope: `$schema` and `a3_version` +//! 3. Top-level field presence, types, no unknown keys [fatal per field] +//! 4. Sequence value +//! 5. Annotation families: site, region, ptm, processing, variant +//! 6. Metadata fields +//! +//! Every non-fatal error is accumulated before returning, so the caller sees +//! all violations at once. Fatal errors halt only the steps that depend on +//! their output — unrelated checks still run. + +use rtemis_a3::normalization::{normalize_positions, normalize_ranges, normalize_sequence}; +use rtemis_a3::{A3, A3_SCHEMA_URI, A3_VERSION, a3_from_json}; +use serde_json::{Map, Value}; + +const TOP_LEVEL_KEYS: &[&str] = &[ + "$schema", + "a3_version", + "sequence", + "annotations", + "metadata", +]; +const ANN_FAMILIES: &[&str] = &["site", "region", "ptm", "processing", "variant"]; +const METADATA_KEYS: &[&str] = &["uniprot_id", "description", "reference", "organism"]; + +// --------------------------------------------------------------------------- +// Public entry point +// --------------------------------------------------------------------------- + +/// Typed diagnostic failure — distinguishes a fatal parse error (exit 2) +/// from A3 validation errors (exit 1). +pub enum DiagnoseError { + /// Step 1 failed: the input is not valid JSON or not a JSON object. + /// Callers should exit with code 2 (system/parse error). + Fatal(Vec), + /// One or more A3 validation errors. Callers should exit with code 1. + Invalid(Vec), +} + +/// Full diagnostic validation of an A3 JSON string. +/// +/// Follows the 6-step plan in `specs/diagnostic.md`. Returns `Ok(A3)` when +/// every check passes, or `Err(DiagnoseError)` with every violation collected. +/// `DiagnoseError::Fatal` signals a JSON parse failure (exit 2); +/// `DiagnoseError::Invalid` signals A3 validation errors (exit 1). +/// +/// On success the standard `a3_from_json` path is used to construct the `A3`, +/// so the returned value is identical to what the fast path would produce. +pub fn a3_diagnose(text: &str) -> Result { + let mut errors: Vec = Vec::new(); + + // ----------------------------------------------------------------------- + // Step 1: Valid JSON [fatal] + // ----------------------------------------------------------------------- + + let value: Value = match serde_json::from_str(text) { + Ok(v) => v, + Err(e) => return Err(DiagnoseError::Fatal(vec![format!("Invalid JSON: {e}")])), + }; + + let obj = match value.as_object() { + Some(o) => o, + None => { + return Err(DiagnoseError::Fatal(vec![ + "Expected a JSON object at the top level".to_string(), + ])); + } + }; + + // ----------------------------------------------------------------------- + // Step 2: Envelope + // ----------------------------------------------------------------------- + + check_envelope(obj, &mut errors); + + // ----------------------------------------------------------------------- + // Step 3: Top-level field presence, types, unknown keys + // + // Each extraction returns `None` when the field is absent or has the wrong + // type — that `None` propagates to disable the steps that depend on it. + // ----------------------------------------------------------------------- + + let seq_raw = require_string_field(obj, "sequence", &mut errors); + + // `annotations` and `metadata` are required even when empty (`{}`). + let ann_obj = required_object_field(obj, "annotations", &mut errors); + let meta_obj = required_object_field(obj, "metadata", &mut errors); + + for key in obj.keys() { + if !TOP_LEVEL_KEYS.contains(&key.as_str()) { + errors.push(format!("unknown top-level key '{key}'")); + } + } + + // ----------------------------------------------------------------------- + // Step 4: Sequence value + // + // Normalize the raw string (uppercase, character set, min length). + // `seq_len` is `Some` only when this step fully passes — Step 5 needs it + // for bounds checking. + // ----------------------------------------------------------------------- + + let seq_len: Option = seq_raw.and_then(|s| match normalize_sequence(s) { + Ok(normalized) => Some(normalized.len() as u32), + Err(e) => { + errors.push(e); + None + } + }); + + // ----------------------------------------------------------------------- + // Step 5: Annotation families + // ----------------------------------------------------------------------- + + if let Some(ann) = ann_obj { + check_annotations(ann, seq_len, &mut errors); + } + + // ----------------------------------------------------------------------- + // Step 6: Metadata fields + // ----------------------------------------------------------------------- + + if let Some(meta) = meta_obj { + check_metadata(meta, &mut errors); + } + + // ----------------------------------------------------------------------- + // Return + // ----------------------------------------------------------------------- + + if errors.is_empty() { + // All diagnostic checks passed — use the standard fast path to build + // a validated A3. This should never fail: if it does, the diagnostic + // checks have a gap that needs fixing. + Ok(a3_from_json(text).expect("diagnostic passed but standard parse failed")) + } else { + Err(DiagnoseError::Invalid(errors)) + } +} + +// --------------------------------------------------------------------------- +// Step implementations +// --------------------------------------------------------------------------- + +fn check_envelope(obj: &Map, errors: &mut Vec) { + match obj.get("$schema") { + None => errors.push(format!("'$schema' is required; must be '{A3_SCHEMA_URI}'")), + Some(v) => match v.as_str() { + None => errors.push(format!( + "'$schema' must be a string; expected '{A3_SCHEMA_URI}'" + )), + Some(s) if s != A3_SCHEMA_URI => { + errors.push(format!("'$schema' must be '{A3_SCHEMA_URI}', got '{s}'")) + } + _ => {} + }, + } + + match obj.get("a3_version") { + None => errors.push(format!("'a3_version' is required; must be '{A3_VERSION}'")), + Some(v) => match v.as_str() { + None => errors.push(format!( + "'a3_version' must be a string; expected '{A3_VERSION}'" + )), + Some(s) if s != A3_VERSION => { + errors.push(format!("'a3_version' must be '{A3_VERSION}', got '{s}'")) + } + _ => {} + }, + } +} + +fn check_annotations(ann: &Map, seq_len: Option, errors: &mut Vec) { + for key in ann.keys() { + if !ANN_FAMILIES.contains(&key.as_str()) { + errors.push(format!("annotations: unknown family '{key}'")); + } + } + + if let Some(v) = ann.get("site") { + match v.as_object() { + Some(o) => check_site_entries(o, seq_len, errors), + None => errors.push("'annotations.site' must be an object".to_string()), + } + } + + if let Some(v) = ann.get("region") { + match v.as_object() { + Some(o) => check_region_entries(o, seq_len, errors), + None => errors.push("'annotations.region' must be an object".to_string()), + } + } + + if let Some(v) = ann.get("ptm") { + match v.as_object() { + Some(o) => check_flex_entries(o, "ptm", seq_len, errors), + None => errors.push("'annotations.ptm' must be an object".to_string()), + } + } + + if let Some(v) = ann.get("processing") { + match v.as_object() { + Some(o) => check_flex_entries(o, "processing", seq_len, errors), + None => errors.push("'annotations.processing' must be an object".to_string()), + } + } + + if let Some(v) = ann.get("variant") { + match v.as_array() { + Some(a) => check_variant_entries(a, seq_len, errors), + None => errors.push("'annotations.variant' must be an array".to_string()), + } + } +} + +fn check_site_entries( + entries: &Map, + seq_len: Option, + errors: &mut Vec, +) { + for (name, val) in entries { + if name.is_empty() { + errors.push("annotations.site: annotation name must not be empty".to_string()); + continue; + } + let field = format!("annotations.site.{name}"); + + let Some(entry) = require_object(val, &field, errors) else { + continue; + }; + let Some(index_val) = require_field(entry, "index", &field, errors) else { + continue; + }; + let Some(arr) = require_array(index_val, &format!("{field}.index"), errors) else { + continue; + }; + let Some(positions) = parse_positions(arr, &format!("{field}.index"), errors) else { + continue; + }; + + match normalize_positions(positions, &field) { + Err(e) => errors.push(e), + Ok(positions) => check_position_bounds(&positions, seq_len, &field, errors), + } + + check_kind_field(entry, &field, errors); + } +} + +fn check_region_entries( + entries: &Map, + seq_len: Option, + errors: &mut Vec, +) { + for (name, val) in entries { + if name.is_empty() { + errors.push("annotations.region: annotation name must not be empty".to_string()); + continue; + } + let field = format!("annotations.region.{name}"); + + let Some(entry) = require_object(val, &field, errors) else { + continue; + }; + let Some(index_val) = require_field(entry, "index", &field, errors) else { + continue; + }; + let Some(arr) = require_array(index_val, &format!("{field}.index"), errors) else { + continue; + }; + let Some(ranges) = parse_ranges(arr, &format!("{field}.index"), errors) else { + continue; + }; + + match normalize_ranges(ranges, &field) { + Err(e) => errors.push(e), + Ok(ranges) => check_range_bounds(&ranges, seq_len, &field, errors), + } + + check_kind_field(entry, &field, errors); + } +} + +fn check_flex_entries( + entries: &Map, + family: &str, + seq_len: Option, + errors: &mut Vec, +) { + for (name, val) in entries { + if name.is_empty() { + errors.push(format!( + "annotations.{family}: annotation name must not be empty" + )); + continue; + } + let field = format!("annotations.{family}.{name}"); + + let Some(entry) = require_object(val, &field, errors) else { + continue; + }; + let Some(index_val) = require_field(entry, "index", &field, errors) else { + continue; + }; + let Some(arr) = require_array(index_val, &format!("{field}.index"), errors) else { + continue; + }; + + // Detect positions vs ranges by the type of the first element. + // Empty arrays are valid for either — treat as positions (no-op). + let is_ranges = arr.first().map(|v| v.is_array()).unwrap_or(false); + + if is_ranges { + let Some(ranges) = parse_ranges(arr, &format!("{field}.index"), errors) else { + continue; + }; + match normalize_ranges(ranges, &field) { + Err(e) => errors.push(e), + Ok(ranges) => check_range_bounds(&ranges, seq_len, &field, errors), + } + } else { + let Some(positions) = parse_positions(arr, &format!("{field}.index"), errors) else { + continue; + }; + match normalize_positions(positions, &field) { + Err(e) => errors.push(e), + Ok(positions) => check_position_bounds(&positions, seq_len, &field, errors), + } + } + + check_kind_field(entry, &field, errors); + } +} + +fn check_variant_entries(entries: &[Value], seq_len: Option, errors: &mut Vec) { + for (i, val) in entries.iter().enumerate() { + let field = format!("annotations.variant[{i}]"); + + let Some(entry) = require_object(val, &field, errors) else { + continue; + }; + + match entry.get("position") { + None => errors.push(format!("{field}: missing required field 'position'")), + Some(v) => match v.as_u64().and_then(|n| u32::try_from(n).ok()) { + None => errors.push(format!("{field}.position: must be a positive integer")), + Some(0) => errors.push(format!("{field}.position: must be ≥ 1 (1-based); got 0")), + Some(pos) => { + if let Some(len) = seq_len + && pos > len + { + errors.push(format!( + "{field}.position: {pos} is out of bounds \ + for sequence of length {len} (must be 1–{len})" + )); + } + } + }, + } + } +} + +fn check_metadata(meta: &Map, errors: &mut Vec) { + for key in meta.keys() { + if !METADATA_KEYS.contains(&key.as_str()) { + errors.push(format!("metadata: unknown field '{key}'")); + } + } + for &key in METADATA_KEYS { + if let Some(v) = meta.get(key) + && !v.is_string() + { + errors.push(format!("metadata.{key}: must be a string")); + } + } +} + +// --------------------------------------------------------------------------- +// Bounds helpers +// --------------------------------------------------------------------------- + +fn check_position_bounds( + positions: &[u32], + seq_len: Option, + field: &str, + errors: &mut Vec, +) { + let Some(len) = seq_len else { return }; + for &pos in positions { + if pos > len { + errors.push(format!( + "{field}.index: position {pos} is out of bounds \ + for sequence of length {len} (must be 1–{len})" + )); + } + } +} + +fn check_range_bounds( + ranges: &[[u32; 2]], + seq_len: Option, + field: &str, + errors: &mut Vec, +) { + let Some(len) = seq_len else { return }; + for [_start, end] in ranges { + if *end > len { + errors.push(format!( + "{field}.index: range endpoint {end} is out of bounds \ + for sequence of length {len} (must be 1–{len})" + )); + } + } +} + +// --------------------------------------------------------------------------- +// Field extraction helpers +// --------------------------------------------------------------------------- + +/// Require a string field in `obj`. Pushes an error and returns `None` if +/// absent or not a string. +fn require_string_field<'a>( + obj: &'a Map, + key: &str, + errors: &mut Vec, +) -> Option<&'a str> { + match obj.get(key) { + None => { + errors.push(format!("'{key}' is required")); + None + } + Some(v) => match v.as_str() { + Some(s) => Some(s), + None => { + errors.push(format!("'{key}' must be a string")); + None + } + }, + } +} + +/// Require an object field in `obj`. Pushes an error and returns `None` if +/// absent or not an object. +fn required_object_field<'a>( + obj: &'a Map, + key: &str, + errors: &mut Vec, +) -> Option<&'a Map> { + match obj.get(key) { + None => { + errors.push(format!("'{key}' is required")); + None + } + Some(v) => match v.as_object() { + Some(o) => Some(o), + None => { + errors.push(format!("'{key}' must be an object")); + None + } + }, + } +} + +fn require_object<'a>( + val: &'a Value, + field: &str, + errors: &mut Vec, +) -> Option<&'a Map> { + match val.as_object() { + Some(o) => Some(o), + None => { + errors.push(format!("{field}: must be an object")); + None + } + } +} + +fn require_field<'a>( + obj: &'a Map, + key: &str, + field: &str, + errors: &mut Vec, +) -> Option<&'a Value> { + match obj.get(key) { + Some(v) => Some(v), + None => { + errors.push(format!("{field}: missing required field '{key}'")); + None + } + } +} + +fn require_array<'a>( + val: &'a Value, + field: &str, + errors: &mut Vec, +) -> Option<&'a Vec> { + match val.as_array() { + Some(a) => Some(a), + None => { + errors.push(format!("{field}: must be an array")); + None + } + } +} + +/// Parse an array of JSON values as `Vec` positions. +/// +/// Returns `None` if any element is not a non-negative integer that fits in +/// `u32` — all bad elements are reported before returning. +fn parse_positions(arr: &[Value], field: &str, errors: &mut Vec) -> Option> { + let mut positions = Vec::with_capacity(arr.len()); + let mut ok = true; + for (i, v) in arr.iter().enumerate() { + match v.as_u64().and_then(|n| u32::try_from(n).ok()) { + Some(pos) => positions.push(pos), + None => { + errors.push(format!("{field}[{i}]: must be a positive integer")); + ok = false; + } + } + } + ok.then_some(positions) +} + +/// Parse an array of JSON values as `Vec<[u32; 2]>` ranges. +/// +/// Each element must be a 2-element array of non-negative integers that fit in +/// `u32`. All bad elements are reported before returning `None`. +fn parse_ranges(arr: &[Value], field: &str, errors: &mut Vec) -> Option> { + let mut ranges = Vec::with_capacity(arr.len()); + let mut ok = true; + for (i, v) in arr.iter().enumerate() { + let elem = format!("{field}[{i}]"); + match v.as_array() { + None => { + errors.push(format!("{elem}: must be a [start, end] array")); + ok = false; + } + Some(pair) if pair.len() != 2 => { + errors.push(format!( + "{elem}: must be a 2-element [start, end] array, got {} elements", + pair.len() + )); + ok = false; + } + Some(pair) => { + let s = pair[0].as_u64().and_then(|n| u32::try_from(n).ok()); + let e = pair[1].as_u64().and_then(|n| u32::try_from(n).ok()); + match (s, e) { + (Some(s), Some(e)) => ranges.push([s, e]), + _ => { + errors.push(format!("{elem}: start and end must be positive integers")); + ok = false; + } + } + } + } + } + ok.then_some(ranges) +} + +/// Check that the optional `"type"` field in an annotation entry is a string. +fn check_kind_field(entry: &Map, field: &str, errors: &mut Vec) { + if let Some(v) = entry.get("type") + && !v.is_string() + { + errors.push(format!("{field}.type: must be a string")); + } +} diff --git a/rust/src/error.rs b/rust/src/error.rs index 9b4bf00..50bfdbe 100644 --- a/rust/src/error.rs +++ b/rust/src/error.rs @@ -3,8 +3,9 @@ //! All fallible operations return `Result`. The two variants //! map onto the two failure modes described in the A3 spec: //! -//! - [`A3Error::Parse`] — the input was not valid JSON -//! - [`A3Error::Validate`] — the JSON parsed but violated A3 rules +//! - [`A3Error::Parse`] — the input was not valid JSON +//! - [`A3Error::Serialize`] — a valid A3 value could not be serialized to JSON +//! - [`A3Error::Validate`] — the JSON parsed but violated A3 rules // `thiserror::Error` is a derive macro that generates the boilerplate needed // to make our enum implement the standard `std::error::Error` trait. diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 0402fdc..8b177b3 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -38,7 +38,8 @@ pub mod validation; // `use rtemis_a3::A3` instead of `use rtemis_a3::types::A3`. pub use error::A3Error; pub use types::{ - A3, A3Index, Annotations, FlexEntry, Metadata, RegionEntry, SiteEntry, VariantRecord, + A3, A3_SCHEMA_URI, A3_VERSION, A3Index, Annotations, FlexEntry, Metadata, RegionEntry, + SiteEntry, VariantRecord, }; pub use validation::validate; @@ -93,9 +94,9 @@ pub fn a3_from_json(text: &str) -> Result { pub fn a3_to_json(a3: &A3, indent: Option) -> Result { match indent { // Compact output — single line, no extra whitespace. - // `.map_err(A3Error::Serialize)` converts the serde_json::Error into - // the correct variant. We cannot use `?` here because `#[from]` is - // only implemented for A3Error::Parse, not A3Error::Serialize. + // `.map_err(A3Error::Serialize)` is required before `?` because + // `#[from]` is only on A3Error::Parse, so serde_json::Error does not + // auto-convert into A3Error::Serialize. None => Ok(serde_json::to_string(a3).map_err(A3Error::Serialize)?), // Pretty output with a custom indent width. @@ -129,9 +130,9 @@ pub fn a3_to_json(a3: &A3, indent: Option) -> Result { /// /// Returns `None` if `position` is 0 or beyond the sequence length. /// -/// `Option` is Rust's null-safe alternative to nullable values — there is -/// no `null` or `None` that can sneak in unexpectedly; you must explicitly -/// handle the `None` case wherever you use the result. +/// `Option` is Rust's null-safe alternative to nullable values — unlike +/// `null` in other languages, the compiler forces callers to handle both +/// `Some(value)` and `None` before they can use the result. pub fn residue_at(a3: &A3, position: u32) -> Option { if position == 0 || position > a3.sequence.len() as u32 { return None; diff --git a/rust/src/main.rs b/rust/src/main.rs index 8812b81..ae82a03 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -3,10 +3,13 @@ //! Usage: `a3 [OPTIONS] ` //! Pass `-` as `` to read from stdin. +mod diagnostic; + use clap::Parser; -use rtemis_a3::{A3, A3Error, validate}; +use colored::Colorize; +use rtemis_a3::{A3, A3_SCHEMA_URI, A3_VERSION, A3Error, validate}; use serde_json::{Value, json}; -use std::io::{self, Read}; +use std::io::{self, IsTerminal, Read}; use std::process; // --------------------------------------------------------------------------- @@ -24,7 +27,7 @@ struct Cli { file: String, /// Maximum number of sequence residues to display - #[arg(short, long, default_value_t = 10)] + #[arg(short, long, default_value_t = 20)] limit: usize, /// Suppress all output; use exit code only @@ -34,49 +37,236 @@ struct Cli { /// Output results in JSON format #[arg(short, long)] json: bool, + + /// Run full diagnostic validation (accumulates all errors) + #[arg(short = 'D', long)] + diagnose: bool, } // --------------------------------------------------------------------------- // Output helpers // --------------------------------------------------------------------------- +/// Word-wrap `text` to `width` columns, returning one string per line. +/// +/// Words that individually exceed `width` are placed on their own line +/// unbroken. If `text` fits within `width`, returns a single-element vec. +fn wrap_words(text: &str, width: usize) -> Vec { + if width == 0 || text.chars().count() <= width { + return vec![text.to_string()]; + } + let mut lines: Vec = Vec::new(); + let mut current = String::new(); + let mut current_width = 0usize; + for word in text.split_whitespace() { + let word_width = word.chars().count(); + if current.is_empty() { + current.push_str(word); + current_width = word_width; + } else if current_width + 1 + word_width <= width { + current.push(' '); + current.push_str(word); + current_width += 1 + word_width; + } else { + lines.push(current.clone()); + current = word.to_string(); + current_width = word_width; + } + } + if !current.is_empty() { + lines.push(current); + } + if lines.is_empty() { + vec![text.to_string()] + } else { + lines + } +} + +/// Build the parenthetical name hint for an annotation row. +/// +/// Shows up to 3 names. Appends `…` if there are more than 3 total, or if +/// any name had to be cropped to stay within `available` display columns. +/// `available` is the space for the content *inside* the parentheses. +fn build_hint(names: &[String], available: usize) -> String { + if names.is_empty() || available < 2 { + return String::new(); + } + let more_than_three = names.len() > 3; + let mut result = String::new(); + + for (i, name) in names.iter().take(3).enumerate() { + let sep = if i == 0 { "" } else { ", " }; + let candidate = format!("{}{}", sep, name); + let after_cols = result.chars().count() + candidate.chars().count(); + // Reserve 1 display column for "…" unless this is provably the last item. + let is_last = i + 1 == names.len() && !more_than_three; + let reserve = if is_last { 0 } else { 1 }; + + if after_cols + reserve <= available { + result.push_str(&candidate); + } else { + // Crop: append "…" to whatever we've accumulated so far. + if result.chars().count() < available { + result.push('…'); + } + return result; + } + } + + if more_than_three && result.chars().count() < available { + result.push('…'); + } + result +} + /// Print human-readable output. /// /// `errors` is empty when the file is valid, non-empty when validation failed. /// In both cases we print whatever metadata and stats are available. fn print_human(a3: &A3, errors: &[String], limit: usize) { + println!(); + // --- Status line --- if errors.is_empty() { - println!("✓ valid A3 schema version 1.0.0 (https://schema.rtemis.org/a3/v1/schema.json)"); + println!( + " {} {} {}", + "✓ valid".green().bold(), + format!("A3 {}", a3.a3_version()) + .bold() + .truecolor(71, 156, 255), + a3.schema().dimmed(), + ); } else { - println!("✗ invalid:"); - for e in errors { - println!(" - {e}"); + println!(" {}", "✗ invalid".red().bold()); + println!(); + let last = errors.len() - 1; + for (i, e) in errors.iter().enumerate() { + let connector = if i == last { "└──" } else { "├──" }; + println!(" {} {}", connector.dimmed(), e.red()); } } - let meta = a3.metadata(); - let ann = a3.annotations(); + println!(); + + // --- Sequence --- let seq = a3.sequence(); let n = limit.min(seq.len()); - let seq_line = if seq.len() > n { - format!("{}... ({})", &seq[..n], seq.len()) + let seq_display = if seq.len() > n { + format!("{}… (length = {})", &seq[..n], seq.len()) } else { - format!("{} ({})", seq, seq.len()) + format!("{} (length = {})", seq, seq.len()) }; - - println!("UniProt ID: {}", meta.uniprot_id()); - println!("Description: {}", meta.description()); - println!("Reference: {}", meta.reference()); - println!("Organism: {}", meta.organism()); - println!("Sequence: {}", seq_line); println!( - "Annotations: site: {} region: {} ptm: {} processing: {} variant: {}", - ann.site().len(), - ann.region().len(), - ann.ptm().len(), - ann.processing().len(), - ann.variant().len(), + " {} {}", + "Sequence".bold(), + seq_display.truecolor(220, 150, 86) ); + + // --- Annotations --- + println!(); + println!(" {}", "Annotations".bold()); + + let ann = a3.annotations(); + + // Sorted names per family (all of them — build_hint decides how many fit). + // Variant has no names; show positions instead. + let mut site_names: Vec = ann.site().keys().cloned().collect(); + site_names.sort(); + let mut region_names: Vec = ann.region().keys().cloned().collect(); + region_names.sort(); + let mut ptm_names: Vec = ann.ptm().keys().cloned().collect(); + ptm_names.sort(); + let mut proc_names: Vec = ann.processing().keys().cloned().collect(); + proc_names.sort(); + let var_names: Vec = ann + .variant() + .iter() + .map(|v| format!("pos {}", v.position())) + .collect(); + + let entries = [ + ("site", ann.site().len(), site_names), + ("region", ann.region().len(), region_names), + ("ptm", ann.ptm().len(), ptm_names), + ("processing", ann.processing().len(), proc_names), + ("variant", ann.variant().len(), var_names), + ]; + let last = entries.len() - 1; + for (i, (name, count, names)) in entries.iter().enumerate() { + let connector = if i == last { "└──" } else { "├──" }; + let padded = format!("{:<12}", name); + let count_str = if *count == 0 { + "—".dimmed().to_string() + } else { + count.to_string().truecolor(220, 150, 86).to_string() + }; + // Columns consumed before the opening paren: + // 2 (indent) + 3 (connector) + 1 (space) + 12 (padded name) + count digits + 2 (gap) + 1 '(' + let prefix_cols = 21 + + if *count == 0 { + 1 + } else { + count.to_string().len() + }; + let available = 90usize.saturating_sub(prefix_cols + 1); // +1 for ')' + let hint_content = build_hint(names, available); + let hint = if hint_content.is_empty() { + String::new() + } else { + format!(" {}", format!("({})", hint_content).dimmed()) + }; + println!(" {} {}{}{}", connector.dimmed(), padded, count_str, hint); + } + + // --- Metadata --- + println!(); + println!(" {}", "Metadata".bold()); + + let meta = a3.metadata(); + let meta_rows: [(&str, &str); 4] = [ + ("UniProt ID", meta.uniprot_id()), + ("Description", meta.description()), + ("Reference", meta.reference()), + ("Organism", meta.organism()), + ]; + let label_width = meta_rows.iter().map(|(l, _)| l.len()).max().unwrap_or(0); + // 2 (indent) + 3 (connector) + 1 (space) + label_width + 2 (gap) + let value_col = 8 + label_width; + let value_width = 90usize.saturating_sub(value_col); + let last = meta_rows.len() - 1; + for (i, (label, value)) in meta_rows.iter().enumerate() { + let is_last = i == last; + let connector = if is_last { "└──" } else { "├──" }; + // Non-last items get a │ at the connector column to keep the list + // visually uninterrupted across wrapped value lines. + let continuation = if is_last { + " ".repeat(value_col) + } else { + format!(" {}{}", "│".dimmed(), " ".repeat(value_col - 3)) + }; + if value.is_empty() { + println!( + " {} {: Result { fn main() { let cli = Cli::parse(); + // Disable colors when stdout is not a terminal (pipe, redirect, --quiet). + if !std::io::stdout().is_terminal() { + colored::control::set_override(false); + } + // Read input — exit 2 on I/O error. let content = read_input(&cli.file).unwrap_or_else(|e| { if !cli.quiet { @@ -140,24 +335,105 @@ fn main() { process::exit(2); }); + // --diagnose: full step-by-step validation that accumulates all errors. + if cli.diagnose { + match diagnostic::a3_diagnose(&content) { + Ok(a3) => { + if !cli.quiet { + if cli.json { + println!( + "{}", + serde_json::to_string_pretty(&build_json(&a3, &[], cli.limit)).unwrap() + ); + } else { + print_human(&a3, &[], cli.limit); + } + } + process::exit(0); + } + Err(err) => { + let (errors, exit_code) = match &err { + diagnostic::DiagnoseError::Fatal(e) => (e.as_slice(), 2i32), + diagnostic::DiagnoseError::Invalid(e) => (e.as_slice(), 1i32), + }; + if !cli.quiet { + if cli.json { + println!( + "{}", + serde_json::to_string_pretty(&json!({ + "valid": false, + "errors": errors, + })) + .unwrap() + ); + } else { + println!("\n {}", "✗ invalid".red().bold()); + println!(); + let last = errors.len() - 1; + for (i, msg) in errors.iter().enumerate() { + let connector = if i == last { "└──" } else { "├──" }; + println!(" {} {}", connector.dimmed(), msg.red()); + } + println!(); + } + } + process::exit(exit_code); + } + } + } + // Stage 1: JSON parse — exit 2 on failure. let raw: A3 = match serde_json::from_str(&content) { Ok(r) => r, Err(e) => { if !cli.quiet { - let msg = format!("Failed to parse JSON: {e}"); + let mut errors = vec![format!("Invalid A3: {e}")]; + + // Even though full deserialization failed, try parsing to a + // generic Value so we can check envelope fields and surface + // *all* errors at once instead of just the first serde failure. + if let Ok(value) = serde_json::from_str::(&content) { + match value.get("$schema").and_then(|v| v.as_str()) { + Some(s) if s != A3_SCHEMA_URI => { + errors.push(format!("'$schema' must be '{A3_SCHEMA_URI}', got '{s}'")); + } + None => { + errors.push(format!( + "'$schema' is required and must be '{A3_SCHEMA_URI}'" + )); + } + _ => {} + } + match value.get("a3_version").and_then(|v| v.as_str()) { + Some(v) if v != A3_VERSION => { + errors.push(format!("'a3_version' must be '{A3_VERSION}', got '{v}'")); + } + None => { + errors.push(format!( + "'a3_version' is required and must be '{A3_VERSION}'" + )); + } + _ => {} + } + } + if cli.json { println!( "{}", serde_json::to_string_pretty(&json!({ "valid": false, - "errors": [msg], + "errors": errors, })) .unwrap() ); } else { - println!("✗ invalid:"); - println!(" - {msg}"); + println!("\n {}", "✗ invalid".red().bold()); + println!(); + let last = errors.len() - 1; + for (i, msg) in errors.iter().enumerate() { + let connector = if i == last { "└──" } else { "├──" }; + println!(" {} {}", connector.dimmed(), msg.red()); + } } } process::exit(2); diff --git a/rust/src/types.rs b/rust/src/types.rs index 9b4a126..b87c769 100644 --- a/rust/src/types.rs +++ b/rust/src/types.rs @@ -282,9 +282,9 @@ impl Metadata { // --------------------------------------------------------------------------- /// Expected value for the `$schema` envelope field. -pub(crate) const A3_SCHEMA_URI: &str = "https://schema.rtemis.org/a3/v1/schema.json"; +pub const A3_SCHEMA_URI: &str = "https://schema.rtemis.org/a3/v1/schema.json"; /// Expected value for the `a3_version` envelope field. -pub(crate) const A3_VERSION: &str = "1.0.0"; +pub const A3_VERSION: &str = "1.0.0"; /// The root A3 object. /// @@ -305,12 +305,10 @@ pub struct A3 { /// Lowercase input is normalized to uppercase during validation. pub(crate) sequence: String, - /// All annotation families. Defaults to all-empty if omitted from JSON. - #[serde(default)] + /// All annotation families. Required; use an empty object `{}` if none. pub(crate) annotations: Annotations, - /// Sequence metadata. Defaults to all-empty strings if omitted from JSON. - #[serde(default)] + /// Sequence metadata. Required; use an empty object `{}` if none. pub(crate) metadata: Metadata, } diff --git a/specs/A3.md b/specs/A3.md index 95f1ec8..c36fb7b 100644 --- a/specs/A3.md +++ b/specs/A3.md @@ -93,10 +93,24 @@ A3 ### sequence +- Required on JSON input; always present in serialized output - Non-empty string; minimum 2 characters - Characters: `[A-Z*]` — standard IUPAC amino acid codes plus `*` (stop codon) - Normalization: lowercase input is uppercased on parse +### annotations + +- Required on JSON input; always present in serialized output +- Object with exactly five families: `site`, `region`, `ptm`, `processing`, `variant` +- Use an empty object `{}` when there are no annotations + +### metadata + +- Required on JSON input; always present in serialized output +- Object with four string fields: `uniprot_id`, `description`, `reference`, `organism` +- All four fields are optional within the object (default `""`) +- Use an empty object `{}` when there is no metadata + ### Positions (`integer[]`) An ordered collection of 1-based residue positions. @@ -167,7 +181,9 @@ Performed field-by-field on raw input: - `$schema`: required string; must equal `"https://schema.rtemis.org/a3/v1/schema.json"` - `a3_version`: required string; must equal `"1.0.0"` -- `sequence`: non-empty, `[A-Za-z*]+` (uppercased on parse), ≥ 2 characters +- `sequence`: required non-empty string, `[A-Za-z*]+` (uppercased on parse), ≥ 2 characters +- `annotations`: required object; empty object `{}` accepted +- `metadata`: required object; empty object `{}` accepted - Positions: positive integers, sorted, deduplicated - Ranges: positive integers, `start < end`, sorted, overlapping ranges rejected - Annotation entries: `{ index, type }` objects — bare arrays rejected diff --git a/specs/cli.md b/specs/cli.md index 675759e..4509a0c 100644 --- a/specs/cli.md +++ b/specs/cli.md @@ -12,26 +12,42 @@ Pass `-` as `` to read from stdin. ## Human-readable output (default) +Output mirrors the schema structure: sequence → annotations → metadata. + ``` -✓ valid A3 schema version 1.0.0 (https://schema.rtemis.org/a3/v1/schema.json) -UniProt ID: P10636 -Description: Microtubule-associated protein tau -Reference: -Organism: Homo sapiens -Sequence: MAEPRQEFEV... (758) -Annotations: site: 2 region: 1 ptm: 3 processing: 0 variant: 5 + ✓ valid A3 1.0.0 https://schema.rtemis.org/a3/v1/schema.json + + Sequence MAEPRQEFEVMEDHAGTYGL… (length = 441) + + Annotations + ├── site 2 + ├── region 1 + ├── ptm 3 + ├── processing 0 + └── variant 5 + + Metadata + ├── UniProt ID P10636 + ├── Description Microtubule-associated protein tau + ├── Reference + └── Organism Homo sapiens ``` -Or on failure: +On failure, errors are listed first (with tree connectors), followed by +whatever metadata and stats are available from the partial parse: ``` -✗ invalid: - - annotations.site.foo: position 999 is out of bounds for sequence of length 6 (must be 1–6) - - annotations.region: annotation name must not be empty + ✗ invalid + + ├── annotations.site.foo: position 999 is out of bounds for sequence of length 6 (must be 1–6) + └── annotations.region: annotation name must not be empty + + Sequence MAEPRQ (length = 6) + ... ``` -- Sequence preview shows the first `min(l, sequence_length)` residues, with total length in parentheses. -- All errors are listed (not just the first). +- Sequence preview shows the first `min(l, sequence_length)` residues. +- All errors are collected and listed before returning (not just the first). ## JSON output (`-j, --json`) @@ -65,9 +81,10 @@ fields are absent. ## Options - ``: Path to the `.json` file to validate. Use `-` for stdin. -- `-l, --limit `: Limit the number of sequence residues displayed (default: 10) +- `-l, --limit `: Limit the number of sequence residues displayed (default: 20) - `-q, --quiet`: Suppress all output; use exit code only - `-j, --json`: Output results in JSON format +- `-D, --diagnose`: Run full step-by-step diagnostic validation (accumulates all errors) - `-h, --help`: Print help information - `-V, --version`: Print version information @@ -81,4 +98,16 @@ fields are absent. `clap` emits exit code 2 for argument errors automatically. I/O and parse failures also exit 2 so callers can distinguish "invalid A3" from "tool could -not run." +not run." In `--diagnose` mode the same contract applies: fatal parse failures +exit 2, A3 validation errors exit 1. + +## Styling + +- `✓ valid` — bold green; `✗ invalid` — bold red; status line indented like all other output +- Schema name and version (`A3 1.0.0`) — cyan; URL — dimmed +- Errors — red +- `Sequence`, `Annotations`, `Metadata` section headers — bold +- Annotation and metadata field names — dimmed +- All values (sequence, counts, metadata) — rgb(220, 150, 86) +- Empty metadata values rendered as dimmed `—` +- Colors disabled automatically when stdout is not a terminal (`NO_COLOR` respected) \ No newline at end of file diff --git a/specs/diagnostic.md b/specs/diagnostic.md new file mode 100644 index 0000000..60761b3 --- /dev/null +++ b/specs/diagnostic.md @@ -0,0 +1,34 @@ +# Diagnostic mode + +Principled step-by-step check of A3 files. All errors are accumulated and +reported together. Steps marked **[fatal]** halt diagnostics if they fail — +subsequent steps cannot run without their output. All other steps are +**continuable**: failure is recorded but checking proceeds. + +1. **Valid JSON** [fatal] + Check that the input is syntactically valid JSON. + +2. **Envelope** — `$schema` and `a3_version` + Check that both fields are present and equal the required values. + +3. **Top-level field presence, types, and no unknown keys** [fatal per field] + Check that `sequence` (string), `annotations` (object), and `metadata` + (object) are present and of the correct type, and that no unknown top-level + keys exist. A field with the wrong type is fatal for the steps that depend + on it (e.g. a non-object `annotations` blocks step 5). + +4. **Sequence value** + Check character validity (standard amino acid letters or `*`) and minimum + length. Required for bounds checking in step 5; if this step fails, + bounds checking is skipped. + +5. **Annotation families**, one by one: `site`, `region`, `ptm`, `processing`, `variant` + For each family: + a. Correct container type (object for site/region/ptm/processing, array for variant). + b. Entry structure and index field types. + c. Bounds: every position and range endpoint within sequence length + (skipped if step 4 failed). + +6. **Metadata fields** + Check that all fields (`uniprot_id`, `description`, `reference`, `organism`) + are strings if present, and that no unknown metadata keys exist. diff --git a/typescript/src/schemas.ts b/typescript/src/schemas.ts index e157d2d..fbfe806 100644 --- a/typescript/src/schemas.ts +++ b/typescript/src/schemas.ts @@ -108,14 +108,8 @@ export const A3InputSchema = z .min(2, "sequence must be at least 2 characters") .regex(/^[A-Za-z*]+$/, "sequence must contain only amino acid letters [A-Za-z] or '*'") .transform((s) => s.toUpperCase()), - annotations: AnnotationsSchema.default({ - site: {}, - region: {}, - ptm: {}, - processing: {}, - variant: [], - }), - metadata: MetadataSchema.default({}), + annotations: AnnotationsSchema, + metadata: MetadataSchema, }) .strict() .superRefine((data, ctx) => { diff --git a/typescript/tests/a3.test.ts b/typescript/tests/a3.test.ts index ed63779..da730f7 100644 --- a/typescript/tests/a3.test.ts +++ b/typescript/tests/a3.test.ts @@ -52,6 +52,7 @@ describe("A3 constructor", () => { a3_version: A3_VERSION, sequence: "MKTAY", annotations: { site: { A: { index: [99], type: "" } } }, + metadata: {}, }), ).toThrow(A3ValidationError); }); @@ -166,6 +167,8 @@ describe("A3.toJSON and JSON.stringify", () => { $schema: A3_SCHEMA_URI, a3_version: A3_VERSION, sequence: "MKTAY", + annotations: {}, + metadata: {}, }); const parsed = JSON.parse(a3.toJSONString()) as { annotations: Record }; expect(parsed.annotations).toHaveProperty("site"); @@ -183,6 +186,7 @@ describe("A3.toJSON and JSON.stringify", () => { annotations: { site: { A: { index: [1, 2] } }, // type omitted — defaults to "" }, + metadata: {}, }); const parsed = JSON.parse(a3.toJSONString()) as { annotations: { site: { A: { type: string } } }; diff --git a/typescript/tests/schemas.test.ts b/typescript/tests/schemas.test.ts index 44c7a16..eb87f9d 100644 --- a/typescript/tests/schemas.test.ts +++ b/typescript/tests/schemas.test.ts @@ -63,11 +63,23 @@ describe("annotation validation", () => { expect(result.success).toBe(false); }); - it("defaults missing annotations families to empty", () => { + it("rejects missing annotations object", () => { const result = A3InputSchema.safeParse({ $schema: A3_SCHEMA_URI, a3_version: A3_VERSION, sequence: "MKTAYIAKQR", + metadata: {}, + }); + expect(result.success).toBe(false); + }); + + it("accepts empty annotations object and defaults families to empty", () => { + const result = A3InputSchema.safeParse({ + $schema: A3_SCHEMA_URI, + a3_version: A3_VERSION, + sequence: "MKTAYIAKQR", + annotations: {}, + metadata: {}, }); expect(result.success).toBe(true); if (result.success) { @@ -289,11 +301,23 @@ describe("variant validation", () => { }); describe("metadata validation", () => { - it("defaults all metadata fields to empty string", () => { + it("rejects missing metadata object", () => { + const result = A3InputSchema.safeParse({ + $schema: A3_SCHEMA_URI, + a3_version: A3_VERSION, + sequence: "MKTAY", + annotations: {}, + }); + expect(result.success).toBe(false); + }); + + it("defaults all metadata fields to empty string when metadata is {}", () => { const result = A3InputSchema.safeParse({ $schema: A3_SCHEMA_URI, a3_version: A3_VERSION, sequence: "MKTAY", + annotations: {}, + metadata: {}, }); expect(result.success).toBe(true); if (result.success) { @@ -311,6 +335,7 @@ describe("metadata validation", () => { $schema: A3_SCHEMA_URI, a3_version: A3_VERSION, sequence: "MKTAY", + annotations: {}, metadata: { uniprot_id: "P10636" }, }); expect(result.success).toBe(true);