From 3bae680e25b97ac9989a6f4c97f565cb0701194a Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Fri, 22 May 2026 13:22:57 +0200 Subject: [PATCH 1/7] first commit --- Project.toml | 2 + src/DataWrangling/DataModes/DataModes.jl | 115 +++++++ .../DataModes/data_manifest_wrangling.jl | 277 +++++++++++++++ src/DataWrangling/DataModes/dry_run_value.jl | 88 +++++ .../DataModes/parse_and_rewrite_script.jl | 118 +++++++ src/DataWrangling/DataWrangling.jl | 47 ++- src/DataWrangling/ECCO/ECCO.jl | 5 + src/DataWrangling/EN4/EN4.jl | 1 + src/DataWrangling/ERA5/ERA5.jl | 6 +- src/DataWrangling/ETOPO/ETOPO.jl | 1 + src/DataWrangling/GEBCO/GEBCO.jl | 3 +- src/DataWrangling/GLORYS/GLORYS.jl | 6 +- src/DataWrangling/IBCAO/IBCAO.jl | 3 +- src/DataWrangling/IBCSO/IBCSO.jl | 1 + src/DataWrangling/JRA55/JRA55.jl | 2 + src/DataWrangling/ORCA/ORCA.jl | 2 + src/DataWrangling/OSPapa/OSPapa.jl | 4 +- src/DataWrangling/WOA/WOA.jl | 2 + src/DataWrangling/metadata.jl | 39 ++- src/DataWrangling/metadata_field.jl | 2 +- .../metadata_field_time_series.jl | 4 +- src/DataWrangling/restoring.jl | 2 +- test/test_data_modes.jl | 320 ++++++++++++++++++ 23 files changed, 1024 insertions(+), 26 deletions(-) create mode 100644 src/DataWrangling/DataModes/DataModes.jl create mode 100644 src/DataWrangling/DataModes/data_manifest_wrangling.jl create mode 100644 src/DataWrangling/DataModes/dry_run_value.jl create mode 100644 src/DataWrangling/DataModes/parse_and_rewrite_script.jl create mode 100644 test/test_data_modes.jl diff --git a/Project.toml b/Project.toml index 98a0ba09..dbbf6d63 100644 --- a/Project.toml +++ b/Project.toml @@ -31,6 +31,7 @@ SeawaterPolynomials = "d496a93d-167e-4197-9f49-d3af4ff8fe40" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Thermodynamics = "b60c26fb-14c3-4610-9d3e-2d17fe7ff00c" +TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" [weakdeps] @@ -90,6 +91,7 @@ SpeedyWeather = "0.20" StaticArrays = "1" Statistics = "<0.0.1, 1" Thermodynamics = "0.15.3" +TOML = "<0.0.1, 1" WorldOceanAtlasTools = "0.6" ZipFile = "0.10" julia = "1.10" diff --git a/src/DataWrangling/DataModes/DataModes.jl b/src/DataWrangling/DataModes/DataModes.jl new file mode 100644 index 00000000..c30daf5a --- /dev/null +++ b/src/DataWrangling/DataModes/DataModes.jl @@ -0,0 +1,115 @@ +""" + DataModes + +Three-mode download dispatch and a declarative `DataManifest.toml` for NumericalEarth. +Modes are selected by the `NUMERICALEARTH_DATA` environment variable: + +| Value | Behavior | +|-------------------|------------------------------------------------------------| +| `"auto"` (default)| Download on demand (current behavior). | +| `"existing"` | Error if any required file is missing. Never download. | +| `"build:"` | Trace the running script; write a manifest to ``. | + +See [`NumericalEarth.DataWrangling.download_dataset`](@ref) for the dispatch and [`build_dataset_manifest`](@ref) for the trace entry point. +""" +module DataModes + +using DocStringExtensions: TYPEDSIGNATURES +using TOML: TOML + +using ..DataWrangling: DataWrangling, AbstractMetadata, Metadata, Metadatum, MetadataSet, BoundingBox, Column, Linear, Nearest +using ..DataWrangling: DatewiseFilename, metadata_path, default_download_directory, download_dataset + +export DryRunValue +export build_dataset_manifest, download_datasets +export register_dataset! + +const DATA_MODE = Ref{Symbol}(:auto) + +include("dry_run_value.jl") +include("data_manifest_wrangling.jl") +include("parse_and_rewrite_script.jl") + +DataWrangling.observe_metadata(m::Metadata) = (DATA_MODE[] === :build && record_for_manifest(m); nothing) +DataWrangling.observe_metadata(m::MetadataSet) = (DATA_MODE[] === :build && record_for_manifest(m); nothing) + +""" + $(TYPEDSIGNATURES) + +Acquire every dataset listed in `metadata...` (varargs form) or in the manifest at `path` +(file-path form). Each entry is routed through [`download_dataset`](@ref), so the current +`NUMERICALEARTH_DATA` mode applies. + +For the file-path form, pass `dir` to override the default download directory for every reconstructed +entry (e.g. when login-node and compute-node filesystems differ). +""" +function download_datasets(metadata::AbstractMetadata...) + foreach(download_dataset, metadata) + return nothing +end + +function download_datasets(path::AbstractString; dir = nothing) + foreach(download_dataset, read_manifest(path; dir)) + return nothing +end + +expected_paths(metadata::Metadatum) = String[metadata_path(metadata)] + +function expected_paths(metadata::Metadata) + p = metadata_path(metadata) + return p isa Vector ? collect(String, p) : String[p] +end + +function expected_paths(mset::MetadataSet) + paths = String[] + for name in mset.names + append!(paths, expected_paths(mset[name])) + end + return paths +end + +""" + $(TYPEDSIGNATURES) + +Verify that every file required by `metadata` is already on disk. Raises a single error listing +every missing file. Returns `nothing` on success. +""" +function check_files_exist(metadata::AbstractMetadata) + paths = expected_paths(metadata) + missing_paths = filter(p -> !isfile(p), paths) + isempty(missing_paths) && return nothing + list = join((" " * p for p in missing_paths), "\n") + error("NUMERICALEARTH_DATA=existing: $(length(missing_paths)) required file(s) missing:\n$list") +end + +function __init__() + env = get(ENV, "NUMERICALEARTH_DATA", "auto") + mode, path = parse_data_mode(env) + DATA_MODE[] = mode + MANIFEST_PATH[] = path + mode === :build || return nothing + + if !isempty(Base.PROGRAM_FILE) + script = abspath(Base.PROGRAM_FILE) + atexit() do + try + build_dataset_manifest(script; manifest = MANIFEST_PATH[]) + @info "NUMERICALEARTH_DATA=build: wrote manifest via AST trace" path=MANIFEST_PATH[] script + catch err + @error "NUMERICALEARTH_DATA=build: trace failed" path=MANIFEST_PATH[] script exception=(err, catch_backtrace()) + end + end + else + atexit() do + try + write_manifest(MANIFEST_PATH[], copy(RECORDED)) + @info "NUMERICALEARTH_DATA=build: wrote manifest" path=MANIFEST_PATH[] entries=length(RECORDED) + catch err + @error "NUMERICALEARTH_DATA=build: failed to write manifest" path=MANIFEST_PATH[] exception=(err, catch_backtrace()) + end + end + end + return nothing +end + +end # module diff --git a/src/DataWrangling/DataModes/data_manifest_wrangling.jl b/src/DataWrangling/DataModes/data_manifest_wrangling.jl new file mode 100644 index 00000000..dc6134e2 --- /dev/null +++ b/src/DataWrangling/DataModes/data_manifest_wrangling.jl @@ -0,0 +1,277 @@ +const MANIFEST_PATH = Ref{String}("") +const RECORDED = AbstractMetadata[] +const DATASET_REGISTRY = Dict{String, Any}() + +""" + $(TYPEDSIGNATURES) + +Parse a `NUMERICALEARTH_DATA` value into a `(mode, manifest_path)` tuple. + +Recognized values: +- `""` or `"auto"` → `(:auto, "")` +- `"existing"` → `(:existing, "")` +- `"build:"` → `(:build, "")` + +Throws `ArgumentError` on any other value or on `"build:"` without a path. +""" +function parse_data_mode(s::AbstractString) + (isempty(s) || s == "auto") && return (:auto, "") + s == "existing" && return (:existing, "") + if startswith(s, "build:") + path = s[length("build:")+1:end] + isempty(path) && throw(ArgumentError("`NUMERICALEARTH_DATA=build:` requires a non-empty manifest path")) + return (:build, path) + end + throw(ArgumentError("Unrecognized NUMERICALEARTH_DATA value: $(repr(s)). Expected \"auto\", \"existing\", or \"build:\".")) +end + +""" + $(TYPEDSIGNATURES) + +Record `metadata` into [`RECORDED`](@ref) for later serialization to a `DataManifest.toml`. Deduplication +is by `metadata` equality on the recorded vector. Returns `nothing`. +""" +function record_for_manifest(metadata::AbstractMetadata) + any(==(metadata), RECORDED) || push!(RECORDED, metadata) + return nothing +end + +""" + $(TYPEDSIGNATURES) + +Register a dataset constructor under a string name so that the manifest writer can serialize it +(`dataset = "Name"`) and the loader can reconstruct it via `DATASET_REGISTRY[name]()`. Idempotent. +""" +function register_dataset!(constructor, name::AbstractString) + DATASET_REGISTRY[String(name)] = constructor + return nothing +end + +function dataset_name(d) + T = typeof(d) + for (name, ctor) in DATASET_REGISTRY + ctor === T && return name + end + return string(nameof(T)) +end + +region_to_dict(::Nothing) = nothing + +function region_to_dict(bb::BoundingBox) + d = Dict{String, Any}("kind" => "BoundingBox") + bb.longitude === nothing || (d["longitude"] = collect(bb.longitude)) + bb.latitude === nothing || (d["latitude"] = collect(bb.latitude)) + bb.z === nothing || (d["z"] = collect(bb.z)) + return d +end + +function region_to_dict(col::Column) + interp = col.interpolation isa Nearest ? "Nearest" : "Linear" + d = Dict{String, Any}("kind" => "Column", "longitude" => col.longitude, "latitude" => col.latitude, "interpolation" => interp) + col.z === nothing || (d["z"] = collect(col.z)) + return d +end + +filename_to_toml(::Nothing) = nothing +filename_to_toml(s::AbstractString) = String(s) +filename_to_toml(f::DatewiseFilename) = collect(String, f.filenames) + +function metadata_to_dict(m::Metadatum) + d = Dict{String, Any}("variable_name" => String(m.name), "dataset" => dataset_name(m.dataset)) + m.dates === nothing || (d["date"] = m.dates) + m.region === nothing || (d["region"] = region_to_dict(m.region)) + m.filename === nothing || (d["filename"] = filename_to_toml(m.filename)) + return d +end + +function metadata_to_dict(m::Metadata) + d = Dict{String, Any}("variable_name" => String(m.name), "dataset" => dataset_name(m.dataset), + "start_date" => first(m.dates), "end_date" => last(m.dates)) + m.region === nothing || (d["region"] = region_to_dict(m.region)) + m.filename === nothing || (d["filename"] = filename_to_toml(m.filename)) + return d +end + +function metadata_to_dict(mset::MetadataSet) + d = Dict{String, Any}("variable_names" => [String(n) for n in mset.names], "dataset" => dataset_name(mset.dataset)) + if mset.dates isa AbstractVector + d["start_date"] = first(mset.dates) + d["end_date"] = last(mset.dates) + elseif mset.dates !== nothing + d["date"] = mset.dates + end + mset.region === nothing || (d["region"] = region_to_dict(mset.region)) + return d +end + +manifest_table_key(::Metadatum) = "metadatum" +manifest_table_key(::Metadata) = "metadata" +manifest_table_key(::MetadataSet) = "metadataset" + +""" + $(TYPEDSIGNATURES) + +Serialize `records` (a vector of `AbstractMetadata`) to `io` as a `DataManifest.toml` with three +table arrays: `[[metadatum]]`, `[[metadata]]`, `[[metadataset]]`. + +The download directory (`dir`) is not stored. The loader uses each dataset's default directory +unless overridden by `download_datasets(...; dir=...)`. +""" +function write_manifest(io::IO, records::AbstractVector) + grouped = Dict{String, Vector{Dict{String, Any}}}("metadatum" => [], + "metadata" => [], + "metadataset" => []) + for r in records + push!(grouped[manifest_table_key(r)], metadata_to_dict(r)) + end + for k in ("metadatum", "metadata", "metadataset") + isempty(grouped[k]) && delete!(grouped, k) + end + TOML.print(io, grouped) + return nothing +end + +function write_manifest(path::AbstractString, records::AbstractVector) + open(io -> write_manifest(io, records), path, "w") + return nothing +end + +##### +##### filename and `region` reconstruction +##### + +region_from_toml(::Nothing) = nothing + +function region_from_toml(d::AbstractDict) + kind = d["kind"] + if kind == "BoundingBox" + longitude = haskey(d, "longitude") ? Tuple(d["longitude"]) : nothing + latitude = haskey(d, "latitude") ? Tuple(d["latitude"]) : nothing + z = haskey(d, "z") ? Tuple(d["z"]) : nothing + return BoundingBox(; longitude, latitude, z) + elseif kind == "Column" + z = haskey(d, "z") ? Tuple(d["z"]) : nothing + interpolation = get(d, "interpolation", "Linear") == "Nearest" ? Nearest() : Linear() + return Column(d["longitude"], d["latitude"]; z, interpolation) + else + throw(ArgumentError("Unknown region kind: $(repr(kind))")) + end +end + +filename_from_toml(::Nothing) = nothing +filename_from_toml(s::AbstractString) = String(s) +filename_from_toml(v::AbstractVector) = DatewiseFilename(collect(String, v)) + +function lookup_dataset(name::AbstractString) + haskey(DATASET_REGISTRY, name) || + throw(ArgumentError("Unknown dataset $(repr(name)). Did you `using` the dataset module so its __init__ runs and registers it?")) + return Base.invokelatest(DATASET_REGISTRY[name]) +end + +##### +##### AbstractMetadata reconstruction +##### + +function from_toml(kind::Symbol, entry::AbstractDict; dir = nothing) + dataset = lookup_dataset(entry["dataset"]) + region = region_from_toml(get(entry, "region", nothing)) + filename = filename_from_toml(get(entry, "filename", nothing)) + download_dir = dir === nothing ? default_download_directory(dataset) : String(dir) + if kind === :metadatum + name = Symbol(entry["variable_name"]) + return Metadatum(name; dataset, region, filename, dir = download_dir, date = get(entry, "date", nothing)) + elseif kind === :metadata + name = Symbol(entry["variable_name"]) + return Metadata(name; dataset, region, filename, dir = download_dir, + start_date = entry["start_date"], end_date = entry["end_date"]) + elseif kind === :metadataset + names = Tuple(Symbol(n) for n in entry["variable_names"]) + haskey(entry, "date") && + return MetadataSet(names...; dataset, region, dir = download_dir, date = entry["date"]) + return MetadataSet(names...; dataset, region, dir = download_dir, + start_date = entry["start_date"], end_date = entry["end_date"]) + else + throw(ArgumentError("Unknown manifest record kind: $(repr(kind))")) + end +end + +""" + $(TYPEDSIGNATURES) + +Read a `DataManifest.toml` and reconstruct every record as the matching `Metadatum`/`Metadata`/`MetadataSet`. +Datasets are looked up by name in [`DATASET_REGISTRY`](@ref). + +Pass `dir` to override every reconstructed record's download directory (useful when login-node and +compute-node filesystems differ); otherwise `default_download_directory(dataset)` is used. +""" +function read_manifest(path::AbstractString; dir = nothing) + raw = TOML.parsefile(path) + return manifest_from_dict(raw; dir) +end + +function read_manifest(io::IO; dir = nothing) + raw = TOML.parse(read(io, String)) + return manifest_from_dict(raw; dir) +end + +function manifest_from_dict(raw::AbstractDict; dir = nothing) + records = AbstractMetadata[] + for k in (:metadatum, :metadata, :metadataset) + haskey(raw, String(k)) || continue + for entry in raw[String(k)] + push!(records, Base.invokelatest(from_toml, k, entry; dir)) + end + end + return records +end + +""" + $(TYPEDSIGNATURES) + +Trace `script` in build-mode and write the resulting `DataManifest.toml` to `manifest`. + +The script's source is parsed with `Meta.parseall`, every statement is wrapped in a per-statement +`try`/`catch` that rebinds failed assignments to [`DryRunValue`](@ref), and the rewritten code is +evaluated in a fresh sandbox module with `DATA_MODE[] = :build`. Each [`download_dataset`](@ref) call +records its metadata into [`RECORDED`](@ref) instead of downloading. The accumulated records are +then serialized via [`write_manifest`](@ref). + +When `overwrite_existing = false` and `manifest` already exists, the existing records are read first +and merged (deduplicated) with the newly recorded ones, so this call appends rather than replaces. +Defaults to `true` (replace). +""" +function build_dataset_manifest(script::AbstractString; + manifest::AbstractString = "DataManifest.toml", + overwrite_existing::Bool = true) + script_abs = abspath(script) + source = read(script_abs, String) + parsed = Meta.parseall(source; filename = script_abs) + basedir = dirname(script_abs) + rewritten = Expr(:toplevel, [rewrite_statement(a, basedir) for a in parsed.args]...) + + saved_mode = DATA_MODE[] + saved_records = copy(RECORDED) + empty!(RECORDED) + DATA_MODE[] = :build + + new_records = AbstractMetadata[] + try + sandbox = Module(:DataModesSandbox) + Core.eval(sandbox, :(eval(x) = Core.eval($sandbox, x))) + Core.eval(sandbox, :(include(p) = Base.include($sandbox, p))) + Core.eval(sandbox, rewritten) + new_records = copy(RECORDED) + finally + DATA_MODE[] = saved_mode + empty!(RECORDED) + append!(RECORDED, saved_records) + end + + if !overwrite_existing && isfile(manifest) + for r in read_manifest(manifest) + any(==(r), new_records) || pushfirst!(new_records, r) + end + end + write_manifest(manifest, new_records) + return manifest +end \ No newline at end of file diff --git a/src/DataWrangling/DataModes/dry_run_value.jl b/src/DataWrangling/DataModes/dry_run_value.jl new file mode 100644 index 00000000..0850f577 --- /dev/null +++ b/src/DataWrangling/DataModes/dry_run_value.jl @@ -0,0 +1,88 @@ +""" + DryRunValue() + +Sentinel returned when a statement under [`build_dataset_manifest`](@ref) tracing either errors or +stands in for a value that real data would have produced. The per-statement `try`/`catch` wrappers +rebind any failed assignment to a `DryRunValue`, so the script continues running and downstream +`download_dataset` calls still register their metadata. + +To maximise script reach without touching `src` outside this module, `DryRunValue` absorbs almost +every common operation — call, property access, indexing, iteration, broadcasting, arithmetic, and +comparison all return another `DryRunValue`. Operations that fall outside this set still throw and +are caught by the surrounding per-statement wrapper. +""" +struct DryRunValue end + +Base.show(io::IO, ::DryRunValue) = print(io, "DryRunValue()") +Base.print(io::IO, ::DryRunValue) = print(io, "DryRunValue()") +Base.string(::DryRunValue) = "DryRunValue()" + +Base.getproperty(::DryRunValue, ::Symbol) = DryRunValue() +Base.setproperty!(::DryRunValue, ::Symbol, _) = DryRunValue() +Base.propertynames(::DryRunValue, ::Bool = false) = () +Base.hasproperty(::DryRunValue, ::Symbol) = true + +(::DryRunValue)(args...; kwargs...) = DryRunValue() + +Base.length(::DryRunValue) = 0 +Base.size(::DryRunValue) = () +Base.size(::DryRunValue, ::Int) = 0 +Base.axes(::DryRunValue) = () +Base.axes(::DryRunValue, ::Int) = Base.OneTo(0) +Base.eltype(::Type{DryRunValue}) = DryRunValue +Base.ndims(::DryRunValue) = 0 +Base.ndims(::Type{DryRunValue}) = 0 +Base.isempty(::DryRunValue) = true +Base.firstindex(::DryRunValue) = 1 +Base.lastindex(::DryRunValue) = 0 +Base.keys(::DryRunValue) = () +Base.values(::DryRunValue) = () +Base.pairs(::DryRunValue) = () + +Base.iterate(::DryRunValue, state = nothing) = nothing +Base.IteratorSize(::Type{DryRunValue}) = Base.HasShape{0}() +Base.IteratorEltype(::Type{DryRunValue}) = Base.HasEltype() + +Base.broadcastable(::DryRunValue) = Ref(DryRunValue()) +Base.materialize(::DryRunValue) = DryRunValue() + +Base.getindex(::DryRunValue, args...) = DryRunValue() +Base.setindex!(::DryRunValue, args...) = DryRunValue() +Base.view(::DryRunValue, args...) = DryRunValue() + +Base.adjoint(::DryRunValue) = DryRunValue() +Base.transpose(::DryRunValue) = DryRunValue() +Base.collect(::DryRunValue) = DryRunValue() +Base.copy(::DryRunValue) = DryRunValue() +Base.deepcopy(::DryRunValue) = DryRunValue() +Base.similar(::DryRunValue, args...) = DryRunValue() + +Base.convert(::Type{DryRunValue}, ::DryRunValue) = DryRunValue() +Base.promote_rule(::Type{DryRunValue}, ::Type) = DryRunValue +Base.promote_rule(::Type, ::Type{DryRunValue}) = DryRunValue + +Base.hash(::DryRunValue, h::UInt) = hash(DryRunValue, h) +Base.:(==)(::DryRunValue, ::DryRunValue) = true +Base.isequal(::DryRunValue, ::DryRunValue) = true + +for op in (:+, :-, :*, :/, :\, :^, :%, :÷, :&, :|, :⊻, :>>, :<<, :>>>, + :<, :>, :<=, :>=, :min, :max) + @eval Base.$op(::DryRunValue, ::Any) = DryRunValue() + @eval Base.$op(::Any, ::DryRunValue) = DryRunValue() + @eval Base.$op(::DryRunValue, ::DryRunValue) = DryRunValue() +end + +for op in (:-, :+, :abs, :abs2, :sqrt, :cbrt, :exp, :exp2, :exp10, :expm1, + :log, :log2, :log10, :log1p, :sin, :cos, :tan, :asin, :acos, :atan, + :sinh, :cosh, :tanh, :floor, :ceil, :round, :real, :imag, :conj, + :inv, :sign, :signbit, :one, :zero, :oneunit, :isnan, :isinf, :isfinite, + :iszero, :isone, :isreal, :isinteger) + @eval Base.$op(::DryRunValue) = DryRunValue() +end + +Base.:(:)(::DryRunValue, ::Any) = DryRunValue() +Base.:(:)(::Any, ::DryRunValue) = DryRunValue() +Base.:(:)(::DryRunValue, ::DryRunValue) = DryRunValue() +Base.:(:)(::DryRunValue, ::Any, ::Any) = DryRunValue() +Base.:(:)(::Any, ::DryRunValue, ::Any) = DryRunValue() +Base.:(:)(::Any, ::Any, ::DryRunValue) = DryRunValue() diff --git a/src/DataWrangling/DataModes/parse_and_rewrite_script.jl b/src/DataWrangling/DataModes/parse_and_rewrite_script.jl new file mode 100644 index 00000000..22194930 --- /dev/null +++ b/src/DataWrangling/DataModes/parse_and_rewrite_script.jl @@ -0,0 +1,118 @@ +const PASSTHROUGH_HEADS = Set([:using, :import, :export, :module, :struct, :abstract, :primitive, :macro, :macrocall, :const]) + +function is_include_call(s) + s isa Expr && s.head === :call && !isempty(s.args) || return false + f = s.args[1] + f === :include && return true + if f isa Expr && f.head === :. && length(f.args) == 2 + f.args[2] === QuoteNode(:include) && return true + end + return false +end + +function is_function_def(s) + s isa Expr || return false + s.head === :function && return true + if s.head === :(=) && length(s.args) == 2 + lhs = s.args[1] + lhs isa Expr || return false + lhs.head === :call && return true + lhs.head === :where && return is_function_def(Expr(:(=), lhs.args[1], s.args[2])) + lhs.head === :(::) && lhs.args[1] isa Expr && lhs.args[1].head === :call && return true + end + return false +end + +function wrap_assignment(lhs, rhs) + if lhs isa Symbol + return Expr(:(=), lhs, :(try; $rhs; catch; $DryRunValue(); end)) + end + return :(try; $lhs = $rhs; catch; end) +end + +wrap_return(args::Vector) = Expr(:try, Expr(:return, args...), false, Expr(:return, :($DryRunValue()))) +wrap_bare(expr) = Expr(:try, expr, false, Expr(:block)) + +function rewrite_block(body, basedir::AbstractString) + if body isa Expr && (body.head === :block || body.head === :toplevel) + return Expr(body.head, [rewrite_statement(a, basedir) for a in body.args]...) + else + return rewrite_statement(body, basedir) + end +end + +function rewrite_if(s, basedir::AbstractString) + new_then = rewrite_block(s.args[2], basedir) + if length(s.args) >= 3 + else_branch = s.args[3] + new_else = (else_branch isa Expr && (else_branch.head === :elseif || else_branch.head === :if)) ? + rewrite_if(else_branch, basedir) : rewrite_block(else_branch, basedir) + return Expr(s.head, s.args[1], new_then, new_else) + end + return Expr(s.head, s.args[1], new_then) +end + +function rewrite_function_body(s, basedir::AbstractString) + if s.head === :function + return Expr(:function, s.args[1], rewrite_block(s.args[2], basedir)) + elseif s.head === :(=) + return Expr(:(=), s.args[1], rewrite_block(s.args[2], basedir)) + elseif s.head === :(->) + return Expr(:(->), s.args[1], rewrite_block(s.args[2], basedir)) + elseif s.head === :do + anon = s.args[2] + new_anon = Expr(:(->), anon.args[1], rewrite_block(anon.args[2], basedir)) + return Expr(:do, s.args[1], new_anon) + end + return s +end + +function inline_include(s, basedir::AbstractString) + path_arg = s.args[2] + path_arg isa AbstractString || return wrap_bare(s) + full_path = isabspath(path_arg) ? path_arg : joinpath(basedir, path_arg) + isfile(full_path) || return wrap_bare(s) + inner_source = read(full_path, String) + inner_parsed = Meta.parseall(inner_source; filename = full_path) + inner_basedir = dirname(abspath(full_path)) + return Expr(:toplevel, [rewrite_statement(a, inner_basedir) for a in inner_parsed.args]...) +end + +function rewrite_statement(s, basedir::AbstractString) + s isa LineNumberNode && return s + s isa Expr || return wrap_bare(s) + + h = s.head + + h in PASSTHROUGH_HEADS && return s + + is_include_call(s) && return inline_include(s, basedir) + + if is_function_def(s) || h === :(->) + return rewrite_function_body(s, basedir) + end + + h === :do && return wrap_bare(rewrite_function_body(s, basedir)) + + if h === :(=) + lhs, rhs = s.args + return wrap_assignment(lhs, rhs) + end + + h === :return && return wrap_return(s.args) + + if h === :for || h === :while || h === :let + new_body = rewrite_block(s.args[2], basedir) + return wrap_bare(Expr(h, s.args[1], new_body)) + end + + (h === :if || h === :elseif) && return wrap_bare(rewrite_if(s, basedir)) + + if h === :block || h === :toplevel + return Expr(h, [rewrite_statement(a, basedir) for a in s.args]...) + end + + h === :quote && return s + + return wrap_bare(s) +end \ No newline at end of file diff --git a/src/DataWrangling/DataWrangling.jl b/src/DataWrangling/DataWrangling.jl index a1003d57..2e6d5232 100644 --- a/src/DataWrangling/DataWrangling.jl +++ b/src/DataWrangling/DataWrangling.jl @@ -4,7 +4,8 @@ restoring, or validation. """ module DataWrangling -export Metadata, Metadatum, MetadataSet, DatewiseFilename, ECCOMetadatum, EN4Metadatum, all_dates, first_date, last_date +export AbstractMetadata, Metadata, Metadatum, MetadataSet, DatewiseFilename, ECCOMetadatum, EN4Metadatum, all_dates, first_date, last_date +export download_dataset export validate_dataset_coverage, metadata_filename export BoundingBox, Column, Linear, Nearest export WOAClimatology, WOAAnnual, WOAMonthly @@ -233,8 +234,27 @@ abstract type AbstractStaticBathymetry <: AbstractStaticDataset end z_interfaces(::AbstractStaticBathymetry) = (0, 1) Base.size(dataset::AbstractStaticBathymetry, variable) = size(dataset) +""" + AbstractMetadata + +Common supertype for [`Metadata`](@ref), [`Metadatum`](@ref), and [`MetadataSet`](@ref). +Used to dispatch [`download_dataset`](@ref) on the three concrete kinds with a single method. +""" +abstract type AbstractMetadata end + +""" +Hook called at the end of every `AbstractMetadata` inner constructor. The default is a no-op; +[`NumericalEarth.DataWrangling.DataModes`](@ref) adds more-specific methods on `Metadata` and +`MetadataSet` that record into the manifest in `:build` mode, so the trace captures Metadata +constructed inside library functions too. +""" +observe_metadata(::AbstractMetadata) = nothing + # Fundamentals include("metadata.jl") + +function download_dataset end + include("set_region_data.jl") include("metadata_field.jl") include("dataset_backend.jl") @@ -242,6 +262,10 @@ include("metadata_field_time_series.jl") include("inpainting.jl") include("restoring.jl") +# parse and verify what data is needed +# download it all in one pass if needed +include("DataModes/DataModes.jl") + function metadata_time_step end function metadata_epoch end @@ -345,4 +369,25 @@ function Downloads.download(metadata::Metadata) error("No download method for $metadata is available (is the backend package loaded?)") end +""" + download_dataset(metadata::AbstractMetadata) + +Acquire the data referenced by `metadata` according to the current +`NUMERICALEARTH_DATA` mode (see [`DataModes`](@ref)): + +- `:auto` — call `Downloads.download(metadata)` (the per-dataset method). +- `:existing` — verify every required file is already on disk; error otherwise. +- `:build` — no-op (metadata is recorded into the manifest by `observe_metadata` at construction). + +This is the single chokepoint through which every code path that needs dataset files must go. +Per-dataset modules keep extending `Downloads.download` for the `:auto` branch only. +""" +function download_dataset(metadata::AbstractMetadata) + mode = DataModes.DATA_MODE[] + mode === :auto && return Downloads.download(metadata) + mode === :existing && return DataModes.check_files_exist(metadata) + mode === :build && return nothing + error("Unknown NUMERICALEARTH_DATA mode: $(repr(mode))") +end + end # module diff --git a/src/DataWrangling/ECCO/ECCO.jl b/src/DataWrangling/ECCO/ECCO.jl index fbb1b665..26e31105 100644 --- a/src/DataWrangling/ECCO/ECCO.jl +++ b/src/DataWrangling/ECCO/ECCO.jl @@ -55,6 +55,11 @@ import ..DataWrangling: download_ECCO_cache::String = "" function __init__() global download_ECCO_cache = @get_scratch!("ECCO") + DataWrangling.DataModes.register_dataset!(ECCO2Monthly, "ECCO2Monthly") + DataWrangling.DataModes.register_dataset!(ECCO2Daily, "ECCO2Daily") + DataWrangling.DataModes.register_dataset!(ECCO4Monthly, "ECCO4Monthly") + DataWrangling.DataModes.register_dataset!(ECCO2DarwinMonthly, "ECCO2DarwinMonthly") + DataWrangling.DataModes.register_dataset!(ECCO4DarwinMonthly, "ECCO4DarwinMonthly") end # Datasets diff --git a/src/DataWrangling/EN4/EN4.jl b/src/DataWrangling/EN4/EN4.jl index c08e1189..1e7309a3 100644 --- a/src/DataWrangling/EN4/EN4.jl +++ b/src/DataWrangling/EN4/EN4.jl @@ -16,6 +16,7 @@ using ..DataWrangling: DataWrangling, Metadata, Metadatum, DownloadProgress, Kel download_EN4_cache::String = "" function __init__() global download_EN4_cache = @get_scratch!("EN4") + DataWrangling.DataModes.register_dataset!(EN4Monthly, "EN4Monthly") end EN4_dataset_variable_names = Dict( diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 3341b8b2..7b9c4872 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -17,7 +17,7 @@ using Printf: Printf, @sprintf using Scratch: Scratch, @get_scratch! using Statistics: Statistics, mean -using ..DataWrangling: Metadata, Metadatum, metadata_path, native_grid, +using ..DataWrangling: DataWrangling, Metadata, Metadatum, metadata_path, native_grid, InverseGravity using NumericalEarth.Grids: PressureLevelVerticalDiscretization @@ -42,6 +42,10 @@ download_ERA5_cache::String = "" function __init__() global download_ERA5_cache = @get_scratch!("ERA5") + DataWrangling.DataModes.register_dataset!(ERA5HourlySingleLevel, "ERA5HourlySingleLevel") + DataWrangling.DataModes.register_dataset!(ERA5MonthlySingleLevel, "ERA5MonthlySingleLevel") + DataWrangling.DataModes.register_dataset!(ERA5HourlyPressureLevels, "ERA5HourlyPressureLevels") + DataWrangling.DataModes.register_dataset!(ERA5MonthlyPressureLevels, "ERA5MonthlyPressureLevels") end ##### diff --git a/src/DataWrangling/ETOPO/ETOPO.jl b/src/DataWrangling/ETOPO/ETOPO.jl index 010cbc50..838b1ae2 100644 --- a/src/DataWrangling/ETOPO/ETOPO.jl +++ b/src/DataWrangling/ETOPO/ETOPO.jl @@ -21,6 +21,7 @@ import ..DataWrangling: download_ETOPO_cache::String = "" function __init__() global download_ETOPO_cache = @get_scratch!("ETOPO") + DataWrangling.DataModes.register_dataset!(ETOPO2022, "ETOPO2022") end ETOPO_bathymetry_variable_names = Dict( diff --git a/src/DataWrangling/GEBCO/GEBCO.jl b/src/DataWrangling/GEBCO/GEBCO.jl index 4b32b892..75396609 100644 --- a/src/DataWrangling/GEBCO/GEBCO.jl +++ b/src/DataWrangling/GEBCO/GEBCO.jl @@ -8,7 +8,7 @@ using Oceananigans.DistributedComputations: @root using Scratch: Scratch, @get_scratch! using ZipFile: ZipFile -using ..DataWrangling: DownloadProgress, Metadatum, metadata_path, AbstractStaticBathymetry +using ..DataWrangling: DataWrangling, DownloadProgress, Metadatum, metadata_path, AbstractStaticBathymetry import ..DataWrangling: metadata_filename, @@ -21,6 +21,7 @@ import ..DataWrangling: download_GEBCO_cache::String = "" function __init__() global download_GEBCO_cache = @get_scratch!("GEBCO") + DataWrangling.DataModes.register_dataset!(GEBCO2024, "GEBCO2024") end GEBCO_bathymetry_variable_names = Dict( diff --git a/src/DataWrangling/GLORYS/GLORYS.jl b/src/DataWrangling/GLORYS/GLORYS.jl index 30177671..9c4b74c5 100644 --- a/src/DataWrangling/GLORYS/GLORYS.jl +++ b/src/DataWrangling/GLORYS/GLORYS.jl @@ -8,7 +8,7 @@ using NCDatasets: NCDatasets, Dataset using Printf: Printf, @sprintf using Scratch: Scratch, @get_scratch! -using ..DataWrangling: Metadata, Metadatum, metadata_path +using ..DataWrangling: DataWrangling, Metadata, Metadatum, metadata_path import ..DataWrangling: all_dates, @@ -27,6 +27,9 @@ import ..DataWrangling: download_GLORYS_cache::String = "" function __init__() global download_GLORYS_cache = @get_scratch!("GLORYS") + DataWrangling.DataModes.register_dataset!(GLORYSStatic, "GLORYSStatic") + DataWrangling.DataModes.register_dataset!(GLORYSDaily, "GLORYSDaily") + DataWrangling.DataModes.register_dataset!(GLORYSMonthly, "GLORYSMonthly") end # Datasets @@ -155,4 +158,3 @@ function z_interfaces(metadata::GLORYSMetadata) end end # module GLORYS - diff --git a/src/DataWrangling/IBCAO/IBCAO.jl b/src/DataWrangling/IBCAO/IBCAO.jl index be9deb37..25030f5a 100644 --- a/src/DataWrangling/IBCAO/IBCAO.jl +++ b/src/DataWrangling/IBCAO/IBCAO.jl @@ -7,7 +7,7 @@ using Oceananigans: Oceananigans using Oceananigans.DistributedComputations: @root using Scratch: Scratch, @get_scratch! -using ..DataWrangling: DownloadProgress, Metadatum, metadata_path, AbstractStaticBathymetry +using ..DataWrangling: DataWrangling, DownloadProgress, Metadatum, metadata_path, AbstractStaticBathymetry import ..DataWrangling: metadata_filename, @@ -22,6 +22,7 @@ import ..DataWrangling: download_IBCAO_cache::String = "" function __init__() global download_IBCAO_cache = @get_scratch!("IBCAO") + DataWrangling.DataModes.register_dataset!(IBCAOv5, "IBCAOv5") end IBCAO_bathymetry_variable_names = Dict( diff --git a/src/DataWrangling/IBCSO/IBCSO.jl b/src/DataWrangling/IBCSO/IBCSO.jl index ec799a7a..dfafdf6e 100644 --- a/src/DataWrangling/IBCSO/IBCSO.jl +++ b/src/DataWrangling/IBCSO/IBCSO.jl @@ -22,6 +22,7 @@ import ..DataWrangling: download_IBCSO_cache::String = "" function __init__() global download_IBCSO_cache = @get_scratch!("IBCSO") + DataWrangling.DataModes.register_dataset!(IBCSOv2, "IBCSOv2") end IBCSO_bathymetry_variable_names = Dict( diff --git a/src/DataWrangling/JRA55/JRA55.jl b/src/DataWrangling/JRA55/JRA55.jl index e347664a..08c3f336 100644 --- a/src/DataWrangling/JRA55/JRA55.jl +++ b/src/DataWrangling/JRA55/JRA55.jl @@ -27,6 +27,8 @@ download_JRA55_cache::String = "" function __init__() global download_JRA55_cache = @get_scratch!("JRA55") + DataWrangling.DataModes.register_dataset!(RepeatYearJRA55, "RepeatYearJRA55") + DataWrangling.DataModes.register_dataset!(MultiYearJRA55, "MultiYearJRA55") end include("JRA55_metadata.jl") diff --git a/src/DataWrangling/ORCA/ORCA.jl b/src/DataWrangling/ORCA/ORCA.jl index 910c722b..ec2feaf5 100644 --- a/src/DataWrangling/ORCA/ORCA.jl +++ b/src/DataWrangling/ORCA/ORCA.jl @@ -25,6 +25,8 @@ download_ORCA_cache::String = "" function __init__() global download_ORCA_cache = @get_scratch!("ORCA") + DataWrangling.DataModes.register_dataset!(ORCA1, "ORCA1") + DataWrangling.DataModes.register_dataset!(ORCA12, "ORCA12") end abstract type ORCADataset end diff --git a/src/DataWrangling/OSPapa/OSPapa.jl b/src/DataWrangling/OSPapa/OSPapa.jl index b51d2d43..2fe554a5 100644 --- a/src/DataWrangling/OSPapa/OSPapa.jl +++ b/src/DataWrangling/OSPapa/OSPapa.jl @@ -20,7 +20,7 @@ using NCDatasets: NCDatasets, NCDataset, defDim, defVar using Scratch: Scratch, @get_scratch! using Thermodynamics: q_vap_from_RH, Liquid -using ..DataWrangling: DownloadProgress +using ..DataWrangling: DataWrangling, DownloadProgress using ...Atmospheres: PrescribedAtmosphere, PrescribedPrecipitationFlux, AtmosphereThermodynamicsParameters using ..DataWrangling: Metadata, Metadatum, metadata_path, first_date, last_date, @@ -57,6 +57,8 @@ download_OSPapa_cache::String = "" function __init__() global download_OSPapa_cache = @get_scratch!("OSPapa") + DataWrangling.DataModes.register_dataset!(OSPapaFluxHourly, "OSPapaFluxHourly") + DataWrangling.DataModes.register_dataset!(OSPapaHourly, "OSPapaHourly") end function download_ospapa_file(dir=download_OSPapa_cache) diff --git a/src/DataWrangling/WOA/WOA.jl b/src/DataWrangling/WOA/WOA.jl index 70f1e964..1995ef23 100644 --- a/src/DataWrangling/WOA/WOA.jl +++ b/src/DataWrangling/WOA/WOA.jl @@ -12,6 +12,8 @@ using ..DataWrangling: DataWrangling, Metadata, Metadatum, metadata_path, download_WOA_cache::String = "" function __init__() global download_WOA_cache = @get_scratch!("WOA") + DataWrangling.DataModes.register_dataset!(WOAAnnual, "WOAAnnual") + DataWrangling.DataModes.register_dataset!(WOAMonthly, "WOAMonthly") end WOA_variable_names = Dict( diff --git a/src/DataWrangling/metadata.jl b/src/DataWrangling/metadata.jl index c77ef783..35ebd948 100644 --- a/src/DataWrangling/metadata.jl +++ b/src/DataWrangling/metadata.jl @@ -72,15 +72,21 @@ getfilename(f::DatewiseFilename, i) = f.filenames[i] getfilename(f::String, i) = f getfilename(::Nothing, i) = nothing -struct Metadata{V, D, R, S, F} +struct Metadata{V, D, R, S, F} <: AbstractMetadata name :: S dataset :: V dates :: D region :: R dir :: String filename :: F + function Metadata{V, D, R, S, F}(name, dataset, dates, region, dir, filename) where {V, D, R, S, F} + m = new{V, D, R, S, F}(name, dataset, dates, region, dir, filename) + observe_metadata(m) + return m + end end +Metadata(name::S, dataset::V, dates::D, region::R, dir::String, filename::F) where {V, D, R, S, F} = Metadata{V, D, R, S, F}(name, dataset, dates, region, dir, filename) Metadata(name, dataset, dates, region, dir) = Metadata(name, dataset, dates, region, dir, nothing) is_three_dimensional(::Metadata) = true @@ -300,15 +306,23 @@ end ##### `download`, ...) keeps working unchanged on the elements. ##### -struct MetadataSet{V, D, R, N, F} +struct MetadataSet{V, D, R, N, F} <: AbstractMetadata names :: N # NTuple{K, Symbol} — verbose dataset variable names dataset :: V # shared dates :: D # shared; scalar or AbstractVector region :: R # shared dir :: String # shared filenames :: F # NamedTuple keyed by `names`, one entry per variable + function MetadataSet{V, D, R, N, F}(names, dataset, dates, region, dir, filenames) where {V, D, R, N, F} + m = new{V, D, R, N, F}(names, dataset, dates, region, dir, filenames) + observe_metadata(m) + return m + end end +MetadataSet(names::N, dataset::V, dates::D, region::R, dir::String, filenames::F) where {V, D, R, N, F} = + MetadataSet{V, D, R, N, F}(names, dataset, dates, region, dir, filenames) + """ MetadataSet(variable_names::Symbol...; dataset, @@ -320,12 +334,11 @@ end start_date = nothing, end_date = nothing) -A bundle of [`Metadata`](@ref) for many variables that share `dataset`, `dates`, -`region`, and `dir` — differing only in variable name. +A bundle of [`Metadata`](@ref) for many variables that share `dataset`, `dates`, `region`, and `dir` +— differing only in variable name. -Each element `mset[name]` (or equivalently `mset.name` or `mset[i]`) is itself a -`Metadata` — or a `Metadatum` when `dates` is a single date. Iteration walks the -variable axis, yielding one `Metadata` per variable. +Each element `mset[name]` (or equivalently `mset.name` or `mset[i]`) is itself a `Metadata` — or a +`Metadatum` when `dates` is a single date. Iteration walks the variable axis, yielding one `Metadata` per variable. Arguments ========= @@ -357,8 +370,7 @@ function MetadataSet(variable_names::Symbol...; start_date = nothing, end_date = nothing) - isempty(variable_names) && - throw(ArgumentError("MetadataSet requires at least one variable name")) + isempty(variable_names) && throw(ArgumentError("MetadataSet requires at least one variable name")) if !isnothing(date) && !isnothing(dates) throw(ArgumentError("Specify either `date` (scalar) or `dates` (vector), not both")) @@ -415,8 +427,7 @@ end Base.propertynames(mset::MetadataSet) = (getfield(mset, :names)..., fieldnames(MetadataSet)...) -# Indexed access. We use `getfield` here so subsequent edits to `getproperty` -# can't make these recursive. +# Indexed access. We use `getfield` here so subsequent edits to `getproperty` can't make these recursive. function Base.getindex(mset::MetadataSet, name::Symbol) fname = getfield(mset, :filenames)[name] return Metadata(name, @@ -427,8 +438,7 @@ function Base.getindex(mset::MetadataSet, name::Symbol) fname) end -@propagate_inbounds Base.getindex(mset::MetadataSet, i::Int) = - getindex(mset, getfield(mset, :names)[i]) +@propagate_inbounds Base.getindex(mset::MetadataSet, i::Int) = getindex(mset, getfield(mset, :names)[i]) Base.length(mset::MetadataSet) = length(getfield(mset, :names)) Base.keys(mset::MetadataSet) = getfield(mset, :names) @@ -441,8 +451,7 @@ Base.lastindex(mset::MetadataSet) = length(mset) return mset[state], state + 1 end -Base.NamedTuple(mset::MetadataSet) = - NamedTuple{getfield(mset, :names)}(map(n -> mset[n], getfield(mset, :names))) +Base.NamedTuple(mset::MetadataSet) = NamedTuple{getfield(mset, :names)}(map(n -> mset[n], getfield(mset, :names))) """ metadata_path(mset::MetadataSet) diff --git a/src/DataWrangling/metadata_field.jl b/src/DataWrangling/metadata_field.jl index c90b3ef9..cde27106 100644 --- a/src/DataWrangling/metadata_field.jl +++ b/src/DataWrangling/metadata_field.jl @@ -197,7 +197,7 @@ function Oceananigans.Fields.Field(metadata::Metadatum, arch=CPU(); halo = (3, 3, 3), cache_inpainted_data = true) - Downloads.download(metadata) + download_dataset(metadata) # Inpainting on a (Flat, Flat, *) column field is meaningless and the # iterative algorithm doesn't terminate gracefully without horizontal diff --git a/src/DataWrangling/metadata_field_time_series.jl b/src/DataWrangling/metadata_field_time_series.jl index d25cc042..7b801e35 100644 --- a/src/DataWrangling/metadata_field_time_series.jl +++ b/src/DataWrangling/metadata_field_time_series.jl @@ -30,7 +30,7 @@ Keyword Arguments Default: `true`. """ function Oceananigans.OutputReaders.FieldTimeSeries(metadata::Metadata, arch::AbstractArchitecture=CPU(); kw...) - Downloads.download(metadata) + download_dataset(metadata) grid = native_grid(metadata, arch) return FieldTimeSeries(metadata, grid; kw...) end @@ -41,7 +41,7 @@ function Oceananigans.OutputReaders.FieldTimeSeries(metadata::Metadata, grid::Ab inpainting = default_inpainting(metadata), cache_inpainted_data = true) - Downloads.download(metadata) + download_dataset(metadata) times = native_times(metadata) diff --git a/src/DataWrangling/restoring.jl b/src/DataWrangling/restoring.jl index e85de0cf..ada59f9a 100644 --- a/src/DataWrangling/restoring.jl +++ b/src/DataWrangling/restoring.jl @@ -194,7 +194,7 @@ function DatasetRestoring(metadata::Metadata, inpainting = NearestNeighborInpainting(Inf), cache_inpainted_data = true) - Downloads.download(metadata) + download_dataset(metadata) fts = FieldTimeSeries(metadata, arch_or_grid; time_indices_in_memory, diff --git a/test/test_data_modes.jl b/test/test_data_modes.jl new file mode 100644 index 00000000..4308903d --- /dev/null +++ b/test/test_data_modes.jl @@ -0,0 +1,320 @@ +include("runtests_setup.jl") + +using NumericalEarth.DataWrangling: AbstractMetadata, Metadatum, Metadata, MetadataSet, + BoundingBox, Column, Nearest, Linear, DatewiseFilename, + download_dataset +using NumericalEarth.DataWrangling.DataModes: DataModes, parse_data_mode, register_dataset!, + write_manifest, read_manifest, download_datasets, + build_dataset_manifest, DryRunValue + +using Downloads: Downloads +using Dates: DateTime +using TOML: TOML + +struct FakeDataset end + +NumericalEarth.DataWrangling.all_dates(::FakeDataset, ::Symbol) = [DateTime(2020, m, 1) for m in 1:12] +NumericalEarth.DataWrangling.build_filename(::FakeDataset, name, dates::AbstractArray, region) = "$(name).nc" +NumericalEarth.DataWrangling.build_filename(::FakeDataset, name, date, region) = "$(name).nc" +NumericalEarth.DataWrangling.metadata_filename(::FakeDataset, name, date, region) = "$(name)_$(date).nc" +NumericalEarth.DataWrangling.default_download_directory(::FakeDataset) = "/tmp/fake_dataset_test" +NumericalEarth.DataWrangling.first_date(::FakeDataset, ::Symbol) = DateTime(2020, 1, 1) + +struct MockMetadatum <: AbstractMetadata + name :: Symbol +end + +const MOCK_DOWNLOAD_CALLS = Ref(0) +Downloads.download(::MockMetadatum) = (MOCK_DOWNLOAD_CALLS[] += 1; nothing) + +@testset "AbstractMetadata supertype" begin + @test Metadata <: AbstractMetadata + @test Metadatum <: AbstractMetadata + @test MetadataSet <: AbstractMetadata +end + +@testset "parse_data_mode" begin + @test parse_data_mode("auto") == (:auto, "") + @test parse_data_mode("") == (:auto, "") + @test parse_data_mode("existing") == (:existing, "") + @test parse_data_mode("build:foo.toml") == (:build, "foo.toml") + @test parse_data_mode("build:path/to/manifest.toml") == (:build, "path/to/manifest.toml") + + @test_throws ArgumentError parse_data_mode("build:") + @test_throws ArgumentError parse_data_mode("garbage") +end + +@testset "check_files_exist" begin + mktempdir() do dir + m_missing = Metadata(:t, nothing, nothing, nothing, dir, "missing_file.nc") + @test_throws ErrorException DataModes.check_files_exist(m_missing) + + present_path = joinpath(dir, "present.nc") + write(present_path, "x") + m_present = Metadata(:t, nothing, nothing, nothing, dir, "present.nc") + @test DataModes.check_files_exist(m_present) === nothing + + dates_vec = [DateTime(2020, 1, 1), DateTime(2020, 1, 2)] + m_multi_missing = Metadata(:t, nothing, dates_vec, nothing, dir, + NumericalEarth.DataWrangling.DatewiseFilename(["a.nc", "b.nc"])) + err = try + DataModes.check_files_exist(m_multi_missing) + nothing + catch e + e + end + @test err !== nothing + @test occursin("a.nc", sprint(showerror, err)) + @test occursin("b.nc", sprint(showerror, err)) + end +end + +@testset "write_manifest TOML serialization" begin + register_dataset!(FakeDataset, "FakeDataset") + + md_um = Metadata(:bathymetry, FakeDataset(), nothing, nothing, "/tmp", "b.nc") + md_one = Metadata(:temperature, FakeDataset(), DateTime(2020, 1, 1), nothing, "/tmp", "t.nc") + dates = [DateTime(2020, 1, 1), DateTime(2020, 12, 31)] + md_range = Metadata(:salinity, FakeDataset(), dates, nothing, "/tmp", + DatewiseFilename(["s1.nc", "s2.nc"])) + region = BoundingBox(longitude=(200.0, 220.0), latitude=(35.0, 55.0)) + md_region = Metadata(:eastward_velocity, FakeDataset(), DateTime(2020, 1, 1), region, "/tmp", "u.nc") + mset = MetadataSet((:T, :S), FakeDataset(), dates, nothing, "/tmp", + (T = DatewiseFilename(["T1.nc", "T2.nc"]), S = DatewiseFilename(["S1.nc", "S2.nc"]))) + + records = AbstractMetadata[md_um, md_one, md_range, md_region, mset] + io = IOBuffer() + write_manifest(io, records) + parsed = TOML.parse(String(take!(io))) + + @test haskey(parsed, "metadatum") + @test haskey(parsed, "metadata") + @test haskey(parsed, "metadataset") + + @test any(e -> e["variable_name"] == "bathymetry" && e["dataset"] == "FakeDataset" + && !haskey(e, "date") && !haskey(e, "dir") && !haskey(e, "region"), + parsed["metadatum"]) + @test any(e -> e["variable_name"] == "temperature" && e["date"] == DateTime(2020, 1, 1), + parsed["metadatum"]) + @test any(e -> e["variable_name"] == "eastward_velocity" && haskey(e, "region") + && e["region"]["kind"] == "BoundingBox" + && e["region"]["longitude"] == [200.0, 220.0], + parsed["metadatum"]) + + @test any(e -> e["variable_name"] == "salinity" + && e["start_date"] == DateTime(2020, 1, 1) + && e["end_date"] == DateTime(2020, 12, 31), + parsed["metadata"]) + + @test any(e -> e["variable_names"] == ["T", "S"] + && e["dataset"] == "FakeDataset" + && e["start_date"] == DateTime(2020, 1, 1) + && e["end_date"] == DateTime(2020, 12, 31), + parsed["metadataset"]) + + # Region: Column + col = Column(45.0, 30.0; z=(-400.0, 0.0), interpolation=Nearest()) + md_col = Metadata(:temperature, FakeDataset(), DateTime(2020, 1, 1), col, "/tmp", "t.nc") + io2 = IOBuffer() + write_manifest(io2, AbstractMetadata[md_col]) + parsed2 = TOML.parse(String(take!(io2))) + @test parsed2["metadatum"][1]["region"]["kind"] == "Column" + @test parsed2["metadatum"][1]["region"]["longitude"] == 45.0 + @test parsed2["metadatum"][1]["region"]["latitude"] == 30.0 + @test parsed2["metadatum"][1]["region"]["interpolation"] == "Nearest" +end + +@testset "read_manifest round-trip" begin + register_dataset!(FakeDataset, "FakeDataset") + + md_one = Metadatum(:temperature; dataset=FakeDataset(), date=DateTime(2020, 6, 1)) + md_range = Metadata(:salinity; dataset=FakeDataset(), + start_date=DateTime(2020, 3, 1), end_date=DateTime(2020, 8, 1)) + region = BoundingBox(longitude=(200.0, 220.0), latitude=(35.0, 55.0)) + md_region = Metadatum(:temperature; dataset=FakeDataset(), date=DateTime(2020, 6, 1), region=region) + mset = MetadataSet(:T, :S; dataset=FakeDataset(), + start_date=DateTime(2020, 3, 1), end_date=DateTime(2020, 8, 1)) + + mktempdir() do dir + path = joinpath(dir, "DataManifest.toml") + write_manifest(path, AbstractMetadata[md_one, md_range, md_region, mset]) + records = read_manifest(path) + @test length(records) == 4 + + rt = first(r for r in records if r isa Metadatum && r.name == :temperature && r.region === nothing) + @test rt.dataset isa FakeDataset + @test rt.dates == DateTime(2020, 6, 1) + + rrange = first(r for r in records if r isa Metadata && !(r isa Metadatum) && r.name == :salinity) + @test rrange.dates == [DateTime(2020, m, 1) for m in 3:8] + + rregion = first(r for r in records if r isa Metadatum && r.region !== nothing) + @test rregion.region isa BoundingBox + @test rregion.region.longitude == (200.0, 220.0) + @test rregion.region.latitude == (35.0, 55.0) + + rset = first(r for r in records if r isa MetadataSet) + @test rset.names == (:T, :S) + @test rset.dates == [DateTime(2020, m, 1) for m in 3:8] + end +end + +@testset "download_datasets varargs and manifest path" begin + register_dataset!(FakeDataset, "FakeDataset") + saved = DataModes.DATA_MODE[] + REAL_DOWNLOAD_CALLS = Ref(0) + Downloads.download(::Metadata{<:FakeDataset}) = (REAL_DOWNLOAD_CALLS[] += 1; nothing) + try + DataModes.DATA_MODE[] = :auto + m1 = Metadatum(:temperature; dataset=FakeDataset(), date=DateTime(2020, 6, 1)) + m2 = Metadatum(:salinity; dataset=FakeDataset(), date=DateTime(2020, 7, 1)) + REAL_DOWNLOAD_CALLS[] = 0 + download_datasets(m1, m2) + @test REAL_DOWNLOAD_CALLS[] == 2 + + mktempdir() do dir + path = joinpath(dir, "DataManifest.toml") + write_manifest(path, AbstractMetadata[m1, m2]) + REAL_DOWNLOAD_CALLS[] = 0 + download_datasets(path) + @test REAL_DOWNLOAD_CALLS[] == 2 + end + finally + DataModes.DATA_MODE[] = saved + end +end + +@testset "observe_metadata hook fires inside library-style functions" begin + register_dataset!(FakeDataset, "FakeDataset") + saved = DataModes.DATA_MODE[] + try + DataModes.DATA_MODE[] = :build + empty!(DataModes.RECORDED) + + library_constructor() = (Metadatum(:temperature; dataset=FakeDataset(), date=DateTime(2020, 6, 1)), + Metadatum(:salinity; dataset=FakeDataset(), date=DateTime(2020, 6, 1))) + library_constructor() + + @test length(DataModes.RECORDED) == 2 + names = sort([String(r.name) for r in DataModes.RECORDED]) + @test names == ["salinity", "temperature"] + finally + DataModes.DATA_MODE[] = saved + empty!(DataModes.RECORDED) + end +end + +@testset "DryRunValue minimal stub" begin + v = DryRunValue() + @test v.anything isa DryRunValue + @test v.something_else isa DryRunValue + @test sprint(show, v) == "DryRunValue()" +end + +@testset "build_dataset_manifest append (overwrite_existing=false)" begin + mktempdir() do dir + manifest = joinpath(dir, "DataManifest.toml") + + script_one = joinpath(dir, "one.jl") + write(script_one, """ + using NumericalEarth + using NumericalEarth.DataWrangling: Metadatum, download_dataset + using NumericalEarth.DataWrangling.DataModes: register_dataset! + using Dates: DateTime + + struct AppendDataset end + NumericalEarth.DataWrangling.metadata_filename(::AppendDataset, name, date, region) = string(name, ".nc") + NumericalEarth.DataWrangling.default_download_directory(::AppendDataset) = "/tmp" + register_dataset!(AppendDataset, "AppendDataset") + + download_dataset(Metadatum(:temperature; dataset=AppendDataset(), date=DateTime(2020, 1, 1))) + """) + build_dataset_manifest(script_one; manifest) + parsed_after_one = TOML.parsefile(manifest) + @test length(parsed_after_one["metadatum"]) == 1 + + script_two = joinpath(dir, "two.jl") + write(script_two, """ + using NumericalEarth + using NumericalEarth.DataWrangling: Metadatum, download_dataset + using NumericalEarth.DataWrangling.DataModes: register_dataset! + using Dates: DateTime + + struct AppendDataset end + NumericalEarth.DataWrangling.metadata_filename(::AppendDataset, name, date, region) = string(name, ".nc") + NumericalEarth.DataWrangling.default_download_directory(::AppendDataset) = "/tmp" + register_dataset!(AppendDataset, "AppendDataset") + + download_dataset(Metadatum(:salinity; dataset=AppendDataset(), date=DateTime(2020, 1, 1))) + """) + + build_dataset_manifest(script_two; manifest, overwrite_existing = false) + parsed_after_two = TOML.parsefile(manifest) + @test length(parsed_after_two["metadatum"]) == 2 + @test sort([e["variable_name"] for e in parsed_after_two["metadatum"]]) == ["salinity", "temperature"] + + build_dataset_manifest(script_two; manifest, overwrite_existing = false) + parsed_after_two_repeat = TOML.parsefile(manifest) + @test length(parsed_after_two_repeat["metadatum"]) == 2 + + build_dataset_manifest(script_two; manifest, overwrite_existing = true) + parsed_after_overwrite = TOML.parsefile(manifest) + @test length(parsed_after_overwrite["metadatum"]) == 1 + @test parsed_after_overwrite["metadatum"][1]["variable_name"] == "salinity" + end +end + +@testset "build_dataset_manifest end-to-end" begin + mktempdir() do dir + script = joinpath(dir, "demo.jl") + write(script, """ + using NumericalEarth + using NumericalEarth.DataWrangling: Metadatum, download_dataset + using NumericalEarth.DataWrangling.DataModes: register_dataset! + using Dates: DateTime + + struct DemoDataset end + NumericalEarth.DataWrangling.metadata_filename(::DemoDataset, name, date, region) = string(name, ".nc") + NumericalEarth.DataWrangling.default_download_directory(::DemoDataset) = "/tmp" + register_dataset!(DemoDataset, "DemoDataset") + + bad = something_undefined() + + function helper() + x = bad.field + y = download_dataset(Metadatum(:T; dataset=DemoDataset(), date=DateTime(2020, 1, 1))) + z = download_dataset(Metadatum(:S; dataset=DemoDataset(), date=DateTime(2020, 1, 1))) + return z + end + + helper() + """) + manifest = joinpath(dir, "DataManifest.toml") + build_dataset_manifest(script; manifest) + parsed = TOML.parsefile(manifest) + @test length(get(parsed, "metadatum", [])) == 2 + @test sort([e["variable_name"] for e in parsed["metadatum"]]) == ["S", "T"] + end +end + +@testset "download_dataset chokepoint" begin + md = MockMetadatum(:t) + saved_mode = DataModes.DATA_MODE[] + try + DataModes.DATA_MODE[] = :auto + MOCK_DOWNLOAD_CALLS[] = 0 + download_dataset(md) + @test MOCK_DOWNLOAD_CALLS[] == 1 + + DataModes.DATA_MODE[] = :build + MOCK_DOWNLOAD_CALLS[] = 0 + download_dataset(md) + @test MOCK_DOWNLOAD_CALLS[] == 0 + + DataModes.DATA_MODE[] = :existing + @test_throws Exception download_dataset(md) + finally + DataModes.DATA_MODE[] = saved_mode + empty!(DataModes.RECORDED) + end +end From 31147b3d6b1b07ff12149ecf08f72cf10735ab94 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Fri, 22 May 2026 15:00:40 +0200 Subject: [PATCH 2/7] more changes --- src/Bathymetry/Bathymetry.jl | 2 +- src/Bathymetry/regrid_bathymetry.jl | 4 +- src/DataWrangling/DataModes/DataModes.jl | 67 ++++---- .../DataModes/data_manifest_wrangling.jl | 158 ++++++++++-------- src/DataWrangling/DataModes/dry_run_value.jl | 2 +- .../DataModes/parse_and_rewrite_script.jl | 3 + src/DataWrangling/DataWrangling.jl | 10 +- .../ERA5/ERA5_pressure_levels.jl | 4 +- .../OSPapa/OSPapa_prescribed_atmosphere.jl | 2 +- .../OSPapa/OSPapa_prescribed_fluxes.jl | 2 +- .../OSPapa/OSPapa_prescribed_radiation.jl | 2 +- src/DataWrangling/metadata.jl | 2 +- src/NumericalEarth.jl | 27 +++ test/download_utils.jl | 5 + test/runtests.jl | 6 +- test/runtests_setup.jl | 6 +- test/test_bathymetry.jl | 2 +- test/test_cds_downloading.jl | 8 +- test/test_data_modes.jl | 119 +++++++------ test/test_distributed_utils.jl | 2 +- test/test_ecco2_daily.jl | 2 +- test/test_ecco2_monthly.jl | 2 +- test/test_ecco4_en4.jl | 2 +- test/test_ecco_atmosphere.jl | 2 +- test/test_glorys_downloading.jl | 2 +- test/test_jra55_ecco_en4_etopo_downloading.jl | 4 +- test/test_mangling.jl | 2 +- test/test_orca_grid.jl | 4 +- test/test_woa.jl | 2 +- 29 files changed, 256 insertions(+), 199 deletions(-) diff --git a/src/Bathymetry/Bathymetry.jl b/src/Bathymetry/Bathymetry.jl index c44d4de0..0cb7222a 100644 --- a/src/Bathymetry/Bathymetry.jl +++ b/src/Bathymetry/Bathymetry.jl @@ -22,7 +22,7 @@ using Printf: Printf using Scratch: Scratch, @get_scratch! using ..DataWrangling: Metadatum, native_grid, metadata_path, - dataset_variable_name, validate_dataset_coverage + dataset_variable_name, validate_dataset_coverage, download_dataset using ..DataWrangling.ETOPO: ETOPO2022 include("regrid_bathymetry.jl") diff --git a/src/Bathymetry/regrid_bathymetry.jl b/src/Bathymetry/regrid_bathymetry.jl index 8e7a8195..daf6ae88 100644 --- a/src/Bathymetry/regrid_bathymetry.jl +++ b/src/Bathymetry/regrid_bathymetry.jl @@ -203,7 +203,7 @@ function regrid_bathymetry(target_grid, metadata; end end - download(metadata) + download_dataset(metadata) target_z = _regrid_bathymetry(target_grid, metadata; height_above_water, @@ -302,7 +302,7 @@ function regrid_bathymetry(target_grid::DistributedGrid, metadata; interpolation_passes, major_basins) # download uses @root internally; all ranks must call it - download(metadata) + download_dataset(metadata) # Only rank 0 performs cache lookup and computation to avoid OOM bottom_height = if arch.local_rank == 0 diff --git a/src/DataWrangling/DataModes/DataModes.jl b/src/DataWrangling/DataModes/DataModes.jl index c30daf5a..b8a95b10 100644 --- a/src/DataWrangling/DataModes/DataModes.jl +++ b/src/DataWrangling/DataModes/DataModes.jl @@ -1,16 +1,21 @@ """ DataModes -Three-mode download dispatch and a declarative `DataManifest.toml` for NumericalEarth. +Three-mode download dispatch and a declarative `NumericalEarthDataManifest.toml` for NumericalEarth. Modes are selected by the `NUMERICALEARTH_DATA` environment variable: -| Value | Behavior | -|-------------------|------------------------------------------------------------| -| `"auto"` (default)| Download on demand (current behavior). | -| `"existing"` | Error if any required file is missing. Never download. | -| `"build:"` | Trace the running script; write a manifest to ``. | +| Value | Behavior | +|-------------------|----------------------------------------------------------------------------| +| `"auto"` (default)| Download on demand (current behavior). | +| `"strict"` | Error if any required file is missing. Never download. | +| `"pregenerate"` | Trace the running script; write the manifest to `pwd()`. | +| `"pregenerate:"` | Same as `"pregenerate"` but write to `/NumericalEarthDataManifest.toml`. | -See [`NumericalEarth.DataWrangling.download_dataset`](@ref) for the dispatch and [`build_dataset_manifest`](@ref) for the trace entry point. +The filename is fixed (`NumericalEarthDataManifest.toml`) so manifests don't collide with Pkg's +`Project.toml` / `Manifest.toml` and there is one canonical manifest per directory. + +See [`NumericalEarth.DataWrangling.download_dataset`](@ref) for the dispatch and +[`pregenerate_dataset_manifest`](@ref) for the trace entry point. """ module DataModes @@ -21,7 +26,7 @@ using ..DataWrangling: DataWrangling, AbstractMetadata, Metadata, Metadatum, Met using ..DataWrangling: DatewiseFilename, metadata_path, default_download_directory, download_dataset export DryRunValue -export build_dataset_manifest, download_datasets +export pregenerate_dataset_manifest, download_datasets export register_dataset! const DATA_MODE = Ref{Symbol}(:auto) @@ -30,26 +35,27 @@ include("dry_run_value.jl") include("data_manifest_wrangling.jl") include("parse_and_rewrite_script.jl") -DataWrangling.observe_metadata(m::Metadata) = (DATA_MODE[] === :build && record_for_manifest(m); nothing) -DataWrangling.observe_metadata(m::MetadataSet) = (DATA_MODE[] === :build && record_for_manifest(m); nothing) +DataWrangling.observe_metadata(m::Metadata) = (DATA_MODE[] === :pregenerate && record_for_manifest(m); nothing) +DataWrangling.observe_metadata(m::MetadataSet) = (DATA_MODE[] === :pregenerate && record_for_manifest(m); nothing) """ $(TYPEDSIGNATURES) -Acquire every dataset listed in `metadata...` (varargs form) or in the manifest at `path` -(file-path form). Each entry is routed through [`download_dataset`](@ref), so the current -`NUMERICALEARTH_DATA` mode applies. +Acquire every dataset listed in `metadata...` (varargs form) or in the manifest at +`joinpath(dir, "NumericalEarthDataManifest.toml")` (zero-arg form). Each entry is routed through +[`download_dataset`](@ref), so the current `NUMERICALEARTH_DATA` mode applies. -For the file-path form, pass `dir` to override the default download directory for every reconstructed -entry (e.g. when login-node and compute-node filesystems differ). +For the manifest form, `dir` is the directory containing the manifest (defaults to `pwd()`). Pass +`download_dir` to override the default download directory for every reconstructed entry (e.g. when +login-node and compute-node filesystems differ). """ function download_datasets(metadata::AbstractMetadata...) foreach(download_dataset, metadata) return nothing end -function download_datasets(path::AbstractString; dir = nothing) - foreach(download_dataset, read_manifest(path; dir)) +function download_datasets(; dir::AbstractString = pwd(), download_dir = nothing) + foreach(download_dataset, read_manifest(; dir, download_dir)) return nothing end @@ -79,33 +85,38 @@ function check_files_exist(metadata::AbstractMetadata) missing_paths = filter(p -> !isfile(p), paths) isempty(missing_paths) && return nothing list = join((" " * p for p in missing_paths), "\n") - error("NUMERICALEARTH_DATA=existing: $(length(missing_paths)) required file(s) missing:\n$list") + error("NUMERICALEARTH_DATA=strict: $(length(missing_paths)) required file(s) missing:\n$list") end function __init__() + # Skip everything if we're inside a precompile / sysimage build subprocess — + # `Base.PROGRAM_FILE` is "-" there and we'd write garbage manifests at exit. + ccall(:jl_generating_output, Cint, ()) == 1 && return nothing + env = get(ENV, "NUMERICALEARTH_DATA", "auto") - mode, path = parse_data_mode(env) + mode, dir_from_env = parse_data_mode(env) DATA_MODE[] = mode - MANIFEST_PATH[] = path - mode === :build || return nothing + MANIFEST_DIR[] = isempty(dir_from_env) ? pwd() : abspath(dir_from_env) + mode === :pregenerate || return nothing - if !isempty(Base.PROGRAM_FILE) + if !isempty(Base.PROGRAM_FILE) && isfile(Base.PROGRAM_FILE) script = abspath(Base.PROGRAM_FILE) atexit() do try - build_dataset_manifest(script; manifest = MANIFEST_PATH[]) - @info "NUMERICALEARTH_DATA=build: wrote manifest via AST trace" path=MANIFEST_PATH[] script + manifest = pregenerate_dataset_manifest(script; dir = MANIFEST_DIR[]) + @info "NUMERICALEARTH_DATA=pregenerate: wrote manifest via AST trace" manifest script catch err - @error "NUMERICALEARTH_DATA=build: trace failed" path=MANIFEST_PATH[] script exception=(err, catch_backtrace()) + @error "NUMERICALEARTH_DATA=pregenerate: trace failed" dir=MANIFEST_DIR[] script exception=(err, catch_backtrace()) end end else atexit() do try - write_manifest(MANIFEST_PATH[], copy(RECORDED)) - @info "NUMERICALEARTH_DATA=build: wrote manifest" path=MANIFEST_PATH[] entries=length(RECORDED) + manifest = manifest_path_in(MANIFEST_DIR[]) + write_manifest(manifest, copy(RECORDED)) + @info "NUMERICALEARTH_DATA=pregenerate: wrote manifest" manifest entries=length(RECORDED) catch err - @error "NUMERICALEARTH_DATA=build: failed to write manifest" path=MANIFEST_PATH[] exception=(err, catch_backtrace()) + @error "NUMERICALEARTH_DATA=pregenerate: failed to write manifest" dir=MANIFEST_DIR[] exception=(err, catch_backtrace()) end end end diff --git a/src/DataWrangling/DataModes/data_manifest_wrangling.jl b/src/DataWrangling/DataModes/data_manifest_wrangling.jl index dc6134e2..b25d5885 100644 --- a/src/DataWrangling/DataModes/data_manifest_wrangling.jl +++ b/src/DataWrangling/DataModes/data_manifest_wrangling.jl @@ -1,28 +1,42 @@ -const MANIFEST_PATH = Ref{String}("") -const RECORDED = AbstractMetadata[] -const DATASET_REGISTRY = Dict{String, Any}() +const MANIFEST_FILENAME = "NumericalEarthDataManifest.toml" +const MANIFEST_DIR = Ref{String}("") +const RECORDED = AbstractMetadata[] +const DATASET_REGISTRY = Dict{String, Any}() """ $(TYPEDSIGNATURES) -Parse a `NUMERICALEARTH_DATA` value into a `(mode, manifest_path)` tuple. +Return the absolute path to the data manifest inside `dir`. The basename is fixed +(`NumericalEarthDataManifest.toml`) to avoid name collisions with Pkg's `Project.toml` / +`Manifest.toml` and similar Julia-ecosystem files, and to give one canonical manifest per +directory — analogous to how each project directory has one `Project.toml`. +""" +manifest_path_in(dir::AbstractString) = joinpath(abspath(dir), MANIFEST_FILENAME) + +""" + $(TYPEDSIGNATURES) + +Parse a `NUMERICALEARTH_DATA` value into a `(mode, dir)` tuple. `dir` is the directory the manifest +will be written to / read from; the filename is always `NumericalEarthDataManifest.toml`. Recognized values: -- `""` or `"auto"` → `(:auto, "")` -- `"existing"` → `(:existing, "")` -- `"build:"` → `(:build, "")` +- `""` or `"auto"` → `(:auto, "")` +- `"strict"` → `(:strict, "")` +- `"pregenerate"` → `(:pregenerate, "")` — writes to the cwd at trace time +- `"pregenerate:"` → `(:pregenerate, "")` — writes to `/NumericalEarthDataManifest.toml` -Throws `ArgumentError` on any other value or on `"build:"` without a path. +Throws `ArgumentError` on any other value. """ function parse_data_mode(s::AbstractString) (isempty(s) || s == "auto") && return (:auto, "") - s == "existing" && return (:existing, "") - if startswith(s, "build:") - path = s[length("build:")+1:end] - isempty(path) && throw(ArgumentError("`NUMERICALEARTH_DATA=build:` requires a non-empty manifest path")) - return (:build, path) + s == "strict" && return (:strict, "") + s == "pregenerate" && return (:pregenerate, "") + if startswith(s, "pregenerate:") + dir = s[length("pregenerate:")+1:end] + isempty(dir) && throw(ArgumentError("`NUMERICALEARTH_DATA=pregenerate:` requires a non-empty directory")) + return (:pregenerate, dir) end - throw(ArgumentError("Unrecognized NUMERICALEARTH_DATA value: $(repr(s)). Expected \"auto\", \"existing\", or \"build:\".")) + throw(ArgumentError("Unrecognized NUMERICALEARTH_DATA value: $(repr(s)). Expected \"auto\", \"strict\", \"pregenerate\", or \"pregenerate:\".")) end """ @@ -77,7 +91,7 @@ filename_to_toml(s::AbstractString) = String(s) filename_to_toml(f::DatewiseFilename) = collect(String, f.filenames) function metadata_to_dict(m::Metadatum) - d = Dict{String, Any}("variable_name" => String(m.name), "dataset" => dataset_name(m.dataset)) + d = Dict{String, Any}("variable_name" => String(m.name)) m.dates === nothing || (d["date"] = m.dates) m.region === nothing || (d["region"] = region_to_dict(m.region)) m.filename === nothing || (d["filename"] = filename_to_toml(m.filename)) @@ -85,7 +99,7 @@ function metadata_to_dict(m::Metadatum) end function metadata_to_dict(m::Metadata) - d = Dict{String, Any}("variable_name" => String(m.name), "dataset" => dataset_name(m.dataset), + d = Dict{String, Any}("variable_name" => String(m.name), "start_date" => first(m.dates), "end_date" => last(m.dates)) m.region === nothing || (d["region"] = region_to_dict(m.region)) m.filename === nothing || (d["filename"] = filename_to_toml(m.filename)) @@ -93,7 +107,7 @@ function metadata_to_dict(m::Metadata) end function metadata_to_dict(mset::MetadataSet) - d = Dict{String, Any}("variable_names" => [String(n) for n in mset.names], "dataset" => dataset_name(mset.dataset)) + d = Dict{String, Any}("variable_names" => [String(n) for n in mset.names]) if mset.dates isa AbstractVector d["start_date"] = first(mset.dates) d["end_date"] = last(mset.dates) @@ -104,28 +118,34 @@ function metadata_to_dict(mset::MetadataSet) return d end -manifest_table_key(::Metadatum) = "metadatum" -manifest_table_key(::Metadata) = "metadata" -manifest_table_key(::MetadataSet) = "metadataset" - """ $(TYPEDSIGNATURES) -Serialize `records` (a vector of `AbstractMetadata`) to `io` as a `DataManifest.toml` with three -table arrays: `[[metadatum]]`, `[[metadata]]`, `[[metadataset]]`. +Serialize `records` (a vector of `AbstractMetadata`) to `io` as a `NumericalEarthDataManifest.toml`, +with one table array per dataset: + +```toml +[[ETOPO2022]] +variable_name = "bathymetry" + +[[JRA55RepeatYear]] +variable_names = ["eastward_wind", "northward_wind", ...] +start_date = "1990-01-01T00:00:00" +end_date = "1990-12-31T18:00:00" + +[[GLORYSDaily]] +variable_name = "temperature" +date = "2020-06-15T00:00:00" +``` The download directory (`dir`) is not stored. The loader uses each dataset's default directory -unless overridden by `download_datasets(...; dir=...)`. +unless overridden by `download_datasets(; dir=...)`. """ function write_manifest(io::IO, records::AbstractVector) - grouped = Dict{String, Vector{Dict{String, Any}}}("metadatum" => [], - "metadata" => [], - "metadataset" => []) + grouped = Dict{String, Vector{Dict{String, Any}}}() for r in records - push!(grouped[manifest_table_key(r)], metadata_to_dict(r)) - end - for k in ("metadatum", "metadata", "metadataset") - isempty(grouped[k]) && delete!(grouped, k) + entries = get!(() -> Dict{String, Any}[], grouped, dataset_name(r.dataset)) + push!(entries, metadata_to_dict(r)) end TOML.print(io, grouped) return nothing @@ -172,54 +192,48 @@ end ##### AbstractMetadata reconstruction ##### -function from_toml(kind::Symbol, entry::AbstractDict; dir = nothing) - dataset = lookup_dataset(entry["dataset"]) +function from_toml(dataset_name::AbstractString, entry::AbstractDict; download_dir = nothing) + dataset = lookup_dataset(dataset_name) region = region_from_toml(get(entry, "region", nothing)) filename = filename_from_toml(get(entry, "filename", nothing)) - download_dir = dir === nothing ? default_download_directory(dataset) : String(dir) - if kind === :metadatum - name = Symbol(entry["variable_name"]) - return Metadatum(name; dataset, region, filename, dir = download_dir, date = get(entry, "date", nothing)) - elseif kind === :metadata - name = Symbol(entry["variable_name"]) - return Metadata(name; dataset, region, filename, dir = download_dir, - start_date = entry["start_date"], end_date = entry["end_date"]) - elseif kind === :metadataset + dir = download_dir === nothing ? default_download_directory(dataset) : String(download_dir) + if haskey(entry, "variable_names") names = Tuple(Symbol(n) for n in entry["variable_names"]) haskey(entry, "date") && - return MetadataSet(names...; dataset, region, dir = download_dir, date = entry["date"]) - return MetadataSet(names...; dataset, region, dir = download_dir, + return MetadataSet(names...; dataset, region, dir, date = entry["date"]) + return MetadataSet(names...; dataset, region, dir, start_date = entry["start_date"], end_date = entry["end_date"]) - else - throw(ArgumentError("Unknown manifest record kind: $(repr(kind))")) end + name = Symbol(entry["variable_name"]) + haskey(entry, "start_date") && + return Metadata(name; dataset, region, filename, dir, + start_date = entry["start_date"], end_date = entry["end_date"]) + return Metadatum(name; dataset, region, filename, dir, date = get(entry, "date", nothing)) end """ $(TYPEDSIGNATURES) -Read a `DataManifest.toml` and reconstruct every record as the matching `Metadatum`/`Metadata`/`MetadataSet`. -Datasets are looked up by name in [`DATASET_REGISTRY`](@ref). +Read the manifest at `joinpath(dir, "NumericalEarthDataManifest.toml")` and reconstruct every +record as the matching `Metadatum`/`Metadata`/`MetadataSet`. Datasets are looked up by name in +[`DATASET_REGISTRY`](@ref). -Pass `dir` to override every reconstructed record's download directory (useful when login-node and -compute-node filesystems differ); otherwise `default_download_directory(dataset)` is used. +Pass `download_dir` to override every reconstructed record's download directory (useful when +login-node and compute-node filesystems differ); otherwise `default_download_directory(dataset)` +is used. """ -function read_manifest(path::AbstractString; dir = nothing) - raw = TOML.parsefile(path) - return manifest_from_dict(raw; dir) +function read_manifest(; dir::AbstractString = pwd(), download_dir = nothing) + raw = TOML.parsefile(manifest_path_in(dir)) + return manifest_from_dict(raw; download_dir) end -function read_manifest(io::IO; dir = nothing) - raw = TOML.parse(read(io, String)) - return manifest_from_dict(raw; dir) -end +read_manifest(io::IO; download_dir = nothing) = manifest_from_dict(TOML.parse(read(io, String)); download_dir) -function manifest_from_dict(raw::AbstractDict; dir = nothing) +function manifest_from_dict(raw::AbstractDict; download_dir = nothing) records = AbstractMetadata[] - for k in (:metadatum, :metadata, :metadataset) - haskey(raw, String(k)) || continue - for entry in raw[String(k)] - push!(records, Base.invokelatest(from_toml, k, entry; dir)) + for (name, entries) in raw + for entry in entries + push!(records, Base.invokelatest(from_toml, name, entry; download_dir)) end end return records @@ -228,20 +242,21 @@ end """ $(TYPEDSIGNATURES) -Trace `script` in build-mode and write the resulting `DataManifest.toml` to `manifest`. +Trace `script` in build-mode and write the resulting manifest to +`joinpath(dir, "NumericalEarthDataManifest.toml")`. The script's source is parsed with `Meta.parseall`, every statement is wrapped in a per-statement `try`/`catch` that rebinds failed assignments to [`DryRunValue`](@ref), and the rewritten code is -evaluated in a fresh sandbox module with `DATA_MODE[] = :build`. Each [`download_dataset`](@ref) call +evaluated in a fresh sandbox module with `DATA_MODE[] = :pregenerate`. Each [`download_dataset`](@ref) call records its metadata into [`RECORDED`](@ref) instead of downloading. The accumulated records are then serialized via [`write_manifest`](@ref). -When `overwrite_existing = false` and `manifest` already exists, the existing records are read first -and merged (deduplicated) with the newly recorded ones, so this call appends rather than replaces. -Defaults to `true` (replace). +When `overwrite_existing = false` and a manifest already exists at `dir`, the existing records are +read first and merged (deduplicated) with the newly recorded ones, so this call appends rather +than replaces. Defaults to `true` (replace). """ -function build_dataset_manifest(script::AbstractString; - manifest::AbstractString = "DataManifest.toml", +function pregenerate_dataset_manifest(script::AbstractString; + dir::AbstractString = pwd(), overwrite_existing::Bool = true) script_abs = abspath(script) source = read(script_abs, String) @@ -252,7 +267,7 @@ function build_dataset_manifest(script::AbstractString; saved_mode = DATA_MODE[] saved_records = copy(RECORDED) empty!(RECORDED) - DATA_MODE[] = :build + DATA_MODE[] = :pregenerate new_records = AbstractMetadata[] try @@ -267,8 +282,9 @@ function build_dataset_manifest(script::AbstractString; append!(RECORDED, saved_records) end + manifest = manifest_path_in(dir) if !overwrite_existing && isfile(manifest) - for r in read_manifest(manifest) + for r in read_manifest(; dir) any(==(r), new_records) || pushfirst!(new_records, r) end end diff --git a/src/DataWrangling/DataModes/dry_run_value.jl b/src/DataWrangling/DataModes/dry_run_value.jl index 0850f577..e3affeb3 100644 --- a/src/DataWrangling/DataModes/dry_run_value.jl +++ b/src/DataWrangling/DataModes/dry_run_value.jl @@ -1,7 +1,7 @@ """ DryRunValue() -Sentinel returned when a statement under [`build_dataset_manifest`](@ref) tracing either errors or +Sentinel returned when a statement under [`pregenerate_dataset_manifest`](@ref) tracing either errors or stands in for a value that real data would have produced. The per-statement `try`/`catch` wrappers rebind any failed assignment to a `DryRunValue`, so the script continues running and downstream `download_dataset` calls still register their metadata. diff --git a/src/DataWrangling/DataModes/parse_and_rewrite_script.jl b/src/DataWrangling/DataModes/parse_and_rewrite_script.jl index 22194930..0082c83d 100644 --- a/src/DataWrangling/DataModes/parse_and_rewrite_script.jl +++ b/src/DataWrangling/DataModes/parse_and_rewrite_script.jl @@ -26,6 +26,9 @@ end function wrap_assignment(lhs, rhs) if lhs isa Symbol return Expr(:(=), lhs, :(try; $rhs; catch; $DryRunValue(); end)) + elseif lhs isa Expr && lhs.head === :tuple + fallback = Expr(:tuple, fill(:($DryRunValue()), length(lhs.args))...) + return Expr(:(=), lhs, :(try; $rhs; catch; $fallback; end)) end return :(try; $lhs = $rhs; catch; end) end diff --git a/src/DataWrangling/DataWrangling.jl b/src/DataWrangling/DataWrangling.jl index 2e6d5232..56fcad5b 100644 --- a/src/DataWrangling/DataWrangling.jl +++ b/src/DataWrangling/DataWrangling.jl @@ -245,7 +245,7 @@ abstract type AbstractMetadata end """ Hook called at the end of every `AbstractMetadata` inner constructor. The default is a no-op; [`NumericalEarth.DataWrangling.DataModes`](@ref) adds more-specific methods on `Metadata` and -`MetadataSet` that record into the manifest in `:build` mode, so the trace captures Metadata +`MetadataSet` that record into the manifest in `:pregenerate` mode, so the trace captures Metadata constructed inside library functions too. """ observe_metadata(::AbstractMetadata) = nothing @@ -376,8 +376,8 @@ Acquire the data referenced by `metadata` according to the current `NUMERICALEARTH_DATA` mode (see [`DataModes`](@ref)): - `:auto` — call `Downloads.download(metadata)` (the per-dataset method). -- `:existing` — verify every required file is already on disk; error otherwise. -- `:build` — no-op (metadata is recorded into the manifest by `observe_metadata` at construction). +- `:strict` — verify every required file is already on disk; error otherwise. +- `:pregenerate` — no-op (metadata is recorded into the manifest by `observe_metadata` at construction). This is the single chokepoint through which every code path that needs dataset files must go. Per-dataset modules keep extending `Downloads.download` for the `:auto` branch only. @@ -385,8 +385,8 @@ Per-dataset modules keep extending `Downloads.download` for the `:auto` branch o function download_dataset(metadata::AbstractMetadata) mode = DataModes.DATA_MODE[] mode === :auto && return Downloads.download(metadata) - mode === :existing && return DataModes.check_files_exist(metadata) - mode === :build && return nothing + mode === :strict && return DataModes.check_files_exist(metadata) + mode === :pregenerate && return nothing error("Unknown NUMERICALEARTH_DATA mode: $(repr(mode))") end diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index 84dbe000..3a6bb830 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -266,8 +266,8 @@ function per_column_geopotential_discretization(metadata::ERA5PressureMetadata) ϕ_sl_meta = Metadata(:geopotential_height; dataset=sl_ds, dates=metadata.dates, region=metadata.region, dir=metadata.dir) - Downloads.download(ϕ_meta) - Downloads.download(ϕ_sl_meta) + download_dataset(ϕ_meta) + download_dataset(ϕ_sl_meta) Φ = Field(first(ϕ_meta)) # 3-D geopotential, m²/s² Φ_sfc = Field(first(ϕ_sl_meta)) # 2-D surface geopotential, m²/s² diff --git a/src/DataWrangling/OSPapa/OSPapa_prescribed_atmosphere.jl b/src/DataWrangling/OSPapa/OSPapa_prescribed_atmosphere.jl index f03a3d67..180d0c69 100644 --- a/src/DataWrangling/OSPapa/OSPapa_prescribed_atmosphere.jl +++ b/src/DataWrangling/OSPapa/OSPapa_prescribed_atmosphere.jl @@ -59,7 +59,7 @@ function OSPapaPrescribedAtmosphere(architecture = CPU(), FT = Float32; function ospapa_fts(name) md = Metadata(name; mdkw...) - Downloads.download(md) + download_dataset(md) fts = FieldTimeSeries(md, surface_grid; time_indices_in_memory = length(md)) fill_gaps!(fts; max_gap = max_gap_hours) return fts diff --git a/src/DataWrangling/OSPapa/OSPapa_prescribed_fluxes.jl b/src/DataWrangling/OSPapa/OSPapa_prescribed_fluxes.jl index 9c525781..6547b460 100644 --- a/src/DataWrangling/OSPapa/OSPapa_prescribed_fluxes.jl +++ b/src/DataWrangling/OSPapa/OSPapa_prescribed_fluxes.jl @@ -44,7 +44,7 @@ function os_papa_prescribed_fluxes(architecture = CPU(), FT = Float64; function flux_fts(name) md = Metadata(name; mdkw...) - Downloads.download(md) + download_dataset(md) fts = FieldTimeSeries(md, surface_grid; time_indices_in_memory = length(md), time_indexing = Cyclical()) diff --git a/src/DataWrangling/OSPapa/OSPapa_prescribed_radiation.jl b/src/DataWrangling/OSPapa/OSPapa_prescribed_radiation.jl index b8e26caf..a8cfe910 100644 --- a/src/DataWrangling/OSPapa/OSPapa_prescribed_radiation.jl +++ b/src/DataWrangling/OSPapa/OSPapa_prescribed_radiation.jl @@ -27,7 +27,7 @@ function OSPapaPrescribedRadiation(architecture = CPU(), FT = Float32; function ospapa_fts(name) md = Metadata(name; mdkw...) - Downloads.download(md) + download_dataset(md) fts = FieldTimeSeries(md, surface_grid; time_indices_in_memory = length(md)) fill_gaps!(fts; max_gap = max_gap_hours) return fts diff --git a/src/DataWrangling/metadata.jl b/src/DataWrangling/metadata.jl index 35ebd948..cc1d878b 100644 --- a/src/DataWrangling/metadata.jl +++ b/src/DataWrangling/metadata.jl @@ -563,7 +563,7 @@ results of each per-variable `download` call (typically the file path(s)). """ function Downloads.download(mset::MetadataSet; kwargs...) names = getfield(mset, :names) - return NamedTuple{names}(map(n -> Downloads.download(mset[n]; kwargs...), names)) + return NamedTuple{names}(map(n -> download_dataset(mset[n]), names)) end """ diff --git a/src/NumericalEarth.jl b/src/NumericalEarth.jl index 2ef5330c..21562193 100644 --- a/src/NumericalEarth.jl +++ b/src/NumericalEarth.jl @@ -177,6 +177,33 @@ using .DataWrangling.OSPapa using PrecompileTools: @setup_workload, @compile_workload +""" +Auto-download datasets listed in `NumericalEarthDataManifest.toml` whenever a manifest sits next +to the active project's `Project.toml` and we're running in `:auto` mode. Cached files are skipped +by each dataset's per-dataset `Downloads.download` method, so subsequent runs are cheap. + +Skipped during precompilation, in `:strict` mode (the manifest is the user's promise that data is +already on disk, not a fetch request), and in `:pregenerate` mode (we're recording, not downloading). +""" +function __init__() + ccall(:jl_generating_output, Cint, ()) == 1 && return nothing + DataWrangling.DataModes.DATA_MODE[] === :auto || return nothing + + project = Base.active_project() + project === nothing && return nothing + project_dir = dirname(project) + manifest = joinpath(project_dir, DataWrangling.DataModes.MANIFEST_FILENAME) + isfile(manifest) || return nothing + + @info "NumericalEarth: auto-downloading datasets from manifest" manifest + try + DataWrangling.DataModes.download_datasets(; dir = project_dir) + catch err + @error "NumericalEarth: auto-download failed; continuing without it" manifest exception=(err, catch_backtrace()) + end + return nothing +end + @setup_workload begin Nx, Ny, Nz = 32, 32, 10 @compile_workload begin diff --git a/test/download_utils.jl b/test/download_utils.jl index 52928090..a7f6917e 100644 --- a/test/download_utils.jl +++ b/test/download_utils.jl @@ -10,6 +10,7 @@ function emit_ci_warning(title, message) end function download_from_artifacts(filepath::AbstractString; max_retries=3) + NumericalEarth.DataWrangling.DataModes.DATA_MODE[] === :pregenerate && return nothing filename = basename(filepath) fallback_url = ARTIFACTS_BASE_URL * filename @info "Downloading $filename from NumericalEarthArtifacts fallback..." @@ -41,9 +42,13 @@ end Try `download_fn()`. If it throws, download the required files from NumericalEarthArtifacts and retry. Emits a CI warning when the fallback is used. +In `:pregenerate` mode the fallback is skipped — `download_fn()` runs unguarded and any error +propagates to the script's per-statement wrapper, so the trace never reaches the network. + Returns the result of `download_fn()`. """ function download_dataset_with_fallback(download_fn, filepaths; dataset_name="dataset") + NumericalEarth.DataWrangling.DataModes.DATA_MODE[] === :pregenerate && return download_fn() try return download_fn() catch e diff --git a/test/runtests.jl b/test/runtests.jl index aba5f4fa..7ea5736a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -70,7 +70,7 @@ function __init__() ETOPOmetadata = Metadatum(:bottom_height, dataset=NumericalEarth.ETOPO.ETOPO2022()) download_dataset_with_fallback(metadata_path(ETOPOmetadata); dataset_name="ETOPO2022") do - download(ETOPOmetadata) + download_dataset(ETOPOmetadata) end ##### @@ -109,14 +109,14 @@ function __init__() for md in ts_set download_dataset_with_fallback(metadata_path(md); dataset_name="$(typeof(dataset)) $(md.name)") do - download(md) + download_dataset(md) end end if dataset isa Union{ECCO2DarwinMonthly, ECCO4DarwinMonthly} PO₄_metadata = Metadata(:phosphate; dataset, dates) download_dataset_with_fallback(metadata_path(PO₄_metadata); dataset_name="$(typeof(dataset)) phosphate") do - download(PO₄_metadata) + download_dataset(PO₄_metadata) end end end diff --git a/test/runtests_setup.jl b/test/runtests_setup.jl index 0b98b8ba..86a6f1ed 100644 --- a/test/runtests_setup.jl +++ b/test/runtests_setup.jl @@ -117,7 +117,7 @@ function test_ocean_metadata_utilities(arch, dataset, dates, inpainting; metadata = Metadata(name; dates, dataset) filepaths = [metadata_path(datum) for datum in metadata] download_dataset_with_fallback(filepaths; dataset_name="$(typeof(dataset)) $name") do - download(metadata) + download_dataset(metadata) end restoring = DatasetRestoring(metadata, arch; rate=1/1000, inpainting) @@ -175,7 +175,7 @@ function test_dataset_restoring(arch, dataset, dates, inpainting; metadata = Metadata(name; dates, dataset) filepaths = [metadata_path(datum) for datum in metadata] download_dataset_with_fallback(filepaths; dataset_name="$(typeof(dataset)) $name") do - download(metadata) + download_dataset(metadata) end var_restoring = DatasetRestoring(metadata, arch; mask, inpainting, rate=1/1000) @@ -216,7 +216,7 @@ function test_timestepping_with_dataset_restoring(arch, dataset, dates, inpainti metadata = Metadata(varnames[end]; dates, dataset) filepaths = [metadata_path(datum) for datum in metadata] download_dataset_with_fallback(filepaths; dataset_name="$(typeof(dataset)) $(varnames[end])") do - download(metadata) + download_dataset(metadata) end restoring = DatasetRestoring(metadata, arch; inpainting, rate=1/1000) forcing = NamedTuple{tuple(fldnames[end])}(tuple(restoring)) diff --git a/test/test_bathymetry.jl b/test/test_bathymetry.jl index bf714aa7..26df4a40 100644 --- a/test/test_bathymetry.jl +++ b/test/test_bathymetry.jl @@ -18,7 +18,7 @@ using Statistics # Testing downloading download_dataset_with_fallback(filepath; dataset_name="ETOPO2022") do - download(ETOPOmetadata) + download_dataset(ETOPOmetadata) end @test isfile(filepath) diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 4da3e63a..3a24a278 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -39,7 +39,7 @@ start_date = DateTime(2005, 2, 16, 12) # Download the data (falls back to NumericalEarthArtifacts if CDS is unreachable) download_dataset_with_fallback(filepath; dataset_name="ERA5Hourly $variable") do - download(metadatum) + download_dataset(metadatum) end @test isfile(filepath) @@ -299,7 +299,7 @@ start_date = DateTime(2005, 2, 16, 12) # Download if not present (falls back to NumericalEarthArtifacts if CDS is unreachable) filepath = metadata_path(metadatum) isfile(filepath) || download_dataset_with_fallback(filepath; dataset_name="ERA5Hourly $variable") do - download(metadatum) + download_dataset(metadatum) end # Create a Field from the downloaded data @@ -325,7 +325,7 @@ start_date = DateTime(2005, 2, 16, 12) # Download if not present (falls back to NumericalEarthArtifacts if CDS is unreachable) filepath = metadata_path(metadatum) isfile(filepath) || download_dataset_with_fallback(filepath; dataset_name="ERA5Hourly $variable") do - download(metadatum) + download_dataset(metadatum) end # Create a target grid matching the bounding box region @@ -361,7 +361,7 @@ start_date = DateTime(2005, 2, 16, 12) filepath = metadata_path(meta) isfile(filepath) && rm(filepath; force=true) - download(meta) + download_dataset(meta) @test isfile(filepath) # Verify the NetCDF has a pressure_level dimension and the right variable diff --git a/test/test_data_modes.jl b/test/test_data_modes.jl index 4308903d..fdc727d1 100644 --- a/test/test_data_modes.jl +++ b/test/test_data_modes.jl @@ -5,7 +5,8 @@ using NumericalEarth.DataWrangling: AbstractMetadata, Metadatum, Metadata, Metad download_dataset using NumericalEarth.DataWrangling.DataModes: DataModes, parse_data_mode, register_dataset!, write_manifest, read_manifest, download_datasets, - build_dataset_manifest, DryRunValue + pregenerate_dataset_manifest, DryRunValue, + MANIFEST_FILENAME, manifest_path_in using Downloads: Downloads using Dates: DateTime @@ -36,11 +37,12 @@ end @testset "parse_data_mode" begin @test parse_data_mode("auto") == (:auto, "") @test parse_data_mode("") == (:auto, "") - @test parse_data_mode("existing") == (:existing, "") - @test parse_data_mode("build:foo.toml") == (:build, "foo.toml") - @test parse_data_mode("build:path/to/manifest.toml") == (:build, "path/to/manifest.toml") + @test parse_data_mode("strict") == (:strict, "") + @test parse_data_mode("pregenerate") == (:pregenerate, "") + @test parse_data_mode("pregenerate:/tmp/m") == (:pregenerate, "/tmp/m") + @test parse_data_mode("pregenerate:relative/dir") == (:pregenerate, "relative/dir") - @test_throws ArgumentError parse_data_mode("build:") + @test_throws ArgumentError parse_data_mode("pregenerate:") @test_throws ArgumentError parse_data_mode("garbage") end @@ -56,7 +58,7 @@ end dates_vec = [DateTime(2020, 1, 1), DateTime(2020, 1, 2)] m_multi_missing = Metadata(:t, nothing, dates_vec, nothing, dir, - NumericalEarth.DataWrangling.DatewiseFilename(["a.nc", "b.nc"])) + DatewiseFilename(["a.nc", "b.nc"])) err = try DataModes.check_files_exist(m_multi_missing) nothing @@ -69,7 +71,7 @@ end end end -@testset "write_manifest TOML serialization" begin +@testset "write_manifest groups by dataset" begin register_dataset!(FakeDataset, "FakeDataset") md_um = Metadata(:bathymetry, FakeDataset(), nothing, nothing, "/tmp", "b.nc") @@ -87,41 +89,36 @@ end write_manifest(io, records) parsed = TOML.parse(String(take!(io))) - @test haskey(parsed, "metadatum") - @test haskey(parsed, "metadata") - @test haskey(parsed, "metadataset") - - @test any(e -> e["variable_name"] == "bathymetry" && e["dataset"] == "FakeDataset" - && !haskey(e, "date") && !haskey(e, "dir") && !haskey(e, "region"), - parsed["metadatum"]) - @test any(e -> e["variable_name"] == "temperature" && e["date"] == DateTime(2020, 1, 1), - parsed["metadatum"]) - @test any(e -> e["variable_name"] == "eastward_velocity" && haskey(e, "region") - && e["region"]["kind"] == "BoundingBox" - && e["region"]["longitude"] == [200.0, 220.0], - parsed["metadatum"]) - - @test any(e -> e["variable_name"] == "salinity" - && e["start_date"] == DateTime(2020, 1, 1) - && e["end_date"] == DateTime(2020, 12, 31), - parsed["metadata"]) + @test collect(keys(parsed)) == ["FakeDataset"] + entries = parsed["FakeDataset"] + @test length(entries) == 5 + @test all(!haskey(e, "dataset") for e in entries) - @test any(e -> e["variable_names"] == ["T", "S"] - && e["dataset"] == "FakeDataset" - && e["start_date"] == DateTime(2020, 1, 1) - && e["end_date"] == DateTime(2020, 12, 31), - parsed["metadataset"]) + @test any(e -> get(e, "variable_name", nothing) == "bathymetry" && !haskey(e, "date") && !haskey(e, "region"), entries) + @test any(e -> get(e, "variable_name", nothing) == "temperature" && get(e, "date", nothing) == DateTime(2020, 1, 1), entries) + @test any(e -> get(e, "variable_name", nothing) == "eastward_velocity" + && haskey(e, "region") && e["region"]["kind"] == "BoundingBox" + && e["region"]["longitude"] == [200.0, 220.0], + entries) + @test any(e -> get(e, "variable_name", nothing) == "salinity" + && get(e, "start_date", nothing) == DateTime(2020, 1, 1) + && get(e, "end_date", nothing) == DateTime(2020, 12, 31), + entries) + @test any(e -> get(e, "variable_names", nothing) == ["T", "S"] + && get(e, "start_date", nothing) == DateTime(2020, 1, 1) + && get(e, "end_date", nothing) == DateTime(2020, 12, 31), + entries) - # Region: Column col = Column(45.0, 30.0; z=(-400.0, 0.0), interpolation=Nearest()) md_col = Metadata(:temperature, FakeDataset(), DateTime(2020, 1, 1), col, "/tmp", "t.nc") io2 = IOBuffer() write_manifest(io2, AbstractMetadata[md_col]) parsed2 = TOML.parse(String(take!(io2))) - @test parsed2["metadatum"][1]["region"]["kind"] == "Column" - @test parsed2["metadatum"][1]["region"]["longitude"] == 45.0 - @test parsed2["metadatum"][1]["region"]["latitude"] == 30.0 - @test parsed2["metadatum"][1]["region"]["interpolation"] == "Nearest" + col_entry = parsed2["FakeDataset"][1] + @test col_entry["region"]["kind"] == "Column" + @test col_entry["region"]["longitude"] == 45.0 + @test col_entry["region"]["latitude"] == 30.0 + @test col_entry["region"]["interpolation"] == "Nearest" end @testset "read_manifest round-trip" begin @@ -136,9 +133,9 @@ end start_date=DateTime(2020, 3, 1), end_date=DateTime(2020, 8, 1)) mktempdir() do dir - path = joinpath(dir, "DataManifest.toml") + path = manifest_path_in(dir) write_manifest(path, AbstractMetadata[md_one, md_range, md_region, mset]) - records = read_manifest(path) + records = read_manifest(; dir) @test length(records) == 4 rt = first(r for r in records if r isa Metadatum && r.name == :temperature && r.region === nothing) @@ -159,7 +156,7 @@ end end end -@testset "download_datasets varargs and manifest path" begin +@testset "download_datasets varargs and manifest dir" begin register_dataset!(FakeDataset, "FakeDataset") saved = DataModes.DATA_MODE[] REAL_DOWNLOAD_CALLS = Ref(0) @@ -173,10 +170,9 @@ end @test REAL_DOWNLOAD_CALLS[] == 2 mktempdir() do dir - path = joinpath(dir, "DataManifest.toml") - write_manifest(path, AbstractMetadata[m1, m2]) + write_manifest(manifest_path_in(dir), AbstractMetadata[m1, m2]) REAL_DOWNLOAD_CALLS[] = 0 - download_datasets(path) + download_datasets(; dir) @test REAL_DOWNLOAD_CALLS[] == 2 end finally @@ -188,7 +184,7 @@ end register_dataset!(FakeDataset, "FakeDataset") saved = DataModes.DATA_MODE[] try - DataModes.DATA_MODE[] = :build + DataModes.DATA_MODE[] = :pregenerate empty!(DataModes.RECORDED) library_constructor() = (Metadatum(:temperature; dataset=FakeDataset(), date=DateTime(2020, 6, 1)), @@ -211,9 +207,9 @@ end @test sprint(show, v) == "DryRunValue()" end -@testset "build_dataset_manifest append (overwrite_existing=false)" begin +@testset "pregenerate_dataset_manifest append (overwrite_existing=false)" begin mktempdir() do dir - manifest = joinpath(dir, "DataManifest.toml") + manifest = manifest_path_in(dir) script_one = joinpath(dir, "one.jl") write(script_one, """ @@ -229,9 +225,9 @@ end download_dataset(Metadatum(:temperature; dataset=AppendDataset(), date=DateTime(2020, 1, 1))) """) - build_dataset_manifest(script_one; manifest) + pregenerate_dataset_manifest(script_one; dir) parsed_after_one = TOML.parsefile(manifest) - @test length(parsed_after_one["metadatum"]) == 1 + @test length(parsed_after_one["AppendDataset"]) == 1 script_two = joinpath(dir, "two.jl") write(script_two, """ @@ -248,23 +244,23 @@ end download_dataset(Metadatum(:salinity; dataset=AppendDataset(), date=DateTime(2020, 1, 1))) """) - build_dataset_manifest(script_two; manifest, overwrite_existing = false) + pregenerate_dataset_manifest(script_two; dir, overwrite_existing = false) parsed_after_two = TOML.parsefile(manifest) - @test length(parsed_after_two["metadatum"]) == 2 - @test sort([e["variable_name"] for e in parsed_after_two["metadatum"]]) == ["salinity", "temperature"] + @test length(parsed_after_two["AppendDataset"]) == 2 + @test sort([e["variable_name"] for e in parsed_after_two["AppendDataset"]]) == ["salinity", "temperature"] - build_dataset_manifest(script_two; manifest, overwrite_existing = false) + pregenerate_dataset_manifest(script_two; dir, overwrite_existing = false) parsed_after_two_repeat = TOML.parsefile(manifest) - @test length(parsed_after_two_repeat["metadatum"]) == 2 + @test length(parsed_after_two_repeat["AppendDataset"]) == 2 - build_dataset_manifest(script_two; manifest, overwrite_existing = true) + pregenerate_dataset_manifest(script_two; dir, overwrite_existing = true) parsed_after_overwrite = TOML.parsefile(manifest) - @test length(parsed_after_overwrite["metadatum"]) == 1 - @test parsed_after_overwrite["metadatum"][1]["variable_name"] == "salinity" + @test length(parsed_after_overwrite["AppendDataset"]) == 1 + @test parsed_after_overwrite["AppendDataset"][1]["variable_name"] == "salinity" end end -@testset "build_dataset_manifest end-to-end" begin +@testset "pregenerate_dataset_manifest end-to-end" begin mktempdir() do dir script = joinpath(dir, "demo.jl") write(script, """ @@ -289,11 +285,10 @@ end helper() """) - manifest = joinpath(dir, "DataManifest.toml") - build_dataset_manifest(script; manifest) - parsed = TOML.parsefile(manifest) - @test length(get(parsed, "metadatum", [])) == 2 - @test sort([e["variable_name"] for e in parsed["metadatum"]]) == ["S", "T"] + pregenerate_dataset_manifest(script; dir) + parsed = TOML.parsefile(manifest_path_in(dir)) + @test length(get(parsed, "DemoDataset", [])) == 2 + @test sort([e["variable_name"] for e in parsed["DemoDataset"]]) == ["S", "T"] end end @@ -306,12 +301,12 @@ end download_dataset(md) @test MOCK_DOWNLOAD_CALLS[] == 1 - DataModes.DATA_MODE[] = :build + DataModes.DATA_MODE[] = :pregenerate MOCK_DOWNLOAD_CALLS[] = 0 download_dataset(md) @test MOCK_DOWNLOAD_CALLS[] == 0 - DataModes.DATA_MODE[] = :existing + DataModes.DATA_MODE[] = :strict @test_throws Exception download_dataset(md) finally DataModes.DATA_MODE[] = saved_mode diff --git a/test/test_distributed_utils.jl b/test/test_distributed_utils.jl index 1f29e49a..10bac060 100644 --- a/test/test_distributed_utils.jl +++ b/test/test_distributed_utils.jl @@ -59,7 +59,7 @@ metadata_filename(::TrivalBathymetry, name, date, region) = "trivial_bathymetry. @testset "Distributed ECCO download" begin dates = DateTimeProlepticGregorian(1992, 1, 1) : Month(1) : DateTimeProlepticGregorian(1994, 4, 1) metadata = Metadata(:u_velocity; dataset=ECCO4Monthly(), dates) - download(metadata) + download_dataset(metadata) @root for metadatum in metadata @test isfile(metadata_path(metadatum)) diff --git a/test/test_ecco2_daily.jl b/test/test_ecco2_daily.jl index 587ade4c..c0eed3f6 100644 --- a/test/test_ecco2_daily.jl +++ b/test/test_ecco2_daily.jl @@ -36,7 +36,7 @@ for arch in test_architectures # if the primary source is unreachable filepaths = [metadata_path(datum) for datum in metadata] download_dataset_with_fallback(filepaths; dataset_name="$D $name") do - download(metadata) + download_dataset(metadata) end for datum in metadata @test isfile(metadata_path(datum)) diff --git a/test/test_ecco2_monthly.jl b/test/test_ecco2_monthly.jl index c36cf456..8a88bd48 100644 --- a/test/test_ecco2_monthly.jl +++ b/test/test_ecco2_monthly.jl @@ -42,7 +42,7 @@ for arch in test_architectures, dataset in test_ecco_datasets # if the primary source is unreachable filepaths = [metadata_path(datum) for datum in metadata] download_dataset_with_fallback(filepaths; dataset_name="$D $name") do - download(metadata) + download_dataset(metadata) end for datum in metadata @test isfile(metadata_path(datum)) diff --git a/test/test_ecco4_en4.jl b/test/test_ecco4_en4.jl index 6d209312..0a008613 100644 --- a/test/test_ecco4_en4.jl +++ b/test/test_ecco4_en4.jl @@ -39,7 +39,7 @@ for arch in test_architectures, dataset in test_ecco_en4_datasets # if the primary source is unreachable filepaths = [metadata_path(datum) for datum in metadata] download_dataset_with_fallback(filepaths; dataset_name="$D $name") do - download(metadata) + download_dataset(metadata) end for datum in metadata @test isfile(metadata_path(datum)) diff --git a/test/test_ecco_atmosphere.jl b/test/test_ecco_atmosphere.jl index 01293380..0009b679 100644 --- a/test/test_ecco_atmosphere.jl +++ b/test/test_ecco_atmosphere.jl @@ -15,7 +15,7 @@ let dates = DateTime(1992, 1, 1):Month(1):DateTime(1992, 3, 1) for name in NumericalEarth.ECCO.ECCO_atmosphere_variables md = Metadata(name; dataset=ECCO4Monthly(), dates) download_dataset_with_fallback(metadata_path(md); dataset_name="ECCO4Monthly $name") do - download(md) + download_dataset(md) end end end diff --git a/test/test_glorys_downloading.jl b/test/test_glorys_downloading.jl index 6dcc23b3..a09995d9 100644 --- a/test/test_glorys_downloading.jl +++ b/test/test_glorys_downloading.jl @@ -19,7 +19,7 @@ using Oceananigans.Fields: location metadatum = Metadatum(variable; dataset, region) filepath = NumericalEarth.DataWrangling.metadata_path(metadatum) isfile(filepath) && rm(filepath; force=true) - download(metadatum) + download_dataset(metadatum) @test isfile(filepath) end end diff --git a/test/test_jra55_ecco_en4_etopo_downloading.jl b/test/test_jra55_ecco_en4_etopo_downloading.jl index 5ea6ed3f..960d45a2 100644 --- a/test/test_jra55_ecco_en4_etopo_downloading.jl +++ b/test/test_jra55_ecco_en4_etopo_downloading.jl @@ -35,7 +35,7 @@ end isfile(filepath) && rm(filepath; force=true) download_dataset_with_fallback(filepath; dataset_name="$(typeof(dataset)) $variable") do - download(metadata) + download_dataset(metadata) end @test isfile(filepath) rm(filepath; force=true) @@ -50,7 +50,7 @@ end isfile(filepath) && rm(filepath; force=true) download_dataset_with_fallback(filepath; dataset_name="ETOPO2022") do - download(metadata) + download_dataset(metadata) end @test isfile(filepath) end diff --git a/test/test_mangling.jl b/test/test_mangling.jl index 540869c1..22a32c2f 100644 --- a/test/test_mangling.jl +++ b/test/test_mangling.jl @@ -34,7 +34,7 @@ end @testset "ECCO v_velocity Field uses ShiftSouth mangling end-to-end" begin md = Metadatum(:v_velocity; dataset=ECCO4Monthly(), date=start_date) download_dataset_with_fallback([metadata_path(md)]; dataset_name="ECCO4Monthly v_velocity") do - download(md) + download_dataset(md) end for arch in test_architectures field = Field(md, arch) diff --git a/test/test_orca_grid.jl b/test/test_orca_grid.jl index b3d73552..60e0bf7b 100644 --- a/test/test_orca_grid.jl +++ b/test/test_orca_grid.jl @@ -16,7 +16,7 @@ using Test for name in (:mesh_mask, :bottom_height) md = Metadatum(name; dataset=ORCA1()) download_dataset_with_fallback(metadata_path(md); dataset_name="ORCA1 $name") do - download(md) + download_dataset(md) end end @@ -165,7 +165,7 @@ end @testset "ORCA1 bathymetry retrieval" begin bathy_md = Metadatum(:bottom_height; dataset=ORCA1()) - download(bathy_md) + download_dataset(bathy_md) path = metadata_path(bathy_md) @test isfile(path) diff --git a/test/test_woa.jl b/test/test_woa.jl index 7f9293ac..b3b6d607 100644 --- a/test/test_woa.jl +++ b/test/test_woa.jl @@ -16,7 +16,7 @@ inpainting = NearestNeighborInpainting(10) function ensure_woa_file(metadatum; label) filepath = metadata_path(metadatum) download_dataset_with_fallback(filepath; dataset_name=label) do - download(metadatum) + download_dataset(metadatum) end return filepath end From d17146c8565a098b1d3c47cc008b5b32e0d6772a Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Fri, 22 May 2026 15:52:10 +0200 Subject: [PATCH 3/7] change manifest --- src/DataWrangling/JRA55/JRA55.jl | 1 - src/DataWrangling/JRA55/JRA55_metadata.jl | 1 - src/DataWrangling/metadata.jl | 19 +++++++++---------- src/NumericalEarth.jl | 5 +++++ test/runtests_setup.jl | 3 +-- test/test_diagnostics_1.jl | 4 ++-- test/test_distributed_utils.jl | 3 +-- test/test_jra55_ecco_en4_etopo_downloading.jl | 2 +- test/test_ocean_sea_ice_model.jl | 4 ++-- 9 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/DataWrangling/JRA55/JRA55.jl b/src/DataWrangling/JRA55/JRA55.jl index 08c3f336..ee3c796f 100644 --- a/src/DataWrangling/JRA55/JRA55.jl +++ b/src/DataWrangling/JRA55/JRA55.jl @@ -7,7 +7,6 @@ export JRA55PrescribedAtmosphere, MultiYearJRA55 using Adapt: Adapt -using CFTime: CFTime using Dates: Dates, DateTime, Day, Hour using Downloads: Downloads using Oceananigans: Oceananigans diff --git a/src/DataWrangling/JRA55/JRA55_metadata.jl b/src/DataWrangling/JRA55/JRA55_metadata.jl index cbf647a3..120a78ec 100644 --- a/src/DataWrangling/JRA55/JRA55_metadata.jl +++ b/src/DataWrangling/JRA55/JRA55_metadata.jl @@ -1,4 +1,3 @@ -using CFTime: CFTime using Dates: Dates, DateTime, Day, Hour using Downloads: Downloads using Oceananigans.DistributedComputations diff --git a/src/DataWrangling/metadata.jl b/src/DataWrangling/metadata.jl index cc1d878b..acedc0f3 100644 --- a/src/DataWrangling/metadata.jl +++ b/src/DataWrangling/metadata.jl @@ -1,4 +1,3 @@ -using CFTime: AbstractCFDateTime, CFTime using Dates: Dates, Date, DateTime using Base: @propagate_inbounds @@ -122,17 +121,17 @@ Keyword Arguments - `dataset`: Supported datasets are `ETOPO2022()`, `ECCO2Monthly()`, `ECCO2Daily()`, `ECCO4Monthly()`, `EN4Monthly()`, `GLORYSDaily()`, `GLORYSMonthly()`, `RepeatYearJRA55()`, and `MultiYearJRA55()`. -- `dates`: The dates of the dataset (`Dates.AbstractDateTime` or `CFTime.AbstractCFDateTime`). +- `dates`: The dates of the dataset (`Dates.AbstractDateTime`). Note that `dates` can either be a range or a vector of dates, representing a time-series. For a single date, use [`Metadatum`](@ref). - `start_date`: If `dates = nothing`, we can prescribe the first date of metadata as a date - (`Dates.AbstractDateTime` or `CFTime.AbstractCFDateTime`). If outside the - date range of the dataset, the first allowable date is chosen. Default: nothing. + (`Dates.AbstractDateTime`). If outside the date range of the dataset, the first + allowable date is chosen. Default: nothing. - `end_date`: If `dates = nothing`, we can prescribe the last date of metadata as a date - (`Dates.AbstractDateTime` or `CFTime.AbstractCFDateTime`). If outside the - date range of the dataset, the last allowable date is chosen. Default: nothing. + (`Dates.AbstractDateTime`). If outside the date range of the dataset, the last + allowable date is chosen. Default: nothing. - `region`: Specifies the spatial region of the dataset. Can be a [`BoundingBox`](@ref) for a rectangular region, a [`Column`](@ref) for a single horizontal location, @@ -171,7 +170,7 @@ function Metadata(variable_name; return Metadata(variable_name, dataset, dates, region, dir, filename) end -const AnyDateTime = Union{AbstractCFDateTime, Dates.AbstractDateTime} +const AnyDateTime = Dates.AbstractDateTime const Metadatum{V} = Metadata{V, <:Union{AnyDateTime, Nothing}} where V function Base.size(metadata::Metadata) @@ -207,7 +206,7 @@ function Metadatum(variable_name; end if !isnothing(date) && !(date isa AnyDateTime) - msg = "`date` must be `nothing`, a `Dates.AbstractDateTime`, or `CFTime.AbstractCFDateTime`, received $(typeof(date))" + msg = "`date` must be `nothing` or a `Dates.AbstractDateTime`, received $(typeof(date))" throw(ArgumentError(msg)) end @@ -348,7 +347,7 @@ Arguments Keyword Arguments ================= - `dataset`: the shared dataset (e.g. `ECCO4Monthly()`, `ERA5HourlyPressureLevels()`). -- `dates`: shared date axis. Either a single `AbstractDateTime`/`AbstractCFDateTime` +- `dates`: shared date axis. Either a single `AbstractDateTime` (yielding a [`MetadatumSet`](@ref)) or an `AbstractVector` of dates. Defaults to `all_dates(dataset, first(variable_names))`. - `date`: convenience scalar form; cannot be used together with `dates`. @@ -386,7 +385,7 @@ function MetadataSet(variable_names::Symbol...; end if !isnothing(date) && !(effective_dates isa AnyDateTime) - msg = "`date` must be a `Dates.AbstractDateTime` or `CFTime.AbstractCFDateTime`, received $(typeof(date))" + msg = "`date` must be a `Dates.AbstractDateTime`, received $(typeof(date))" throw(ArgumentError(msg)) end diff --git a/src/NumericalEarth.jl b/src/NumericalEarth.jl index 21562193..9c144d4b 100644 --- a/src/NumericalEarth.jl +++ b/src/NumericalEarth.jl @@ -186,8 +186,13 @@ Skipped during precompilation, in `:strict` mode (the manifest is the user's pro already on disk, not a fetch request), and in `:pregenerate` mode (we're recording, not downloading). """ function __init__() + # Skip during precompile / sysimage build subprocess. ccall(:jl_generating_output, Cint, ()) == 1 && return nothing + # Skip in :strict / :pregenerate — those modes have their own semantics. DataWrangling.DataModes.DATA_MODE[] === :auto || return nothing + # Skip in REPL / `julia -e ...` — auto-download is only for script-mode runs + # (`julia --project my_simulation.jl`) where blocking-on-fetch matches user intent. + (!isempty(Base.PROGRAM_FILE) && isfile(Base.PROGRAM_FILE)) || return nothing project = Base.active_project() project === nothing && return nothing diff --git a/test/runtests_setup.jl b/test/runtests_setup.jl index 86a6f1ed..27c277c9 100644 --- a/test/runtests_setup.jl +++ b/test/runtests_setup.jl @@ -15,7 +15,6 @@ using NumericalEarth.WOA using Oceananigans.Architectures: architecture, on_architecture using Oceananigans.OutputReaders: interpolate! -using CFTime using Dates using CUDA: @allowscalar @@ -23,7 +22,7 @@ using CUDA: @allowscalar gpu_test = parse(Bool, get(ENV, "GPU_TEST", "false")) test_architectures = gpu_test ? [GPU()] : [CPU()] -start_date = DateTimeProlepticGregorian(1993, 1, 1) +start_date = DateTime(1993, 1, 1) test_datasets = (ECCO2Monthly(), ECCO2Daily(), diff --git a/test/test_diagnostics_1.jl b/test/test_diagnostics_1.jl index 8e05ada0..42933d57 100644 --- a/test/test_diagnostics_1.jl +++ b/test/test_diagnostics_1.jl @@ -23,8 +23,8 @@ for arch in test_architectures, dataset in (ECCO4Monthly(),) grid = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom_height)) - start = DateTimeProlepticGregorian(1993, 1, 1) - stop = DateTimeProlepticGregorian(1993, 2, 1) + start = DateTime(1993, 1, 1) + stop = DateTime(1993, 2, 1) dates = range(start; stop, step=Month(1)) Tmeta = Metadata(:temperature; dataset, dates) diff --git a/test/test_distributed_utils.jl b/test/test_distributed_utils.jl index 10bac060..eca34746 100644 --- a/test/test_distributed_utils.jl +++ b/test/test_distributed_utils.jl @@ -3,7 +3,6 @@ include("runtests_setup.jl") using MPI MPI.Init() -using CFTime using Dates using NCDatasets using NumericalEarth.DataWrangling: metadata_path @@ -57,7 +56,7 @@ latitude_interfaces(::TrivalBathymetry) = (0, 50) metadata_filename(::TrivalBathymetry, name, date, region) = "trivial_bathymetry.nc" @testset "Distributed ECCO download" begin - dates = DateTimeProlepticGregorian(1992, 1, 1) : Month(1) : DateTimeProlepticGregorian(1994, 4, 1) + dates = DateTime(1992, 1, 1) : Month(1) : DateTime(1994, 4, 1) metadata = Metadata(:u_velocity; dataset=ECCO4Monthly(), dates) download_dataset(metadata) diff --git a/test/test_jra55_ecco_en4_etopo_downloading.jl b/test/test_jra55_ecco_en4_etopo_downloading.jl index 960d45a2..ee6760da 100644 --- a/test/test_jra55_ecco_en4_etopo_downloading.jl +++ b/test/test_jra55_ecco_en4_etopo_downloading.jl @@ -30,7 +30,7 @@ end error("what am I supposed to download?") for variable in variables - metadata = Metadata(variable; dates=DateTimeProlepticGregorian(1993, 1, 1), dataset) + metadata = Metadata(variable; dates=DateTime(1993, 1, 1), dataset) filepath = metadata_path(metadata) isfile(filepath) && rm(filepath; force=true) diff --git a/test/test_ocean_sea_ice_model.jl b/test/test_ocean_sea_ice_model.jl index 20d60e32..555888f8 100644 --- a/test/test_ocean_sea_ice_model.jl +++ b/test/test_ocean_sea_ice_model.jl @@ -33,9 +33,9 @@ using ClimaSeaIce.Rheologies for dataset in [ECCO4Monthly(), EN4Monthly()] @info "Testing timestepping with $(typeof(dataset)) on $A" - start_date = DateTimeProlepticGregorian(1993, 1, 1) + start_date = DateTime(1993, 1, 1) time_resolution = dataset isa ECCO2Daily ? Day(1) : Month(1) - end_date = DateTimeProlepticGregorian(1993, 2, 1) + end_date = DateTime(1993, 2, 1) dates = start_date : time_resolution : end_date initial_state = MetadataSet(:temperature, :salinity; From 719e63cfc96ff6d8fb8dd5470a2664e43e228fee Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Fri, 22 May 2026 15:52:39 +0200 Subject: [PATCH 4/7] add a test manifest --- test/NumericalEarthDataManifest.toml | 391 +++++++++++++++++++++++++++ 1 file changed, 391 insertions(+) create mode 100644 test/NumericalEarthDataManifest.toml diff --git a/test/NumericalEarthDataManifest.toml b/test/NumericalEarthDataManifest.toml new file mode 100644 index 00000000..a5394d66 --- /dev/null +++ b/test/NumericalEarthDataManifest.toml @@ -0,0 +1,391 @@ +[[ORCA12]] +filename = "bathy_eORCA12_noclosea_from_GEBCO2021_FillZero_S21TT_CloseaCopy.nc" +variable_name = "bottom_height" +[[ORCA12]] +filename = "grid_mask_eORCA12-GO6.nc" +variable_name = "mesh_mask" + +[[GEBCO2024]] +filename = "GEBCO_2024.nc" +variable_name = "bottom_height" + +[[IBCSOv2]] +filename = "IBCSO_v2_bed_WGS84.nc" +variable_name = "bottom_height" + +[[ETOPO2022]] +filename = "ETOPO_2022_v1_60s_N90W180_surface.nc" +variable_name = "bottom_height" + +[[WOAAnnual]] +filename = "woa_t_annual.nc" +variable_name = "temperature" +[[WOAAnnual]] +filename = "woa_s_annual.nc" +variable_name = "salinity" + +[[WOAMonthly]] +filename = "woa_t_monthly_01.nc" +date = 2018-01-01T00:00:00.000Z +variable_name = "temperature" +[[WOAMonthly]] +filename = "woa_s_monthly_01.nc" +date = 2018-01-01T00:00:00.000Z +variable_name = "salinity" + +[[ERA5HourlySingleLevel]] +filename = "2m_temperature_ERA5HourlySingleLevel_2004-12-27T00_-110.0_30.0_-25.0_35.0.nc" +date = 2004-12-27T00:00:00.000Z +variable_name = "temperature" + + [ERA5HourlySingleLevel.region] + kind = "BoundingBox" + latitude = [-25, 35] + longitude = [-110, 30] + +[[ORCA1]] +filename = "eORCA1.2_mesh_mask.nc" +variable_name = "mesh_mask" +[[ORCA1]] +filename = "eORCA_R1_bathy_meter_v2.2.nc" +variable_name = "bottom_height" + +[[IBCAOv5]] +filename = "ibcao_v5_wgs84_0p01deg.nc" +variable_name = "bottom_height" + +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] +variable_name = "salinity" +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] +variable_name = "salinity" +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] +variable_name = "temperature" +[[ECCO2Daily]] +filename = "SALT.1440x720x50.19930105.nc" +date = 1993-01-05T00:00:00.000Z +variable_name = "salinity" +[[ECCO2Daily]] +filename = "SALT.1440x720x50.19930104.nc" +date = 1993-01-04T00:00:00.000Z +variable_name = "salinity" +[[ECCO2Daily]] +filename = "SALT.1440x720x50.19930103.nc" +date = 1993-01-03T00:00:00.000Z +variable_name = "salinity" +[[ECCO2Daily]] +filename = "SALT.1440x720x50.19930102.nc" +date = 1993-01-02T00:00:00.000Z +variable_name = "salinity" +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] +variable_name = "salinity" +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] +variable_name = "temperature" +[[ECCO2Daily]] +filename = "SALT.1440x720x50.19930101.nc" +date = 1993-01-01T00:00:00.000Z +variable_name = "salinity" +[[ECCO2Daily]] +filename = "THETA.1440x720x50.19930105.nc" +date = 1993-01-05T00:00:00.000Z +variable_name = "temperature" +[[ECCO2Daily]] +filename = "THETA.1440x720x50.19930104.nc" +date = 1993-01-04T00:00:00.000Z +variable_name = "temperature" +[[ECCO2Daily]] +filename = "THETA.1440x720x50.19930103.nc" +date = 1993-01-03T00:00:00.000Z +variable_name = "temperature" +[[ECCO2Daily]] +filename = "THETA.1440x720x50.19930102.nc" +date = 1993-01-02T00:00:00.000Z +variable_name = "temperature" +[[ECCO2Daily]] +filename = "THETA.1440x720x50.19930101.nc" +date = 1993-01-01T00:00:00.000Z +variable_name = "temperature" +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] +variable_name = "temperature" +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] +variable_name = "salinity" +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] +variable_name = "salinity" +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] +variable_name = "temperature" +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] +variable_name = "salinity" +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] +variable_name = "temperature" +[[ECCO2Daily]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-01-05T00:00:00.000Z +filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] +variable_name = "temperature" + +[[ECCO4Monthly]] +filename = "THETA_1992_03.nc" +date = 1992-03-01T00:00:00.000Z +variable_name = "temperature" + + [ECCO4Monthly.region] + kind = "Column" + latitude = 50.1 + longitude = 35.1 + interpolation = "Linear" +[[ECCO4Monthly]] +filename = "THETA_1992_02.nc" +date = 1992-02-01T00:00:00.000Z +variable_name = "temperature" + + [ECCO4Monthly.region] + kind = "Column" + latitude = 50.1 + longitude = 35.1 + interpolation = "Linear" +[[ECCO4Monthly]] +start_date = 1992-01-01T00:00:00.000Z +end_date = 1992-03-01T00:00:00.000Z +filename = ["THETA_1992_01.nc", "THETA_1992_02.nc", "THETA_1992_03.nc"] +variable_name = "temperature" + + [ECCO4Monthly.region] + kind = "Column" + latitude = 50.1 + longitude = 35.1 + interpolation = "Linear" +[[ECCO4Monthly]] +filename = "THETA_1992_01.nc" +date = 1992-01-01T00:00:00.000Z +variable_name = "temperature" + + [ECCO4Monthly.region] + kind = "BoundingBox" + latitude = [-30, 30] +[[ECCO4Monthly]] +filename = "THETA_1992_01.nc" +date = 1992-01-01T00:00:00.000Z +variable_name = "temperature" + + [ECCO4Monthly.region] + kind = "BoundingBox" + latitude = [-30, 30] + longitude = [-180, 180] +[[ECCO4Monthly]] +filename = "THETA_1992_01.nc" +date = 1992-01-01T00:00:00.000Z +variable_name = "temperature" + + [ECCO4Monthly.region] + kind = "BoundingBox" + latitude = [0, 10] + longitude = [0, 10] +[[ECCO4Monthly]] +filename = "THETA_1992_01.nc" +date = 1992-01-01T00:00:00.000Z +variable_name = "temperature" +[[ECCO4Monthly]] +filename = "EVEL_1992_01.nc" +date = 1992-01-01T00:00:00.000Z +variable_name = "u_velocity" + + [ECCO4Monthly.region] + kind = "BoundingBox" + latitude = [0, 10] + longitude = [0, 10] +[[ECCO4Monthly]] +filename = "EVEL_1992_01.nc" +date = 1992-01-01T00:00:00.000Z +variable_name = "u_velocity" +[[ECCO4Monthly]] +filename = "SSH_1992_01.nc" +date = 1992-01-01T00:00:00.000Z +variable_name = "free_surface" + + [ECCO4Monthly.region] + kind = "Column" + latitude = 50.1 + longitude = 35.1 + interpolation = "Linear" +[[ECCO4Monthly]] +filename = "EVEL_1992_01.nc" +date = 1992-01-01T00:00:00.000Z +variable_name = "u_velocity" + + [ECCO4Monthly.region] + kind = "Column" + latitude = 50.1 + longitude = 35.1 + interpolation = "Linear" +[[ECCO4Monthly]] +filename = "THETA_1992_01.nc" +date = 1992-01-01T00:00:00.000Z +variable_name = "temperature" + + [ECCO4Monthly.region] + kind = "Column" + latitude = 50.1 + longitude = 35.1 + interpolation = "Linear" +[[ECCO4Monthly]] +start_date = 1992-01-01T00:00:00.000Z +end_date = 1992-03-01T00:00:00.000Z +filename = ["THETA_1992_01.nc", "THETA_1992_02.nc", "THETA_1992_03.nc"] +variable_name = "temperature" + + [ECCO4Monthly.region] + kind = "Column" + latitude = 50.1 + longitude = 35.1 + interpolation = "Linear" +[[ECCO4Monthly]] +filename = "SALT_1993_02.nc" +date = 1993-02-01T00:00:00.000Z +variable_name = "salinity" +[[ECCO4Monthly]] +filename = "THETA_1993_02.nc" +date = 1993-02-01T00:00:00.000Z +variable_name = "temperature" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-02-01T00:00:00.000Z +filename = ["SALT_1993_01.nc", "SALT_1993_02.nc"] +variable_name = "salinity" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-02-01T00:00:00.000Z +filename = ["THETA_1993_01.nc", "THETA_1993_02.nc"] +variable_name = "temperature" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-02-01T00:00:00.000Z +filename = ["THETA_1993_01.nc", "THETA_1993_02.nc"] +variable_name = "temperature" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-02-01T00:00:00.000Z +filename = ["SALT_1993_01.nc", "SALT_1993_02.nc"] +variable_name = "salinity" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +variable_names = ["temperature", "salinity"] +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +filename = ["THETA_1993_01.nc", "THETA_1993_02.nc", "THETA_1993_03.nc", "THETA_1993_04.nc"] +variable_name = "temperature" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +variable_names = ["temperature", "salinity"] +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +filename = ["THETA_1993_01.nc", "THETA_1993_02.nc", "THETA_1993_03.nc", "THETA_1993_04.nc"] +variable_name = "temperature" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +filename = ["SALT_1993_01.nc", "SALT_1993_02.nc", "SALT_1993_03.nc", "SALT_1993_04.nc"] +variable_name = "salinity" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +variable_names = ["temperature", "salinity"] +[[ECCO4Monthly]] +filename = "NVEL_1993_01.nc" +date = 1993-01-01T00:00:00.000Z +variable_name = "v_velocity" +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +variable_names = ["temperature", "salinity"] +[[ECCO4Monthly]] +filename = "THETA_1993_01.nc" +date = 1993-01-01T00:00:00.000Z +variable_name = "temperature" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +variable_names = ["temperature", "salinity"] +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +filename = ["THETA_1993_01.nc", "THETA_1993_02.nc", "THETA_1993_03.nc", "THETA_1993_04.nc"] +variable_name = "temperature" +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +variable_names = ["temperature", "salinity"] + + [ECCO4Monthly.region] + kind = "BoundingBox" + latitude = [-10.0, 10.0] + longitude = [-20.0, 20.0] +[[ECCO4Monthly]] +filename = "SALT_1993_01.nc" +date = 1993-01-01T00:00:00.000Z +variable_name = "salinity" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +variable_names = ["temperature", "salinity"] +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +filename = ["THETA_1993_01.nc", "THETA_1993_02.nc", "THETA_1993_03.nc", "THETA_1993_04.nc"] +variable_name = "temperature" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +filename = ["SALT_1993_01.nc", "SALT_1993_02.nc", "SALT_1993_03.nc", "SALT_1993_04.nc"] +variable_name = "salinity" +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +variable_names = ["temperature", "sea_ice_thickness"] +[[ECCO4Monthly]] +filename = "SIheff_1993_01.nc" +date = 1993-01-01T00:00:00.000Z +variable_name = "sea_ice_thickness" +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +variable_names = ["temperature", "salinity", "sea_ice_thickness", "sea_ice_concentration"] +[[ECCO4Monthly]] +filename = "SIarea_1993_01.nc" +date = 1993-01-01T00:00:00.000Z +variable_name = "sea_ice_concentration" +[[ECCO4Monthly]] +start_date = 1993-01-01T00:00:00.000Z +end_date = 1993-04-01T00:00:00.000Z +variable_names = ["temperature", "salinity"] From 1b7197d7fb658ccf0aae4689789a099cb219cbc1 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Fri, 22 May 2026 15:57:56 +0200 Subject: [PATCH 5/7] add a freshness test --- test/test_data_manifest_freshness.jl | 54 ++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 test/test_data_manifest_freshness.jl diff --git a/test/test_data_manifest_freshness.jl b/test/test_data_manifest_freshness.jl new file mode 100644 index 00000000..22daaba7 --- /dev/null +++ b/test/test_data_manifest_freshness.jl @@ -0,0 +1,54 @@ +include("runtests_setup.jl") + +using NumericalEarth.DataWrangling.DataModes: pregenerate_dataset_manifest, manifest_path_in +using TOML + +# Regenerate the test-folder `NumericalEarthDataManifest.toml` by tracing every `test_*.jl` +# (excluding self) in pregenerate mode, and compare the result against the committed manifest. +# If they differ the manifest is stale — re-run the pregenerate command below and commit the +# resulting `test/NumericalEarthDataManifest.toml`. +function regenerate_manifest_in(out_dir) + test_dir = @__DIR__ + self = @__FILE__ + for f in sort(readdir(test_dir; join=true)) + endswith(f, ".jl") && startswith(basename(f), "test_") || continue + abspath(f) == abspath(self) && continue + try + pregenerate_dataset_manifest(f; dir = out_dir, overwrite_existing = false) + catch + # Per-file trace failures are non-fatal — same behavior as the manual pregenerate + # loop. Whatever records the trace captured before the failure are still in the + # manifest; subsequent files keep accumulating. + end + end + out = manifest_path_in(out_dir) + return isfile(out) ? TOML.parsefile(out) : Dict{String, Any}() +end + +@testset "DataManifest freshness" begin + committed_path = manifest_path_in(@__DIR__) + @test isfile(committed_path) + + committed = TOML.parsefile(committed_path) + regenerated = mktempdir(regenerate_manifest_in) + + if committed != regenerated + added = sort(collect(setdiff(keys(regenerated), keys(committed)))) + removed = sort(collect(setdiff(keys(committed), keys(regenerated)))) + isempty(added) || @info "Datasets added to the regenerated manifest" datasets=added + isempty(removed) || @info "Datasets missing from the regenerated manifest" datasets=removed + for k in sort(collect(intersect(keys(committed), keys(regenerated)))) + committed[k] == regenerated[k] && continue + @info "Entries differ for dataset" dataset=k committed=committed[k] regenerated=regenerated[k] + end + @info "Manifest is stale. To regenerate, run from the repo root:\n " * + "julia --project -e 'using NumericalEarth.DataWrangling.DataModes: " * + "pregenerate_dataset_manifest, manifest_path_in; dir = abspath(\"test\"); " * + "rm(manifest_path_in(dir); force=true); for f in sort(readdir(dir; join=true)); " * + "endswith(f, \".jl\") && startswith(basename(f), \"test_\") && " * + "basename(f) != \"test_data_manifest_freshness.jl\" || continue; " * + "try; pregenerate_dataset_manifest(f; dir, overwrite_existing=false); catch; end; end'" + end + + @test committed == regenerated +end From 6cb77adcc9652a89e4227dcecef8b3894516dcb4 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Fri, 22 May 2026 15:58:07 +0200 Subject: [PATCH 6/7] remove extra comments --- test/test_data_manifest_freshness.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/test_data_manifest_freshness.jl b/test/test_data_manifest_freshness.jl index 22daaba7..51bf75d7 100644 --- a/test/test_data_manifest_freshness.jl +++ b/test/test_data_manifest_freshness.jl @@ -16,9 +16,6 @@ function regenerate_manifest_in(out_dir) try pregenerate_dataset_manifest(f; dir = out_dir, overwrite_existing = false) catch - # Per-file trace failures are non-fatal — same behavior as the manual pregenerate - # loop. Whatever records the trace captured before the failure are still in the - # manifest; subsequent files keep accumulating. end end out = manifest_path_in(out_dir) From 71daf6cc02ec45d0c23c3e566f48bf50f7cc9a4d Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Fri, 22 May 2026 16:52:28 +0200 Subject: [PATCH 7/7] update toml file and clean up a bit --- src/DataWrangling/DataModes/DataModes.jl | 45 +- .../DataModes/data_manifest_wrangling.jl | 102 +++-- src/DataWrangling/DataModes/dry_run_value.jl | 1 - .../DataModes/parse_and_rewrite_script.jl | 2 +- src/DataWrangling/DataWrangling.jl | 2 +- src/NumericalEarth.jl | 40 +- test/NumericalEarthDataManifest.toml | 433 +++++++----------- test/test_data_manifest_freshness.jl | 11 +- 8 files changed, 271 insertions(+), 365 deletions(-) diff --git a/src/DataWrangling/DataModes/DataModes.jl b/src/DataWrangling/DataModes/DataModes.jl index b8a95b10..351f296c 100644 --- a/src/DataWrangling/DataModes/DataModes.jl +++ b/src/DataWrangling/DataModes/DataModes.jl @@ -59,19 +59,11 @@ function download_datasets(; dir::AbstractString = pwd(), download_dir = nothing return nothing end -expected_paths(metadata::Metadatum) = String[metadata_path(metadata)] - -function expected_paths(metadata::Metadata) - p = metadata_path(metadata) - return p isa Vector ? collect(String, p) : String[p] -end - -function expected_paths(mset::MetadataSet) - paths = String[] - for name in mset.names - append!(paths, expected_paths(mset[name])) - end - return paths +function expected_paths(m::AbstractMetadata) + m isa MetadataSet && + return reduce(vcat, expected_paths(m[n]) for n in m.names; init = String[]) + p = metadata_path(m) + return p isa AbstractVector ? collect(String, p) : String[p] end """ @@ -89,37 +81,10 @@ function check_files_exist(metadata::AbstractMetadata) end function __init__() - # Skip everything if we're inside a precompile / sysimage build subprocess — - # `Base.PROGRAM_FILE` is "-" there and we'd write garbage manifests at exit. - ccall(:jl_generating_output, Cint, ()) == 1 && return nothing - env = get(ENV, "NUMERICALEARTH_DATA", "auto") mode, dir_from_env = parse_data_mode(env) DATA_MODE[] = mode MANIFEST_DIR[] = isempty(dir_from_env) ? pwd() : abspath(dir_from_env) - mode === :pregenerate || return nothing - - if !isempty(Base.PROGRAM_FILE) && isfile(Base.PROGRAM_FILE) - script = abspath(Base.PROGRAM_FILE) - atexit() do - try - manifest = pregenerate_dataset_manifest(script; dir = MANIFEST_DIR[]) - @info "NUMERICALEARTH_DATA=pregenerate: wrote manifest via AST trace" manifest script - catch err - @error "NUMERICALEARTH_DATA=pregenerate: trace failed" dir=MANIFEST_DIR[] script exception=(err, catch_backtrace()) - end - end - else - atexit() do - try - manifest = manifest_path_in(MANIFEST_DIR[]) - write_manifest(manifest, copy(RECORDED)) - @info "NUMERICALEARTH_DATA=pregenerate: wrote manifest" manifest entries=length(RECORDED) - catch err - @error "NUMERICALEARTH_DATA=pregenerate: failed to write manifest" dir=MANIFEST_DIR[] exception=(err, catch_backtrace()) - end - end - end return nothing end diff --git a/src/DataWrangling/DataModes/data_manifest_wrangling.jl b/src/DataWrangling/DataModes/data_manifest_wrangling.jl index b25d5885..1575424a 100644 --- a/src/DataWrangling/DataModes/data_manifest_wrangling.jl +++ b/src/DataWrangling/DataModes/data_manifest_wrangling.jl @@ -42,8 +42,10 @@ end """ $(TYPEDSIGNATURES) -Record `metadata` into [`RECORDED`](@ref) for later serialization to a `DataManifest.toml`. Deduplication -is by `metadata` equality on the recorded vector. Returns `nothing`. +Record `metadata` into [`RECORDED`](@ref) for later serialization to a +`NumericalEarthDataManifest.toml`. Deduplication here is by `metadata` fieldwise `==` (which +includes `dir`); [`write_manifest`](@ref) does a second, canonical dedup by serialized-dict +equality (which doesn't). Returns `nothing`. """ function record_for_manifest(metadata::AbstractMetadata) any(==(metadata), RECORDED) || push!(RECORDED, metadata) @@ -90,31 +92,25 @@ filename_to_toml(::Nothing) = nothing filename_to_toml(s::AbstractString) = String(s) filename_to_toml(f::DatewiseFilename) = collect(String, f.filenames) -function metadata_to_dict(m::Metadatum) - d = Dict{String, Any}("variable_name" => String(m.name)) - m.dates === nothing || (d["date"] = m.dates) - m.region === nothing || (d["region"] = region_to_dict(m.region)) - m.filename === nothing || (d["filename"] = filename_to_toml(m.filename)) - return d -end - -function metadata_to_dict(m::Metadata) - d = Dict{String, Any}("variable_name" => String(m.name), - "start_date" => first(m.dates), "end_date" => last(m.dates)) - m.region === nothing || (d["region"] = region_to_dict(m.region)) - m.filename === nothing || (d["filename"] = filename_to_toml(m.filename)) - return d -end - -function metadata_to_dict(mset::MetadataSet) - d = Dict{String, Any}("variable_names" => [String(n) for n in mset.names]) - if mset.dates isa AbstractVector - d["start_date"] = first(mset.dates) - d["end_date"] = last(mset.dates) - elseif mset.dates !== nothing - d["date"] = mset.dates +# Single source of truth for the TOML schema. Read [`from_toml`](@ref) for the inverse — +# the two functions must stay symmetric, and keeping them adjacent + linear makes drift visible. +function metadata_to_dict(m::AbstractMetadata) + d = Dict{String, Any}() + if m isa MetadataSet + d["variable_names"] = [String(n) for n in m.names] + else + d["variable_name"] = String(m.name) + end + if m.dates isa AbstractVector + d["start_date"] = first(m.dates) + d["end_date"] = last(m.dates) + elseif m.dates !== nothing + d["date"] = m.dates + end + m.region === nothing || (d["region"] = region_to_dict(m.region)) + if !(m isa MetadataSet) + m.filename === nothing || (d["filename"] = filename_to_toml(m.filename)) end - mset.region === nothing || (d["region"] = region_to_dict(mset.region)) return d end @@ -145,9 +141,10 @@ function write_manifest(io::IO, records::AbstractVector) grouped = Dict{String, Vector{Dict{String, Any}}}() for r in records entries = get!(() -> Dict{String, Any}[], grouped, dataset_name(r.dataset)) - push!(entries, metadata_to_dict(r)) + d = metadata_to_dict(r) + any(==(d), entries) || push!(entries, d) end - TOML.print(io, grouped) + TOML.print(io, grouped; sorted = true) return nothing end @@ -192,23 +189,24 @@ end ##### AbstractMetadata reconstruction ##### -function from_toml(dataset_name::AbstractString, entry::AbstractDict; download_dir = nothing) - dataset = lookup_dataset(dataset_name) +function from_toml(name::AbstractString, entry::AbstractDict; download_dir = nothing) + dataset = lookup_dataset(name) region = region_from_toml(get(entry, "region", nothing)) filename = filename_from_toml(get(entry, "filename", nothing)) dir = download_dir === nothing ? default_download_directory(dataset) : String(download_dir) if haskey(entry, "variable_names") - names = Tuple(Symbol(n) for n in entry["variable_names"]) + variable_names = Tuple(Symbol(n) for n in entry["variable_names"]) haskey(entry, "date") && - return MetadataSet(names...; dataset, region, dir, date = entry["date"]) - return MetadataSet(names...; dataset, region, dir, + return MetadataSet(variable_names...; dataset, region, dir, date = entry["date"]) + return MetadataSet(variable_names...; dataset, region, dir, start_date = entry["start_date"], end_date = entry["end_date"]) end - name = Symbol(entry["variable_name"]) + variable_name = Symbol(entry["variable_name"]) haskey(entry, "start_date") && - return Metadata(name; dataset, region, filename, dir, + return Metadata(variable_name; dataset, region, filename, dir, start_date = entry["start_date"], end_date = entry["end_date"]) - return Metadatum(name; dataset, region, filename, dir, date = get(entry, "date", nothing)) + return Metadatum(variable_name; dataset, region, filename, dir, + date = get(entry, "date", nothing)) end """ @@ -229,13 +227,27 @@ end read_manifest(io::IO; download_dir = nothing) = manifest_from_dict(TOML.parse(read(io, String)); download_dir) +""" + $(TYPEDSIGNATURES) + +Reconstruct every entry in a parsed manifest dict. Entries whose `dataset` key isn't currently in +[`DATASET_REGISTRY`](@ref) are skipped with a single grouped warning, so a manifest containing +records from a dataset module the current session hasn't loaded (e.g. `JRA55` when running a +`Bathymetry`-only script) doesn't abort the read. +""" function manifest_from_dict(raw::AbstractDict; download_dir = nothing) records = AbstractMetadata[] + unknown = String[] for (name, entries) in raw + if !haskey(DATASET_REGISTRY, name) + push!(unknown, name) + continue + end for entry in entries push!(records, Base.invokelatest(from_toml, name, entry; download_dir)) end end + isempty(unknown) || @warn "Skipping manifest entries for unregistered datasets; load the relevant dataset modules to include them" datasets=sort(unknown) return records end @@ -254,10 +266,16 @@ then serialized via [`write_manifest`](@ref). When `overwrite_existing = false` and a manifest already exists at `dir`, the existing records are read first and merged (deduplicated) with the newly recorded ones, so this call appends rather than replaces. Defaults to `true` (replace). + +`quiet = true` (the default) swallows everything the traced script writes to stdout/stderr — most +of which is noise (test-failure summaries, NetCDF "file not found" warnings, library `@warn`s) +because pregenerate mode deliberately skips the downloads those tests depend on. Pass +`quiet = false` to see all of it, e.g. when debugging an unexpected trace failure. """ function pregenerate_dataset_manifest(script::AbstractString; dir::AbstractString = pwd(), - overwrite_existing::Bool = true) + overwrite_existing::Bool = true, + quiet::Bool = true) script_abs = abspath(script) source = read(script_abs, String) parsed = Meta.parseall(source; filename = script_abs) @@ -274,7 +292,15 @@ function pregenerate_dataset_manifest(script::AbstractString; sandbox = Module(:DataModesSandbox) Core.eval(sandbox, :(eval(x) = Core.eval($sandbox, x))) Core.eval(sandbox, :(include(p) = Base.include($sandbox, p))) - Core.eval(sandbox, rewritten) + if quiet + redirect_stdout(devnull) do + redirect_stderr(devnull) do + Core.eval(sandbox, rewritten) + end + end + else + Core.eval(sandbox, rewritten) + end new_records = copy(RECORDED) finally DATA_MODE[] = saved_mode diff --git a/src/DataWrangling/DataModes/dry_run_value.jl b/src/DataWrangling/DataModes/dry_run_value.jl index e3affeb3..3995cef9 100644 --- a/src/DataWrangling/DataModes/dry_run_value.jl +++ b/src/DataWrangling/DataModes/dry_run_value.jl @@ -44,7 +44,6 @@ Base.IteratorSize(::Type{DryRunValue}) = Base.HasShape{0}() Base.IteratorEltype(::Type{DryRunValue}) = Base.HasEltype() Base.broadcastable(::DryRunValue) = Ref(DryRunValue()) -Base.materialize(::DryRunValue) = DryRunValue() Base.getindex(::DryRunValue, args...) = DryRunValue() Base.setindex!(::DryRunValue, args...) = DryRunValue() diff --git a/src/DataWrangling/DataModes/parse_and_rewrite_script.jl b/src/DataWrangling/DataModes/parse_and_rewrite_script.jl index 0082c83d..8b38b933 100644 --- a/src/DataWrangling/DataModes/parse_and_rewrite_script.jl +++ b/src/DataWrangling/DataModes/parse_and_rewrite_script.jl @@ -1,4 +1,4 @@ -const PASSTHROUGH_HEADS = Set([:using, :import, :export, :module, :struct, :abstract, :primitive, :macro, :macrocall, :const]) +const PASSTHROUGH_HEADS = (:using, :import, :export, :module, :struct, :abstract, :primitive, :macro, :macrocall, :const) function is_include_call(s) s isa Expr && s.head === :call && !isempty(s.args) || return false diff --git a/src/DataWrangling/DataWrangling.jl b/src/DataWrangling/DataWrangling.jl index 56fcad5b..0a909c8b 100644 --- a/src/DataWrangling/DataWrangling.jl +++ b/src/DataWrangling/DataWrangling.jl @@ -384,7 +384,7 @@ Per-dataset modules keep extending `Downloads.download` for the `:auto` branch o """ function download_dataset(metadata::AbstractMetadata) mode = DataModes.DATA_MODE[] - mode === :auto && return Downloads.download(metadata) + mode === :auto && return Downloads.download(metadata) mode === :strict && return DataModes.check_files_exist(metadata) mode === :pregenerate && return nothing error("Unknown NUMERICALEARTH_DATA mode: $(repr(mode))") diff --git a/src/NumericalEarth.jl b/src/NumericalEarth.jl index 9c144d4b..2d026eb5 100644 --- a/src/NumericalEarth.jl +++ b/src/NumericalEarth.jl @@ -178,22 +178,40 @@ using .DataWrangling.OSPapa using PrecompileTools: @setup_workload, @compile_workload """ -Auto-download datasets listed in `NumericalEarthDataManifest.toml` whenever a manifest sits next -to the active project's `Project.toml` and we're running in `:auto` mode. Cached files are skipped -by each dataset's per-dataset `Downloads.download` method, so subsequent runs are cheap. - -Skipped during precompilation, in `:strict` mode (the manifest is the user's promise that data is -already on disk, not a fetch request), and in `:pregenerate` mode (we're recording, not downloading). +Process-level entry point that fires once after every submodule's `__init__` has run. + +- In `:auto` mode (the default), auto-downloads datasets listed in `NumericalEarthDataManifest.toml` + whenever a manifest sits next to the active project's `Project.toml`. Cached files are skipped by + each dataset's per-dataset `Downloads.download` method, so subsequent runs are cheap. +- In `:pregenerate` mode (`NUMERICALEARTH_DATA=pregenerate` or `pregenerate:`), traces + `Base.PROGRAM_FILE` via `pregenerate_dataset_manifest` and `exit(0)` — the script's real + execution is skipped. The trace runs silently (`quiet = true`) so only the final + `wrote manifest` log appears. + +Both paths are no-ops during precompilation, in `:strict` mode, and when no real `PROGRAM_FILE` +is set (REPL / `julia -e ...`). """ function __init__() - # Skip during precompile / sysimage build subprocess. ccall(:jl_generating_output, Cint, ()) == 1 && return nothing - # Skip in :strict / :pregenerate — those modes have their own semantics. - DataWrangling.DataModes.DATA_MODE[] === :auto || return nothing - # Skip in REPL / `julia -e ...` — auto-download is only for script-mode runs - # (`julia --project my_simulation.jl`) where blocking-on-fetch matches user intent. (!isempty(Base.PROGRAM_FILE) && isfile(Base.PROGRAM_FILE)) || return nothing + mode = DataWrangling.DataModes.DATA_MODE[] + if mode === :pregenerate + script = abspath(Base.PROGRAM_FILE) + # `MANIFEST_DIR[]` is populated by `DataModes.__init__` from the env var; if that init + # somehow hasn't run (precompile workload edge case), fall back to the current directory. + dir = isempty(DataWrangling.DataModes.MANIFEST_DIR[]) ? pwd() : DataWrangling.DataModes.MANIFEST_DIR[] + try + manifest = DataWrangling.DataModes.pregenerate_dataset_manifest(script; dir) + @info "NUMERICALEARTH_DATA=pregenerate: wrote manifest via AST trace" manifest script + catch err + @error "NUMERICALEARTH_DATA=pregenerate: trace failed" dir script exception=(err, catch_backtrace()) + end + exit(0) + end + + mode === :auto || return nothing + project = Base.active_project() project === nothing && return nothing project_dir = dirname(project) diff --git a/test/NumericalEarthDataManifest.toml b/test/NumericalEarthDataManifest.toml index a5394d66..cd92e034 100644 --- a/test/NumericalEarthDataManifest.toml +++ b/test/NumericalEarthDataManifest.toml @@ -1,211 +1,173 @@ -[[ORCA12]] -filename = "bathy_eORCA12_noclosea_from_GEBCO2021_FillZero_S21TT_CloseaCopy.nc" -variable_name = "bottom_height" -[[ORCA12]] -filename = "grid_mask_eORCA12-GO6.nc" -variable_name = "mesh_mask" - -[[GEBCO2024]] -filename = "GEBCO_2024.nc" -variable_name = "bottom_height" - -[[IBCSOv2]] -filename = "IBCSO_v2_bed_WGS84.nc" -variable_name = "bottom_height" - -[[ETOPO2022]] -filename = "ETOPO_2022_v1_60s_N90W180_surface.nc" -variable_name = "bottom_height" - -[[WOAAnnual]] -filename = "woa_t_annual.nc" -variable_name = "temperature" -[[WOAAnnual]] -filename = "woa_s_annual.nc" -variable_name = "salinity" - -[[WOAMonthly]] -filename = "woa_t_monthly_01.nc" -date = 2018-01-01T00:00:00.000Z -variable_name = "temperature" -[[WOAMonthly]] -filename = "woa_s_monthly_01.nc" -date = 2018-01-01T00:00:00.000Z -variable_name = "salinity" - -[[ERA5HourlySingleLevel]] -filename = "2m_temperature_ERA5HourlySingleLevel_2004-12-27T00_-110.0_30.0_-25.0_35.0.nc" -date = 2004-12-27T00:00:00.000Z -variable_name = "temperature" - - [ERA5HourlySingleLevel.region] - kind = "BoundingBox" - latitude = [-25, 35] - longitude = [-110, 30] - -[[ORCA1]] -filename = "eORCA1.2_mesh_mask.nc" -variable_name = "mesh_mask" -[[ORCA1]] -filename = "eORCA_R1_bathy_meter_v2.2.nc" -variable_name = "bottom_height" - -[[IBCAOv5]] -filename = "ibcao_v5_wgs84_0p01deg.nc" -variable_name = "bottom_height" - -[[ECCO2Daily]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-01-05T00:00:00.000Z -filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] -variable_name = "salinity" -[[ECCO2Daily]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-01-05T00:00:00.000Z -filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] -variable_name = "salinity" [[ECCO2Daily]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-01-05T00:00:00.000Z -filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] -variable_name = "temperature" -[[ECCO2Daily]] -filename = "SALT.1440x720x50.19930105.nc" date = 1993-01-05T00:00:00.000Z +filename = "SALT.1440x720x50.19930105.nc" variable_name = "salinity" [[ECCO2Daily]] -filename = "SALT.1440x720x50.19930104.nc" date = 1993-01-04T00:00:00.000Z +filename = "SALT.1440x720x50.19930104.nc" variable_name = "salinity" [[ECCO2Daily]] -filename = "SALT.1440x720x50.19930103.nc" date = 1993-01-03T00:00:00.000Z +filename = "SALT.1440x720x50.19930103.nc" variable_name = "salinity" [[ECCO2Daily]] -filename = "SALT.1440x720x50.19930102.nc" date = 1993-01-02T00:00:00.000Z +filename = "SALT.1440x720x50.19930102.nc" variable_name = "salinity" [[ECCO2Daily]] -start_date = 1993-01-01T00:00:00.000Z end_date = 1993-01-05T00:00:00.000Z filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] -variable_name = "salinity" -[[ECCO2Daily]] start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-01-05T00:00:00.000Z -filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] -variable_name = "temperature" +variable_name = "salinity" [[ECCO2Daily]] -filename = "SALT.1440x720x50.19930101.nc" date = 1993-01-01T00:00:00.000Z +filename = "SALT.1440x720x50.19930101.nc" variable_name = "salinity" [[ECCO2Daily]] -filename = "THETA.1440x720x50.19930105.nc" date = 1993-01-05T00:00:00.000Z +filename = "THETA.1440x720x50.19930105.nc" variable_name = "temperature" [[ECCO2Daily]] -filename = "THETA.1440x720x50.19930104.nc" date = 1993-01-04T00:00:00.000Z +filename = "THETA.1440x720x50.19930104.nc" variable_name = "temperature" [[ECCO2Daily]] -filename = "THETA.1440x720x50.19930103.nc" date = 1993-01-03T00:00:00.000Z +filename = "THETA.1440x720x50.19930103.nc" variable_name = "temperature" [[ECCO2Daily]] -filename = "THETA.1440x720x50.19930102.nc" date = 1993-01-02T00:00:00.000Z +filename = "THETA.1440x720x50.19930102.nc" variable_name = "temperature" [[ECCO2Daily]] -filename = "THETA.1440x720x50.19930101.nc" date = 1993-01-01T00:00:00.000Z +filename = "THETA.1440x720x50.19930101.nc" variable_name = "temperature" [[ECCO2Daily]] -start_date = 1993-01-01T00:00:00.000Z end_date = 1993-01-05T00:00:00.000Z filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] +start_date = 1993-01-01T00:00:00.000Z variable_name = "temperature" -[[ECCO2Daily]] + +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +filename = "SIarea_1993_01.nc" +variable_name = "sea_ice_concentration" +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +variable_names = ["temperature", "salinity", "sea_ice_thickness", "sea_ice_concentration"] +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +filename = "SIheff_1993_01.nc" +variable_name = "sea_ice_thickness" +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +variable_names = ["temperature", "sea_ice_thickness"] +[[ECCO4Monthly]] +end_date = 1993-04-01T00:00:00.000Z +filename = ["SALT_1993_01.nc", "SALT_1993_02.nc", "SALT_1993_03.nc", "SALT_1993_04.nc"] start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-01-05T00:00:00.000Z -filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] variable_name = "salinity" -[[ECCO2Daily]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-01-05T00:00:00.000Z -filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +filename = "SALT_1993_01.nc" variable_name = "salinity" -[[ECCO2Daily]] +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +variable_names = ["temperature", "salinity"] + + [ECCO4Monthly.region] + kind = "BoundingBox" + latitude = [-10.0, 10.0] + longitude = [-20.0, 20.0] +[[ECCO4Monthly]] +end_date = 1993-04-01T00:00:00.000Z +filename = ["THETA_1993_01.nc", "THETA_1993_02.nc", "THETA_1993_03.nc", "THETA_1993_04.nc"] start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-01-05T00:00:00.000Z -filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] variable_name = "temperature" -[[ECCO2Daily]] +[[ECCO4Monthly]] +end_date = 1993-04-01T00:00:00.000Z start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-01-05T00:00:00.000Z -filename = ["SALT.1440x720x50.19930101.nc", "SALT.1440x720x50.19930102.nc", "SALT.1440x720x50.19930103.nc", "SALT.1440x720x50.19930104.nc", "SALT.1440x720x50.19930105.nc"] +variable_names = ["temperature", "salinity"] +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +filename = "THETA_1993_01.nc" +variable_name = "temperature" +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +variable_names = ["temperature", "salinity"] +[[ECCO4Monthly]] +date = 1993-01-01T00:00:00.000Z +filename = "NVEL_1993_01.nc" +variable_name = "v_velocity" +[[ECCO4Monthly]] +date = 1993-02-01T00:00:00.000Z +filename = "SALT_1993_02.nc" variable_name = "salinity" -[[ECCO2Daily]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-01-05T00:00:00.000Z -filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] +[[ECCO4Monthly]] +date = 1993-02-01T00:00:00.000Z +filename = "THETA_1993_02.nc" variable_name = "temperature" -[[ECCO2Daily]] +[[ECCO4Monthly]] +end_date = 1993-02-01T00:00:00.000Z +filename = ["SALT_1993_01.nc", "SALT_1993_02.nc"] +start_date = 1993-01-01T00:00:00.000Z +variable_name = "salinity" +[[ECCO4Monthly]] +end_date = 1993-02-01T00:00:00.000Z +filename = ["THETA_1993_01.nc", "THETA_1993_02.nc"] start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-01-05T00:00:00.000Z -filename = ["THETA.1440x720x50.19930101.nc", "THETA.1440x720x50.19930102.nc", "THETA.1440x720x50.19930103.nc", "THETA.1440x720x50.19930104.nc", "THETA.1440x720x50.19930105.nc"] variable_name = "temperature" - [[ECCO4Monthly]] -filename = "THETA_1992_03.nc" -date = 1992-03-01T00:00:00.000Z +date = 1992-01-01T00:00:00.000Z +filename = "THETA_1992_01.nc" variable_name = "temperature" [ECCO4Monthly.region] + interpolation = "Linear" kind = "Column" latitude = 50.1 longitude = 35.1 - interpolation = "Linear" [[ECCO4Monthly]] -filename = "THETA_1992_02.nc" -date = 1992-02-01T00:00:00.000Z -variable_name = "temperature" +date = 1992-01-01T00:00:00.000Z +filename = "EVEL_1992_01.nc" +variable_name = "u_velocity" [ECCO4Monthly.region] + interpolation = "Linear" kind = "Column" latitude = 50.1 longitude = 35.1 - interpolation = "Linear" [[ECCO4Monthly]] -start_date = 1992-01-01T00:00:00.000Z -end_date = 1992-03-01T00:00:00.000Z -filename = ["THETA_1992_01.nc", "THETA_1992_02.nc", "THETA_1992_03.nc"] -variable_name = "temperature" +date = 1992-01-01T00:00:00.000Z +filename = "SSH_1992_01.nc" +variable_name = "free_surface" [ECCO4Monthly.region] + interpolation = "Linear" kind = "Column" latitude = 50.1 longitude = 35.1 - interpolation = "Linear" [[ECCO4Monthly]] -filename = "THETA_1992_01.nc" date = 1992-01-01T00:00:00.000Z -variable_name = "temperature" - - [ECCO4Monthly.region] - kind = "BoundingBox" - latitude = [-30, 30] +filename = "EVEL_1992_01.nc" +variable_name = "u_velocity" [[ECCO4Monthly]] -filename = "THETA_1992_01.nc" date = 1992-01-01T00:00:00.000Z -variable_name = "temperature" +filename = "EVEL_1992_01.nc" +variable_name = "u_velocity" [ECCO4Monthly.region] kind = "BoundingBox" - latitude = [-30, 30] - longitude = [-180, 180] + latitude = [0, 10] + longitude = [0, 10] [[ECCO4Monthly]] +date = 1992-01-01T00:00:00.000Z filename = "THETA_1992_01.nc" +variable_name = "temperature" +[[ECCO4Monthly]] date = 1992-01-01T00:00:00.000Z +filename = "THETA_1992_01.nc" variable_name = "temperature" [ECCO4Monthly.region] @@ -213,179 +175,106 @@ variable_name = "temperature" latitude = [0, 10] longitude = [0, 10] [[ECCO4Monthly]] -filename = "THETA_1992_01.nc" date = 1992-01-01T00:00:00.000Z +filename = "THETA_1992_01.nc" variable_name = "temperature" -[[ECCO4Monthly]] -filename = "EVEL_1992_01.nc" -date = 1992-01-01T00:00:00.000Z -variable_name = "u_velocity" [ECCO4Monthly.region] kind = "BoundingBox" - latitude = [0, 10] - longitude = [0, 10] + latitude = [-30, 30] + longitude = [-180, 180] [[ECCO4Monthly]] -filename = "EVEL_1992_01.nc" date = 1992-01-01T00:00:00.000Z -variable_name = "u_velocity" -[[ECCO4Monthly]] -filename = "SSH_1992_01.nc" -date = 1992-01-01T00:00:00.000Z -variable_name = "free_surface" +filename = "THETA_1992_01.nc" +variable_name = "temperature" [ECCO4Monthly.region] - kind = "Column" - latitude = 50.1 - longitude = 35.1 - interpolation = "Linear" + kind = "BoundingBox" + latitude = [-30, 30] [[ECCO4Monthly]] -filename = "EVEL_1992_01.nc" -date = 1992-01-01T00:00:00.000Z -variable_name = "u_velocity" +end_date = 1992-03-01T00:00:00.000Z +filename = ["THETA_1992_01.nc", "THETA_1992_02.nc", "THETA_1992_03.nc"] +start_date = 1992-01-01T00:00:00.000Z +variable_name = "temperature" [ECCO4Monthly.region] + interpolation = "Linear" kind = "Column" latitude = 50.1 longitude = 35.1 - interpolation = "Linear" [[ECCO4Monthly]] -filename = "THETA_1992_01.nc" -date = 1992-01-01T00:00:00.000Z +date = 1992-02-01T00:00:00.000Z +filename = "THETA_1992_02.nc" variable_name = "temperature" [ECCO4Monthly.region] + interpolation = "Linear" kind = "Column" latitude = 50.1 longitude = 35.1 - interpolation = "Linear" [[ECCO4Monthly]] -start_date = 1992-01-01T00:00:00.000Z -end_date = 1992-03-01T00:00:00.000Z -filename = ["THETA_1992_01.nc", "THETA_1992_02.nc", "THETA_1992_03.nc"] +date = 1992-03-01T00:00:00.000Z +filename = "THETA_1992_03.nc" variable_name = "temperature" [ECCO4Monthly.region] + interpolation = "Linear" kind = "Column" latitude = 50.1 longitude = 35.1 - interpolation = "Linear" -[[ECCO4Monthly]] -filename = "SALT_1993_02.nc" -date = 1993-02-01T00:00:00.000Z -variable_name = "salinity" -[[ECCO4Monthly]] -filename = "THETA_1993_02.nc" -date = 1993-02-01T00:00:00.000Z -variable_name = "temperature" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-02-01T00:00:00.000Z -filename = ["SALT_1993_01.nc", "SALT_1993_02.nc"] -variable_name = "salinity" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-02-01T00:00:00.000Z -filename = ["THETA_1993_01.nc", "THETA_1993_02.nc"] -variable_name = "temperature" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-02-01T00:00:00.000Z -filename = ["THETA_1993_01.nc", "THETA_1993_02.nc"] -variable_name = "temperature" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-02-01T00:00:00.000Z -filename = ["SALT_1993_01.nc", "SALT_1993_02.nc"] -variable_name = "salinity" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -variable_names = ["temperature", "salinity"] -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -filename = ["THETA_1993_01.nc", "THETA_1993_02.nc", "THETA_1993_03.nc", "THETA_1993_04.nc"] -variable_name = "temperature" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -variable_names = ["temperature", "salinity"] -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -filename = ["THETA_1993_01.nc", "THETA_1993_02.nc", "THETA_1993_03.nc", "THETA_1993_04.nc"] -variable_name = "temperature" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -filename = ["SALT_1993_01.nc", "SALT_1993_02.nc", "SALT_1993_03.nc", "SALT_1993_04.nc"] -variable_name = "salinity" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -variable_names = ["temperature", "salinity"] -[[ECCO4Monthly]] -filename = "NVEL_1993_01.nc" -date = 1993-01-01T00:00:00.000Z -variable_name = "v_velocity" -[[ECCO4Monthly]] -date = 1993-01-01T00:00:00.000Z -variable_names = ["temperature", "salinity"] -[[ECCO4Monthly]] -filename = "THETA_1993_01.nc" -date = 1993-01-01T00:00:00.000Z -variable_name = "temperature" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -variable_names = ["temperature", "salinity"] -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -filename = ["THETA_1993_01.nc", "THETA_1993_02.nc", "THETA_1993_03.nc", "THETA_1993_04.nc"] + +[[ERA5HourlySingleLevel]] +date = 2004-12-27T00:00:00.000Z +filename = "2m_temperature_ERA5HourlySingleLevel_2004-12-27T00_-110.0_30.0_-25.0_35.0.nc" variable_name = "temperature" -[[ECCO4Monthly]] -date = 1993-01-01T00:00:00.000Z -variable_names = ["temperature", "salinity"] - [ECCO4Monthly.region] + [ERA5HourlySingleLevel.region] kind = "BoundingBox" - latitude = [-10.0, 10.0] - longitude = [-20.0, 20.0] -[[ECCO4Monthly]] -filename = "SALT_1993_01.nc" -date = 1993-01-01T00:00:00.000Z + latitude = [-25, 35] + longitude = [-110, 30] + +[[ETOPO2022]] +filename = "ETOPO_2022_v1_60s_N90W180_surface.nc" +variable_name = "bottom_height" + +[[GEBCO2024]] +filename = "GEBCO_2024.nc" +variable_name = "bottom_height" + +[[IBCAOv5]] +filename = "ibcao_v5_wgs84_0p01deg.nc" +variable_name = "bottom_height" + +[[IBCSOv2]] +filename = "IBCSO_v2_bed_WGS84.nc" +variable_name = "bottom_height" + +[[ORCA1]] +filename = "eORCA1.2_mesh_mask.nc" +variable_name = "mesh_mask" +[[ORCA1]] +filename = "eORCA_R1_bathy_meter_v2.2.nc" +variable_name = "bottom_height" + +[[ORCA12]] +filename = "bathy_eORCA12_noclosea_from_GEBCO2021_FillZero_S21TT_CloseaCopy.nc" +variable_name = "bottom_height" +[[ORCA12]] +filename = "grid_mask_eORCA12-GO6.nc" +variable_name = "mesh_mask" + +[[WOAAnnual]] +filename = "woa_t_annual.nc" +variable_name = "temperature" +[[WOAAnnual]] +filename = "woa_s_annual.nc" variable_name = "salinity" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -variable_names = ["temperature", "salinity"] -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -filename = ["THETA_1993_01.nc", "THETA_1993_02.nc", "THETA_1993_03.nc", "THETA_1993_04.nc"] + +[[WOAMonthly]] +date = 2018-01-01T00:00:00.000Z +filename = "woa_t_monthly_01.nc" variable_name = "temperature" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -filename = ["SALT_1993_01.nc", "SALT_1993_02.nc", "SALT_1993_03.nc", "SALT_1993_04.nc"] +[[WOAMonthly]] +date = 2018-01-01T00:00:00.000Z +filename = "woa_s_monthly_01.nc" variable_name = "salinity" -[[ECCO4Monthly]] -date = 1993-01-01T00:00:00.000Z -variable_names = ["temperature", "sea_ice_thickness"] -[[ECCO4Monthly]] -filename = "SIheff_1993_01.nc" -date = 1993-01-01T00:00:00.000Z -variable_name = "sea_ice_thickness" -[[ECCO4Monthly]] -date = 1993-01-01T00:00:00.000Z -variable_names = ["temperature", "salinity", "sea_ice_thickness", "sea_ice_concentration"] -[[ECCO4Monthly]] -filename = "SIarea_1993_01.nc" -date = 1993-01-01T00:00:00.000Z -variable_name = "sea_ice_concentration" -[[ECCO4Monthly]] -start_date = 1993-01-01T00:00:00.000Z -end_date = 1993-04-01T00:00:00.000Z -variable_names = ["temperature", "salinity"] diff --git a/test/test_data_manifest_freshness.jl b/test/test_data_manifest_freshness.jl index 51bf75d7..c33cd00e 100644 --- a/test/test_data_manifest_freshness.jl +++ b/test/test_data_manifest_freshness.jl @@ -1,6 +1,6 @@ include("runtests_setup.jl") -using NumericalEarth.DataWrangling.DataModes: pregenerate_dataset_manifest, manifest_path_in +using NumericalEarth.DataWrangling.DataModes: DataModes, pregenerate_dataset_manifest, manifest_path_in using TOML # Regenerate the test-folder `NumericalEarthDataManifest.toml` by tracing every `test_*.jl` @@ -23,6 +23,15 @@ function regenerate_manifest_in(out_dir) end @testset "DataManifest freshness" begin + # This test self-invokes `pregenerate_dataset_manifest` on every other `test_*.jl`. If + # we're already running inside a pregenerate trace (i.e. some outer loop is tracing this + # very file), recursing here both wastes work and corrupts per-process state — most + # notably MPI, which gets re-initialised across nested sandbox boundaries. + if DataModes.DATA_MODE[] === :pregenerate + @info "Skipping DataManifest freshness test inside a pregenerate trace" + return + end + committed_path = manifest_path_in(@__DIR__) @test isfile(committed_path)