Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions bindings/julia/src/binary_utils/BinaryUtils.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module BinaryUtils

# This module contains higher-level utilities for working with binary data, implemented only in Julia.
# It relies only on the public API provided by the Binary module, and does not call any C functions directly.

using Quiver: Binary

include("utils.jl")
include("data_operations.jl")
include("file_operations.jl")

end
204 changes: 204 additions & 0 deletions bindings/julia/src/binary_utils/data_operations.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
function apply_expression_over_files(
output_filename::String,
filenames::Vector{String},
operation::Function,
)
readers = [Binary.open_file(filename; mode = :read) for filename in filenames]
metadata = Binary.get_metadata(first(readers))
dimensions = Binary.get_dimensions(metadata)
dimension_names = [dim.name for dim in dimensions]

for reader in readers
if !is_equal(metadata, Binary.get_metadata(reader))
for r in readers
Binary.close!(r)
end
throw(ArgumentError("Cannot apply expression over files due to a metadata mismatch. Files: $(filenames)"))
end
end

initial_dimension_values = ones(Int, length(dimensions))
for (idx, dimension) in enumerate(dimensions)
if dimension.is_time_dimension
initial_dimension_values[idx] = dimension.initial_value
end
end

writer = Binary.open_file(output_filename; mode = :write, metadata = metadata)

maximum_number_of_iterations = prod([dim.size for dim in dimensions])
data_all_readers = [zeros(num_labels) for _ in 1:n_readers]
current_dimension_values = copy(initial_dimension_values)

for _ in 1:maximum_number_of_iterations
dims = dimension_names .=> current_dimension_values
for (idx, reader) in enumerate(readers)
data_all_readers[idx] = Binary.read(reader; dims...)
end
data_to_write = operation.(data_all_readers...)
Binary.write!(writer; data = data_to_write, dims...)

# TODO: next_dimensions implemented in another branch
current_dimension_values = Binary.next_dimensions(writer, current_dimension_values)
if current_dimension_values === initial_dimension_values
break
end
end

for reader in readers
Binary.close!(reader)
end
Binary.close!(writer)

return nothing
end

function apply_expression_over_agents(
output_filename::String,
filename::String,
operation::Function,
new_labels::Vector{String},
)
reader = Binary.open_file(filename; mode = :read)
metadata = Binary.get_metadata(reader)
dimensions = Binary.get_dimensions(metadata)
time_dimensions = filter(dim -> dim.is_time_dimension, dimensions)
labels = Binary.get_labels(metadata)

test_data = ones(length(labels))
result = operation(test_data)
if length(result) != length(new_labels)
Binary.close!(reader)
throw(
ArgumentError(
"Cannot apply expression over agents. The provided operation does not return the expected number of labels. Expected: $(number_of_new_labels), got: $(length(result)). File: $(filename)",
),
)
end

writer_metadata = Binary.Metadata(;
initial_datetime = get_initial_datetime(metadata),
unit = get_unit(metadata),
version = get_version(metadata),
labels = new_labels,
dimensions = [dim.name for dim in dimensions],
dimension_sizes = [dim.size for dim in dimensions],
time_dimensions = [dim.name for dim in time_dimensions],
frequencies = [dim.frequency for dim in time_dimensions],
)
writer = Binary.open_file(output_filename; mode = :write, metadata = writer_metadata)

initial_dimension_values = ones(Int, length(dimensions))
for (idx, dimension) in enumerate(dimensions)
if dimension.is_time_dimension
initial_dimension_values[idx] = dimension.initial_value
end
end

maximum_number_of_iterations = prod([dim.size for dim in dimensions])
current_dimension_values = copy(initial_dimension_values)

for _ in 1:maximum_number_of_iterations
dims = dimension_names .=> current_dimension_values
data = Binary.read(reader; dims...)
data_to_write = vcat(operation(data))
Binary.write!(writer; data = data_to_write, dims...)

# TODO: next_dimensions implemented in another branch
current_dimension_values = Binary.next_dimensions(writer, current_dimension_values)
if current_dimension_values === initial_dimension_values
break
end
end

Binary.close!(reader)
Binary.close!(writer)

return nothing
end

function apply_expression_over_dimensions(
output_filename::String,
filename::String,
operation::Function,
dim_to_operate::String;
suppress_dimension_order_warning::Bool = false,
)
reader = Binary.open_file(filename; mode = :read)
metadata = Binary.get_metadata(reader)
dimensions = Binary.get_dimensions(metadata)
time_dimensions = filter(dim -> dim.is_time_dimension, dimensions)
dimension_names = [dim.name for dim in dimensions]
dimension_sizes = [dim.size for dim in dimensions]

dim_index = findfirst(isequal(dim_to_operate), dimension_names)
if dim_index === nothing
Binary.close!(reader)
throw(
ArgumentError(
"Cannot apply expression over dimensions because the specified dimension '$dim_to_operate' was not found in the file. Available dimensions: $(dimension_names). File: $(filename)",
),
)
end

if dimensions[dim_index].is_time_dimension
Binary.close!(reader)
throw(
ArgumentError(
"Cannot apply expression over dimensions. This is not implemented for time dimensions. File: $(filename)",
),
)
end

if dim_index != length(dimension_names) && !suppress_dimension_order_warning
@warn "The specified dimension '$dim_to_operate' is not the last dimension in the file. This is not the most efficient way to iterate over dimensions."
end

writer_metadata = Binary.Metadata(;
initial_datetime = get_initial_datetime(metadata),
unit = get_unit(metadata),
version = get_version(metadata),
labels = get_labels(metadata),
dimensions = dimension_names[1:end .!= dim_index],
dimension_sizes = dimension_sizes[1:end .!= dim_index],
time_dimensions = [dim.name for dim in time_dimensions],
frequencies = [dim.frequency for dim in time_dimensions],
)
writer = Binary.open_file(output_filename; mode = :write, metadata = writer_metadata)

initial_dimension_values = ones(Int, length(dimensions))
for (idx, dimension) in enumerate(dimensions)
if dimension.is_time_dimension
initial_dimension_values[idx] = dimension.initial_value
end
end

maximum_number_of_iterations = prod([dim.size for (idx, dim) in dimensions if idx != dim_index])
data = zeros(length(get_labels(metadata)), dimension_sizes[dim_index])
current_dimension_values = copy(initial_dimension_values)

for _ in 1:maximum_number_of_iterations
for value_at_dim in 1:dimension_sizes[dim_index]
current_dimension_values[dim_index] = value_at_dim
dims = dimension_names .=> current_dimension_values
data[:, value_at_dim] = Binary.read(reader; dims...)
end

data_to_write = operation(data, dims = 2)[:, 1]
Binary.write!(writer; data = data_to_write, dims...)

# This ensures the outer loop doesn't waste time iterating through the dimension we're operating over, since we iterate on it in the inner loop.
# This would not work for time dimensions, as their size might not be constant and independent of the other dimensions.
current_dimension_values[dim_index] = dimension_sizes[dim_index]
# TODO: next_dimensions implemented in another branch
current_dimension_values = Binary.next_dimensions(writer, current_dimension_values)
if current_dimension_values === initial_dimension_values
break
end
end

Binary.close!(reader)
Binary.close!(writer)

return nothing
end
149 changes: 149 additions & 0 deletions bindings/julia/src/binary_utils/file_operations.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
function merge(
output_filename::String,
filenames::Vector{String},
)
readers = [Binary.open_file(filename; mode = :read) for filename in filenames]
metadata = Binary.get_metadata(first(readers))
dimensions = Binary.get_dimensions(metadata)
time_dimensions = filter(dim -> dim.is_time_dimension, dimensions)
dimension_names = [dim.name for dim in dimensions]

labels = String[]
number_of_labels_per_file = Int[]

for reader in readers
current_metadata = Binary.get_metadata(reader)
if !is_equal(metadata, current_metadata; ignore_labels = true)
for r in readers
Binary.close!(r)
end
throw(ArgumentError("Cannot merge files due to a metadata mismatch. Files: $(filenames)"))
end

current_labels = Binary.get_labels(current_metadata)
for label in current_labels
if label in labels
for r in readers
Binary.close!(r)
end
throw(ArgumentError("Cannot merge files due to duplicate labels. Label: $(label). Files: $(filenames)"))
end
push!(labels, label)
end
push!(number_of_labels_per_file, length(current_labels))
end

writer_metadata = Binary.Metadata(
unit = get_unit(metadata),
version = get_version(metadata),
labels = labels,
dimensions = dimension_names,
dimension_sizes = [dim.size for dim in dimensions],
time_dimensions = [dim.name for dim in time_dimensions],
frequencies = [dim.frequency for dim in time_dimensions],
)
writer = Binary.open_file(output_filename; mode = :write, metadata = writer_metadata)

initial_dimension_values = ones(Int, length(dimensions))
for (idx, dimension) in enumerate(dimensions)
if dimension.is_time_dimension
initial_dimension_values[idx] = dimension.initial_value
end
end

maximum_number_of_iterations = prod([dim.size for dim in dimensions])
current_dimension_values = copy(initial_dimension_values)

data_to_write = zeros(length(labels))
for _ in 1:maximum_number_of_iterations
dims = dimension_names .=> current_dimension_values
for (i, reader) in enumerate(readers)
if i == 1
initial_idx = 1
else
initial_idx = sum(number_of_labels_per_file[1:(i-1)]) + 1
end
final_idx = sum(number_of_labels_per_file[1:i])
data_to_write[initial_idx:final_idx] = Binary.read(reader; dims...)
end
Binary.write!(writer; data = data_to_write, dims...)

# TODO: next_dimensions implemented in another branch
current_dimension_values = Binary.next_dimensions(writer, current_dimension_values)
if current_dimension_values === initial_dimension_values
break
end
end

for reader in readers
Binary.close!(reader)
end
Binary.close!(writer)

return nothing
end

function file_to_array(
filename::String,
)
reader = Binary.open_file(filename; mode = :read)
metadata = Binary.get_metadata(reader)
number_of_labels = length(Binary.get_labels(metadata))
dimensions = Binary.get_dimensions(metadata)
dimension_names = [dim.name for dim in dimensions]
dimension_sizes = [dim.size for dim in dimensions]

data = zeros(Float64, number_of_labels, dimension_sizes...)

initial_dimension_values = ones(Int, length(dimensions))
for (idx, dimension) in enumerate(dimensions)
if dimension.is_time_dimension
initial_dimension_values[idx] = dimension.initial_value
end
end

maximum_number_of_iterations = prod([dim.size for dim in dimensions])
current_dimension_values = copy(initial_dimension_values)
for _ in 1:maximum_number_of_iterations
dims = dimension_names .=> current_dimension_values
data[:, current_dimension_values...] = Binary.read(reader; dims...)

# TODO: next_dimensions implemented in another branch
current_dimension_values = Binary.next_dimensions(writer, current_dimension_values)
if current_dimension_values === initial_dimension_values
break
end
end
return nothing
end

function array_to_file(
filename::String,
data::Array{T, N},
metadata::Binary.Metadata,
) where {T, N}
writer = Binary.open_file(filename; mode = :write, metadata = metadata)
dimensions = Binary.get_dimensions(metadata)
dimension_names = [dim.name for dim in dimensions]

initial_dimension_values = ones(Int, length(dimensions))
for (idx, dimension) in enumerate(dimensions)
if dimension.is_time_dimension
initial_dimension_values[idx] = dimension.initial_value
end
end

maximum_number_of_iterations = prod([dim.size for dim in dimensions])
current_dimension_values = copy(initial_dimension_values)
for _ in 1:maximum_number_of_iterations
dims = dimension_names .=> current_dimension_values
Binary.write(writer; data = data[:, current_dimension_values...], dims...)

# TODO: next_dimensions implemented in another branch
current_dimension_values = Binary.next_dimensions(writer, current_dimension_values)
if current_dimension_values === initial_dimension_values
break
end
end
return nothing
end
29 changes: 29 additions & 0 deletions bindings/julia/src/binary_utils/utils.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
function is_equal(
metadata1::Binary.Metadata,
metadata2::Binary.Metadata;
ignore_labels::Bool = false,
)::Bool
if get_initial_datetime(metadata1) != get_initial_datetime(metadata2)
return false
end
if !ignore_labels && get_labels(metadata1) != get_labels(metadata2)
return false
end
if get_unit(metadata1) != get_unit(metadata2)
return false
end
if get_version(metadata1) != get_version(metadata2)
return false
end
dimensions1 = Binary.get_dimensions(metadata1)
dimensions2 = Binary.get_dimensions(metadata2)
if length(dimensions1) != length(dimensions2)
return false
end
for (dim1, dim2) in zip(dimensions1, dimensions2)
if dim1 != dim2
return false
end
end
return true
end
Loading