From becec7f890d13ca34119ff1dc5c385f4aa621be8 Mon Sep 17 00:00:00 2001 From: Juniper Tyree <50025784+juntyr@users.noreply.github.com> Date: Thu, 17 Apr 2025 15:54:19 +0300 Subject: [PATCH 1/5] Draft JPEG2000 quantization fix --- .../compressor/compressors/jpeg2000.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/climatebenchpress/compressor/compressors/jpeg2000.py b/src/climatebenchpress/compressor/compressors/jpeg2000.py index fd643fc..08187c5 100644 --- a/src/climatebenchpress/compressor/compressors/jpeg2000.py +++ b/src/climatebenchpress/compressor/compressors/jpeg2000.py @@ -17,28 +17,34 @@ class Jpeg2000(Compressor): @staticmethod def abs_bound_codec(dtype, error_bound): - # Currently, the input is transformed into the range - # round(min_pixel_val/ error_bound) <= x <= round(max_pixel_val / error_bound) - # This means any values outside this range will incur a larger error. - precision = error_bound max_pixel_val = 2**25 - 1 # maximum pixel value for our integer encoding. + data_range = float(np.amax(data)) - float(np.amin(data)) + # Here we use the formula for the PSNR (https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio) # to convert between the absolute error and the PSNR value. # The original PSNR formula uses the root mean square error (RMSE), # therefore JPEG does not guaruantee pointwise error bounds but only # average error bounds. - psnr = 20 * (math.log10(max_pixel_val) - math.log10(error_bound)) + psnr = 20 * (math.log10(data_range) - math.log10(error_bound)) return CodecStack( + # increase precision for better rounding during linear quantization + numcodecs.astype.AsType( + encode_dtype="float64", + decode_dtype="float32", + ), + # remap from [min, max] to [0, max_pixel_val] numcodecs_wasm_fixed_offset_scale.FixedOffsetScale( - offset=0, - scale=precision, + offset=float(np.amin(data)), + scale=data_range / max_pixel_val, ), + # round and truncate to integer values numcodecs_wasm_round.Round(precision=1), numcodecs.astype.AsType( encode_dtype="int32", decode_dtype="float32", ), + # apply the PSNR error bound numcodecs_wasm_jpeg2000.Jpeg2000(mode="psnr", psnr=psnr), ) From 6fe1c0c89e838f290bfecf1e8c0dda1b4bae1ee0 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Tue, 22 Apr 2025 12:26:57 +0200 Subject: [PATCH 2/5] Implementation of new interface to allow JPEG2000 quantization --- .../compressor/compressors/abc.py | 30 ++++++++++++++++--- .../compressor/compressors/bitround.py | 4 ++- .../compressor/compressors/bitround_pco.py | 4 ++- .../compressor/compressors/jpeg2000.py | 17 ++++++++--- .../compressor/compressors/stochround.py | 2 +- .../compressor/compressors/sz3.py | 4 +-- .../compressor/compressors/tthresh.py | 4 +-- .../compressor/compressors/zfp.py | 2 +- 8 files changed, 51 insertions(+), 16 deletions(-) diff --git a/src/climatebenchpress/compressor/compressors/abc.py b/src/climatebenchpress/compressor/compressors/abc.py index 1a16813..b9ab020 100644 --- a/src/climatebenchpress/compressor/compressors/abc.py +++ b/src/climatebenchpress/compressor/compressors/abc.py @@ -57,12 +57,24 @@ class Compressor(ABC): @staticmethod @abstractmethod - def abs_bound_codec(dtype: np.dtype, error_bound: float) -> Codec: + def abs_bound_codec( + error_bound: float, + *, + dtype: Optional[np.dtype] = None, + data_abs_min: Optional[float] = None, + data_abs_max: Optional[float] = None, + ) -> Codec: pass @staticmethod @abstractmethod - def rel_bound_codec(dtype: np.dtype, error_bound: float) -> Codec: + def rel_bound_codec( + error_bound: float, + *, + dtype: Optional[np.dtype] = None, + data_abs_min: Optional[float] = None, + data_abs_max: Optional[float] = None, + ) -> Codec: pass @classmethod @@ -116,9 +128,19 @@ def build( new_codecs: dict[VariableName, Codec] = dict() for var, eb in eb_per_var.items(): if eb.abs_error is not None and cls.has_abs_error_impl: - new_codecs[var] = cls.abs_bound_codec(dtypes[var], eb.abs_error) + new_codecs[var] = cls.abs_bound_codec( + eb.abs_error, + dtype=dtypes[var], + data_abs_min=data_abs_min[var], + data_abs_max=data_abs_max[var], + ) elif eb.rel_error is not None and cls.has_rel_error_impl: - new_codecs[var] = cls.rel_bound_codec(dtypes[var], eb.rel_error) + new_codecs[var] = cls.rel_bound_codec( + eb.rel_error, + dtype=dtypes[var], + data_abs_min=data_abs_min[var], + data_abs_max=data_abs_max[var], + ) else: # This should never happen as we have already transformed the error bounds. # If this happens, it means there is a bug in the implementation. diff --git a/src/climatebenchpress/compressor/compressors/bitround.py b/src/climatebenchpress/compressor/compressors/bitround.py index 599a7a9..836083a 100644 --- a/src/climatebenchpress/compressor/compressors/bitround.py +++ b/src/climatebenchpress/compressor/compressors/bitround.py @@ -13,7 +13,9 @@ class BitRound(Compressor): description = "Bit Rounding" @staticmethod - def rel_bound_codec(dtype, error_bound): + def rel_bound_codec(error_bound, *, dtype=None, **kwargs): + assert dtype is not None, "dtype must be provided" + keepbits = compute_keepbits(dtype, error_bound) return CodecStack( numcodecs_wasm_bit_round.BitRound(keepbits=keepbits), diff --git a/src/climatebenchpress/compressor/compressors/bitround_pco.py b/src/climatebenchpress/compressor/compressors/bitround_pco.py index 5fda857..e1f34aa 100644 --- a/src/climatebenchpress/compressor/compressors/bitround_pco.py +++ b/src/climatebenchpress/compressor/compressors/bitround_pco.py @@ -14,7 +14,9 @@ class BitRoundPco(Compressor): description = "Bit Rounding + PCodec" @staticmethod - def rel_bound_codec(dtype, error_bound): + def rel_bound_codec(error_bound, *, dtype=None, **kwargs): + assert dtype is not None, "dtype must be provided" + keepbits = compute_keepbits(dtype, error_bound) return CodecStack( numcodecs_wasm_bit_round.BitRound(keepbits=keepbits), diff --git a/src/climatebenchpress/compressor/compressors/jpeg2000.py b/src/climatebenchpress/compressor/compressors/jpeg2000.py index 08187c5..5faad74 100644 --- a/src/climatebenchpress/compressor/compressors/jpeg2000.py +++ b/src/climatebenchpress/compressor/compressors/jpeg2000.py @@ -16,10 +16,19 @@ class Jpeg2000(Compressor): description = "JPEG 2000" @staticmethod - def abs_bound_codec(dtype, error_bound): + def abs_bound_codec( + error_bound, + *, + data_abs_min=None, + data_abs_max=None, + **kwargs, + ): + assert data_abs_min is not None, "data_abs_min must be provided" + assert data_abs_max is not None, "data_abs_max must be provided" + max_pixel_val = 2**25 - 1 # maximum pixel value for our integer encoding. - data_range = float(np.amax(data)) - float(np.amin(data)) + data_range = data_abs_max - data_abs_min # Here we use the formula for the PSNR (https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio) # to convert between the absolute error and the PSNR value. @@ -36,14 +45,14 @@ def abs_bound_codec(dtype, error_bound): ), # remap from [min, max] to [0, max_pixel_val] numcodecs_wasm_fixed_offset_scale.FixedOffsetScale( - offset=float(np.amin(data)), + offset=data_abs_min, scale=data_range / max_pixel_val, ), # round and truncate to integer values numcodecs_wasm_round.Round(precision=1), numcodecs.astype.AsType( encode_dtype="int32", - decode_dtype="float32", + decode_dtype="float64", ), # apply the PSNR error bound numcodecs_wasm_jpeg2000.Jpeg2000(mode="psnr", psnr=psnr), diff --git a/src/climatebenchpress/compressor/compressors/stochround.py b/src/climatebenchpress/compressor/compressors/stochround.py index bcdab95..c7a756a 100644 --- a/src/climatebenchpress/compressor/compressors/stochround.py +++ b/src/climatebenchpress/compressor/compressors/stochround.py @@ -13,7 +13,7 @@ class StochRound(Compressor): description = "Stochastic Rounding" @staticmethod - def abs_bound_codec(dtype, error_bound): + def abs_bound_codec(error_bound, **kwargs): precision = error_bound return CodecStack( numcodecs_wasm_uniform_noise.UniformNoise(scale=precision / 2, seed=42), diff --git a/src/climatebenchpress/compressor/compressors/sz3.py b/src/climatebenchpress/compressor/compressors/sz3.py index ddc148b..b4f6c08 100644 --- a/src/climatebenchpress/compressor/compressors/sz3.py +++ b/src/climatebenchpress/compressor/compressors/sz3.py @@ -10,11 +10,11 @@ class Sz3(Compressor): description = "SZ3" @staticmethod - def abs_bound_codec(dtype, error_bound): + def abs_bound_codec(error_bound, **kwargs): return numcodecs_wasm_sz3.Sz3(eb_mode="abs", eb_abs=error_bound) @staticmethod - def rel_bound_codec(dtype, error_bound): + def rel_bound_codec(error_bound, **kwargs): # SZ3 will not ensure that the relative error bound is strictly met. # Internally, SZ3 transforms the relative error bound to an absolute error bound # based on the range of the input data: diff --git a/src/climatebenchpress/compressor/compressors/tthresh.py b/src/climatebenchpress/compressor/compressors/tthresh.py index 68ba853..401186f 100644 --- a/src/climatebenchpress/compressor/compressors/tthresh.py +++ b/src/climatebenchpress/compressor/compressors/tthresh.py @@ -10,9 +10,9 @@ class Tthresh(Compressor): description = "tthresh" @staticmethod - def abs_bound_codec(dtype, error_bound): + def abs_bound_codec(error_bound, **kwargs): return numcodecs_wasm_tthresh.Tthresh(eb_mode="rmse", eb_rmse=error_bound) @staticmethod - def rel_bound_codec(dtype, error_bound): + def rel_bound_codec(error_bound, **kwargs): return numcodecs_wasm_tthresh.Tthresh(eb_mode="eps", eb_rmse=error_bound) diff --git a/src/climatebenchpress/compressor/compressors/zfp.py b/src/climatebenchpress/compressor/compressors/zfp.py index a6611b8..a09f8a5 100644 --- a/src/climatebenchpress/compressor/compressors/zfp.py +++ b/src/climatebenchpress/compressor/compressors/zfp.py @@ -18,5 +18,5 @@ class Zfp(Compressor): # See https://zfp.readthedocs.io/en/release1.0.1/faq.html#q-relerr for more details. @staticmethod - def abs_bound_codec(dtype, error_bound): + def abs_bound_codec(error_bound, **kwargs): return numcodecs_wasm_zfp.Zfp(mode="fixed-accuracy", tolerance=error_bound) From 85e67ea251e0bf3d9b381a73d8497a5509dc9edf Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Tue, 22 Apr 2025 18:35:29 +0200 Subject: [PATCH 3/5] Pass in data_min not data_abs_min to jpeg2000 --- .../compressor/compressors/abc.py | 22 ++++++++++++------- .../compressor/compressors/jpeg2000.py | 12 +++++----- .../compressor/scripts/compress.py | 20 +++++++++++------ 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/src/climatebenchpress/compressor/compressors/abc.py b/src/climatebenchpress/compressor/compressors/abc.py index b9ab020..5156afe 100644 --- a/src/climatebenchpress/compressor/compressors/abc.py +++ b/src/climatebenchpress/compressor/compressors/abc.py @@ -61,8 +61,8 @@ def abs_bound_codec( error_bound: float, *, dtype: Optional[np.dtype] = None, - data_abs_min: Optional[float] = None, - data_abs_max: Optional[float] = None, + data_min: Optional[float] = None, + data_max: Optional[float] = None, ) -> Codec: pass @@ -72,8 +72,8 @@ def rel_bound_codec( error_bound: float, *, dtype: Optional[np.dtype] = None, - data_abs_min: Optional[float] = None, - data_abs_max: Optional[float] = None, + data_min: Optional[float] = None, + data_max: Optional[float] = None, ) -> Codec: pass @@ -83,6 +83,8 @@ def build( dtypes: dict[VariableName, np.dtype], data_abs_min: dict[VariableName, float], data_abs_max: dict[VariableName, float], + data_min: dict[VariableName, float], + data_max: dict[VariableName, float], error_bounds: list[dict[VariableName, ErrorBound]], ) -> dict[VariantName, list[NamedPerVariableCodec]]: """ @@ -102,6 +104,10 @@ def build( Dict mapping from variable name to minimum absolute value for the variable. data_abs_max : dict[VariableName, float] Dict mapping from variable name to maximum absolute value for the variable. + data_min : dict[VariableName, float] + Dict mapping from variable name to minimum value for the variable. + data_max : dict[VariableName, float] + Dict mapping from variable name to maximum value for the variable. error_bounds: list[ErrorBound] List of error bounds to use for the compressor. @@ -131,15 +137,15 @@ def build( new_codecs[var] = cls.abs_bound_codec( eb.abs_error, dtype=dtypes[var], - data_abs_min=data_abs_min[var], - data_abs_max=data_abs_max[var], + data_min=data_min[var], + data_max=data_max[var], ) elif eb.rel_error is not None and cls.has_rel_error_impl: new_codecs[var] = cls.rel_bound_codec( eb.rel_error, dtype=dtypes[var], - data_abs_min=data_abs_min[var], - data_abs_max=data_abs_max[var], + data_min=data_min[var], + data_max=data_max[var], ) else: # This should never happen as we have already transformed the error bounds. diff --git a/src/climatebenchpress/compressor/compressors/jpeg2000.py b/src/climatebenchpress/compressor/compressors/jpeg2000.py index 5faad74..2a74834 100644 --- a/src/climatebenchpress/compressor/compressors/jpeg2000.py +++ b/src/climatebenchpress/compressor/compressors/jpeg2000.py @@ -19,16 +19,16 @@ class Jpeg2000(Compressor): def abs_bound_codec( error_bound, *, - data_abs_min=None, - data_abs_max=None, + data_min=None, + data_max=None, **kwargs, ): - assert data_abs_min is not None, "data_abs_min must be provided" - assert data_abs_max is not None, "data_abs_max must be provided" + assert data_min is not None, "data_min must be provided" + assert data_max is not None, "data_max must be provided" max_pixel_val = 2**25 - 1 # maximum pixel value for our integer encoding. - data_range = data_abs_max - data_abs_min + data_range = data_max - data_min # Here we use the formula for the PSNR (https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio) # to convert between the absolute error and the PSNR value. @@ -45,7 +45,7 @@ def abs_bound_codec( ), # remap from [min, max] to [0, max_pixel_val] numcodecs_wasm_fixed_offset_scale.FixedOffsetScale( - offset=data_abs_min, + offset=data_min, scale=data_range / max_pixel_val, ), # round and truncate to integer values diff --git a/src/climatebenchpress/compressor/scripts/compress.py b/src/climatebenchpress/compressor/scripts/compress.py index 251f05d..bd039e2 100644 --- a/src/climatebenchpress/compressor/scripts/compress.py +++ b/src/climatebenchpress/compressor/scripts/compress.py @@ -15,11 +15,7 @@ from numcodecs_observers.walltime import WalltimeObserver from numcodecs_wasm import WasmCodecInstructionCounterObserver -from ..compressors.abc import ( - Compressor, - ErrorBound, - NamedPerVariableCodec, -) +from ..compressors.abc import Compressor, ErrorBound, NamedPerVariableCodec from ..monitor import progress_bar @@ -49,11 +45,19 @@ def compress( continue ds = xr.open_dataset(dataset, chunks=dict(), engine="zarr") - ds_dtypes, ds_abs_mins, ds_abs_maxs = dict(), dict(), dict() + ds_dtypes, ds_abs_mins, ds_abs_maxs, ds_mins, ds_maxs = ( + dict(), + dict(), + dict(), + dict(), + dict(), + ) for v in ds: abs_vals = xr.ufuncs.abs(ds[v]) ds_abs_mins[v] = abs_vals.min().values.item() ds_abs_maxs[v] = abs_vals.max().values.item() + ds_mins[v] = ds[v].min().values.item() + ds_maxs[v] = ds[v].max().values.item() ds_dtypes[v] = ds[v].dtype error_bounds = get_error_bounds(datasets_error_bounds, dataset.parent.name) @@ -64,7 +68,9 @@ def compress( continue compressor_variants: dict[str, list[NamedPerVariableCodec]] = ( - compressor.build(ds_dtypes, ds_abs_mins, ds_abs_maxs, error_bounds) + compressor.build( + ds_dtypes, ds_abs_mins, ds_abs_maxs, ds_mins, ds_maxs, error_bounds + ) ) for compr_name, named_codecs in compressor_variants.items(): From 2373a7a613eb494cf559ce4bd65ea422ece5cad7 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Thu, 24 Apr 2025 12:56:26 +0200 Subject: [PATCH 4/5] Adjust max_pixel_value bound --- src/climatebenchpress/compressor/compressors/jpeg2000.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/climatebenchpress/compressor/compressors/jpeg2000.py b/src/climatebenchpress/compressor/compressors/jpeg2000.py index 2a74834..2f0a6b9 100644 --- a/src/climatebenchpress/compressor/compressors/jpeg2000.py +++ b/src/climatebenchpress/compressor/compressors/jpeg2000.py @@ -26,7 +26,7 @@ def abs_bound_codec( assert data_min is not None, "data_min must be provided" assert data_max is not None, "data_max must be provided" - max_pixel_val = 2**25 - 1 # maximum pixel value for our integer encoding. + max_pixel_val = 2**24 - 1 # maximum pixel value for our integer encoding. data_range = data_max - data_min From 44653bd0b4fbb613e8d30bd461f178550629df1c Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Thu, 24 Apr 2025 13:41:59 +0200 Subject: [PATCH 5/5] Switch to unsigned integer --- src/climatebenchpress/compressor/compressors/jpeg2000.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/climatebenchpress/compressor/compressors/jpeg2000.py b/src/climatebenchpress/compressor/compressors/jpeg2000.py index 2f0a6b9..1d73bba 100644 --- a/src/climatebenchpress/compressor/compressors/jpeg2000.py +++ b/src/climatebenchpress/compressor/compressors/jpeg2000.py @@ -26,7 +26,7 @@ def abs_bound_codec( assert data_min is not None, "data_min must be provided" assert data_max is not None, "data_max must be provided" - max_pixel_val = 2**24 - 1 # maximum pixel value for our integer encoding. + max_pixel_val = 2**25 - 1 # maximum pixel value for our integer encoding. data_range = data_max - data_min @@ -51,7 +51,7 @@ def abs_bound_codec( # round and truncate to integer values numcodecs_wasm_round.Round(precision=1), numcodecs.astype.AsType( - encode_dtype="int32", + encode_dtype="uint32", decode_dtype="float64", ), # apply the PSNR error bound