diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index fbe602375..c02884076 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -128,6 +128,14 @@ Remove-Item -Recurse -Force "$env:USERPROFILE\.cache\winml" The next `winml build` will re-create the cache as needed. Use `--rebuild` to force a full rebuild without relying on cached intermediates. +When a build runs out of disk space mid-write, `winml` now stops with a clear message instead of a misleading downstream error: + +```text +ONNXSaveError: Insufficient disk space — unable to write ONNX model to . Free up disk space and try again. +``` + +The partially written file is removed automatically, so a later stage never reads a truncated model. (Previously this surfaced much later as a confusing `ValueError: Failed to find proper ai.onnx domain` during quantization.) Free up space using the command above and re-run the build. + --- ## General Tips diff --git a/src/winml/modelkit/commands/build.py b/src/winml/modelkit/commands/build.py index 288ffc3fe..73af2a750 100644 --- a/src/winml/modelkit/commands/build.py +++ b/src/winml/modelkit/commands/build.py @@ -809,8 +809,14 @@ def _patch_device(cfg: WinMLBuildConfig) -> None: # Map common errors to actionable hints err_str = str(e) + err_lower = err_str.lower() hint = None - if "Quantization failed" in err_str: + if "disk space" in err_lower or "no space left" in err_lower: + hint = ( + "Free up disk space (e.g. clear the HuggingFace cache or " + "~/.cache/winml) and rebuild." + ) + elif "Quantization failed" in err_str: hint = "Try: --no-quant to skip quantization" elif "Compilation failed" in err_str: hint = "Try: --no-compile to skip compilation" diff --git a/src/winml/modelkit/onnx/__init__.py b/src/winml/modelkit/onnx/__init__.py index 3b067e224..4c7fe312d 100644 --- a/src/winml/modelkit/onnx/__init__.py +++ b/src/winml/modelkit/onnx/__init__.py @@ -20,7 +20,7 @@ from .external_data import copy_onnx_model, get_onnx_model_hash from .io import InputTensorSpec, OutputTensorSpec, generate_inputs_from_onnx, get_io_config from .metadata import capture_metadata, restore_metadata -from .persistence import cleanup_onnx, load_onnx, save_onnx +from .persistence import ONNXSaveError, cleanup_onnx, load_onnx, save_onnx from .shape import infer_onnx_shapes, infer_shapes from .utils import EXTERNAL_DATA_THRESHOLD, check_onnx_model, get_model_size @@ -29,6 +29,7 @@ "EXTERNAL_DATA_THRESHOLD", "InputTensorSpec", "ONNXDomain", + "ONNXSaveError", "OutputTensorSpec", "SupportedONNXType", "capture_metadata", diff --git a/src/winml/modelkit/onnx/external_data.py b/src/winml/modelkit/onnx/external_data.py index 6fe61c74c..5363a1985 100644 --- a/src/winml/modelkit/onnx/external_data.py +++ b/src/winml/modelkit/onnx/external_data.py @@ -26,7 +26,7 @@ import onnx from onnx import external_data_helper -from .persistence import load_onnx, save_onnx +from .persistence import _cleanup_partial_save, _raise_save_error, load_onnx, save_onnx logger = logging.getLogger(__name__) @@ -219,23 +219,30 @@ def copy_onnx_model( dst.parent.mkdir(parents=True, exist_ok=True) try: - external_files = get_external_data_files(src) - except Exception: - # Not a valid ONNX file or can't parse — fall back to simple copy - shutil.copy2(src, dst) - return - - if not external_files: - # No external data — simple copy - shutil.copy2(src, dst) - return - - if len(external_files) == 1: - # Single external data file — copy .data + patch .onnx - _copy_single_external(src, dst, external_files[0]) - else: - # Multiple files — consolidate into one - _copy_consolidate(src, dst) + try: + external_files = get_external_data_files(src) + except Exception: + # Not a valid ONNX file or can't parse — fall back to simple copy + shutil.copy2(src, dst) + return + + if not external_files: + # No external data — simple copy + shutil.copy2(src, dst) + return + + if len(external_files) == 1: + # Single external data file — copy .data + patch .onnx + _copy_single_external(src, dst, external_files[0]) + else: + # Multiple files — consolidate into one + _copy_consolidate(src, dst) + except OSError as e: + # A failed copy (commonly disk-full) can leave a truncated destination + # and/or .data sidecar behind. Remove them and surface a clear error + # instead of letting a later stage load the corrupt model. + _cleanup_partial_save(dst, dst.parent / f"{dst.name}.data") + _raise_save_error(e, dst) logger.debug( "Copied ONNX model with external data: %s -> %s (%d data files)", diff --git a/src/winml/modelkit/onnx/persistence.py b/src/winml/modelkit/onnx/persistence.py index c4c71cad0..fab8b638f 100644 --- a/src/winml/modelkit/onnx/persistence.py +++ b/src/winml/modelkit/onnx/persistence.py @@ -12,9 +12,11 @@ from __future__ import annotations +import errno import logging import os from pathlib import Path +from typing import NoReturn import onnx from onnx.external_data_helper import _get_all_tensors, uses_external_data @@ -25,6 +27,75 @@ logger = logging.getLogger(__name__) +# Windows ERROR_DISK_FULL. Python usually maps this to errno.ENOSPC via the CRT, +# but we check the raw winerror too so a disk-full write is always recognised. +_WINDOWS_ERROR_DISK_FULL = 112 + + +class ONNXSaveError(OSError): + """Raised when an ONNX model cannot be written to disk. + + Subclasses :class:`OSError` so existing ``except OSError`` handlers keep + working and ``errno`` is preserved, while surfacing a clear, actionable + message. This matters most for disk-full conditions: without it, a failed + write leaves a truncated/zero-byte ``.onnx`` behind and the real cause only + shows up much later as an opaque opset-parsing error in a downstream stage. + + Attributes: + path: Destination path that could not be written. + disk_full: ``True`` when the failure was caused by insufficient disk + space (``errno.ENOSPC`` / Windows ``ERROR_DISK_FULL``). + """ + + def __init__( + self, + message: str, + *, + path: str | Path | None = None, + disk_full: bool = False, + ) -> None: + super().__init__(message) + self.path = path + self.disk_full = disk_full + + +def _is_disk_full_error(error: OSError) -> bool: + """Return ``True`` when *error* represents an out-of-disk-space condition.""" + return ( + error.errno == errno.ENOSPC + or getattr(error, "winerror", None) == _WINDOWS_ERROR_DISK_FULL + ) + + +def _cleanup_partial_save(*paths: Path | None) -> None: + """Best-effort removal of partial artifacts left by a failed write. + + A failed ``onnx.save_model`` / copy can leave a zero-byte or truncated + ``.onnx`` file (and ``.data`` sidecar) behind. Removing them prevents a + later stage from loading a corrupt model and reporting a misleading error. + """ + for partial in paths: + if partial is None: + continue + try: + Path(partial).unlink(missing_ok=True) + except OSError: + logger.debug("Could not remove partial artifact: %s", partial, exc_info=True) + + +def _raise_save_error(error: OSError, path: Path) -> NoReturn: + """Translate a write ``OSError`` into a clear :class:`ONNXSaveError`.""" + disk_full = _is_disk_full_error(error) + if disk_full: + message = ( + f"Insufficient disk space — unable to write ONNX model to {path}. " + "Free up disk space and try again." + ) + else: + message = f"Failed to write ONNX model to {path}: {error}" + raise ONNXSaveError(message, path=path, disk_full=disk_full) from error + + def load_onnx( path: str | Path, *, @@ -127,20 +198,31 @@ def save_onnx( # path.parent is guaranteed to exist: mkdir() was called above. original_cwd = Path.cwd() try: - os.chdir(path.parent) - onnx.save_model( - model, - path.name, - save_as_external_data=True, - all_tensors_to_one_file=True, - location=ext_location, - size_threshold=1024, - ) - finally: - os.chdir(original_cwd) + try: + os.chdir(path.parent) + onnx.save_model( + model, + path.name, + save_as_external_data=True, + all_tensors_to_one_file=True, + location=ext_location, + size_threshold=1024, + ) + finally: + os.chdir(original_cwd) + except OSError as e: + # A failed external-data write can leave a truncated .onnx and/or + # .data sidecar behind; remove them so a later stage never loads a + # corrupt model and reports a misleading error. + _cleanup_partial_save(path, ext_path) + _raise_save_error(e, path) else: logger.debug("Saving ONNX model inline to %s", path) - onnx.save_model(model, str(path)) + try: + onnx.save_model(model, str(path)) + except OSError as e: + _cleanup_partial_save(path) + _raise_save_error(e, path) def cleanup_onnx(path: str | Path) -> list[Path]: diff --git a/src/winml/modelkit/quant/quantizer.py b/src/winml/modelkit/quant/quantizer.py index 8b9ba033f..3323434ee 100644 --- a/src/winml/modelkit/quant/quantizer.py +++ b/src/winml/modelkit/quant/quantizer.py @@ -22,6 +22,41 @@ logger = logging.getLogger(__name__) +def _check_input_model_opset(model_path: Path) -> str | None: + """Return a clear error message if *model_path* is empty/corrupt, else None. + + Mirrors ORT's ``get_opset_version`` requirement: a usable model must declare + a default (``""`` / ``ai.onnx``) opset import. A zero-byte or truncated file + parses into an (almost) empty ModelProto with no such opset import — the + signature of a previous stage that failed to finish writing (most commonly + because it ran out of disk space). Detecting it here lets us surface the + real cause instead of ORT's opaque "Failed to find proper ai.onnx domain". + + Reads only the graph (no external weights) directly via ``onnx.load_model`` + so the check stays cheap and never trips over a missing ``.data`` sidecar. + """ + from onnx import load_model + + try: + model = load_model(str(model_path), load_external_data=False) + except Exception as e: + return ( + f"Input ONNX model could not be parsed: {model_path} ({e}). " + "The file may be truncated or corrupt — for example, a previous " + "build stage may have run out of disk space. Free up disk space " + "and rebuild." + ) + + has_default_opset = any(opset.domain in ("", "ai.onnx") for opset in model.opset_import) + if not has_default_opset: + return ( + f"Input ONNX model is empty or corrupt (no ai.onnx opset import): " + f"{model_path}. It may have been truncated by a previous failed " + "write (e.g. insufficient disk space). Free up disk space and rebuild." + ) + return None + + def quantize_onnx( model_path: str | Path, output_path: str | Path | None = None, @@ -97,6 +132,18 @@ def _quantize_single_pass( errors=[f"Model not found: {model_path}"], ) + # Guard against an empty/corrupt input model. A previous stage that ran out + # of disk space can leave a truncated/zero-byte .onnx behind; without this + # check ORT fails deep inside quantization with the opaque + # "Failed to find proper ai.onnx domain". Surface the real cause instead. + opset_error = _check_input_model_opset(model_path) + if opset_error is not None: + return QuantizeResult( + success=False, + output_path=None, + errors=[opset_error], + ) + errors: list[str] = [] warnings: list[str] = [] diff --git a/tests/unit/onnx/test_external_data.py b/tests/unit/onnx/test_external_data.py index 0d118f475..630f75f4a 100644 --- a/tests/unit/onnx/test_external_data.py +++ b/tests/unit/onnx/test_external_data.py @@ -6,12 +6,16 @@ from __future__ import annotations +import errno +import shutil from typing import TYPE_CHECKING import numpy as np import onnx +import pytest from onnx import TensorProto, external_data_helper, helper, numpy_helper +from winml.modelkit.onnx import ONNXSaveError from winml.modelkit.onnx.external_data import ( copy_onnx_model, get_external_data_files, @@ -258,3 +262,32 @@ def test_copy_overwrites_existing_dst_with_external_data(self, tmp_path: Path) - src_arr = numpy_helper.to_array(src_full.graph.initializer[0]) dst_arr = numpy_helper.to_array(dst_full.graph.initializer[0]) assert np.array_equal(src_arr, dst_arr) + + +class TestCopyOnnxModelDiskFull: + """copy_onnx_model surfaces a clear error and cleans up on a failed write.""" + + def test_copy_disk_full_raises_and_cleans_dst( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + src = tmp_path / "src.onnx" + dst = tmp_path / "out" / "dst.onnx" + onnx.save(_make_small_model(), str(src)) # valid, no external data + + def _failing_copy2(_s: object, d: object, *_a: object, **_k: object) -> None: + from pathlib import Path as _Path + + _Path(d).write_bytes(b"") # partial/truncated destination + raise OSError(errno.ENOSPC, "simulated write failure") + + monkeypatch.setattr(shutil, "copy2", _failing_copy2) + + with pytest.raises(ONNXSaveError) as exc_info: + copy_onnx_model(src, dst) + + err = exc_info.value + assert err.disk_full is True + assert isinstance(err, OSError) + assert "disk space" in str(err).lower() + # The truncated destination must not be left behind. + assert not dst.exists() diff --git a/tests/unit/onnx/test_persistence.py b/tests/unit/onnx/test_persistence.py index 189991d81..8a08601ba 100644 --- a/tests/unit/onnx/test_persistence.py +++ b/tests/unit/onnx/test_persistence.py @@ -11,6 +11,7 @@ from __future__ import annotations +import errno from typing import TYPE_CHECKING import numpy as np @@ -21,6 +22,7 @@ from winml.modelkit.onnx import EXTERNAL_DATA_THRESHOLD from winml.modelkit.onnx.persistence import ( + ONNXSaveError, cleanup_onnx, load_onnx, save_onnx, @@ -622,3 +624,80 @@ class TestConstants: def test_threshold_is_100mib(self) -> None: assert EXTERNAL_DATA_THRESHOLD == 100 * 1024 * 1024 + + +# --------------------------------------------------------------------------- +# Disk-full / failed-write handling (issue #259) +# --------------------------------------------------------------------------- + + +def _make_failing_save_model(errno_code: int): + """Build a fake ``onnx.save_model`` that simulates a failed disk write. + + Mirrors the OS behaviour: ``open(path, "wb")`` truncates/creates the target + first, then the write fails — leaving a partial artifact behind that the + real code must clean up. + """ + + def _fake_save_model(_proto: object, f: str, **kwargs: object) -> None: + from pathlib import Path as _Path + + _Path(f).write_bytes(b"") # 0-byte partial file, like a truncated write + location = kwargs.get("location") + if isinstance(location, str): + _Path(location).write_bytes(b"") + raise OSError(errno_code, "simulated write failure") + + return _fake_save_model + + +class TestSaveOnnxDiskFull: + """save_onnx surfaces a clear error and removes partial files on failure.""" + + def test_inline_disk_full_raises_clear_error( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + model = _make_tiny_model() + model_path = tmp_path / "out.onnx" + monkeypatch.setattr(onnx, "save_model", _make_failing_save_model(errno.ENOSPC)) + + with pytest.raises(ONNXSaveError) as exc_info: + save_onnx(model, model_path, use_external_data=False) + + err = exc_info.value + assert err.disk_full is True + assert isinstance(err, OSError) # backward-compatible with except OSError + assert "disk space" in str(err).lower() + # The truncated 0-byte file must not be left behind for a later stage. + assert not model_path.exists() + + def test_external_disk_full_raises_and_cleans_sidecar( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + model = _make_model_with_initializer() + model_path = tmp_path / "out.onnx" + sidecar = tmp_path / "out.onnx.data" + monkeypatch.setattr(onnx, "save_model", _make_failing_save_model(errno.ENOSPC)) + + with pytest.raises(ONNXSaveError) as exc_info: + save_onnx(model, model_path, threshold_size=0) # force external data + + assert exc_info.value.disk_full is True + assert "disk space" in str(exc_info.value).lower() + assert not model_path.exists() + assert not sidecar.exists() + + def test_non_enospc_oserror_raises_generic_error( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + model = _make_tiny_model() + model_path = tmp_path / "out.onnx" + monkeypatch.setattr(onnx, "save_model", _make_failing_save_model(errno.EACCES)) + + with pytest.raises(ONNXSaveError) as exc_info: + save_onnx(model, model_path, use_external_data=False) + + err = exc_info.value + assert err.disk_full is False + assert "Failed to write ONNX model" in str(err) + assert not model_path.exists() diff --git a/tests/unit/test_quantizer.py b/tests/unit/test_quantizer.py index 927956c87..95bfcf3e9 100644 --- a/tests/unit/test_quantizer.py +++ b/tests/unit/test_quantizer.py @@ -30,6 +30,22 @@ def rewind(self) -> None: return None +def _write_minimal_onnx_model(path: Path) -> None: + """Write a tiny but valid ONNX model (with an ai.onnx opset) to *path*. + + The quantizer's input guard parses the file and requires a default opset + import, so tests that exercise the quantize flow need a real model on disk. + """ + import onnx + + x = onnx.helper.make_tensor_value_info("X", onnx.TensorProto.FLOAT, [1, 3]) + y = onnx.helper.make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [1, 3]) + node = onnx.helper.make_node("Relu", ["X"], ["Y"]) + graph = onnx.helper.make_graph([node], "g", [x], [y]) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 17)]) + onnx.save_model(model, str(path)) + + class _FakeOrtModule(ModuleType): quantization: ModuleType @@ -94,7 +110,7 @@ def test_quantize_onnx_removes_only_exact_external_data_sidecar( ) -> None: """Cleanup should remove only the exact .data sidecar for the output model.""" model_path = tmp_path / "model.onnx" - model_path.write_text("input") + _write_minimal_onnx_model(model_path) output_path = tmp_path / "quantized.onnx" exact_sidecar = tmp_path / f"{output_path.name}.data" extra_suffix_sidecar = tmp_path / f"{output_path.name}.data.bak" @@ -143,3 +159,75 @@ def fake_quantize(*, model_input, model_output: str, quant_config) -> None: assert result.success is True assert extra_suffix_sidecar.exists() + + +def _write_opsetless_onnx_model(path: Path) -> None: + """Write a parseable ONNX model that declares no default (ai.onnx) opset.""" + import onnx + + x = onnx.helper.make_tensor_value_info("X", onnx.TensorProto.FLOAT, [1, 3]) + y = onnx.helper.make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [1, 3]) + node = onnx.helper.make_node("Relu", ["X"], ["Y"]) + graph = onnx.helper.make_graph([node], "g", [x], [y]) + # Only a custom-domain opset — no "" / "ai.onnx" import, which is the + # signature ORT's get_opset_version() rejects. + model = onnx.helper.make_model( + graph, opset_imports=[onnx.helper.make_opsetid("com.example", 1)] + ) + onnx.save_model(model, str(path)) + + +def test_quantize_empty_input_model_surfaces_clear_error(tmp_path: Path) -> None: + """A zero-byte input model yields a clear disk-space/corruption error. + + Regression for #259: a truncated optimize output must not surface as ORT's + opaque "Failed to find proper ai.onnx domain". + """ + model_path = tmp_path / "optimized.onnx" + model_path.write_bytes(b"") # truncated/zero-byte write left by disk-full + + result = quantize_onnx(model_path, output_path=tmp_path / "quantized.onnx") + + assert result.success is False + assert result.output_path is None + joined = " ".join(result.errors).lower() + assert "disk space" in joined + assert "failed to find proper ai.onnx domain" not in joined + + +def test_quantize_opsetless_input_model_surfaces_clear_error(tmp_path: Path) -> None: + """A model with no ai.onnx opset import yields a clear, specific error.""" + model_path = tmp_path / "optimized.onnx" + _write_opsetless_onnx_model(model_path) + + result = quantize_onnx(model_path, output_path=tmp_path / "quantized.onnx") + + assert result.success is False + assert result.output_path is None + joined = " ".join(result.errors) + assert "no ai.onnx opset import" in joined + assert "Failed to find proper ai.onnx domain" not in joined + + +def test_quantize_unparseable_input_model_surfaces_clear_error( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """An input model that fails to parse yields a clear error, not a traceback.""" + import onnx + + model_path = tmp_path / "optimized.onnx" + _write_minimal_onnx_model(model_path) + + def _raise_parse_error(*_args: Any, **_kwargs: Any) -> Any: + raise RuntimeError("protobuf parse error") + + monkeypatch.setattr(onnx, "load_model", _raise_parse_error) + + result = quantize_onnx(model_path, output_path=tmp_path / "quantized.onnx") + + assert result.success is False + assert result.output_path is None + joined = " ".join(result.errors) + assert "could not be parsed" in joined + assert "disk space" in joined.lower()