Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,14 @@ Remove-Item -Recurse -Force "$env:USERPROFILE\.cache\winml"

The next `winml build` will re-create the cache as needed. Use `--rebuild` to force a full rebuild without relying on cached intermediates.

When a build runs out of disk space mid-write, `winml` now stops with a clear message instead of a misleading downstream error:

```text
ONNXSaveError: Insufficient disk space — unable to write ONNX model to <path>. Free up disk space and try again.
```

The partially written file is removed automatically, so a later stage never reads a truncated model. (Previously this surfaced much later as a confusing `ValueError: Failed to find proper ai.onnx domain` during quantization.) Free up space using the command above and re-run the build.

---

## General Tips
Expand Down
8 changes: 7 additions & 1 deletion src/winml/modelkit/commands/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,8 +809,14 @@ def _patch_device(cfg: WinMLBuildConfig) -> None:

# Map common errors to actionable hints
err_str = str(e)
err_lower = err_str.lower()
hint = None
if "Quantization failed" in err_str:
if "disk space" in err_lower or "no space left" in err_lower:
hint = (
"Free up disk space (e.g. clear the HuggingFace cache or "
"~/.cache/winml) and rebuild."
)
elif "Quantization failed" in err_str:
hint = "Try: --no-quant to skip quantization"
elif "Compilation failed" in err_str:
hint = "Try: --no-compile to skip compilation"
Expand Down
3 changes: 2 additions & 1 deletion src/winml/modelkit/onnx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from .external_data import copy_onnx_model, get_onnx_model_hash
from .io import InputTensorSpec, OutputTensorSpec, generate_inputs_from_onnx, get_io_config
from .metadata import capture_metadata, restore_metadata
from .persistence import cleanup_onnx, load_onnx, save_onnx
from .persistence import ONNXSaveError, cleanup_onnx, load_onnx, save_onnx
from .shape import infer_onnx_shapes, infer_shapes
from .utils import EXTERNAL_DATA_THRESHOLD, check_onnx_model, get_model_size

Expand All @@ -29,6 +29,7 @@
"EXTERNAL_DATA_THRESHOLD",
"InputTensorSpec",
"ONNXDomain",
"ONNXSaveError",
"OutputTensorSpec",
"SupportedONNXType",
"capture_metadata",
Expand Down
43 changes: 25 additions & 18 deletions src/winml/modelkit/onnx/external_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import onnx
from onnx import external_data_helper

from .persistence import load_onnx, save_onnx
from .persistence import _cleanup_partial_save, _raise_save_error, load_onnx, save_onnx


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -219,23 +219,30 @@ def copy_onnx_model(
dst.parent.mkdir(parents=True, exist_ok=True)

try:
external_files = get_external_data_files(src)
except Exception:
# Not a valid ONNX file or can't parse — fall back to simple copy
shutil.copy2(src, dst)
return

if not external_files:
# No external data — simple copy
shutil.copy2(src, dst)
return

if len(external_files) == 1:
# Single external data file — copy .data + patch .onnx
_copy_single_external(src, dst, external_files[0])
else:
# Multiple files — consolidate into one
_copy_consolidate(src, dst)
try:
external_files = get_external_data_files(src)
except Exception:
# Not a valid ONNX file or can't parse — fall back to simple copy
shutil.copy2(src, dst)
return

if not external_files:
# No external data — simple copy
shutil.copy2(src, dst)
return

if len(external_files) == 1:
# Single external data file — copy .data + patch .onnx
_copy_single_external(src, dst, external_files[0])
else:
# Multiple files — consolidate into one
_copy_consolidate(src, dst)
except OSError as e:
# A failed copy (commonly disk-full) can leave a truncated destination
# and/or .data sidecar behind. Remove them and surface a clear error
# instead of letting a later stage load the corrupt model.
_cleanup_partial_save(dst, dst.parent / f"{dst.name}.data")
_raise_save_error(e, dst)

logger.debug(
"Copied ONNX model with external data: %s -> %s (%d data files)",
Expand Down
106 changes: 94 additions & 12 deletions src/winml/modelkit/onnx/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@

from __future__ import annotations

import errno
import logging
import os
from pathlib import Path
from typing import NoReturn

import onnx
from onnx.external_data_helper import _get_all_tensors, uses_external_data
Expand All @@ -25,6 +27,75 @@
logger = logging.getLogger(__name__)


# Windows ERROR_DISK_FULL. Python usually maps this to errno.ENOSPC via the CRT,
# but we check the raw winerror too so a disk-full write is always recognised.
_WINDOWS_ERROR_DISK_FULL = 112


class ONNXSaveError(OSError):
"""Raised when an ONNX model cannot be written to disk.

Subclasses :class:`OSError` so existing ``except OSError`` handlers keep
working and ``errno`` is preserved, while surfacing a clear, actionable
message. This matters most for disk-full conditions: without it, a failed
write leaves a truncated/zero-byte ``.onnx`` behind and the real cause only
shows up much later as an opaque opset-parsing error in a downstream stage.

Attributes:
path: Destination path that could not be written.
disk_full: ``True`` when the failure was caused by insufficient disk
space (``errno.ENOSPC`` / Windows ``ERROR_DISK_FULL``).
"""

def __init__(
self,
message: str,
*,
path: str | Path | None = None,
disk_full: bool = False,
) -> None:
super().__init__(message)
self.path = path
self.disk_full = disk_full


def _is_disk_full_error(error: OSError) -> bool:
"""Return ``True`` when *error* represents an out-of-disk-space condition."""
return (
error.errno == errno.ENOSPC
or getattr(error, "winerror", None) == _WINDOWS_ERROR_DISK_FULL
)


def _cleanup_partial_save(*paths: Path | None) -> None:
"""Best-effort removal of partial artifacts left by a failed write.

A failed ``onnx.save_model`` / copy can leave a zero-byte or truncated
``.onnx`` file (and ``.data`` sidecar) behind. Removing them prevents a
later stage from loading a corrupt model and reporting a misleading error.
"""
for partial in paths:
if partial is None:
continue
try:
Path(partial).unlink(missing_ok=True)
except OSError:
logger.debug("Could not remove partial artifact: %s", partial, exc_info=True)


def _raise_save_error(error: OSError, path: Path) -> NoReturn:
"""Translate a write ``OSError`` into a clear :class:`ONNXSaveError`."""
disk_full = _is_disk_full_error(error)
if disk_full:
message = (
f"Insufficient disk space — unable to write ONNX model to {path}. "
"Free up disk space and try again."
)
else:
message = f"Failed to write ONNX model to {path}: {error}"
raise ONNXSaveError(message, path=path, disk_full=disk_full) from error


def load_onnx(
path: str | Path,
*,
Expand Down Expand Up @@ -127,20 +198,31 @@ def save_onnx(
# path.parent is guaranteed to exist: mkdir() was called above.
original_cwd = Path.cwd()
try:
os.chdir(path.parent)
onnx.save_model(
model,
path.name,
save_as_external_data=True,
all_tensors_to_one_file=True,
location=ext_location,
size_threshold=1024,
)
finally:
os.chdir(original_cwd)
try:
os.chdir(path.parent)
onnx.save_model(
model,
path.name,
save_as_external_data=True,
all_tensors_to_one_file=True,
location=ext_location,
size_threshold=1024,
)
finally:
os.chdir(original_cwd)
except OSError as e:
# A failed external-data write can leave a truncated .onnx and/or
# .data sidecar behind; remove them so a later stage never loads a
# corrupt model and reports a misleading error.
_cleanup_partial_save(path, ext_path)
_raise_save_error(e, path)
else:
logger.debug("Saving ONNX model inline to %s", path)
onnx.save_model(model, str(path))
try:
onnx.save_model(model, str(path))
except OSError as e:
_cleanup_partial_save(path)
_raise_save_error(e, path)


def cleanup_onnx(path: str | Path) -> list[Path]:
Expand Down
47 changes: 47 additions & 0 deletions src/winml/modelkit/quant/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,41 @@
logger = logging.getLogger(__name__)


def _check_input_model_opset(model_path: Path) -> str | None:
"""Return a clear error message if *model_path* is empty/corrupt, else None.

Mirrors ORT's ``get_opset_version`` requirement: a usable model must declare
a default (``""`` / ``ai.onnx``) opset import. A zero-byte or truncated file
parses into an (almost) empty ModelProto with no such opset import — the
signature of a previous stage that failed to finish writing (most commonly
because it ran out of disk space). Detecting it here lets us surface the
real cause instead of ORT's opaque "Failed to find proper ai.onnx domain".

Reads only the graph (no external weights) directly via ``onnx.load_model``
so the check stays cheap and never trips over a missing ``.data`` sidecar.
"""
from onnx import load_model

try:
model = load_model(str(model_path), load_external_data=False)
except Exception as e:
return (
f"Input ONNX model could not be parsed: {model_path} ({e}). "
"The file may be truncated or corrupt — for example, a previous "
"build stage may have run out of disk space. Free up disk space "
"and rebuild."
)

has_default_opset = any(opset.domain in ("", "ai.onnx") for opset in model.opset_import)
if not has_default_opset:
return (
f"Input ONNX model is empty or corrupt (no ai.onnx opset import): "
f"{model_path}. It may have been truncated by a previous failed "
"write (e.g. insufficient disk space). Free up disk space and rebuild."
)
return None


def quantize_onnx(
model_path: str | Path,
output_path: str | Path | None = None,
Expand Down Expand Up @@ -97,6 +132,18 @@ def _quantize_single_pass(
errors=[f"Model not found: {model_path}"],
)

# Guard against an empty/corrupt input model. A previous stage that ran out
# of disk space can leave a truncated/zero-byte .onnx behind; without this
# check ORT fails deep inside quantization with the opaque
# "Failed to find proper ai.onnx domain". Surface the real cause instead.
opset_error = _check_input_model_opset(model_path)
if opset_error is not None:
return QuantizeResult(
success=False,
output_path=None,
errors=[opset_error],
)

errors: list[str] = []
warnings: list[str] = []

Expand Down
33 changes: 33 additions & 0 deletions tests/unit/onnx/test_external_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@

from __future__ import annotations

import errno
import shutil
from typing import TYPE_CHECKING

import numpy as np
import onnx
import pytest
from onnx import TensorProto, external_data_helper, helper, numpy_helper

from winml.modelkit.onnx import ONNXSaveError
from winml.modelkit.onnx.external_data import (
copy_onnx_model,
get_external_data_files,
Expand Down Expand Up @@ -258,3 +262,32 @@ def test_copy_overwrites_existing_dst_with_external_data(self, tmp_path: Path) -
src_arr = numpy_helper.to_array(src_full.graph.initializer[0])
dst_arr = numpy_helper.to_array(dst_full.graph.initializer[0])
assert np.array_equal(src_arr, dst_arr)


class TestCopyOnnxModelDiskFull:
"""copy_onnx_model surfaces a clear error and cleans up on a failed write."""

def test_copy_disk_full_raises_and_cleans_dst(
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
src = tmp_path / "src.onnx"
dst = tmp_path / "out" / "dst.onnx"
onnx.save(_make_small_model(), str(src)) # valid, no external data

def _failing_copy2(_s: object, d: object, *_a: object, **_k: object) -> None:
from pathlib import Path as _Path

_Path(d).write_bytes(b"") # partial/truncated destination
raise OSError(errno.ENOSPC, "simulated write failure")

monkeypatch.setattr(shutil, "copy2", _failing_copy2)

with pytest.raises(ONNXSaveError) as exc_info:
copy_onnx_model(src, dst)

err = exc_info.value
assert err.disk_full is True
assert isinstance(err, OSError)
assert "disk space" in str(err).lower()
# The truncated destination must not be left behind.
assert not dst.exists()
Loading
Loading