From 0d839a84ce191e419a17b84c0f797f1cedf8f1e4 Mon Sep 17 00:00:00 2001
From: Zhipeng Wang <zhiwang@microsoft.com>
Date: Fri, 26 Jun 2026 14:58:29 +0800
Subject: [PATCH 1/2] Fix #724: reconcile task registry into a single source of
 truth

KNOWN_TASKS (validate_task / --list-tasks) and TASK_ABBREV (cache keys)
were maintained independently and drifted: 13 task names lived only in
TASK_ABBREV and were rejected by validate_task. Derive both from one
_TASK_REGISTRY so they can no longer desync.

- Add the 5 wired tasks (video-classification, keypoint-matching,
  table-question-answering, zero-shot-audio-classification, any-to-any)
  to the canonical set, keeping their existing abbreviations.
- Drop the 8 unreferenced/stale task names from the abbreviation table.
- Keep the 5 collapsed aliases out of KNOWN_TASKS but retain their
  abbreviations for cache-key / serve reverse-decode stability.
- Add drift guards: registry/abbrev consistency, wired-task regression,
  stale-task audit lock, and an inference.tasks cross-source check.

No abbreviation changes for any task still in use, so no cache churn.
---
 src/winml/modelkit/loader/task.py     | 125 +++++++++++++------------
 tests/unit/loader/test_known_tasks.py | 127 +++++++++++++++++++++++++-
 2 files changed, 185 insertions(+), 67 deletions(-)

diff --git a/src/winml/modelkit/loader/task.py b/src/winml/modelkit/loader/task.py
index dc32998dc..f5cb32b12 100644
--- a/src/winml/modelkit/loader/task.py
+++ b/src/winml/modelkit/loader/task.py
@@ -26,9 +26,26 @@
 
 logger = logging.getLogger(__name__)
 
-# Task abbreviations for cache keys (47 tasks from HuggingFace Transformers)
-TASK_ABBREV: dict[str, str] = {
-    # Vision tasks
+# =============================================================================
+# Task registry — single source of truth
+# =============================================================================
+#
+# ``_TASK_REGISTRY`` is the one authoritative table of canonical task names that
+# ``winml`` recognizes, each paired with its cache-key abbreviation (``None`` ->
+# ``get_task_abbrev`` falls back to an 8-char truncation). Both public views are
+# *derived* from it, so they can no longer drift apart (the root cause of #724):
+#
+#   * ``KNOWN_TASKS``  (validation / ``inspect --list-tasks``)  = the names
+#   * ``TASK_ABBREV``  (cache-key abbreviations)                = the (name, abbrev) pairs
+#
+# Kept hand-curated (not imported from optimum) so the ``--list-tasks`` fast path
+# stays import-cheap — importing ``optimum.exporters`` would transitively pull in
+# ``transformers`` and cost ~10s. ``tests/unit/loader/test_known_tasks.py`` guards
+# this set against optimum's task list, ``HF_TASK_DEFAULTS``,
+# ``HF_MODEL_CLASS_MAPPING`` and ``inference.tasks`` so any task added there fails
+# CI until it is mirrored here.
+_TASK_REGISTRY: dict[str, str | None] = {
+    # Vision
     "image-classification": "imgcls",
     "image-segmentation": "imgseg",
     "image-feature-extraction": "imgfeat",
@@ -37,96 +54,76 @@
     "image-text-to-text": "imgtxt2t",
     "object-detection": "objdet",
     "depth-estimation": "depth",
-    "instance-segmentation": "instseg",
     "semantic-segmentation": "semseg",
-    "universal-segmentation": "uniseg",
     "keypoint-detection": "kptdet",
     "keypoint-matching": "kptmtch",
     "mask-generation": "maskgen",
-    "masked-image-modeling": "mskim",
+    "masked-im": None,
     "video-classification": "vidcls",
     "zero-shot-image-classification": "zsimg",
     "zero-shot-object-detection": "zsobj",
-    # NLP tasks
+    "inpainting": None,
+    "text-to-image": None,
+    # NLP
     "text-classification": "txtcls",
-    "sequence-classification": "seqcls",
     "token-classification": "tokcls",
     "question-answering": "qa",
     "text-generation": "txtgen",
     "text2text-generation": "txt2txt",
     "fill-mask": "mask",
     "feature-extraction": "feat",
-    "text-encoding": "txtenc",
-    "summarization": "summ",
-    "translation": "transl",
     "multiple-choice": "mltchs",
     "next-sentence-prediction": "nsp",
-    "pretraining": "pretrain",
     "table-question-answering": "tabqa",
     "document-question-answering": "docqa",
-    "zero-shot-classification": "zscls",
-    # Audio tasks
+    "sentence-similarity": None,
+    # Audio
     "audio-classification": "audiocls",
     "audio-frame-classification": "audfrm",
-    "audio-tokenization": "audtok",
     "audio-xvector": "audxvc",
     "automatic-speech-recognition": "asr",
     "text-to-audio": "txt2aud",
     "zero-shot-audio-classification": "zsaud",
-    # Multimodal tasks
+    # Multimodal
     "visual-question-answering": "vqa",
     "any-to-any": "a2a",
-    "multimodal-lm": "mmlm",
-    # Other tasks
-    "backbone": "bkbone",
-    "time-series-prediction": "tseries",
+    # Other
+    "reinforcement-learning": None,
+    "time-series-forecasting": None,
 }
 
 
-# Canonical set of task names recognized by `winml inspect`.
-# Hand-coded so that `winml inspect --list-tasks` does not need to import
-# optimum.exporters (which transitively imports transformers and costs ~10s).
-# Synced with optimum.exporters.tasks.TasksManager.get_all_tasks() plus our
-# own HF_TASK_DEFAULTS entries; add new tasks here when optimum gains them.
-KNOWN_TASKS: frozenset[str] = frozenset(
-    {
-        "audio-classification",
-        "audio-frame-classification",
-        "audio-xvector",
-        "automatic-speech-recognition",
-        "depth-estimation",
-        "document-question-answering",
-        "feature-extraction",
-        "fill-mask",
-        "image-classification",
-        "image-feature-extraction",
-        "image-segmentation",
-        "image-text-to-text",
-        "image-to-image",
-        "image-to-text",
-        "inpainting",
-        "keypoint-detection",
-        "mask-generation",
-        "masked-im",
-        "multiple-choice",
-        "next-sentence-prediction",
-        "object-detection",
-        "question-answering",
-        "reinforcement-learning",
-        "semantic-segmentation",
-        "sentence-similarity",
-        "text-classification",
-        "text-generation",
-        "text-to-audio",
-        "text-to-image",
-        "text2text-generation",
-        "time-series-forecasting",
-        "token-classification",
-        "visual-question-answering",
-        "zero-shot-image-classification",
-        "zero-shot-object-detection",
-    }
-)
+# Aliases that ``normalize_task()`` / ``to_optimum_task()`` collapse to canonical
+# forms, so they are deliberately *excluded* from ``KNOWN_TASKS``. They keep a
+# stable cache-key abbreviation because a few callers still use the alias name
+# directly as the resolved task — composite models register ``summarization`` /
+# ``translation`` (see ``models/hf/bart.py``, ``t5.py``, ``marian.py``) and
+# ``inference.tasks`` defines a ``TaskInputSpec`` for ``zero-shot-classification``
+# — so existing cache directories and the ``serve`` reverse-decode map
+# (``app.py``: ``{v: k for k, v in TASK_ABBREV.items()}``) must round-trip.
+_TASK_ALIAS_ABBREV: dict[str, str] = {
+    "pretraining": "pretrain",
+    "sequence-classification": "seqcls",
+    "summarization": "summ",
+    "translation": "transl",
+    "zero-shot-classification": "zscls",
+}
+
+
+# Canonical set of task names recognized by `winml inspect` (names only).
+# Derived from `_TASK_REGISTRY` above — do not hand-edit; add tasks to the
+# registry instead so `KNOWN_TASKS` and `TASK_ABBREV` stay in lockstep.
+KNOWN_TASKS: frozenset[str] = frozenset(_TASK_REGISTRY)
+
+
+# Task -> abbreviation for cache keys. Derived from `_TASK_REGISTRY`: canonical
+# tasks whose abbreviation is `None` are omitted here (and truncated to 8 chars
+# by `get_task_abbrev`); the collapsed aliases are appended for cache-key and
+# `serve` reverse-decode stability.
+TASK_ABBREV: dict[str, str] = {
+    **{task: abbrev for task, abbrev in _TASK_REGISTRY.items() if abbrev is not None},
+    **_TASK_ALIAS_ABBREV,
+}
 
 
 # =============================================================================
diff --git a/tests/unit/loader/test_known_tasks.py b/tests/unit/loader/test_known_tasks.py
index b78a8532a..4738ce4e4 100644
--- a/tests/unit/loader/test_known_tasks.py
+++ b/tests/unit/loader/test_known_tasks.py
@@ -17,7 +17,40 @@
 
 from __future__ import annotations
 
+import pytest
+
 from winml.modelkit.loader import KNOWN_TASKS
+from winml.modelkit.loader.task import (
+    _TASK_ALIAS_ABBREV,
+    _TASK_REGISTRY,
+    TASK_ABBREV,
+)
+
+
+# Tasks #724 confirmed are wired into the codebase (inference TaskInputSpec,
+# composite-model registration, or e2e testset model tag) and therefore must be
+# accepted by validate_task and advertised by ``--list-tasks``.
+WIRED_TASKS = (
+    "video-classification",
+    "keypoint-matching",
+    "table-question-answering",
+    "zero-shot-audio-classification",
+    "any-to-any",
+)
+
+# Entries audited as stale in #724 — they only ever existed in the old
+# TASK_ABBREV table (no inference spec, model registration, test, or doc) and
+# were dropped. Locked here so they are not silently re-added.
+DROPPED_TASKS = (
+    "instance-segmentation",
+    "universal-segmentation",
+    "masked-image-modeling",
+    "text-encoding",
+    "audio-tokenization",
+    "multimodal-lm",
+    "backbone",
+    "time-series-prediction",
+)
 
 
 class TestKnownTasksShape:
@@ -47,7 +80,7 @@ def test_covers_hf_task_defaults(self) -> None:
         missing = set(HF_TASK_DEFAULTS) - KNOWN_TASKS
         assert not missing, (
             f"HF_TASK_DEFAULTS contains tasks not in KNOWN_TASKS: {sorted(missing)}. "
-            "Add them to KNOWN_TASKS in src/winml/modelkit/loader/task.py."
+            "Add them to _TASK_REGISTRY in src/winml/modelkit/loader/task.py."
         )
 
     def test_covers_hf_model_class_mapping(self) -> None:
@@ -57,7 +90,7 @@ def test_covers_hf_model_class_mapping(self) -> None:
         missing = registered - KNOWN_TASKS
         assert not missing, (
             f"HF_MODEL_CLASS_MAPPING uses tasks not in KNOWN_TASKS: {sorted(missing)}. "
-            "Add them to KNOWN_TASKS in src/winml/modelkit/loader/task.py."
+            "Add them to _TASK_REGISTRY in src/winml/modelkit/loader/task.py."
         )
 
     def test_covers_optimum_tasks(self) -> None:
@@ -74,5 +107,93 @@ def test_covers_optimum_tasks(self) -> None:
         missing = optimum_tasks - KNOWN_TASKS
         assert not missing, (
             f"optimum exposes tasks not in KNOWN_TASKS: {sorted(missing)}. "
-            "Add them to KNOWN_TASKS in src/winml/modelkit/loader/task.py."
+            "Add them to _TASK_REGISTRY in src/winml/modelkit/loader/task.py."
+        )
+
+
+class TestTaskRegistrySingleSourceOfTruth:
+    """KNOWN_TASKS and TASK_ABBREV must derive from one registry (#724).
+
+    The root cause of #724 was two hand-maintained tables (``TASK_ABBREV`` and
+    ``KNOWN_TASKS``) that drifted apart. They now both derive from
+    ``_TASK_REGISTRY``; these tests lock that invariant.
+    """
+
+    def test_known_tasks_equals_registry_keys(self) -> None:
+        assert frozenset(_TASK_REGISTRY) == KNOWN_TASKS
+
+    def test_every_canonical_abbrev_key_is_known(self) -> None:
+        """Every non-alias key in TASK_ABBREV must be a known task."""
+        non_alias = {task for task in TASK_ABBREV if task not in _TASK_ALIAS_ABBREV}
+        missing = non_alias - KNOWN_TASKS
+        assert not missing, f"TASK_ABBREV canonical keys not in KNOWN_TASKS: {sorted(missing)}"
+
+    def test_aliases_excluded_from_known_tasks(self) -> None:
+        """Collapsed aliases keep a cache-key abbrev but are not canonical tasks."""
+        overlap = set(_TASK_ALIAS_ABBREV) & KNOWN_TASKS
+        assert not overlap, f"Aliases must not appear in KNOWN_TASKS: {sorted(overlap)}"
+
+    def test_abbreviations_are_unique(self) -> None:
+        """serve/app.py inverts TASK_ABBREV to {abbrev: task}; collisions lose entries."""
+        values = list(TASK_ABBREV.values())
+        assert len(values) == len(set(values)), "Duplicate abbreviations in TASK_ABBREV"
+
+
+class TestWiredTasksAccepted:
+    """Regression for #724: the wired tasks must pass validation, not be rejected."""
+
+    @pytest.mark.parametrize("task", WIRED_TASKS)
+    def test_wired_task_in_known_tasks(self, task: str) -> None:
+        assert task in KNOWN_TASKS
+
+    @pytest.mark.parametrize("task", WIRED_TASKS)
+    def test_resolver_validate_task_accepts(self, task: str) -> None:
+        from winml.modelkit.inspect.resolver import validate_task
+
+        validate_task(task)  # must not raise
+
+    @pytest.mark.parametrize("task", WIRED_TASKS)
+    def test_click_callback_accepts(self, task: str) -> None:
+        from winml.modelkit.commands.inspect import _validate_task
+
+        assert _validate_task(None, None, task) == task  # must not raise
+
+
+class TestStaleTasksDropped:
+    """Audit lock for #724: the 8 stale names stay out of every task view."""
+
+    @pytest.mark.parametrize("task", DROPPED_TASKS)
+    def test_dropped_from_known_tasks(self, task: str) -> None:
+        assert task not in KNOWN_TASKS
+
+    @pytest.mark.parametrize("task", DROPPED_TASKS)
+    def test_dropped_from_task_abbrev(self, task: str) -> None:
+        assert task not in TASK_ABBREV
+
+
+class TestInferenceTasksAreKnown:
+    """Every inference task must be reachable from the loader task registry.
+
+    This is the cross-source guard #724 was missing: a ``TaskInputSpec`` for a
+    genuinely new canonical task (as ``video-classification`` was) added to
+    ``inference.tasks`` but not to ``_TASK_REGISTRY`` would be silently rejected
+    by ``validate_task``.
+    """
+
+    # Pipeline-sugar aliases that live only in inference.tasks (they share another
+    # task's input schema; there is no separate canonical task). A NEW canonical
+    # task belongs in _TASK_REGISTRY (loader/task.py), NOT in this allowlist.
+    INFERENCE_ONLY_ALIASES = frozenset(
+        {"sentiment-analysis", "ner", "vqa", "text-to-speech"}
+    )
+
+    def test_inference_tasks_are_known_or_alias(self) -> None:
+        from winml.modelkit.inference.tasks import TASK_REGISTRY as INFERENCE_TASKS
+
+        recognized = KNOWN_TASKS | set(_TASK_ALIAS_ABBREV) | self.INFERENCE_ONLY_ALIASES
+        missing = set(INFERENCE_TASKS) - recognized
+        assert not missing, (
+            f"inference.tasks defines tasks unknown to the loader registry: {sorted(missing)}. "
+            "If these are real canonical tasks, add them to _TASK_REGISTRY in "
+            "src/winml/modelkit/loader/task.py (do not extend INFERENCE_ONLY_ALIASES)."
         )

From d4701d0e6a9d45c0dc33e057e73767145f41e02c Mon Sep 17 00:00:00 2001
From: Zhipeng Wang <zhiwang@microsoft.com>
Date: Fri, 26 Jun 2026 15:02:42 +0800
Subject: [PATCH 2/2] test: document new task-registry drift guards in module
 docstring

---
 tests/unit/loader/test_known_tasks.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/unit/loader/test_known_tasks.py b/tests/unit/loader/test_known_tasks.py
index 4738ce4e4..69eba2fc5 100644
--- a/tests/unit/loader/test_known_tasks.py
+++ b/tests/unit/loader/test_known_tasks.py
@@ -2,17 +2,22 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
-"""Tests for the hand-coded KNOWN_TASKS constant.
+"""Tests for the hand-coded task registry and its derived views.
 
-KNOWN_TASKS is hand-coded so that ``winml inspect --list-tasks`` does not
-need to import ``optimum.exporters`` (which transitively imports
-``transformers`` and adds ~10 s of startup latency).
+``_TASK_REGISTRY`` (and the ``KNOWN_TASKS`` / ``TASK_ABBREV`` views derived
+from it) is hand-coded so that ``winml inspect --list-tasks`` does not need to
+import ``optimum.exporters`` (which transitively imports ``transformers`` and
+adds ~10 s of startup latency).
 
 These tests guard against drift:
   * KNOWN_TASKS must be a superset of optimum's TasksManager task set, so
     no canonical task disappears from ``--list-tasks`` when optimum adds one.
   * KNOWN_TASKS must include every task registered in our own
     HF_TASK_DEFAULTS / HF_MODEL_CLASS_MAPPING.
+  * KNOWN_TASKS and TASK_ABBREV must stay derived from the single
+    ``_TASK_REGISTRY`` so the two views cannot desync (the root cause of #724).
+  * Every task wired into ``inference.tasks`` must be a known task, so a newly
+    added canonical task can never be silently rejected by ``validate_task``.
 """
 
 from __future__ import annotations