diff --git a/ci/run_mypy.sh b/ci/run_mypy.sh
index fdb3e026..d3f818d1 100755
--- a/ci/run_mypy.sh
+++ b/ci/run_mypy.sh
@@ -14,6 +14,7 @@
 
 set -e -E -u -o pipefail
 
+mypy --version
 mypy \
     --config-file ./pyproject.toml \
     --exclude=legateboost/test \
diff --git a/conda/environments/all_cuda-122.yaml b/conda/environments/all_cuda-122.yaml
index 68eb3824..6d4e98af 100644
--- a/conda/environments/all_cuda-122.yaml
+++ b/conda/environments/all_cuda-122.yaml
@@ -30,6 +30,7 @@ dependencies:
 - numpy
 - onnx>=1.10
 - onnxmltools>=1.10
+- onnxruntime
 - openblas
 - pydata-sphinx-theme>=0.16
 - pytest>=7,<8
diff --git a/dependencies.yaml b/dependencies.yaml
index 9cd93fba..3700d52e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -178,3 +178,4 @@ dependencies:
           - xgboost>=2.0
           - onnx>=1.10
           - onnxmltools>=1.10
+          - onnxruntime
diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index 80c8beb2..a7e1b50c 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -16,7 +16,14 @@
 from .input_validation import _lb_check_X, _lb_check_X_y, check_sample_weight
 from .metrics import BaseMetric, metrics
 from .models import BaseModel, Tree
-from .objectives import BaseObjective, objectives
+from .objectives import OBJECTIVES_MAP, BaseObjective
+from .onnx_utils import (
+    init_predictions,
+    make_model,
+    merge_model_graphs,
+    mirror_predict_proba_output,
+    reshape_predictions,
+)
 from .shapley import global_shapley_attributions, local_shapley_attributions
 from .utils import AddableMixin, AddMember, PickleCupynumericMixin
 
@@ -422,7 +429,7 @@ def fit(
 
         # setup objective
         if isinstance(self.objective, str):
-            self._objective_instance = objectives[self.objective]()
+            self._objective_instance = OBJECTIVES_MAP[self.objective]()
         elif isinstance(self.objective, BaseObjective):
             self._objective_instance = self.objective
         else:
@@ -528,6 +535,26 @@ def _predict(self, X: cn.ndarray) -> cn.ndarray:
             pred += Type.batch_predict(models, X)
         return pred
 
+    def predict_raw(self, X: cn.ndarray) -> cn.ndarray:
+        """Predict pre-transformed values for samples in X. E.g. before applying a
+        sigmoid function.
+
+        Parameters
+        ----------
+
+        X :
+            The input samples.
+
+        Returns
+        -------
+
+        y :
+            The predicted raw values for each sample in X.
+        """
+        X = _lb_check_X(X)
+        validate_data(self, X, reset=False, skip_check_array=True)
+        return self._predict(X)
+
     def dump_models(self) -> str:
         """Dumps the models in the current instance to a string.
 
@@ -540,6 +567,29 @@ def dump_models(self) -> str:
             text += str(m)
         return text
 
+    def _to_onnx_predict_raw(self, X: cn.ndarray) -> Any:
+        init_graph = init_predictions(self.model_init_, X.dtype)
+        graph = merge_model_graphs([init_graph] + [m.to_onnx(X) for m in self.models_])
+        # remove the X_out output, we only need the predictions
+        graph.output.remove(graph.output[0])
+        return graph
+
+    def _to_onnx_predict_transformed(self, X: cn.ndarray) -> Any:
+        import onnx
+
+        graph = onnx.compose.merge_graphs(
+            self._to_onnx_predict_raw(X),
+            self._objective_instance.onnx_transform(self.predict_raw(X[0:1])),
+            io_map=[
+                (
+                    "model_{}_predictions_out".format(len(self.models_) - 1),
+                    "predictions_in",
+                )
+            ],
+            prefix2="transform_",
+        )
+        return graph
+
     def global_attributions(
         self,
         X: cn.array,
@@ -701,10 +751,10 @@ class LBRegressor(RegressorMixin, LBBase):
     Examples
     --------
     >>> import cupynumeric as cn
-    >>> import legateboost as lbst
+    >>> import legateboost as lb
     >>> X = cn.random.random((1000, 10))
     >>> y = cn.random.random(X.shape[0])
-    >>> model = lbst.LBRegressor(n_estimators=5).fit(X, y)
+    >>> model = lb.LBRegressor(n_estimators=5).fit(X, y)
     >>>
     """
 
@@ -827,6 +877,58 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
             pred = pred.squeeze(axis=1)
         return pred
 
+    def to_onnx(self, X: cn.ndarray, predict_function: str = "predict") -> Any:
+        """Converts the estimator to an ONNX model which is expected to produce
+        equivalent predictions to `predict_function` up to reasonable floating
+        point tolerance. The ONNX model is hard coded to the X input data type,
+        separate models should be generated for float and double. The ONNX model
+        takes "X_in" as input and produces "predictions_out" as output.
+
+        Parameters
+        ----------
+        X:
+            Example input data. Use to infer input data characteristics.
+            A model produced for float32 will not accept float64 input and vice versa.
+        predict_function : str
+            The serialised ONNX model can produce output equivalent to 'predict' or
+            'predict_raw'.
+            The default is "predict".
+        Returns
+        -------
+        Any
+            The ONNX model.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import legateboost as lb
+        >>> X = np.random.random((1000, 10))
+        >>> y = np.random.random(X.shape[0])
+        >>> model = lb.LBRegressor(n_estimators=5).fit(X, y)
+        >>> import onnxruntime as ort
+        >>> sess = ort.InferenceSession(model.to_onnx(X).SerializeToString())
+        >>> onnx_pred = sess.run(None, {"X_in": X})[0]
+        >>> assert np.allclose(model.predict(X), onnx_pred, atol=1e-6)
+        >>>
+        """
+        import onnx
+
+        if predict_function not in ["predict", "predict_raw"]:
+            raise ValueError(
+                "predict_function should be one of ['predict', 'predict_raw']"
+            )
+        if predict_function == "predict":
+            graph = self._to_onnx_predict_transformed(X)
+        else:
+            graph = self._to_onnx_predict_raw(X)
+
+        # coerce the output shape to be the same as the equivalent predict function
+        test_pred = getattr(self, predict_function)(X[0:1])
+        graph = reshape_predictions(graph, test_pred)
+        model = make_model(graph)
+        onnx.checker.check_model(model, full_check=True)
+        return model
+
 
 class LBClassifier(ClassifierMixin, LBBase):
     """Implements a gradient boosting algorithm for classification problems.
@@ -1031,26 +1133,6 @@ def fit(
         )
         return self
 
-    def predict_raw(self, X: cn.ndarray) -> cn.ndarray:
-        """Predict pre-transformed values for samples in X. E.g. before applying a
-        sigmoid function.
-
-        Parameters
-        ----------
-
-        X :
-            The input samples.
-
-        Returns
-        -------
-
-        y :
-            The predicted raw values for each sample in X.
-        """
-        X = _lb_check_X(X)
-        validate_data(self, X, reset=False, skip_check_array=True)
-        return super()._predict(X)
-
     def predict_proba(self, X: cn.ndarray) -> cn.ndarray:
         """Predict class probabilities for samples in X.
 
@@ -1092,3 +1174,74 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
         """
         check_is_fitted(self)
         return self._objective_instance.output_class(self.predict_proba(X))
+
+    def to_onnx(self, X: cn.ndarray, predict_function: str = "predict") -> Any:
+        """Converts the estimator to an ONNX model which is expected to produce
+        equivalent predictions to `predict_function` up to reasonable floating
+        point tolerance. The ONNX model is hard coded to the X input data type,
+        separate models should be generated for float and double. The ONNX model
+        takes "X_in" as input and produces "predictions_out" as output.
+
+        Parameters
+        ----------
+        X:
+            Example input data. Use to infer input data characteristics.
+            A model produced for float32 will not accept float64 input and vice versa.
+        predict_function : str
+            The serialised ONNX model can produce output equivalent to 'predict',
+            'predict_proba', or 'predict_raw'.
+            The default is "predict".
+        Returns
+        -------
+        Any
+            The ONNX model.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import legateboost as lb
+        >>> X = np.random.random((1000, 10))
+        >>> y = np.random.randint(0, 2, X.shape[0])
+        >>> model = lb.LBClassifier(n_estimators=5).fit(X, y)
+        >>> import onnxruntime as ort
+        >>> sess = ort.InferenceSession(model.to_onnx(X,
+        ...     predict_function="predict_proba").SerializeToString())
+        >>> onnx_pred = sess.run(None, {"X_in": X})[0]
+        >>> assert np.allclose(model.predict_proba(X), onnx_pred, atol=1e-6)
+        >>>
+        """
+        import onnx
+
+        if predict_function not in ["predict", "predict_proba", "predict_raw"]:
+            raise ValueError(
+                "predict_function should be one of ['predict',"
+                " 'predict_proba', 'predict_raw']"
+            )
+        if predict_function in ["predict_proba", "predict"]:
+            graph = self._to_onnx_predict_transformed(X)
+            # need to mirror the output when we only output one target
+            if self.predict_raw(X[0:1]).shape[1] == 1:
+                graph = mirror_predict_proba_output(graph)
+        if predict_function == "predict":
+            # argmax the predict_proba output
+            argmax = self._objective_instance.onnx_output_class(
+                self.predict_proba(X[0:1])
+            )
+            graph = onnx.compose.merge_graphs(
+                graph,
+                argmax,
+                io_map=[
+                    (graph.output[0].name, "predictions_in"),
+                ],
+                prefix2="classifier_predict_",
+            )
+
+        elif predict_function == "predict_raw":
+            graph = self._to_onnx_predict_raw(X)
+
+        # coerce the output shape to be the same as the equivalent predict function
+        test_pred = getattr(self, predict_function)(X[0:1])
+        graph = reshape_predictions(graph, test_pred)
+        model = make_model(graph)
+        onnx.checker.check_model(model, full_check=True)
+        return model
diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py
index a1e88011..38e37cb7 100644
--- a/legateboost/models/base_model.py
+++ b/legateboost/models/base_model.py
@@ -127,12 +127,25 @@ def __mul__(self, scalar: Any) -> "BaseModel":
     def __hash__(self) -> int:
         return hash(str(self))
 
-    def to_onnx(self) -> Any:
-        """Convert the model to an ONNX model.
+    def to_onnx(self, X: cn.array) -> Any:
+        """Convert the model to an ONNX graph.
 
+        The implemented ONNX graph should accept the following two inputs:
+        - "X_in" : 2D tensor of shape (n_samples, n_features) and type `X_dtype`.
+        - "predictions in" : 2D tensor of shape (n_samples, n_outputs) and type double.
+        The graph should output:
+        - "predictions out" : 2D tensor of shape (n_samples, n_outputs) and type double.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Example input X matrix. Used to infer type and shape of the input.
+
+        y_pred : ndarray of shape (n_samples,)
+            The predicted labels.
         Returns
         -------
         Any
-            The ONNX model.
+            The ONNX graph.
         """
         raise NotImplementedError
diff --git a/legateboost/models/krr.py b/legateboost/models/krr.py
index 31af8d5a..0a41788d 100644
--- a/legateboost/models/krr.py
+++ b/legateboost/models/krr.py
@@ -243,115 +243,40 @@ def __mul__(self, scalar: Any) -> "KRR":
         self.betas_ *= scalar
         return new
 
-    def to_onnx(self) -> Any:
-        from onnx import numpy_helper
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_tensor_value_info,
-            np_dtype_to_tensor_dtype,
-        )
-
-        assert self.X_train.dtype == self.betas_.dtype
-
-        def make_constant_node(value: cn.array, name: str) -> Any:
-            return make_node(
-                "Constant",
-                inputs=[],
-                value=numpy_helper.from_array(value, name=name),
-                outputs=[name],
-            )
-
-        nodes = []
+    def to_onnx(self, X: cn.array) -> Any:
+        import onnx
 
-        # model constants
-        betas = numpy_helper.from_array(self.betas_.__array__(), name="betas")
-        X_train = numpy_helper.from_array(self.X_train.__array__(), name="X_train")
-
-        # pred inputs
-        X = make_tensor_value_info(
-            "X",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[1]],
-        )
-        pred = make_tensor_value_info(
-            "pred",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.betas_.shape[1]],
-        )
-
-        # exanded l2 distance
-        # distance = np.sum(X**2, axis=1)[:, np.newaxis] - 2 * np.dot(X, self.X_train.T)
-        # + np.sum(self.X_train**2, axis=1)
-        make_tensor_value_info(
-            "XX", np_dtype_to_tensor_dtype(self.betas_.dtype), [None]
-        )
-        make_tensor_value_info(
-            "YY",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [self.X_train.shape[0], 1],
-        )
-        make_tensor_value_info(
-            "XY_reshaped",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [1, self.X_train.shape[0]],
-        )
-        make_tensor_value_info(
-            "XY",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[0]],
-        )
-        nodes.append(make_constant_node(np.array([1]), "axis1"))
-        nodes.append(make_node("ReduceSumSquare", ["X", "axis1"], ["XX"]))
-        nodes.append(make_node("Gemm", ["X", "X_train"], ["XY"], alpha=-2.0, transB=1))
-        nodes.append(make_node("ReduceSumSquare", ["X_train", "axis1"], ["YY"]))
-        nodes.append(make_constant_node(np.array([1, -1]), "reshape"))
-        nodes.append(make_node("Reshape", ["YY", "reshape"], ["YY_reshaped"]))
-        nodes.append(make_node("Add", ["XX", "XY"], ["add0"]))
-        make_tensor_value_info(
-            "l2",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[0]],
-        )
-        nodes.append(make_node("Add", ["YY_reshaped", "add0"], ["l2"]))
-        nodes.append(make_constant_node(np.array([0.0], self.betas_.dtype), "zero"))
-        make_tensor_value_info(
-            "l2_clipped",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[0]],
-        )
-        nodes.append(make_node("Max", ["l2", "zero"], ["l2_clipped"]))
-
-        # RBF kernel
-        # K = np.exp(-distance / (2 * self.sigma**2))
-        make_tensor_value_info(
-            "rbf0",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[0]],
-        )
+        X_type_text = "double" if X.dtype == cn.float64 else "float"
         if self.sigma is None:
-            raise ValueError("sigma is None. Has fit been called?")
-        nodes.append(
-            make_constant_node(
-                np.array([-2.0 * self.sigma**2], self.betas_.dtype), "denominator"
-            )
-        )
-        nodes.append(make_node("Div", ["l2_clipped", "denominator"], ["rbf0"]))
-        make_tensor_value_info(
-            "K",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[0]],
-        )
-        nodes.append(make_node("Exp", ["rbf0"], ["K"]))
-
-        # prediction
-        # pred = np.dot(K, self.betas_)
-        nodes.append(make_node("MatMul", ["K", "betas"], ["pred"]))
-        graph = make_graph(
-            nodes, "legateboost.model.KRR", [X], [pred], [betas, X_train]
+            raise ValueError("Model has not been trained. Cannot export to ONNX.")
+        denominator = -2.0 * self.sigma**2
+        onnx_text = f"""
+        KRRModel ({X_type_text}[N, M] X_in, double[N, K] predictions_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out)
+        {{
+            X_out = Identity(X_in)
+            axis1 = Constant<value_ints=[1]>()
+            XX = ReduceSumSquare(X_in, axis1)
+            XY = Gemm<alpha=-2.0, transB=1>(X_in, X_train)
+            YY = ReduceSumSquare(X_train, axis1)
+            reshape = Constant<value_ints=[1, -1]>()
+            YY_reshaped = Reshape(YY, reshape)
+            add0 = Add(XX, XY)
+            l2 = Add(YY_reshaped, add0)
+            zero = Constant<value= {X_type_text}{{0.0}}>()
+            l2_clipped = Max(l2, zero)
+            denominator = Constant<value={X_type_text} {{{denominator}}}>()
+            rbf0 = Div(l2_clipped, denominator)
+            K = Exp(rbf0)
+            dot = MatMul(K, betas)
+            dot_double = Cast<to=11>(dot)
+            predictions_out = Add(dot_double, predictions_in)
+        }}
+        """  # noqa: E501
+        graph = onnx.parser.parse_graph(onnx_text)
+        graph.initializer.extend(
+            [
+                onnx.numpy_helper.from_array(self.betas_.__array__(), name="betas"),
+                onnx.numpy_helper.from_array(self.X_train.__array__(), name="X_train"),
+            ]
         )
-        onnx_model = make_model(graph)
-        check_model(onnx_model)
-        return onnx_model
+        return graph
diff --git a/legateboost/models/linear.py b/legateboost/models/linear.py
index ec34594e..1319d2de 100644
--- a/legateboost/models/linear.py
+++ b/legateboost/models/linear.py
@@ -152,36 +152,27 @@ def __mul__(self, scalar: Any) -> "Linear":
         new.betas_ *= scalar
         return new
 
-    def to_onnx(self) -> Any:
-        from onnx import numpy_helper
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_tensor_value_info,
-            np_dtype_to_tensor_dtype,
+    def to_onnx(self, X: cn.array) -> Any:
+        import onnx
+
+        X_type_text = "double" if X.dtype == cn.float64 else "float"
+        onnx_text = f"""
+        LinearModel ({X_type_text}[N, M] X_in, double[N, K] predictions_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out)
+        {{
+            X_out = Identity(X_in)
+            mult = MatMul(X_in, betas)
+            result = Add(mult, intercept)
+            result_double = Cast<to=11>(result)
+            predictions_out = Add(result_double, predictions_in)
+        }}
+        """  # noqa: E501
+        graph = onnx.parser.parse_graph(onnx_text)
+        graph.initializer.extend(
+            [
+                onnx.numpy_helper.from_array(self.betas_[1:].__array__(), name="betas"),
+                onnx.numpy_helper.from_array(
+                    self.betas_[0].__array__(), name="intercept"
+                ),
+            ]
         )
-
-        # model constants
-        betas = numpy_helper.from_array(self.betas_[1:].__array__(), name="betas")
-        intercept = numpy_helper.from_array(
-            self.betas_[0].__array__(), name="intercept"
-        )
-
-        # pred inputs
-        X = make_tensor_value_info(
-            "X", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, None]
-        )
-        pred = make_tensor_value_info(
-            "pred", np_dtype_to_tensor_dtype(self.betas_.dtype), [None]
-        )
-
-        node1 = make_node("MatMul", ["X", "betas"], ["XBeta"])
-        node2 = make_node("Add", ["XBeta", "intercept"], ["pred"])
-        graph = make_graph(
-            [node1, node2], "legateboost.model.Linear", [X], [pred], [betas, intercept]
-        )
-        onnx_model = make_model(graph)
-        check_model(onnx_model)
-        return onnx_model
+        return graph
diff --git a/legateboost/models/nn.py b/legateboost/models/nn.py
index 356f0264..969dc0df 100644
--- a/legateboost/models/nn.py
+++ b/legateboost/models/nn.py
@@ -182,12 +182,10 @@ def __mul__(self, scalar: Any) -> "NN":
         new.biases_[-1] *= scalar
         return new
 
-    def to_onnx(self) -> Any:
-        from onnx import numpy_helper
-        from onnx.checker import check_model
+    def to_onnx(self, X: cn.array) -> Any:
+        from onnx import TensorProto, numpy_helper
         from onnx.helper import (
             make_graph,
-            make_model,
             make_node,
             make_tensor_value_info,
             np_dtype_to_tensor_dtype,
@@ -204,64 +202,74 @@ def to_onnx(self) -> Any:
         ]
 
         # pred inputs
-        X = make_tensor_value_info(
-            "X",
+        n_outputs = self.coefficients_[-1].shape[1]
+        n_features = self.coefficients_[0].shape[0]
+        X_in = make_tensor_value_info(
+            "X_in",
             np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
             [None, self.coefficients_[0].shape[0]],
         )
-
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, n_outputs],
+        )
         nodes = []
 
-        make_tensor_value_info(
-            "activations0",
-            np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
-            [None, None],
-        )
-        nodes.append(make_node("MatMul", ["X", "coefficients0"], ["activations0"]))
-        activations_with_bias = make_tensor_value_info(
-            "activations0withbias",
-            np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
-            [None, None],
-        )
+        nodes.append(make_node("MatMul", ["X_in", "coefficients0"], ["activations0"]))
         nodes.append(
             make_node("Add", ["activations0", "bias0"], ["activations0withbias"])
         )
 
         for i in range(1, len(coefficients)):
-            make_tensor_value_info(
-                f"tanh{i}",
-                np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
-                [None, None],
-            )
             nodes.append(make_node("Tanh", [f"activations{i-1}withbias"], [f"tanh{i}"]))
-            make_tensor_value_info(
-                f"activations{i}",
-                np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
-                [None, None],
-            )
             nodes.append(
                 make_node(
                     "MatMul", [f"tanh{i}", f"coefficients{i}"], [f"activations{i}"]
                 )
             )
-            activations_with_bias = make_tensor_value_info(
-                f"activations{i}withbias",
-                np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
-                [None, None],
-            )
             nodes.append(
                 make_node(
                     "Add", [f"activations{i}", f"bias{i}"], [f"activations{i}withbias"]
                 )
             )
 
+        # outputs
+        X_out = make_tensor_value_info(
+            "X_out",
+            np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
+            [None, n_features],
+        )
+        nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            TensorProto.DOUBLE,
+            [None, n_outputs],
+        )
+        nodes.append(
+            make_node(
+                "Cast",
+                ["activations{}withbias".format(len(self.coefficients_) - 1)],
+                ["casted"],
+                to=TensorProto.DOUBLE,
+            )
+        )
+        nodes.append(
+            make_node(
+                "Add",
+                [
+                    "casted",
+                    "predictions_in",
+                ],
+                ["predictions_out"],
+            )
+        )
+
         graph = make_graph(
             nodes,
             "legateboost.model.NN",
-            [X],
-            [activations_with_bias],
+            [X_in, predictions_in],
+            [X_out, predictions_out],
             biases + coefficients,
         )
-        onnx_model = make_model(graph)
-        check_model(onnx_model)
-        return onnx_model
+        return graph
diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index 38cfda93..b5827f13 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -1,9 +1,11 @@
 import copy
 import warnings
+from dataclasses import dataclass
 from enum import IntEnum
-from typing import Any, Callable, List, Sequence, Union, cast
+from typing import Any, Callable, Dict, List, Sequence, Union, cast
 
 import numpy as np
+import numpy.typing as npt
 
 import cupynumeric as cn
 from legate.core import TaskTarget, get_legate_runtime, types
@@ -316,123 +318,176 @@ def __mul__(self, scalar: Any) -> "Tree":
         new.leaf_value *= scalar
         return new
 
-    def to_onnx(self) -> Any:
-        import onnx
-        from onnx import numpy_helper
-        from onnx.checker import check_model
+    # copy the tree structure to numpy arrays
+    # cupynumeric element access is very slow
+    @dataclass
+    class TreeAsNumpy:
+        leaf_value: npt.NDArray[np.float64]
+        feature: npt.NDArray[np.int32]
+        split_value: npt.NDArray[np.float64]
+        gain: npt.NDArray[np.float64]
+        hessian: npt.NDArray[np.float64]
+        is_leaf: npt.NDArray[np.bool_]
+
+    # structure of arrays for tree structure expected by onnx
+    # this container is a convenience to not have 7 function arguments
+    class OnnxSoa:
+        def __init__(self, size: int, n_outputs: int) -> None:
+            self.nodes_modes: npt.NDArray[str] = np.full(size, "BRANCH_LEQ")
+            self.nodes_featureids: npt.NDArray[np.int32] = np.full(
+                size, -1, dtype=np.int32
+            )
+            self.nodes_truenodeids: npt.NDArray[np.int32] = np.full(
+                size, -1, dtype=np.int32
+            )
+            self.nodes_falsenodeids: npt.NDArray[np.int32] = np.full(
+                size, -1, dtype=np.int32
+            )
+            self.nodes_nodeids: npt.NDArray[np.int32] = np.arange(size, dtype=np.int32)
+            self.nodes_values: npt.NDArray[np.float64] = np.full(
+                size, -1.0, dtype=np.float64
+            )
+            self.leaf_weights: npt.NDArray[np.float64] = np.full(
+                (size, n_outputs), -1.0, dtype=np.float64
+            )
+
+    def recurse_tree(
+        self, tree: TreeAsNumpy, soa: OnnxSoa, old_node_idx: int, new_node_idx: int
+    ) -> int:
+        # new_node_idx is sparse
+        if tree.is_leaf[old_node_idx]:
+            soa.nodes_modes[new_node_idx] = "LEAF"
+            soa.leaf_weights[new_node_idx] = tree.leaf_value[old_node_idx]
+            return new_node_idx
+        else:
+            soa.nodes_modes[new_node_idx] = "BRANCH_LEQ"
+            soa.nodes_featureids[new_node_idx] = tree.feature[old_node_idx]
+            soa.nodes_values[new_node_idx] = tree.split_value[old_node_idx]
+            left_child_idx = new_node_idx + 1
+            soa.nodes_truenodeids[new_node_idx] = left_child_idx
+            node_idx_counter = self.recurse_tree(
+                tree, soa, self.left_child(old_node_idx), left_child_idx
+            )
+            right_child_idx = node_idx_counter + 1
+            soa.nodes_falsenodeids[new_node_idx] = right_child_idx
+            node_idx_counter = self.recurse_tree(
+                tree, soa, self.right_child(old_node_idx), right_child_idx
+            )
+
+        return node_idx_counter
+
+    def to_onnx(self, X: cn.array) -> Any:
+        from onnx import TensorProto, numpy_helper
         from onnx.helper import (
             make_graph,
-            make_model,
-            make_tensor,
+            make_node,
             make_tensor_value_info,
+            np_dtype_to_tensor_dtype,
+        )
+
+        num_sparse_nodes = (self.hessian[:, 0] > 0.0).sum()
+        num_outputs = self.leaf_value.shape[1]
+        # copy the tree as numpy because single element
+        # access with cupynumeric is very slow
+        tree = Tree.TreeAsNumpy(
+            self.leaf_value.__array__(),
+            self.feature.__array__(),
+            self.split_value.__array__(),
+            self.gain.__array__(),
+            self.hessian.__array__(),
+            self.feature.__array__() == -1,
         )
+        soa = Tree.OnnxSoa(num_sparse_nodes, num_outputs)
+        # This recursive function could become a bottleneck for large trees
+        # In this case consider implmenting a C++ legate task for this conversion
+        # Cython could also work
+        self.recurse_tree(tree, soa, 0, 0)
 
         onnx_nodes = []
 
-        # We map the legate-boost tree representation to the TreeEnsemble ONNX operator
-        # the features array, splits array, and leaf weights can be passed unchanged
-        # ONNX then requires some extra arrays to represent the tree structure
-        # - nodes_truenodeidx is the index of the left child for a given node
-        # - nodes_falsenodeidx is the index of the right child for a given node
-        # - nodes_modes indicates that nodes use a <= comparison operator
-        # - nodes_trueleafs indicates that the left child is a leaf node
-        # - nodes_falseleafs indicates that the right child is a leaf node
-        # - leaf_targetids indicates which output the leaf node corresponds to
-        # ONNX does not support vector leaf so we will repeat the tree n_outputs
-        # times, each time with a different constant for leaf_targetids
-        # This is not ideal but I don't see a better way
-
-        tree_max_nodes = self.feature.size
-        all_nodes_idx = np.arange(tree_max_nodes)
-        nodes_featureids = self.feature.__array__()
-        nodes_splits = numpy_helper.from_array(self.split_value.__array__())
-        nodes_truenodeids = self.left_child(all_nodes_idx)
-        # get the left child of each node and check if it is a leaf
-        # if the node is already leaf then its child can go off the end of the array
-        # use np.minimum to avoid this
-        nodes_trueleafs = self.is_leaf(
-            np.minimum(tree_max_nodes - 1, self.left_child(all_nodes_idx))
-        ).astype(int)
-        nodes_falsenodeids = self.right_child(all_nodes_idx)
-        nodes_falseleafs = self.is_leaf(
-            np.minimum(tree_max_nodes - 1, self.right_child(all_nodes_idx))
-        ).astype(int)
-
-        for output_idx in range(0, self.leaf_value.shape[1]):
-            leaf_targetids = np.full(self.feature.size, output_idx, dtype=np.int64)
-            leaf_weights = numpy_helper.from_array(
-                self.leaf_value[:, output_idx].__array__()
+        kwargs: Dict[str, Any] = {}
+        # TreeEnsembleRegressor asks us to pass these as tensors when X.dtype is double
+        # we simply pass a set of indices as leaf weights and then add a node later to
+        # look up the (vector valued) leaf weights
+        if X.dtype == np.float32:
+            kwargs["nodes_values"] = soa.nodes_values.astype(np.float32)
+            kwargs["target_weights"] = soa.nodes_nodeids.astype(np.float32)
+        else:
+            kwargs["nodes_values_as_tensor"] = numpy_helper.from_array(
+                soa.nodes_values, name="nodes_values"
+            )
+            kwargs["target_weights_as_tensor"] = numpy_helper.from_array(
+                soa.nodes_nodeids.astype(np.float64), name="target_weights"
             )
 
-            onnx_nodes.append(
-                onnx.helper.make_node(
-                    "TreeEnsemble",
-                    ["X"],
-                    ["pred" + str(output_idx)],
-                    domain="ai.onnx.ml",
-                    n_targets=self.leaf_value.shape[1],
-                    membership_values=None,
-                    nodes_missing_value_tracks_true=None,
-                    nodes_hitrates=None,
-                    aggregate_function=1,
-                    post_transform=0,
-                    tree_roots=[0],
-                    nodes_modes=make_tensor(
-                        "nodes_modes",
-                        onnx.TensorProto.UINT8,
-                        self.feature.shape,
-                        np.zeros_like(self.feature, dtype=np.uint8),
-                    ),
-                    nodes_featureids=nodes_featureids,
-                    nodes_splits=nodes_splits,
-                    nodes_truenodeids=nodes_truenodeids,
-                    nodes_trueleafs=nodes_trueleafs,
-                    nodes_falsenodeids=nodes_falsenodeids,
-                    nodes_falseleafs=nodes_falseleafs,
-                    leaf_targetids=leaf_targetids,
-                    leaf_weights=leaf_weights,
-                )
+        # TreeEnsembleRegressor is deprecated, but its successor TreeEnsemble
+        # is at the time of writing not available from onnxruntime on conda-forge
+        # This can be updated at some point without too much trouble
+        onnx_nodes.append(
+            make_node(
+                "TreeEnsembleRegressor",
+                ["X_in"],
+                ["predicted_leaf_index"],
+                domain="ai.onnx.ml",
+                n_targets=1,
+                membership_values=None,
+                nodes_missing_value_tracks_true=None,
+                nodes_hitrates=None,
+                nodes_modes=soa.nodes_modes,
+                nodes_featureids=soa.nodes_featureids,
+                nodes_truenodeids=soa.nodes_truenodeids,
+                nodes_falsenodeids=soa.nodes_falsenodeids,
+                nodes_nodeids=soa.nodes_nodeids,
+                nodes_treeids=np.zeros_like(soa.nodes_nodeids, dtype=np.int64),
+                target_ids=np.zeros_like(soa.nodes_nodeids, dtype=np.int64),
+                target_nodeids=soa.nodes_nodeids,
+                target_treeids=np.zeros_like(soa.nodes_nodeids, dtype=np.int64),
+                **kwargs,
             )
+        )
 
-            if output_idx == 0:
-                accumulated_pred = make_tensor_value_info(
-                    "accumulated_pred0", onnx.TensorProto.DOUBLE, [None, None]
-                )
-                onnx_nodes.append(
-                    onnx.helper.make_node(
-                        "Identity",
-                        ["pred" + str(output_idx)],
-                        ["accumulated_pred0"],
-                    )
-                )
-            else:
-                accumulated_pred = make_tensor_value_info(
-                    "accumulated_pred" + str(output_idx),
-                    onnx.TensorProto.DOUBLE,
-                    [None, None],
-                )
-                onnx_nodes.append(
-                    onnx.helper.make_node(
-                        "Add",
-                        [
-                            "accumulated_pred" + str(output_idx - 1),
-                            "pred" + str(output_idx),
-                        ],
-                        ["accumulated_pred" + str(output_idx)],
-                    )
-                )
+        leaf_weights = numpy_helper.from_array(soa.leaf_weights, name="leaf_weights")
+        predictions_out = make_tensor_value_info(
+            "predictions_out", TensorProto.DOUBLE, [None, num_outputs]
+        )
+        # make indices 1-d
+        onnx_nodes.append(
+            make_node(
+                "Squeeze", ["predicted_leaf_index"], ["predicted_leaf_index_squeezed"]
+            )
+        )
+        onnx_nodes.append(
+            make_node(
+                "Cast",
+                ["predicted_leaf_index_squeezed"],
+                ["predicted_leaf_index_int"],
+                to=TensorProto.INT32,
+            )
+        )
+        onnx_nodes.append(
+            make_node(
+                "Gather", ["leaf_weights", "predicted_leaf_index_int"], ["gathered"]
+            )
+        )
+        predictions_in = make_tensor_value_info(
+            "predictions_in", TensorProto.DOUBLE, [None, num_outputs]
+        )
+        onnx_nodes.append(
+            make_node("Add", ["predictions_in", "gathered"], ["predictions_out"])
+        )
 
-        # pred inputs
-        X = make_tensor_value_info("X", onnx.TensorProto.DOUBLE, [None, None])
-        graph = make_graph(
-            onnx_nodes, "legateboost.models.Tree", [X], [accumulated_pred]
+        X_in = make_tensor_value_info(
+            "X_in", np_dtype_to_tensor_dtype(X.dtype), [None, None]
         )
-        model = make_model(
-            graph,
-            opset_imports=[
-                onnx.helper.make_opsetid("ai.onnx.ml", 5),
-                onnx.helper.make_opsetid("", 14),
-            ],
+        X_out = make_tensor_value_info(
+            "X_out", np_dtype_to_tensor_dtype(X.dtype), [None, None]
+        )
+        onnx_nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
+        graph = make_graph(
+            onnx_nodes,
+            "legateboost.models.Tree",
+            [X_in, predictions_in],
+            [X_out, predictions_out],
+            [leaf_weights],
         )
-        check_model(model)
-        return model
+        return graph
diff --git a/legateboost/objectives.py b/legateboost/objectives.py
index ea389660..08219f29 100644
--- a/legateboost/objectives.py
+++ b/legateboost/objectives.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Tuple
+from typing import Any, Tuple
 
 from scipy.stats import norm
 from typing_extensions import TypeAlias, override
@@ -43,6 +43,7 @@ class BaseObjective(ABC):
 
     # utility constant
     one = cn.ones(1, dtype=cn.float64)
+    half = cn.array(0.5, dtype=cn.float64)
 
     @abstractmethod
     def gradient(self, y: cn.ndarray, pred: cn.ndarray) -> GradPair:
@@ -70,6 +71,30 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         """
         return pred
 
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
+        """Returns an ONNX graph that accepts
+        - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double.
+        And outputs the transformed predictions.
+        - "predictions_out" : arbitrary tensor depending on the objective.
+
+        Is by default the identity transform.
+
+        The ONNX transform should produce the same output as the transform
+        method for each objective.
+
+        Returns:
+            Onnx graph that transforms the predictions.
+        """
+        import onnx
+
+        onnx_text = """
+        BaseObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            predictions_out = Identity(predictions_in)
+        }
+        """
+        return onnx.parser.parse_graph(onnx_text)
+
     @abstractmethod
     def metric(self) -> BaseMetric:
         """Returns the default error metric for the objective function.
@@ -115,6 +140,25 @@ def output_class(self, pred: cn.ndarray) -> cn.ndarray:
         """
         return cn.argmax(pred, axis=-1)
 
+    def onnx_output_class(self, pred: cn.ndarray) -> Any:
+        """Returns an ONNX model that accepts
+        - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double.
+        And outputs the predicted class labels.
+        - "predictions_out" : 1D tensor of shape (n_samples,) and type int32.
+
+        Returns:
+            Onnx model that converts probabilities into class labels.
+        """
+        import onnx
+
+        onnx_text = """
+        BaseModelOutputClass (double[N, M] predictions_in) => (int64[N, M] predictions_out)
+        {
+            predictions_out = ArgMax<axis = -1>(predictions_in)
+        }
+        """  # noqa: E501
+        return onnx.parser.parse_graph(onnx_text)
+
 
 class SquaredErrorObjective(BaseObjective):
     """The Squared Error objective function for regression problems.
@@ -243,6 +287,29 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         pred[:, :, 1] = cn.clip(pred[:, :, 1], -5, 5)
         return pred
 
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
+        import onnx
+
+        onnx_text = """
+        NormalObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            out_shape = Constant<value_ints = [0,-1,2]>()
+            var_starts = Constant<value_ints = [0,0,1]>()
+            mean_starts = Constant<value_ints = [0,0,0]>()
+            axis = Constant<value_ints = [0,1,2]>()
+            steps = Constant<value_ints = [1,1,2]>()
+            min = Constant<value = double {-5.0}>()
+            max = Constant<value = double {5.0}>()
+            reshaped = Reshape(predictions_in, out_shape)
+            new_shape = Shape(reshaped)
+            variance = Slice(reshaped, var_starts, new_shape, axis, steps)
+            mean = Slice(reshaped, mean_starts, new_shape, axis, steps)
+            clipped_variance = Clip(variance, min, max)
+            predictions_out = Concat<axis=2>(mean, clipped_variance)
+        }
+        """
+        return onnx.parser.parse_graph(onnx_text)
+
     @override
     def mean(self, param: cn.ndarray) -> cn.ndarray:
         """Return the mean for the Normal distribution."""
@@ -321,6 +388,18 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         """Inverse log link."""
         return cn.exp(pred)
 
+    @override
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
+        import onnx
+
+        onnx_text = """
+        GammaDevianceTransform (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            predictions_out = Exp(predictions_in)
+        }
+        """  # noqa: E501
+        return onnx.parser.parse_graph(onnx_text)
+
     def initialise_prediction(
         self, y: cn.ndarray, w: cn.ndarray, boost_from_average: bool
     ) -> cn.ndarray:
@@ -365,6 +444,18 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         assert pred.ndim == 3
         return cn.exp(pred)
 
+    @override
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
+        import onnx
+
+        onnx_text = """
+        GammaTransform (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            predictions_out = Exp(predictions_in)
+        }
+        """
+        return onnx.parser.parse_graph(onnx_text)
+
     @override
     def metric(self) -> GammaLLMetric:
         return GammaLLMetric()
@@ -421,7 +512,7 @@ def var(self, param: cn.ndarray) -> cn.ndarray:
 class QuantileObjective(BaseObjective):
     """Minimises the quantile loss, otherwise known as check loss or pinball loss.
 
-    :math:`L(y_i, p_i) = \\frac{1}{k}\\sum_{j=1}^{k} (q_j - \\mathbb{1})(y_i - p_{i, j})`
+    :math:`L(y_i, p_i) = \\frac{}{k}\\sum_{j=1}^{k} (q_j - \\mathbb{1})(y_i - p_{i, j})`
 
     where
 
@@ -511,6 +602,18 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         div = cn.sum(e_x, axis=1)
         return e_x / div[:, cn.newaxis]
 
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
+        import onnx
+
+        operator_to_use = "Sigmoid" if pred.shape[1] == 1 else "Softmax"
+        onnx_text = f"""
+        LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {{
+            predictions_out = {operator_to_use}(predictions_in)
+        }}
+        """
+        return onnx.parser.parse_graph(onnx_text)
+
     def metric(self) -> LogLossMetric:
         return LogLossMetric()
 
@@ -547,8 +650,32 @@ def gradient(self, y: cn.ndarray, pred: cn.ndarray) -> GradPair:
     def transform(self, pred: cn.ndarray) -> cn.ndarray:
         return self.one / (self.one + cn.exp(-pred))
 
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
+        import onnx
+
+        onnx_text = """
+        MultiLabelObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            predictions_out = Sigmoid(predictions_in)
+        }
+        """  # noqa: E501
+        return onnx.parser.parse_graph(onnx_text)
+
     def output_class(self, pred: cn.ndarray) -> cn.ndarray:
-        return cn.array(pred > 0.5, dtype=cn.int32).squeeze()
+        return cn.array(pred > 0.5, dtype=cn.int64)
+
+    def onnx_output_class(self, pred: cn.ndarray) -> Any:
+        import onnx
+
+        onnx_text = """
+        MultiLabelOutputClass (double[N, M] predictions_in) => (int64[N, M] predictions_out)
+        {
+            half = Constant<value = double {0.5}>()
+            greater = Greater(predictions_in, half)
+            predictions_out = Cast<to = 7>(greater)
+        }
+        """  # noqa: E501
+        return onnx.parser.parse_graph(onnx_text)
 
     def metric(self) -> MultiLabelMetric:
         return MultiLabelMetric()
@@ -614,6 +741,33 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         K = pred.shape[1]  # number of classes
         return logloss.transform((1 / (K - 1)) * pred)
 
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
+        import onnx
+
+        onnx_text = """
+        LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        """  # noqa: E501
+
+        if pred.shape[1] == 1:
+            onnx_text += """
+            {
+                constant = Constant<value = double {2.0}>()
+                a = Mul(predictions_in, constant)
+                predictions_out = Sigmoid(a)
+            }
+            """
+            return onnx.parser.parse_graph(onnx_text)
+
+        constant = 1 / (pred.shape[1] - 1)
+        onnx_text += f"""
+        {{
+            constant = Constant<value = double {{{constant}}}>()
+            a = Mul(predictions_in, constant)
+            predictions_out = Softmax(a)
+        }}
+        """
+        return onnx.parser.parse_graph(onnx_text)
+
     def metric(self) -> ExponentialMetric:
         return ExponentialMetric()
 
@@ -628,7 +782,7 @@ def initialise_prediction(
         return self.one_step_newton(y, w, boost_from_average, init)
 
 
-objectives = {
+OBJECTIVES_MAP = {
     "squared_error": SquaredErrorObjective,
     "normal": NormalObjective,
     "log_loss": LogLossObjective,
@@ -638,3 +792,11 @@ def initialise_prediction(
     "gamma_deviance": GammaDevianceObjective,
     "gamma": GammaObjective,
 }
+
+REGRESSION_OBJECTIVES = ["squared_error", "normal", "gamma_deviance", "gamma"]
+
+CLASSIFICATION_OBJECTIVES = [
+    "log_loss",
+    "multi_label",
+    "exp",
+]
diff --git a/legateboost/onnx_utils.py b/legateboost/onnx_utils.py
new file mode 100644
index 00000000..3b2f72fb
--- /dev/null
+++ b/legateboost/onnx_utils.py
@@ -0,0 +1,104 @@
+from typing import Any, List
+
+import numpy as np
+
+import cupynumeric as cn
+
+# onnx is imported only if needed - keep this a soft dependency
+try:
+    import onnx
+except ImportError:
+    pass
+
+
+def make_model(graph: Any) -> Any:
+    # make model with appropriate opset imports for legate-boost
+    LEGATEBOOST_ONNX_OPSET_IMPORTS = [
+        onnx.helper.make_opsetid("ai.onnx.ml", 3),
+        onnx.helper.make_opsetid("", 21),
+    ]
+    return onnx.helper.make_model(graph, opset_imports=LEGATEBOOST_ONNX_OPSET_IMPORTS)
+
+
+def reshape_predictions(graph: Any, pred: cn.ndarray) -> Any:
+    # àppend an onnx graph that shapes the predictions equivalently to pred
+    shape = list(pred.shape)
+    shape[0] = -1
+    out_type = "int64" if pred.dtype == cn.int64 else "double"
+    onnx_text = f"""
+    ReshapePredictions ({out_type}[N, M] predictions_in) => ({out_type}{shape} predictions_out)
+    {{
+        shape = Constant<value_ints={shape}>()
+        predictions_out = Reshape(predictions_in, shape)
+    }}
+    """  # noqa: E501
+    reshape_graph = onnx.parser.parse_graph(onnx_text)
+    graph = onnx.compose.merge_graphs(
+        graph,
+        reshape_graph,
+        io_map=[
+            (graph.output[0].name, "predictions_in"),
+        ],
+        prefix2="reshape_",
+    )
+    return graph
+
+
+def mirror_predict_proba_output(graph: Any) -> Any:
+    # where model outputs only true probability we need to add the false probability
+    onnx_text = """
+    MirrorPredict (double[N, M] predictions_in) => (double[N, 2] predictions_out)
+    {
+        one = Constant<value = double {1.0}>()
+        false_probability = Sub(one, predictions_in)
+        predictions_out = Concat<axis=1>(false_probability, predictions_in)
+    }
+    """  # noqa: E501
+    new_graph = onnx.parser.parse_graph(onnx_text)
+    new_graph = onnx.compose.merge_graphs(
+        graph,
+        new_graph,
+        io_map=[
+            (graph.output[0].name, "predictions_in"),
+        ],
+        prefix2="mirror_",
+    )
+    return new_graph
+
+
+def init_predictions(model_init: cn.array, X_dtype: Any) -> Any:
+    # form a graph that takes X_in and model_init as input and outputs
+    # model_init repeated n_rows times
+
+    X_type_text = "double" if X_dtype == cn.float64 else "float"
+    onnx_text = f"""
+    InitPredictions ({X_type_text}[N, M] X_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out)
+    {{
+        X_out = Identity(X_in)
+        n_rows = Shape<end=1>(X_in)
+        one = Constant<value_ints=[1]>()
+        tile_repeat = Concat<axis=0>(n_rows, one)
+        predictions_out = Tile(init, tile_repeat)
+    }}
+    """  # noqa: E501
+    graph = onnx.parser.parse_graph(onnx_text)
+    graph.initializer.append(
+        onnx.numpy_helper.from_array(np.atleast_2d(model_init.__array__()), name="init")
+    )
+    return graph
+
+
+def merge_model_graphs(graphs: List[Any]) -> Any:
+    # merge a list of graphs into a single graph
+    combined = graphs[0]
+    for i, g in enumerate(graphs[1:]):
+        combined = onnx.compose.merge_graphs(
+            combined,
+            g,
+            io_map=[
+                (combined.output[0].name, "X_in"),
+                (combined.output[1].name, "predictions_in"),
+            ],
+            prefix2="model_{}_".format(i),
+        )
+    return combined
diff --git a/legateboost/test/test_objective.py b/legateboost/test/test_objective.py
index aaf8823c..a78ecb80 100644
--- a/legateboost/test/test_objective.py
+++ b/legateboost/test/test_objective.py
@@ -1,3 +1,5 @@
+import numpy as np
+import onnxruntime as ort
 import pytest
 
 import cupynumeric as cn
@@ -5,12 +7,26 @@
 from legateboost.testing.utils import non_increasing
 
 
+def compare_onnx_transform(obj, pred):
+    sess = ort.InferenceSession(obj.onnx_transform(pred).SerializeToString())
+    feeds = {
+        "predictions_in": pred,
+    }
+    onnx_transform = sess.run(None, feeds)[0]
+    assert onnx_transform.dtype == np.float64
+    transform = obj.transform(cn.array(pred))
+    assert transform.shape == onnx_transform.shape
+    assert np.allclose(onnx_transform, transform, atol=1e-6)
+
+
 def test_normal() -> None:
     obj = lb.NormalObjective()
     y = cn.array([[1.0], [2.0], [3.0]])
     init = obj.initialise_prediction(y, cn.array([1.0, 1.0, 1.0]), True)
     assert cn.allclose(init, cn.array([y.mean(), cn.log(y.std())]))
 
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))
+
 
 def test_gamma_deviance() -> None:
     obj = lb.GammaDevianceObjective()
@@ -35,6 +51,8 @@ def test_gamma_deviance() -> None:
     reg.fit(X, y1, eval_set=[(X, y1)], eval_result=eval_result)
     assert non_increasing(eval_result["train"]["deviance_gamma"])
 
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))
+
 
 def test_gamma() -> None:
     import numpy as np
@@ -51,6 +69,8 @@ def test_gamma() -> None:
     reg.fit(X, y, eval_set=[(X, y)], eval_result=eval_result)
     assert non_increasing(eval_result["train"]["gamma_neg_ll"])
 
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))
+
 
 def test_log_loss() -> None:
     obj = lb.LogLossObjective()
@@ -95,6 +115,9 @@ def test_log_loss() -> None:
             False,
         )
 
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))
+    compare_onnx_transform(obj, np.arange(4).reshape(4, 1).astype(np.float64))
+
 
 def test_exp():
     obj = lb.ExponentialObjective()
@@ -127,6 +150,9 @@ def test_exp():
             False,
         )
 
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))
+    compare_onnx_transform(obj, np.arange(4).reshape(4, 1).astype(np.float64))
+
 
 def test_multi_label():
     obj = lb.MultiLabelObjective()
@@ -139,3 +165,5 @@ def test_multi_label():
 
     with pytest.raises(ValueError, match=r"Expected labels to be in \[0, 1\]"):
         obj.initialise_prediction(cn.array([[1], [2]]), cn.array([[1.0], [1.0]]), False)
+
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index 8710eaf4..57e75d75 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -1,35 +1,180 @@
 import numpy as np
+import onnxruntime as ort
 import pytest
-from onnx.reference import ReferenceEvaluator
 
 import cupynumeric as cn
 import legateboost as lb
+from legateboost.onnx_utils import make_model
 
 
-@pytest.mark.parametrize(
-    "Model", [M for M in lb.models.BaseModel.__subclasses__() if hasattr(M, "to_onnx")]
-)
-@pytest.mark.parametrize("n_outputs", [1, 5])
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_onnx(Model, n_outputs, dtype):
+def compare_model_predictions(model, X):
+    onnx_model = make_model(model.to_onnx(X))
+    sess = ort.InferenceSession(onnx_model.SerializeToString())
+    feeds = {
+        "X_in": X,
+    }
+    pred = model.predict(cn.array(X))
+    feeds["predictions_in"] = np.zeros((X.shape[0], pred.shape[1]))
+    onnx_pred = sess.run(None, feeds)[1]
+    assert onnx_pred.dtype == np.float64
+    assert pred.shape == onnx_pred.shape
+    assert np.allclose(
+        onnx_pred, pred, atol=1e-2 if X.dtype == np.float32 else 1e-6
+    ), np.linalg.norm(pred - onnx_pred)
+
+
+def compare_estimator_predictions(estimator, X, predict_function, allowed_wrong=0):
+    sess = ort.InferenceSession(
+        estimator.to_onnx(X, predict_function).SerializeToString()
+    )
+    feeds = {
+        "X_in": X,
+    }
+    pred_method = getattr(estimator, predict_function)
+    pred = pred_method(cn.array(X))
+    onnx_pred = sess.run(None, feeds)[0]
+
+    assert onnx_pred.dtype == pred.dtype
+    assert pred.shape == onnx_pred.shape
+    number_wrong = np.sum(
+        np.abs(pred - onnx_pred) > (1e-2 if X.dtype == np.float32 else 1e-5)
+    )
+    assert number_wrong <= allowed_wrong, (
+        np.linalg.norm(pred - onnx_pred),
+        number_wrong,
+    )
+
+
+@pytest.fixture
+def model_dataset(dtype, n_outputs):
     rs = np.random.RandomState(0)
     X = rs.random((1000, 10)).astype(dtype)
     g = rs.normal(size=(X.shape[0], n_outputs))
     h = rs.random(g.shape) + 0.1
+    return X, g, h
+
+
+@pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("n_outputs", [1, 5])
+def test_models(Model, model_dataset):
+    X, g, h = model_dataset
     model = (
         Model()
         .set_random_state(np.random.RandomState(2))
         .fit(cn.array(X), cn.array(g), cn.array(h))
     )
 
-    def pred_onnx(onnx, X):
-        sess = ReferenceEvaluator(onnx)
-        pred = np.empty(X.shape[0], dtype=dtype)
-        feeds = {"X": X, "pred": pred}
-        return sess.run(None, feeds)
+    compare_model_predictions(model, X)
 
-    assert np.allclose(
-        model.predict(cn.array(X)),
-        pred_onnx(model.to_onnx(), X)[0],
-        atol=1e-3 if dtype == np.float32 else 1e-6,
+
+@pytest.mark.parametrize("n_outputs", [1, 5])
+def test_init(n_outputs):
+    # ONNX correctly outputs model init
+    X = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
+    y = np.full((3, n_outputs), 5.0, dtype=np.float32)
+    estimator = lb.LBRegressor(n_estimators=0, random_state=0).fit(X, y)
+    assert np.all(estimator.model_init_ == 5.0)
+    compare_estimator_predictions(estimator, X, "predict_raw")
+
+
+@pytest.fixture
+def regression_dataset(dtype, n_outputs):
+    from sklearn.datasets import make_regression
+
+    X, y = make_regression(
+        n_samples=1000,
+        n_features=10,
+        n_informative=5,
+        n_targets=n_outputs,
+        random_state=0,
+    )
+    # make labels strictly positive for certain objectives
+    return X.astype(dtype), np.abs(y.astype(dtype))
+
+
+@pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()])
+@pytest.mark.parametrize("objective", lb.objectives.REGRESSION_OBJECTIVES)
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("n_outputs", [1, 5])
+def test_regressor(Model, objective, regression_dataset):
+    X, y = regression_dataset
+    if (
+        Model in [lb.models.KRR, lb.models.NN]
+        and objective == "gamma"
+        and X.dtype == np.float32
+    ):
+        pytest.skip("Skipping as numerically unstable")
+    if objective in [
+        "quantile",
+        "gamma_deviance",
+        "gamma",
+    ] and (y.ndim > 1 and y.shape[1] > 1):
+        pytest.skip("skipping quantile, gamma and gamma_deviance for multiple outputs")
+    model = lb.LBRegressor(
+        n_estimators=2,
+        objective=objective,
+        base_models=(Model(),),
+        random_state=0,
+    ).fit(X, y)
+
+    compare_estimator_predictions(model, X, "predict_raw")
+    compare_estimator_predictions(model, X, "predict")
+
+
+@pytest.fixture
+def classification_dataset(dtype, n_outputs):
+    from sklearn.datasets import make_classification
+
+    X, y = make_classification(
+        n_samples=1000,
+        n_features=10,
+        n_informative=5,
+        n_classes=n_outputs,
+        random_state=0,
     )
+    return X.astype(dtype), np.abs(y.astype(dtype))
+
+
+@pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()])
+@pytest.mark.parametrize("objective", lb.objectives.CLASSIFICATION_OBJECTIVES)
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("n_outputs", [2, 5])
+def test_classifier(Model, objective, classification_dataset):
+    X, y = classification_dataset
+    if objective == "multi_label":
+        # encode labels as one-hot
+        encoded = np.zeros((y.shape[0], int(y.max() + 1)))
+        encoded[np.arange(y.shape[0]), y.astype(int)] = 1
+        y = encoded
+    model = lb.LBClassifier(
+        n_estimators=2,
+        objective=objective,
+        base_models=(Model(),),
+        random_state=0,
+    ).fit(X, y)
+
+    compare_estimator_predictions(model, X, "predict_raw")
+    compare_estimator_predictions(model, X, "predict_proba")
+    # softmax has numerical differences with float32
+    # allow a very small number of different class predictions
+    # this is fine so long as the probabilities are close
+    allowed_wrong = 5 if y.max() > 1 and X.dtype == np.float32 else 0
+    compare_estimator_predictions(model, X, "predict", allowed_wrong)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("n_outputs", [1, 5])
+@pytest.mark.parametrize("max_depth", list(range(0, 12, 3)))
+def test_tree(regression_dataset, max_depth):
+    # test tree depths more exhaustively
+    # some edge cases e.g. max_depth=0
+    X, y = regression_dataset
+    model = lb.LBRegressor(
+        init=None,
+        n_estimators=2,
+        base_models=(lb.models.Tree(max_depth=max_depth),),
+        random_state=0,
+    ).fit(X, y)
+
+    compare_estimator_predictions(model, X, "predict_raw")
diff --git a/legateboost/test/test_with_hypothesis.py b/legateboost/test/test_with_hypothesis.py
index 22275f67..c9312079 100644
--- a/legateboost/test/test_with_hypothesis.py
+++ b/legateboost/test/test_with_hypothesis.py
@@ -1,4 +1,5 @@
 import numpy as np
+import onnxruntime as ort
 from hypothesis import HealthCheck, Verbosity, assume, given, settings, strategies as st
 from sklearn.preprocessing import StandardScaler
 
@@ -25,15 +26,15 @@
 @st.composite
 def tree_strategy(draw):
     if get_legate_runtime().machine.count(TaskTarget.GPU) > 0:
-        max_depth = draw(st.integers(1, 8))
+        max_depth = draw(st.integers(0, 8))
     else:
-        max_depth = draw(st.integers(1, 6))
-    alpha = draw(st.floats(0.0, 1.0))
+        max_depth = draw(st.integers(0, 6))
+    l2_regularization = draw(st.floats(0.0, 1.0))
     split_samples = draw(st.integers(1, 500))
     feature_fraction = draw(st.sampled_from([0.5, 1.0]))
     return lb.models.Tree(
         max_depth=max_depth,
-        alpha=alpha,
+        l2_regularization=l2_regularization,
         split_samples=split_samples,
         feature_fraction=feature_fraction,
     )
@@ -41,20 +42,22 @@ def tree_strategy(draw):
 
 @st.composite
 def nn_strategy(draw):
-    alpha = draw(st.floats(0.0, 1.0))
+    l2_regularization = draw(st.floats(0.0, 1.0))
     hidden_layer_sizes = draw(st.sampled_from([(), (100,), (100, 100), (10, 10, 10)]))
     # max iter needs to be sufficiently large, otherwise the models can make the loss
     # worse (from a bad initialization)
     max_iter = 200
     return lb.models.NN(
-        alpha=alpha, hidden_layer_sizes=hidden_layer_sizes, max_iter=max_iter
+        l2_regularization=l2_regularization,
+        hidden_layer_sizes=hidden_layer_sizes,
+        max_iter=max_iter,
     )
 
 
 @st.composite
 def linear_strategy(draw):
-    alpha = draw(st.floats(0.0, 1.0))
-    return lb.models.Linear(alpha=alpha)
+    l2_regularization = draw(st.floats(0.0, 1.0))
+    return lb.models.Linear(l2_regularization=l2_regularization)
 
 
 @st.composite
@@ -63,9 +66,11 @@ def krr_strategy(draw):
         sigma = draw(st.floats(0.1, 1.0))
     else:
         sigma = None
-    alpha = draw(st.floats(0.0, 1.0))
+    l2_regularization = draw(st.floats(0.0, 1.0))
     components = draw(st.integers(2, 10))
-    return lb.models.KRR(n_components=components, alpha=alpha, sigma=sigma)
+    return lb.models.KRR(
+        n_components=components, l2_regularization=l2_regularization, sigma=sigma
+    )
 
 
 @st.composite
@@ -161,11 +166,20 @@ def test_regressor(model_params, regression_params, regression_dataset):
     model = lb.LBRegressor(**model_params, **regression_params, verbose=True).fit(
         X, y, sample_weight=w, eval_result=eval_result
     )
-    model.predict(X)
     loss = next(iter(eval_result["train"].values()))
     assert non_increasing(loss, tol=1e-1)
     sanity_check_models(model)
 
+    # check onnx
+    # for now reshape legate-boost predict to 2-D
+    # eventually onnx should match the output shape exactly
+    predict_raw = model.predict_raw(X)
+    onnx_predict_raw = pred_onnx(model.to_onnx(X.dtype), X)
+    onnx_predict_raw = onnx_predict_raw.reshape(predict_raw.shape)
+    assert np.allclose(
+        predict_raw, onnx_predict_raw, atol=1e-3 if X.dtype == np.float32 else 1e-6
+    ), np.linalg.norm(predict_raw - onnx_predict_raw)
+
 
 classification_param_strategy = st.fixed_dictionaries(
     {
@@ -240,12 +254,18 @@ def classification_dataset_strategy(draw):
     return X, y, w, name
 
 
+def pred_onnx(onnx, X):
+    sess = ort.InferenceSession(onnx.SerializeToString())
+    return sess.run(None, {"X_in": X})[0]
+
+
 @given(
     general_model_param_strategy,
     classification_param_strategy,
     classification_dataset_strategy(),
 )
 @cn.errstate(divide="raise", invalid="raise")
+@settings(print_blob=True)
 def test_classifier(
     model_params: dict, classification_params: dict, classification_dataset: tuple
 ) -> None:
@@ -256,8 +276,15 @@ def test_classifier(
     )
     model.predict(X)
     model.predict_proba(X)
-    model.predict_raw(X)
+    predict_raw = model.predict_raw(X)
     loss = next(iter(eval_result["train"].values()))
     # multiclass models with higher learning rates don't always converge
     if len(model.classes_) == 2:
         assert non_increasing(loss, 1e-1)
+
+    # check onnx
+    onnx_predict_raw = pred_onnx(model.to_onnx(X.dtype), X)
+    onnx_predict_raw = onnx_predict_raw.reshape(predict_raw.shape)
+    assert np.allclose(
+        predict_raw, onnx_predict_raw, atol=1e-3 if X.dtype == np.float32 else 1e-6
+    ), np.linalg.norm(predict_raw - onnx_predict_raw)
diff --git a/pyproject.toml b/pyproject.toml
index a69041c6..e8269341 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,7 @@ test = [
     "notebook>=7",
     "onnx>=1.10",
     "onnxmltools>=1.10",
+    "onnxruntime",
     "pytest>=7,<8",
     "seaborn>=0.13",
     "xgboost>=2.0",