diff --git a/ci/run_mypy.sh b/ci/run_mypy.sh index fdb3e026..d3f818d1 100755 --- a/ci/run_mypy.sh +++ b/ci/run_mypy.sh @@ -14,6 +14,7 @@ set -e -E -u -o pipefail +mypy --version mypy \ --config-file ./pyproject.toml \ --exclude=legateboost/test \ diff --git a/conda/environments/all_cuda-122.yaml b/conda/environments/all_cuda-122.yaml index 68eb3824..6d4e98af 100644 --- a/conda/environments/all_cuda-122.yaml +++ b/conda/environments/all_cuda-122.yaml @@ -30,6 +30,7 @@ dependencies: - numpy - onnx>=1.10 - onnxmltools>=1.10 +- onnxruntime - openblas - pydata-sphinx-theme>=0.16 - pytest>=7,<8 diff --git a/dependencies.yaml b/dependencies.yaml index 9cd93fba..3700d52e 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -178,3 +178,4 @@ dependencies: - xgboost>=2.0 - onnx>=1.10 - onnxmltools>=1.10 + - onnxruntime diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py index 80c8beb2..a7e1b50c 100644 --- a/legateboost/legateboost.py +++ b/legateboost/legateboost.py @@ -16,7 +16,14 @@ from .input_validation import _lb_check_X, _lb_check_X_y, check_sample_weight from .metrics import BaseMetric, metrics from .models import BaseModel, Tree -from .objectives import BaseObjective, objectives +from .objectives import OBJECTIVES_MAP, BaseObjective +from .onnx_utils import ( + init_predictions, + make_model, + merge_model_graphs, + mirror_predict_proba_output, + reshape_predictions, +) from .shapley import global_shapley_attributions, local_shapley_attributions from .utils import AddableMixin, AddMember, PickleCupynumericMixin @@ -422,7 +429,7 @@ def fit( # setup objective if isinstance(self.objective, str): - self._objective_instance = objectives[self.objective]() + self._objective_instance = OBJECTIVES_MAP[self.objective]() elif isinstance(self.objective, BaseObjective): self._objective_instance = self.objective else: @@ -528,6 +535,26 @@ def _predict(self, X: cn.ndarray) -> cn.ndarray: pred += Type.batch_predict(models, X) return pred + def predict_raw(self, X: cn.ndarray) -> cn.ndarray: + """Predict pre-transformed values for samples in X. E.g. before applying a + sigmoid function. + + Parameters + ---------- + + X : + The input samples. + + Returns + ------- + + y : + The predicted raw values for each sample in X. + """ + X = _lb_check_X(X) + validate_data(self, X, reset=False, skip_check_array=True) + return self._predict(X) + def dump_models(self) -> str: """Dumps the models in the current instance to a string. @@ -540,6 +567,29 @@ def dump_models(self) -> str: text += str(m) return text + def _to_onnx_predict_raw(self, X: cn.ndarray) -> Any: + init_graph = init_predictions(self.model_init_, X.dtype) + graph = merge_model_graphs([init_graph] + [m.to_onnx(X) for m in self.models_]) + # remove the X_out output, we only need the predictions + graph.output.remove(graph.output[0]) + return graph + + def _to_onnx_predict_transformed(self, X: cn.ndarray) -> Any: + import onnx + + graph = onnx.compose.merge_graphs( + self._to_onnx_predict_raw(X), + self._objective_instance.onnx_transform(self.predict_raw(X[0:1])), + io_map=[ + ( + "model_{}_predictions_out".format(len(self.models_) - 1), + "predictions_in", + ) + ], + prefix2="transform_", + ) + return graph + def global_attributions( self, X: cn.array, @@ -701,10 +751,10 @@ class LBRegressor(RegressorMixin, LBBase): Examples -------- >>> import cupynumeric as cn - >>> import legateboost as lbst + >>> import legateboost as lb >>> X = cn.random.random((1000, 10)) >>> y = cn.random.random(X.shape[0]) - >>> model = lbst.LBRegressor(n_estimators=5).fit(X, y) + >>> model = lb.LBRegressor(n_estimators=5).fit(X, y) >>> """ @@ -827,6 +877,58 @@ def predict(self, X: cn.ndarray) -> cn.ndarray: pred = pred.squeeze(axis=1) return pred + def to_onnx(self, X: cn.ndarray, predict_function: str = "predict") -> Any: + """Converts the estimator to an ONNX model which is expected to produce + equivalent predictions to `predict_function` up to reasonable floating + point tolerance. The ONNX model is hard coded to the X input data type, + separate models should be generated for float and double. The ONNX model + takes "X_in" as input and produces "predictions_out" as output. + + Parameters + ---------- + X: + Example input data. Use to infer input data characteristics. + A model produced for float32 will not accept float64 input and vice versa. + predict_function : str + The serialised ONNX model can produce output equivalent to 'predict' or + 'predict_raw'. + The default is "predict". + Returns + ------- + Any + The ONNX model. + + Examples + -------- + >>> import numpy as np + >>> import legateboost as lb + >>> X = np.random.random((1000, 10)) + >>> y = np.random.random(X.shape[0]) + >>> model = lb.LBRegressor(n_estimators=5).fit(X, y) + >>> import onnxruntime as ort + >>> sess = ort.InferenceSession(model.to_onnx(X).SerializeToString()) + >>> onnx_pred = sess.run(None, {"X_in": X})[0] + >>> assert np.allclose(model.predict(X), onnx_pred, atol=1e-6) + >>> + """ + import onnx + + if predict_function not in ["predict", "predict_raw"]: + raise ValueError( + "predict_function should be one of ['predict', 'predict_raw']" + ) + if predict_function == "predict": + graph = self._to_onnx_predict_transformed(X) + else: + graph = self._to_onnx_predict_raw(X) + + # coerce the output shape to be the same as the equivalent predict function + test_pred = getattr(self, predict_function)(X[0:1]) + graph = reshape_predictions(graph, test_pred) + model = make_model(graph) + onnx.checker.check_model(model, full_check=True) + return model + class LBClassifier(ClassifierMixin, LBBase): """Implements a gradient boosting algorithm for classification problems. @@ -1031,26 +1133,6 @@ def fit( ) return self - def predict_raw(self, X: cn.ndarray) -> cn.ndarray: - """Predict pre-transformed values for samples in X. E.g. before applying a - sigmoid function. - - Parameters - ---------- - - X : - The input samples. - - Returns - ------- - - y : - The predicted raw values for each sample in X. - """ - X = _lb_check_X(X) - validate_data(self, X, reset=False, skip_check_array=True) - return super()._predict(X) - def predict_proba(self, X: cn.ndarray) -> cn.ndarray: """Predict class probabilities for samples in X. @@ -1092,3 +1174,74 @@ def predict(self, X: cn.ndarray) -> cn.ndarray: """ check_is_fitted(self) return self._objective_instance.output_class(self.predict_proba(X)) + + def to_onnx(self, X: cn.ndarray, predict_function: str = "predict") -> Any: + """Converts the estimator to an ONNX model which is expected to produce + equivalent predictions to `predict_function` up to reasonable floating + point tolerance. The ONNX model is hard coded to the X input data type, + separate models should be generated for float and double. The ONNX model + takes "X_in" as input and produces "predictions_out" as output. + + Parameters + ---------- + X: + Example input data. Use to infer input data characteristics. + A model produced for float32 will not accept float64 input and vice versa. + predict_function : str + The serialised ONNX model can produce output equivalent to 'predict', + 'predict_proba', or 'predict_raw'. + The default is "predict". + Returns + ------- + Any + The ONNX model. + + Examples + -------- + >>> import numpy as np + >>> import legateboost as lb + >>> X = np.random.random((1000, 10)) + >>> y = np.random.randint(0, 2, X.shape[0]) + >>> model = lb.LBClassifier(n_estimators=5).fit(X, y) + >>> import onnxruntime as ort + >>> sess = ort.InferenceSession(model.to_onnx(X, + ... predict_function="predict_proba").SerializeToString()) + >>> onnx_pred = sess.run(None, {"X_in": X})[0] + >>> assert np.allclose(model.predict_proba(X), onnx_pred, atol=1e-6) + >>> + """ + import onnx + + if predict_function not in ["predict", "predict_proba", "predict_raw"]: + raise ValueError( + "predict_function should be one of ['predict'," + " 'predict_proba', 'predict_raw']" + ) + if predict_function in ["predict_proba", "predict"]: + graph = self._to_onnx_predict_transformed(X) + # need to mirror the output when we only output one target + if self.predict_raw(X[0:1]).shape[1] == 1: + graph = mirror_predict_proba_output(graph) + if predict_function == "predict": + # argmax the predict_proba output + argmax = self._objective_instance.onnx_output_class( + self.predict_proba(X[0:1]) + ) + graph = onnx.compose.merge_graphs( + graph, + argmax, + io_map=[ + (graph.output[0].name, "predictions_in"), + ], + prefix2="classifier_predict_", + ) + + elif predict_function == "predict_raw": + graph = self._to_onnx_predict_raw(X) + + # coerce the output shape to be the same as the equivalent predict function + test_pred = getattr(self, predict_function)(X[0:1]) + graph = reshape_predictions(graph, test_pred) + model = make_model(graph) + onnx.checker.check_model(model, full_check=True) + return model diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py index a1e88011..38e37cb7 100644 --- a/legateboost/models/base_model.py +++ b/legateboost/models/base_model.py @@ -127,12 +127,25 @@ def __mul__(self, scalar: Any) -> "BaseModel": def __hash__(self) -> int: return hash(str(self)) - def to_onnx(self) -> Any: - """Convert the model to an ONNX model. + def to_onnx(self, X: cn.array) -> Any: + """Convert the model to an ONNX graph. + The implemented ONNX graph should accept the following two inputs: + - "X_in" : 2D tensor of shape (n_samples, n_features) and type `X_dtype`. + - "predictions in" : 2D tensor of shape (n_samples, n_outputs) and type double. + The graph should output: + - "predictions out" : 2D tensor of shape (n_samples, n_outputs) and type double. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Example input X matrix. Used to infer type and shape of the input. + + y_pred : ndarray of shape (n_samples,) + The predicted labels. Returns ------- Any - The ONNX model. + The ONNX graph. """ raise NotImplementedError diff --git a/legateboost/models/krr.py b/legateboost/models/krr.py index 31af8d5a..0a41788d 100644 --- a/legateboost/models/krr.py +++ b/legateboost/models/krr.py @@ -243,115 +243,40 @@ def __mul__(self, scalar: Any) -> "KRR": self.betas_ *= scalar return new - def to_onnx(self) -> Any: - from onnx import numpy_helper - from onnx.checker import check_model - from onnx.helper import ( - make_graph, - make_model, - make_node, - make_tensor_value_info, - np_dtype_to_tensor_dtype, - ) - - assert self.X_train.dtype == self.betas_.dtype - - def make_constant_node(value: cn.array, name: str) -> Any: - return make_node( - "Constant", - inputs=[], - value=numpy_helper.from_array(value, name=name), - outputs=[name], - ) - - nodes = [] + def to_onnx(self, X: cn.array) -> Any: + import onnx - # model constants - betas = numpy_helper.from_array(self.betas_.__array__(), name="betas") - X_train = numpy_helper.from_array(self.X_train.__array__(), name="X_train") - - # pred inputs - X = make_tensor_value_info( - "X", - np_dtype_to_tensor_dtype(self.betas_.dtype), - [None, self.X_train.shape[1]], - ) - pred = make_tensor_value_info( - "pred", - np_dtype_to_tensor_dtype(self.betas_.dtype), - [None, self.betas_.shape[1]], - ) - - # exanded l2 distance - # distance = np.sum(X**2, axis=1)[:, np.newaxis] - 2 * np.dot(X, self.X_train.T) - # + np.sum(self.X_train**2, axis=1) - make_tensor_value_info( - "XX", np_dtype_to_tensor_dtype(self.betas_.dtype), [None] - ) - make_tensor_value_info( - "YY", - np_dtype_to_tensor_dtype(self.betas_.dtype), - [self.X_train.shape[0], 1], - ) - make_tensor_value_info( - "XY_reshaped", - np_dtype_to_tensor_dtype(self.betas_.dtype), - [1, self.X_train.shape[0]], - ) - make_tensor_value_info( - "XY", - np_dtype_to_tensor_dtype(self.betas_.dtype), - [None, self.X_train.shape[0]], - ) - nodes.append(make_constant_node(np.array([1]), "axis1")) - nodes.append(make_node("ReduceSumSquare", ["X", "axis1"], ["XX"])) - nodes.append(make_node("Gemm", ["X", "X_train"], ["XY"], alpha=-2.0, transB=1)) - nodes.append(make_node("ReduceSumSquare", ["X_train", "axis1"], ["YY"])) - nodes.append(make_constant_node(np.array([1, -1]), "reshape")) - nodes.append(make_node("Reshape", ["YY", "reshape"], ["YY_reshaped"])) - nodes.append(make_node("Add", ["XX", "XY"], ["add0"])) - make_tensor_value_info( - "l2", - np_dtype_to_tensor_dtype(self.betas_.dtype), - [None, self.X_train.shape[0]], - ) - nodes.append(make_node("Add", ["YY_reshaped", "add0"], ["l2"])) - nodes.append(make_constant_node(np.array([0.0], self.betas_.dtype), "zero")) - make_tensor_value_info( - "l2_clipped", - np_dtype_to_tensor_dtype(self.betas_.dtype), - [None, self.X_train.shape[0]], - ) - nodes.append(make_node("Max", ["l2", "zero"], ["l2_clipped"])) - - # RBF kernel - # K = np.exp(-distance / (2 * self.sigma**2)) - make_tensor_value_info( - "rbf0", - np_dtype_to_tensor_dtype(self.betas_.dtype), - [None, self.X_train.shape[0]], - ) + X_type_text = "double" if X.dtype == cn.float64 else "float" if self.sigma is None: - raise ValueError("sigma is None. Has fit been called?") - nodes.append( - make_constant_node( - np.array([-2.0 * self.sigma**2], self.betas_.dtype), "denominator" - ) - ) - nodes.append(make_node("Div", ["l2_clipped", "denominator"], ["rbf0"])) - make_tensor_value_info( - "K", - np_dtype_to_tensor_dtype(self.betas_.dtype), - [None, self.X_train.shape[0]], - ) - nodes.append(make_node("Exp", ["rbf0"], ["K"])) - - # prediction - # pred = np.dot(K, self.betas_) - nodes.append(make_node("MatMul", ["K", "betas"], ["pred"])) - graph = make_graph( - nodes, "legateboost.model.KRR", [X], [pred], [betas, X_train] + raise ValueError("Model has not been trained. Cannot export to ONNX.") + denominator = -2.0 * self.sigma**2 + onnx_text = f""" + KRRModel ({X_type_text}[N, M] X_in, double[N, K] predictions_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out) + {{ + X_out = Identity(X_in) + axis1 = Constant() + XX = ReduceSumSquare(X_in, axis1) + XY = Gemm(X_in, X_train) + YY = ReduceSumSquare(X_train, axis1) + reshape = Constant() + YY_reshaped = Reshape(YY, reshape) + add0 = Add(XX, XY) + l2 = Add(YY_reshaped, add0) + zero = Constant() + l2_clipped = Max(l2, zero) + denominator = Constant() + rbf0 = Div(l2_clipped, denominator) + K = Exp(rbf0) + dot = MatMul(K, betas) + dot_double = Cast(dot) + predictions_out = Add(dot_double, predictions_in) + }} + """ # noqa: E501 + graph = onnx.parser.parse_graph(onnx_text) + graph.initializer.extend( + [ + onnx.numpy_helper.from_array(self.betas_.__array__(), name="betas"), + onnx.numpy_helper.from_array(self.X_train.__array__(), name="X_train"), + ] ) - onnx_model = make_model(graph) - check_model(onnx_model) - return onnx_model + return graph diff --git a/legateboost/models/linear.py b/legateboost/models/linear.py index ec34594e..1319d2de 100644 --- a/legateboost/models/linear.py +++ b/legateboost/models/linear.py @@ -152,36 +152,27 @@ def __mul__(self, scalar: Any) -> "Linear": new.betas_ *= scalar return new - def to_onnx(self) -> Any: - from onnx import numpy_helper - from onnx.checker import check_model - from onnx.helper import ( - make_graph, - make_model, - make_node, - make_tensor_value_info, - np_dtype_to_tensor_dtype, + def to_onnx(self, X: cn.array) -> Any: + import onnx + + X_type_text = "double" if X.dtype == cn.float64 else "float" + onnx_text = f""" + LinearModel ({X_type_text}[N, M] X_in, double[N, K] predictions_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out) + {{ + X_out = Identity(X_in) + mult = MatMul(X_in, betas) + result = Add(mult, intercept) + result_double = Cast(result) + predictions_out = Add(result_double, predictions_in) + }} + """ # noqa: E501 + graph = onnx.parser.parse_graph(onnx_text) + graph.initializer.extend( + [ + onnx.numpy_helper.from_array(self.betas_[1:].__array__(), name="betas"), + onnx.numpy_helper.from_array( + self.betas_[0].__array__(), name="intercept" + ), + ] ) - - # model constants - betas = numpy_helper.from_array(self.betas_[1:].__array__(), name="betas") - intercept = numpy_helper.from_array( - self.betas_[0].__array__(), name="intercept" - ) - - # pred inputs - X = make_tensor_value_info( - "X", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, None] - ) - pred = make_tensor_value_info( - "pred", np_dtype_to_tensor_dtype(self.betas_.dtype), [None] - ) - - node1 = make_node("MatMul", ["X", "betas"], ["XBeta"]) - node2 = make_node("Add", ["XBeta", "intercept"], ["pred"]) - graph = make_graph( - [node1, node2], "legateboost.model.Linear", [X], [pred], [betas, intercept] - ) - onnx_model = make_model(graph) - check_model(onnx_model) - return onnx_model + return graph diff --git a/legateboost/models/nn.py b/legateboost/models/nn.py index 356f0264..969dc0df 100644 --- a/legateboost/models/nn.py +++ b/legateboost/models/nn.py @@ -182,12 +182,10 @@ def __mul__(self, scalar: Any) -> "NN": new.biases_[-1] *= scalar return new - def to_onnx(self) -> Any: - from onnx import numpy_helper - from onnx.checker import check_model + def to_onnx(self, X: cn.array) -> Any: + from onnx import TensorProto, numpy_helper from onnx.helper import ( make_graph, - make_model, make_node, make_tensor_value_info, np_dtype_to_tensor_dtype, @@ -204,64 +202,74 @@ def to_onnx(self) -> Any: ] # pred inputs - X = make_tensor_value_info( - "X", + n_outputs = self.coefficients_[-1].shape[1] + n_features = self.coefficients_[0].shape[0] + X_in = make_tensor_value_info( + "X_in", np_dtype_to_tensor_dtype(self.coefficients_[0].dtype), [None, self.coefficients_[0].shape[0]], ) - + predictions_in = make_tensor_value_info( + "predictions_in", + TensorProto.DOUBLE, + [None, n_outputs], + ) nodes = [] - make_tensor_value_info( - "activations0", - np_dtype_to_tensor_dtype(self.coefficients_[0].dtype), - [None, None], - ) - nodes.append(make_node("MatMul", ["X", "coefficients0"], ["activations0"])) - activations_with_bias = make_tensor_value_info( - "activations0withbias", - np_dtype_to_tensor_dtype(self.coefficients_[0].dtype), - [None, None], - ) + nodes.append(make_node("MatMul", ["X_in", "coefficients0"], ["activations0"])) nodes.append( make_node("Add", ["activations0", "bias0"], ["activations0withbias"]) ) for i in range(1, len(coefficients)): - make_tensor_value_info( - f"tanh{i}", - np_dtype_to_tensor_dtype(self.coefficients_[0].dtype), - [None, None], - ) nodes.append(make_node("Tanh", [f"activations{i-1}withbias"], [f"tanh{i}"])) - make_tensor_value_info( - f"activations{i}", - np_dtype_to_tensor_dtype(self.coefficients_[0].dtype), - [None, None], - ) nodes.append( make_node( "MatMul", [f"tanh{i}", f"coefficients{i}"], [f"activations{i}"] ) ) - activations_with_bias = make_tensor_value_info( - f"activations{i}withbias", - np_dtype_to_tensor_dtype(self.coefficients_[0].dtype), - [None, None], - ) nodes.append( make_node( "Add", [f"activations{i}", f"bias{i}"], [f"activations{i}withbias"] ) ) + # outputs + X_out = make_tensor_value_info( + "X_out", + np_dtype_to_tensor_dtype(self.coefficients_[0].dtype), + [None, n_features], + ) + nodes.append(make_node("Identity", ["X_in"], ["X_out"])) + predictions_out = make_tensor_value_info( + "predictions_out", + TensorProto.DOUBLE, + [None, n_outputs], + ) + nodes.append( + make_node( + "Cast", + ["activations{}withbias".format(len(self.coefficients_) - 1)], + ["casted"], + to=TensorProto.DOUBLE, + ) + ) + nodes.append( + make_node( + "Add", + [ + "casted", + "predictions_in", + ], + ["predictions_out"], + ) + ) + graph = make_graph( nodes, "legateboost.model.NN", - [X], - [activations_with_bias], + [X_in, predictions_in], + [X_out, predictions_out], biases + coefficients, ) - onnx_model = make_model(graph) - check_model(onnx_model) - return onnx_model + return graph diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py index 38cfda93..b5827f13 100644 --- a/legateboost/models/tree.py +++ b/legateboost/models/tree.py @@ -1,9 +1,11 @@ import copy import warnings +from dataclasses import dataclass from enum import IntEnum -from typing import Any, Callable, List, Sequence, Union, cast +from typing import Any, Callable, Dict, List, Sequence, Union, cast import numpy as np +import numpy.typing as npt import cupynumeric as cn from legate.core import TaskTarget, get_legate_runtime, types @@ -316,123 +318,176 @@ def __mul__(self, scalar: Any) -> "Tree": new.leaf_value *= scalar return new - def to_onnx(self) -> Any: - import onnx - from onnx import numpy_helper - from onnx.checker import check_model + # copy the tree structure to numpy arrays + # cupynumeric element access is very slow + @dataclass + class TreeAsNumpy: + leaf_value: npt.NDArray[np.float64] + feature: npt.NDArray[np.int32] + split_value: npt.NDArray[np.float64] + gain: npt.NDArray[np.float64] + hessian: npt.NDArray[np.float64] + is_leaf: npt.NDArray[np.bool_] + + # structure of arrays for tree structure expected by onnx + # this container is a convenience to not have 7 function arguments + class OnnxSoa: + def __init__(self, size: int, n_outputs: int) -> None: + self.nodes_modes: npt.NDArray[str] = np.full(size, "BRANCH_LEQ") + self.nodes_featureids: npt.NDArray[np.int32] = np.full( + size, -1, dtype=np.int32 + ) + self.nodes_truenodeids: npt.NDArray[np.int32] = np.full( + size, -1, dtype=np.int32 + ) + self.nodes_falsenodeids: npt.NDArray[np.int32] = np.full( + size, -1, dtype=np.int32 + ) + self.nodes_nodeids: npt.NDArray[np.int32] = np.arange(size, dtype=np.int32) + self.nodes_values: npt.NDArray[np.float64] = np.full( + size, -1.0, dtype=np.float64 + ) + self.leaf_weights: npt.NDArray[np.float64] = np.full( + (size, n_outputs), -1.0, dtype=np.float64 + ) + + def recurse_tree( + self, tree: TreeAsNumpy, soa: OnnxSoa, old_node_idx: int, new_node_idx: int + ) -> int: + # new_node_idx is sparse + if tree.is_leaf[old_node_idx]: + soa.nodes_modes[new_node_idx] = "LEAF" + soa.leaf_weights[new_node_idx] = tree.leaf_value[old_node_idx] + return new_node_idx + else: + soa.nodes_modes[new_node_idx] = "BRANCH_LEQ" + soa.nodes_featureids[new_node_idx] = tree.feature[old_node_idx] + soa.nodes_values[new_node_idx] = tree.split_value[old_node_idx] + left_child_idx = new_node_idx + 1 + soa.nodes_truenodeids[new_node_idx] = left_child_idx + node_idx_counter = self.recurse_tree( + tree, soa, self.left_child(old_node_idx), left_child_idx + ) + right_child_idx = node_idx_counter + 1 + soa.nodes_falsenodeids[new_node_idx] = right_child_idx + node_idx_counter = self.recurse_tree( + tree, soa, self.right_child(old_node_idx), right_child_idx + ) + + return node_idx_counter + + def to_onnx(self, X: cn.array) -> Any: + from onnx import TensorProto, numpy_helper from onnx.helper import ( make_graph, - make_model, - make_tensor, + make_node, make_tensor_value_info, + np_dtype_to_tensor_dtype, + ) + + num_sparse_nodes = (self.hessian[:, 0] > 0.0).sum() + num_outputs = self.leaf_value.shape[1] + # copy the tree as numpy because single element + # access with cupynumeric is very slow + tree = Tree.TreeAsNumpy( + self.leaf_value.__array__(), + self.feature.__array__(), + self.split_value.__array__(), + self.gain.__array__(), + self.hessian.__array__(), + self.feature.__array__() == -1, ) + soa = Tree.OnnxSoa(num_sparse_nodes, num_outputs) + # This recursive function could become a bottleneck for large trees + # In this case consider implmenting a C++ legate task for this conversion + # Cython could also work + self.recurse_tree(tree, soa, 0, 0) onnx_nodes = [] - # We map the legate-boost tree representation to the TreeEnsemble ONNX operator - # the features array, splits array, and leaf weights can be passed unchanged - # ONNX then requires some extra arrays to represent the tree structure - # - nodes_truenodeidx is the index of the left child for a given node - # - nodes_falsenodeidx is the index of the right child for a given node - # - nodes_modes indicates that nodes use a <= comparison operator - # - nodes_trueleafs indicates that the left child is a leaf node - # - nodes_falseleafs indicates that the right child is a leaf node - # - leaf_targetids indicates which output the leaf node corresponds to - # ONNX does not support vector leaf so we will repeat the tree n_outputs - # times, each time with a different constant for leaf_targetids - # This is not ideal but I don't see a better way - - tree_max_nodes = self.feature.size - all_nodes_idx = np.arange(tree_max_nodes) - nodes_featureids = self.feature.__array__() - nodes_splits = numpy_helper.from_array(self.split_value.__array__()) - nodes_truenodeids = self.left_child(all_nodes_idx) - # get the left child of each node and check if it is a leaf - # if the node is already leaf then its child can go off the end of the array - # use np.minimum to avoid this - nodes_trueleafs = self.is_leaf( - np.minimum(tree_max_nodes - 1, self.left_child(all_nodes_idx)) - ).astype(int) - nodes_falsenodeids = self.right_child(all_nodes_idx) - nodes_falseleafs = self.is_leaf( - np.minimum(tree_max_nodes - 1, self.right_child(all_nodes_idx)) - ).astype(int) - - for output_idx in range(0, self.leaf_value.shape[1]): - leaf_targetids = np.full(self.feature.size, output_idx, dtype=np.int64) - leaf_weights = numpy_helper.from_array( - self.leaf_value[:, output_idx].__array__() + kwargs: Dict[str, Any] = {} + # TreeEnsembleRegressor asks us to pass these as tensors when X.dtype is double + # we simply pass a set of indices as leaf weights and then add a node later to + # look up the (vector valued) leaf weights + if X.dtype == np.float32: + kwargs["nodes_values"] = soa.nodes_values.astype(np.float32) + kwargs["target_weights"] = soa.nodes_nodeids.astype(np.float32) + else: + kwargs["nodes_values_as_tensor"] = numpy_helper.from_array( + soa.nodes_values, name="nodes_values" + ) + kwargs["target_weights_as_tensor"] = numpy_helper.from_array( + soa.nodes_nodeids.astype(np.float64), name="target_weights" ) - onnx_nodes.append( - onnx.helper.make_node( - "TreeEnsemble", - ["X"], - ["pred" + str(output_idx)], - domain="ai.onnx.ml", - n_targets=self.leaf_value.shape[1], - membership_values=None, - nodes_missing_value_tracks_true=None, - nodes_hitrates=None, - aggregate_function=1, - post_transform=0, - tree_roots=[0], - nodes_modes=make_tensor( - "nodes_modes", - onnx.TensorProto.UINT8, - self.feature.shape, - np.zeros_like(self.feature, dtype=np.uint8), - ), - nodes_featureids=nodes_featureids, - nodes_splits=nodes_splits, - nodes_truenodeids=nodes_truenodeids, - nodes_trueleafs=nodes_trueleafs, - nodes_falsenodeids=nodes_falsenodeids, - nodes_falseleafs=nodes_falseleafs, - leaf_targetids=leaf_targetids, - leaf_weights=leaf_weights, - ) + # TreeEnsembleRegressor is deprecated, but its successor TreeEnsemble + # is at the time of writing not available from onnxruntime on conda-forge + # This can be updated at some point without too much trouble + onnx_nodes.append( + make_node( + "TreeEnsembleRegressor", + ["X_in"], + ["predicted_leaf_index"], + domain="ai.onnx.ml", + n_targets=1, + membership_values=None, + nodes_missing_value_tracks_true=None, + nodes_hitrates=None, + nodes_modes=soa.nodes_modes, + nodes_featureids=soa.nodes_featureids, + nodes_truenodeids=soa.nodes_truenodeids, + nodes_falsenodeids=soa.nodes_falsenodeids, + nodes_nodeids=soa.nodes_nodeids, + nodes_treeids=np.zeros_like(soa.nodes_nodeids, dtype=np.int64), + target_ids=np.zeros_like(soa.nodes_nodeids, dtype=np.int64), + target_nodeids=soa.nodes_nodeids, + target_treeids=np.zeros_like(soa.nodes_nodeids, dtype=np.int64), + **kwargs, ) + ) - if output_idx == 0: - accumulated_pred = make_tensor_value_info( - "accumulated_pred0", onnx.TensorProto.DOUBLE, [None, None] - ) - onnx_nodes.append( - onnx.helper.make_node( - "Identity", - ["pred" + str(output_idx)], - ["accumulated_pred0"], - ) - ) - else: - accumulated_pred = make_tensor_value_info( - "accumulated_pred" + str(output_idx), - onnx.TensorProto.DOUBLE, - [None, None], - ) - onnx_nodes.append( - onnx.helper.make_node( - "Add", - [ - "accumulated_pred" + str(output_idx - 1), - "pred" + str(output_idx), - ], - ["accumulated_pred" + str(output_idx)], - ) - ) + leaf_weights = numpy_helper.from_array(soa.leaf_weights, name="leaf_weights") + predictions_out = make_tensor_value_info( + "predictions_out", TensorProto.DOUBLE, [None, num_outputs] + ) + # make indices 1-d + onnx_nodes.append( + make_node( + "Squeeze", ["predicted_leaf_index"], ["predicted_leaf_index_squeezed"] + ) + ) + onnx_nodes.append( + make_node( + "Cast", + ["predicted_leaf_index_squeezed"], + ["predicted_leaf_index_int"], + to=TensorProto.INT32, + ) + ) + onnx_nodes.append( + make_node( + "Gather", ["leaf_weights", "predicted_leaf_index_int"], ["gathered"] + ) + ) + predictions_in = make_tensor_value_info( + "predictions_in", TensorProto.DOUBLE, [None, num_outputs] + ) + onnx_nodes.append( + make_node("Add", ["predictions_in", "gathered"], ["predictions_out"]) + ) - # pred inputs - X = make_tensor_value_info("X", onnx.TensorProto.DOUBLE, [None, None]) - graph = make_graph( - onnx_nodes, "legateboost.models.Tree", [X], [accumulated_pred] + X_in = make_tensor_value_info( + "X_in", np_dtype_to_tensor_dtype(X.dtype), [None, None] ) - model = make_model( - graph, - opset_imports=[ - onnx.helper.make_opsetid("ai.onnx.ml", 5), - onnx.helper.make_opsetid("", 14), - ], + X_out = make_tensor_value_info( + "X_out", np_dtype_to_tensor_dtype(X.dtype), [None, None] + ) + onnx_nodes.append(make_node("Identity", ["X_in"], ["X_out"])) + graph = make_graph( + onnx_nodes, + "legateboost.models.Tree", + [X_in, predictions_in], + [X_out, predictions_out], + [leaf_weights], ) - check_model(model) - return model + return graph diff --git a/legateboost/objectives.py b/legateboost/objectives.py index ea389660..08219f29 100644 --- a/legateboost/objectives.py +++ b/legateboost/objectives.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Tuple +from typing import Any, Tuple from scipy.stats import norm from typing_extensions import TypeAlias, override @@ -43,6 +43,7 @@ class BaseObjective(ABC): # utility constant one = cn.ones(1, dtype=cn.float64) + half = cn.array(0.5, dtype=cn.float64) @abstractmethod def gradient(self, y: cn.ndarray, pred: cn.ndarray) -> GradPair: @@ -70,6 +71,30 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray: """ return pred + def onnx_transform(self, pred: cn.ndarray) -> Any: + """Returns an ONNX graph that accepts + - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double. + And outputs the transformed predictions. + - "predictions_out" : arbitrary tensor depending on the objective. + + Is by default the identity transform. + + The ONNX transform should produce the same output as the transform + method for each objective. + + Returns: + Onnx graph that transforms the predictions. + """ + import onnx + + onnx_text = """ + BaseObjective (double[N, M] predictions_in) => (double[N, M] predictions_out) + { + predictions_out = Identity(predictions_in) + } + """ + return onnx.parser.parse_graph(onnx_text) + @abstractmethod def metric(self) -> BaseMetric: """Returns the default error metric for the objective function. @@ -115,6 +140,25 @@ def output_class(self, pred: cn.ndarray) -> cn.ndarray: """ return cn.argmax(pred, axis=-1) + def onnx_output_class(self, pred: cn.ndarray) -> Any: + """Returns an ONNX model that accepts + - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double. + And outputs the predicted class labels. + - "predictions_out" : 1D tensor of shape (n_samples,) and type int32. + + Returns: + Onnx model that converts probabilities into class labels. + """ + import onnx + + onnx_text = """ + BaseModelOutputClass (double[N, M] predictions_in) => (int64[N, M] predictions_out) + { + predictions_out = ArgMax(predictions_in) + } + """ # noqa: E501 + return onnx.parser.parse_graph(onnx_text) + class SquaredErrorObjective(BaseObjective): """The Squared Error objective function for regression problems. @@ -243,6 +287,29 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray: pred[:, :, 1] = cn.clip(pred[:, :, 1], -5, 5) return pred + def onnx_transform(self, pred: cn.ndarray) -> Any: + import onnx + + onnx_text = """ + NormalObjective (double[N, M] predictions_in) => (double[N, M] predictions_out) + { + out_shape = Constant() + var_starts = Constant() + mean_starts = Constant() + axis = Constant() + steps = Constant() + min = Constant() + max = Constant() + reshaped = Reshape(predictions_in, out_shape) + new_shape = Shape(reshaped) + variance = Slice(reshaped, var_starts, new_shape, axis, steps) + mean = Slice(reshaped, mean_starts, new_shape, axis, steps) + clipped_variance = Clip(variance, min, max) + predictions_out = Concat(mean, clipped_variance) + } + """ + return onnx.parser.parse_graph(onnx_text) + @override def mean(self, param: cn.ndarray) -> cn.ndarray: """Return the mean for the Normal distribution.""" @@ -321,6 +388,18 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray: """Inverse log link.""" return cn.exp(pred) + @override + def onnx_transform(self, pred: cn.ndarray) -> Any: + import onnx + + onnx_text = """ + GammaDevianceTransform (double[N, M] predictions_in) => (double[N, M] predictions_out) + { + predictions_out = Exp(predictions_in) + } + """ # noqa: E501 + return onnx.parser.parse_graph(onnx_text) + def initialise_prediction( self, y: cn.ndarray, w: cn.ndarray, boost_from_average: bool ) -> cn.ndarray: @@ -365,6 +444,18 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray: assert pred.ndim == 3 return cn.exp(pred) + @override + def onnx_transform(self, pred: cn.ndarray) -> Any: + import onnx + + onnx_text = """ + GammaTransform (double[N, M] predictions_in) => (double[N, M] predictions_out) + { + predictions_out = Exp(predictions_in) + } + """ + return onnx.parser.parse_graph(onnx_text) + @override def metric(self) -> GammaLLMetric: return GammaLLMetric() @@ -421,7 +512,7 @@ def var(self, param: cn.ndarray) -> cn.ndarray: class QuantileObjective(BaseObjective): """Minimises the quantile loss, otherwise known as check loss or pinball loss. - :math:`L(y_i, p_i) = \\frac{1}{k}\\sum_{j=1}^{k} (q_j - \\mathbb{1})(y_i - p_{i, j})` + :math:`L(y_i, p_i) = \\frac{}{k}\\sum_{j=1}^{k} (q_j - \\mathbb{1})(y_i - p_{i, j})` where @@ -511,6 +602,18 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray: div = cn.sum(e_x, axis=1) return e_x / div[:, cn.newaxis] + def onnx_transform(self, pred: cn.ndarray) -> Any: + import onnx + + operator_to_use = "Sigmoid" if pred.shape[1] == 1 else "Softmax" + onnx_text = f""" + LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out) + {{ + predictions_out = {operator_to_use}(predictions_in) + }} + """ + return onnx.parser.parse_graph(onnx_text) + def metric(self) -> LogLossMetric: return LogLossMetric() @@ -547,8 +650,32 @@ def gradient(self, y: cn.ndarray, pred: cn.ndarray) -> GradPair: def transform(self, pred: cn.ndarray) -> cn.ndarray: return self.one / (self.one + cn.exp(-pred)) + def onnx_transform(self, pred: cn.ndarray) -> Any: + import onnx + + onnx_text = """ + MultiLabelObjective (double[N, M] predictions_in) => (double[N, M] predictions_out) + { + predictions_out = Sigmoid(predictions_in) + } + """ # noqa: E501 + return onnx.parser.parse_graph(onnx_text) + def output_class(self, pred: cn.ndarray) -> cn.ndarray: - return cn.array(pred > 0.5, dtype=cn.int32).squeeze() + return cn.array(pred > 0.5, dtype=cn.int64) + + def onnx_output_class(self, pred: cn.ndarray) -> Any: + import onnx + + onnx_text = """ + MultiLabelOutputClass (double[N, M] predictions_in) => (int64[N, M] predictions_out) + { + half = Constant() + greater = Greater(predictions_in, half) + predictions_out = Cast(greater) + } + """ # noqa: E501 + return onnx.parser.parse_graph(onnx_text) def metric(self) -> MultiLabelMetric: return MultiLabelMetric() @@ -614,6 +741,33 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray: K = pred.shape[1] # number of classes return logloss.transform((1 / (K - 1)) * pred) + def onnx_transform(self, pred: cn.ndarray) -> Any: + import onnx + + onnx_text = """ + LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out) + """ # noqa: E501 + + if pred.shape[1] == 1: + onnx_text += """ + { + constant = Constant() + a = Mul(predictions_in, constant) + predictions_out = Sigmoid(a) + } + """ + return onnx.parser.parse_graph(onnx_text) + + constant = 1 / (pred.shape[1] - 1) + onnx_text += f""" + {{ + constant = Constant() + a = Mul(predictions_in, constant) + predictions_out = Softmax(a) + }} + """ + return onnx.parser.parse_graph(onnx_text) + def metric(self) -> ExponentialMetric: return ExponentialMetric() @@ -628,7 +782,7 @@ def initialise_prediction( return self.one_step_newton(y, w, boost_from_average, init) -objectives = { +OBJECTIVES_MAP = { "squared_error": SquaredErrorObjective, "normal": NormalObjective, "log_loss": LogLossObjective, @@ -638,3 +792,11 @@ def initialise_prediction( "gamma_deviance": GammaDevianceObjective, "gamma": GammaObjective, } + +REGRESSION_OBJECTIVES = ["squared_error", "normal", "gamma_deviance", "gamma"] + +CLASSIFICATION_OBJECTIVES = [ + "log_loss", + "multi_label", + "exp", +] diff --git a/legateboost/onnx_utils.py b/legateboost/onnx_utils.py new file mode 100644 index 00000000..3b2f72fb --- /dev/null +++ b/legateboost/onnx_utils.py @@ -0,0 +1,104 @@ +from typing import Any, List + +import numpy as np + +import cupynumeric as cn + +# onnx is imported only if needed - keep this a soft dependency +try: + import onnx +except ImportError: + pass + + +def make_model(graph: Any) -> Any: + # make model with appropriate opset imports for legate-boost + LEGATEBOOST_ONNX_OPSET_IMPORTS = [ + onnx.helper.make_opsetid("ai.onnx.ml", 3), + onnx.helper.make_opsetid("", 21), + ] + return onnx.helper.make_model(graph, opset_imports=LEGATEBOOST_ONNX_OPSET_IMPORTS) + + +def reshape_predictions(graph: Any, pred: cn.ndarray) -> Any: + # àppend an onnx graph that shapes the predictions equivalently to pred + shape = list(pred.shape) + shape[0] = -1 + out_type = "int64" if pred.dtype == cn.int64 else "double" + onnx_text = f""" + ReshapePredictions ({out_type}[N, M] predictions_in) => ({out_type}{shape} predictions_out) + {{ + shape = Constant() + predictions_out = Reshape(predictions_in, shape) + }} + """ # noqa: E501 + reshape_graph = onnx.parser.parse_graph(onnx_text) + graph = onnx.compose.merge_graphs( + graph, + reshape_graph, + io_map=[ + (graph.output[0].name, "predictions_in"), + ], + prefix2="reshape_", + ) + return graph + + +def mirror_predict_proba_output(graph: Any) -> Any: + # where model outputs only true probability we need to add the false probability + onnx_text = """ + MirrorPredict (double[N, M] predictions_in) => (double[N, 2] predictions_out) + { + one = Constant() + false_probability = Sub(one, predictions_in) + predictions_out = Concat(false_probability, predictions_in) + } + """ # noqa: E501 + new_graph = onnx.parser.parse_graph(onnx_text) + new_graph = onnx.compose.merge_graphs( + graph, + new_graph, + io_map=[ + (graph.output[0].name, "predictions_in"), + ], + prefix2="mirror_", + ) + return new_graph + + +def init_predictions(model_init: cn.array, X_dtype: Any) -> Any: + # form a graph that takes X_in and model_init as input and outputs + # model_init repeated n_rows times + + X_type_text = "double" if X_dtype == cn.float64 else "float" + onnx_text = f""" + InitPredictions ({X_type_text}[N, M] X_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out) + {{ + X_out = Identity(X_in) + n_rows = Shape(X_in) + one = Constant() + tile_repeat = Concat(n_rows, one) + predictions_out = Tile(init, tile_repeat) + }} + """ # noqa: E501 + graph = onnx.parser.parse_graph(onnx_text) + graph.initializer.append( + onnx.numpy_helper.from_array(np.atleast_2d(model_init.__array__()), name="init") + ) + return graph + + +def merge_model_graphs(graphs: List[Any]) -> Any: + # merge a list of graphs into a single graph + combined = graphs[0] + for i, g in enumerate(graphs[1:]): + combined = onnx.compose.merge_graphs( + combined, + g, + io_map=[ + (combined.output[0].name, "X_in"), + (combined.output[1].name, "predictions_in"), + ], + prefix2="model_{}_".format(i), + ) + return combined diff --git a/legateboost/test/test_objective.py b/legateboost/test/test_objective.py index aaf8823c..a78ecb80 100644 --- a/legateboost/test/test_objective.py +++ b/legateboost/test/test_objective.py @@ -1,3 +1,5 @@ +import numpy as np +import onnxruntime as ort import pytest import cupynumeric as cn @@ -5,12 +7,26 @@ from legateboost.testing.utils import non_increasing +def compare_onnx_transform(obj, pred): + sess = ort.InferenceSession(obj.onnx_transform(pred).SerializeToString()) + feeds = { + "predictions_in": pred, + } + onnx_transform = sess.run(None, feeds)[0] + assert onnx_transform.dtype == np.float64 + transform = obj.transform(cn.array(pred)) + assert transform.shape == onnx_transform.shape + assert np.allclose(onnx_transform, transform, atol=1e-6) + + def test_normal() -> None: obj = lb.NormalObjective() y = cn.array([[1.0], [2.0], [3.0]]) init = obj.initialise_prediction(y, cn.array([1.0, 1.0, 1.0]), True) assert cn.allclose(init, cn.array([y.mean(), cn.log(y.std())])) + compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64)) + def test_gamma_deviance() -> None: obj = lb.GammaDevianceObjective() @@ -35,6 +51,8 @@ def test_gamma_deviance() -> None: reg.fit(X, y1, eval_set=[(X, y1)], eval_result=eval_result) assert non_increasing(eval_result["train"]["deviance_gamma"]) + compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64)) + def test_gamma() -> None: import numpy as np @@ -51,6 +69,8 @@ def test_gamma() -> None: reg.fit(X, y, eval_set=[(X, y)], eval_result=eval_result) assert non_increasing(eval_result["train"]["gamma_neg_ll"]) + compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64)) + def test_log_loss() -> None: obj = lb.LogLossObjective() @@ -95,6 +115,9 @@ def test_log_loss() -> None: False, ) + compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64)) + compare_onnx_transform(obj, np.arange(4).reshape(4, 1).astype(np.float64)) + def test_exp(): obj = lb.ExponentialObjective() @@ -127,6 +150,9 @@ def test_exp(): False, ) + compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64)) + compare_onnx_transform(obj, np.arange(4).reshape(4, 1).astype(np.float64)) + def test_multi_label(): obj = lb.MultiLabelObjective() @@ -139,3 +165,5 @@ def test_multi_label(): with pytest.raises(ValueError, match=r"Expected labels to be in \[0, 1\]"): obj.initialise_prediction(cn.array([[1], [2]]), cn.array([[1.0], [1.0]]), False) + + compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64)) diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py index 8710eaf4..57e75d75 100644 --- a/legateboost/test/test_onnx.py +++ b/legateboost/test/test_onnx.py @@ -1,35 +1,180 @@ import numpy as np +import onnxruntime as ort import pytest -from onnx.reference import ReferenceEvaluator import cupynumeric as cn import legateboost as lb +from legateboost.onnx_utils import make_model -@pytest.mark.parametrize( - "Model", [M for M in lb.models.BaseModel.__subclasses__() if hasattr(M, "to_onnx")] -) -@pytest.mark.parametrize("n_outputs", [1, 5]) -@pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_onnx(Model, n_outputs, dtype): +def compare_model_predictions(model, X): + onnx_model = make_model(model.to_onnx(X)) + sess = ort.InferenceSession(onnx_model.SerializeToString()) + feeds = { + "X_in": X, + } + pred = model.predict(cn.array(X)) + feeds["predictions_in"] = np.zeros((X.shape[0], pred.shape[1])) + onnx_pred = sess.run(None, feeds)[1] + assert onnx_pred.dtype == np.float64 + assert pred.shape == onnx_pred.shape + assert np.allclose( + onnx_pred, pred, atol=1e-2 if X.dtype == np.float32 else 1e-6 + ), np.linalg.norm(pred - onnx_pred) + + +def compare_estimator_predictions(estimator, X, predict_function, allowed_wrong=0): + sess = ort.InferenceSession( + estimator.to_onnx(X, predict_function).SerializeToString() + ) + feeds = { + "X_in": X, + } + pred_method = getattr(estimator, predict_function) + pred = pred_method(cn.array(X)) + onnx_pred = sess.run(None, feeds)[0] + + assert onnx_pred.dtype == pred.dtype + assert pred.shape == onnx_pred.shape + number_wrong = np.sum( + np.abs(pred - onnx_pred) > (1e-2 if X.dtype == np.float32 else 1e-5) + ) + assert number_wrong <= allowed_wrong, ( + np.linalg.norm(pred - onnx_pred), + number_wrong, + ) + + +@pytest.fixture +def model_dataset(dtype, n_outputs): rs = np.random.RandomState(0) X = rs.random((1000, 10)).astype(dtype) g = rs.normal(size=(X.shape[0], n_outputs)) h = rs.random(g.shape) + 0.1 + return X, g, h + + +@pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("n_outputs", [1, 5]) +def test_models(Model, model_dataset): + X, g, h = model_dataset model = ( Model() .set_random_state(np.random.RandomState(2)) .fit(cn.array(X), cn.array(g), cn.array(h)) ) - def pred_onnx(onnx, X): - sess = ReferenceEvaluator(onnx) - pred = np.empty(X.shape[0], dtype=dtype) - feeds = {"X": X, "pred": pred} - return sess.run(None, feeds) + compare_model_predictions(model, X) - assert np.allclose( - model.predict(cn.array(X)), - pred_onnx(model.to_onnx(), X)[0], - atol=1e-3 if dtype == np.float32 else 1e-6, + +@pytest.mark.parametrize("n_outputs", [1, 5]) +def test_init(n_outputs): + # ONNX correctly outputs model init + X = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32) + y = np.full((3, n_outputs), 5.0, dtype=np.float32) + estimator = lb.LBRegressor(n_estimators=0, random_state=0).fit(X, y) + assert np.all(estimator.model_init_ == 5.0) + compare_estimator_predictions(estimator, X, "predict_raw") + + +@pytest.fixture +def regression_dataset(dtype, n_outputs): + from sklearn.datasets import make_regression + + X, y = make_regression( + n_samples=1000, + n_features=10, + n_informative=5, + n_targets=n_outputs, + random_state=0, + ) + # make labels strictly positive for certain objectives + return X.astype(dtype), np.abs(y.astype(dtype)) + + +@pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()]) +@pytest.mark.parametrize("objective", lb.objectives.REGRESSION_OBJECTIVES) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("n_outputs", [1, 5]) +def test_regressor(Model, objective, regression_dataset): + X, y = regression_dataset + if ( + Model in [lb.models.KRR, lb.models.NN] + and objective == "gamma" + and X.dtype == np.float32 + ): + pytest.skip("Skipping as numerically unstable") + if objective in [ + "quantile", + "gamma_deviance", + "gamma", + ] and (y.ndim > 1 and y.shape[1] > 1): + pytest.skip("skipping quantile, gamma and gamma_deviance for multiple outputs") + model = lb.LBRegressor( + n_estimators=2, + objective=objective, + base_models=(Model(),), + random_state=0, + ).fit(X, y) + + compare_estimator_predictions(model, X, "predict_raw") + compare_estimator_predictions(model, X, "predict") + + +@pytest.fixture +def classification_dataset(dtype, n_outputs): + from sklearn.datasets import make_classification + + X, y = make_classification( + n_samples=1000, + n_features=10, + n_informative=5, + n_classes=n_outputs, + random_state=0, ) + return X.astype(dtype), np.abs(y.astype(dtype)) + + +@pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()]) +@pytest.mark.parametrize("objective", lb.objectives.CLASSIFICATION_OBJECTIVES) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("n_outputs", [2, 5]) +def test_classifier(Model, objective, classification_dataset): + X, y = classification_dataset + if objective == "multi_label": + # encode labels as one-hot + encoded = np.zeros((y.shape[0], int(y.max() + 1))) + encoded[np.arange(y.shape[0]), y.astype(int)] = 1 + y = encoded + model = lb.LBClassifier( + n_estimators=2, + objective=objective, + base_models=(Model(),), + random_state=0, + ).fit(X, y) + + compare_estimator_predictions(model, X, "predict_raw") + compare_estimator_predictions(model, X, "predict_proba") + # softmax has numerical differences with float32 + # allow a very small number of different class predictions + # this is fine so long as the probabilities are close + allowed_wrong = 5 if y.max() > 1 and X.dtype == np.float32 else 0 + compare_estimator_predictions(model, X, "predict", allowed_wrong) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("n_outputs", [1, 5]) +@pytest.mark.parametrize("max_depth", list(range(0, 12, 3))) +def test_tree(regression_dataset, max_depth): + # test tree depths more exhaustively + # some edge cases e.g. max_depth=0 + X, y = regression_dataset + model = lb.LBRegressor( + init=None, + n_estimators=2, + base_models=(lb.models.Tree(max_depth=max_depth),), + random_state=0, + ).fit(X, y) + + compare_estimator_predictions(model, X, "predict_raw") diff --git a/legateboost/test/test_with_hypothesis.py b/legateboost/test/test_with_hypothesis.py index 22275f67..c9312079 100644 --- a/legateboost/test/test_with_hypothesis.py +++ b/legateboost/test/test_with_hypothesis.py @@ -1,4 +1,5 @@ import numpy as np +import onnxruntime as ort from hypothesis import HealthCheck, Verbosity, assume, given, settings, strategies as st from sklearn.preprocessing import StandardScaler @@ -25,15 +26,15 @@ @st.composite def tree_strategy(draw): if get_legate_runtime().machine.count(TaskTarget.GPU) > 0: - max_depth = draw(st.integers(1, 8)) + max_depth = draw(st.integers(0, 8)) else: - max_depth = draw(st.integers(1, 6)) - alpha = draw(st.floats(0.0, 1.0)) + max_depth = draw(st.integers(0, 6)) + l2_regularization = draw(st.floats(0.0, 1.0)) split_samples = draw(st.integers(1, 500)) feature_fraction = draw(st.sampled_from([0.5, 1.0])) return lb.models.Tree( max_depth=max_depth, - alpha=alpha, + l2_regularization=l2_regularization, split_samples=split_samples, feature_fraction=feature_fraction, ) @@ -41,20 +42,22 @@ def tree_strategy(draw): @st.composite def nn_strategy(draw): - alpha = draw(st.floats(0.0, 1.0)) + l2_regularization = draw(st.floats(0.0, 1.0)) hidden_layer_sizes = draw(st.sampled_from([(), (100,), (100, 100), (10, 10, 10)])) # max iter needs to be sufficiently large, otherwise the models can make the loss # worse (from a bad initialization) max_iter = 200 return lb.models.NN( - alpha=alpha, hidden_layer_sizes=hidden_layer_sizes, max_iter=max_iter + l2_regularization=l2_regularization, + hidden_layer_sizes=hidden_layer_sizes, + max_iter=max_iter, ) @st.composite def linear_strategy(draw): - alpha = draw(st.floats(0.0, 1.0)) - return lb.models.Linear(alpha=alpha) + l2_regularization = draw(st.floats(0.0, 1.0)) + return lb.models.Linear(l2_regularization=l2_regularization) @st.composite @@ -63,9 +66,11 @@ def krr_strategy(draw): sigma = draw(st.floats(0.1, 1.0)) else: sigma = None - alpha = draw(st.floats(0.0, 1.0)) + l2_regularization = draw(st.floats(0.0, 1.0)) components = draw(st.integers(2, 10)) - return lb.models.KRR(n_components=components, alpha=alpha, sigma=sigma) + return lb.models.KRR( + n_components=components, l2_regularization=l2_regularization, sigma=sigma + ) @st.composite @@ -161,11 +166,20 @@ def test_regressor(model_params, regression_params, regression_dataset): model = lb.LBRegressor(**model_params, **regression_params, verbose=True).fit( X, y, sample_weight=w, eval_result=eval_result ) - model.predict(X) loss = next(iter(eval_result["train"].values())) assert non_increasing(loss, tol=1e-1) sanity_check_models(model) + # check onnx + # for now reshape legate-boost predict to 2-D + # eventually onnx should match the output shape exactly + predict_raw = model.predict_raw(X) + onnx_predict_raw = pred_onnx(model.to_onnx(X.dtype), X) + onnx_predict_raw = onnx_predict_raw.reshape(predict_raw.shape) + assert np.allclose( + predict_raw, onnx_predict_raw, atol=1e-3 if X.dtype == np.float32 else 1e-6 + ), np.linalg.norm(predict_raw - onnx_predict_raw) + classification_param_strategy = st.fixed_dictionaries( { @@ -240,12 +254,18 @@ def classification_dataset_strategy(draw): return X, y, w, name +def pred_onnx(onnx, X): + sess = ort.InferenceSession(onnx.SerializeToString()) + return sess.run(None, {"X_in": X})[0] + + @given( general_model_param_strategy, classification_param_strategy, classification_dataset_strategy(), ) @cn.errstate(divide="raise", invalid="raise") +@settings(print_blob=True) def test_classifier( model_params: dict, classification_params: dict, classification_dataset: tuple ) -> None: @@ -256,8 +276,15 @@ def test_classifier( ) model.predict(X) model.predict_proba(X) - model.predict_raw(X) + predict_raw = model.predict_raw(X) loss = next(iter(eval_result["train"].values())) # multiclass models with higher learning rates don't always converge if len(model.classes_) == 2: assert non_increasing(loss, 1e-1) + + # check onnx + onnx_predict_raw = pred_onnx(model.to_onnx(X.dtype), X) + onnx_predict_raw = onnx_predict_raw.reshape(predict_raw.shape) + assert np.allclose( + predict_raw, onnx_predict_raw, atol=1e-3 if X.dtype == np.float32 else 1e-6 + ), np.linalg.norm(predict_raw - onnx_predict_raw) diff --git a/pyproject.toml b/pyproject.toml index a69041c6..e8269341 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ test = [ "notebook>=7", "onnx>=1.10", "onnxmltools>=1.10", + "onnxruntime", "pytest>=7,<8", "seaborn>=0.13", "xgboost>=2.0",