pjaselin · pjaselin · May 31, 2026 · May 17, 2026 · May 17, 2026 · May 17, 2026
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -14,6 +14,9 @@ on:
     types:
       - published
 
+env:
+  UV_VERSION: "0.11.17"
+
 jobs:
   build-wheels:
     name: Build wheels on ${{ matrix.os }}
@@ -35,7 +38,7 @@ jobs:
       - name: Install uv
         uses: astral-sh/setup-uv@v6
         with:
-          version: "0.9.8"
+          version: ${{ env.UV_VERSION }}
 
       - name: Enable Developer Command Prompt for Windows
         if: matrix.os == 'windows-latest'
@@ -74,7 +77,7 @@ jobs:
       - name: Install uv
         uses: astral-sh/setup-uv@v6
         with:
-          version: "0.9.8"
+          version: ${{ env.UV_VERSION }}
 
       - name: Build source distribution
         run: |
@@ -153,7 +156,7 @@ jobs:
       - name: Install uv
         uses: astral-sh/setup-uv@v6
         with:
-          version: "0.9.8"
+          version: ${{ env.UV_VERSION }}
 
       - name: Install dependencies
         run: uv sync --group docs

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -18,6 +18,9 @@ on:
     branches:
       - main
 
+env:
+  UV_VERSION: "0.11.17"
+
 jobs:
   lint-and-test:
     name: Lint and test on ${{ matrix.os }} with Python ${{ matrix.python-version }}
@@ -50,7 +53,7 @@ jobs:
       - name: Install uv
         uses: astral-sh/setup-uv@v6
         with:
-          version: "0.9.8"
+          version: ${{ env.UV_VERSION }}
 
       - name: Enable Developer Command Prompt for Windows
         if: matrix.os == 'windows-latest'
@@ -64,6 +67,10 @@ jobs:
       - name: Install dependencies
         run: uv sync --extra viz
 
+      - name: Run pre-commit hooks over repo
+        if: matrix.os == 'ubuntu-latest'
+        uses: pre-commit/action@v3.0.1
+
       - name: Lint with Ruff
         run: uv run ruff check
 
@@ -100,7 +107,7 @@ jobs:
       - name: Install uv
         uses: astral-sh/setup-uv@v6
         with:
-          version: "0.9.8"
+          version: ${{ env.UV_VERSION }}
 
       - name: Install dependencies
         run: uv sync --group docs

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 ---
 default_language_version:
-  node: 22.9.0
+  node: 22.22.3
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -16,39 +16,48 @@ repos:
         args: [--pytest-test-first]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.14.4
+    rev: v0.15.15
     hooks:
       # Run the linter.
       - id: ruff
       # Run the formatter.
       - id: ruff-format
         exclude: ^cubist/src
   - repo: https://github.com/DavidAnson/markdownlint-cli2
-    rev: v0.18.1
+    rev: v0.22.1
     hooks:
       - id: markdownlint-cli2
   - repo: https://github.com/thlorenz/doctoc
-    rev: v2.2.0
+    rev: v2.4.1
     hooks:
       - id: doctoc
   - repo: https://github.com/crate-ci/typos
-    rev: v1.39.0
+    rev: v1.47.0
     hooks:
       - id: typos
         exclude: ^cubist/src
         args: []
   - repo: https://github.com/pycqa/isort
-    rev: 7.0.0
+    rev: 9.0.0a3
     hooks:
       - id: isort
   - repo: https://github.com/commitizen-tools/commitizen
-    rev: v4.9.1
+    rev: v4.16.3
     hooks:
       - id: commitizen
   - repo: https://github.com/PyCQA/doc8
     rev: v2.0.0
     hooks:
       - id: doc8
+  - repo: https://github.com/semgrep/pre-commit
+    rev: v1.164.0
+    hooks:
+      - id: semgrep-docker
+        name: semgrep
+        description: This hook runs semgrep (a.k.a. semgrep:latest)
+        language: docker_image
+        # See the comment above for why we set those SEMGREP_XXX variables
+        entry: -e SEMGREP_LOG_FILE=/tmp/out.log -e SEMGREP_VERSION_CACHE_PATH=/tmp/cache semgrep/semgrep:latest semgrep
   - repo: local
     hooks:
       - id: mypy

diff --git a/README.md b/README.md
@@ -8,21 +8,33 @@
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/cubist)](https://pypi.org/project/cubist)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
 
-`cubist` is a Python package and wrapper for [Ross Quinlan](https://www.rulequest.com/Personal/)'s [Cubist](https://www.rulequest.com/cubist-unix.html) v2.07 regression model with additional utilities for visualizing the model. The package is both inspired by and a translation of the [R wrapper for Cubist](https://github.com/topepo/Cubist). This implementation of the model is compatible with and the visualization utilities are designed after those in [scikit-learn](https://scikit-learn.org/stable/).
+`cubist` is a Python package and wrapper for [Ross Quinlan](https://www.rulequest.com/Personal/)'s [Cubist](https://www.rulequest.com/cubist-unix.html) v2.07 rule-based regression model with additional utilities for visualizing a trained model. The package is both inspired by and a translation of the [R wrapper for Cubist](https://github.com/topepo/Cubist). This implementation of the model is compatible with and the visualization utilities are designed after those in [scikit-learn](https://scikit-learn.org/stable/).
 
 <!-- START doctoc generated TOC please keep comment here to allow auto update -->
 <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
 **Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
 
+- [Model Features](#model-features)
 - [Installation](#installation)
   - [Model-Only](#model-only)
   - [Enable Visualization Utilities](#enable-visualization-utilities)
-- [Usage](#usage)
-- [Cubist Model Features](#cubist-model-features)
+- [Usage with Verbose Output](#usage-with-verbose-output)
 - [Package Contents](#package-contents)
 
 <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 
+## Model Features
+
+The Cubist model has the following distinguishing features, though not all are fully enabled in this package:
+
+- Generates a piecewise model formulated as a collection of conditional rules with corresponding linear regressors (optionally allowing for nearest-neighbor correction).
+- High interpretability due to piecewise rules and linear regressors.
+- Handles missing values.
+- Handles continuous, date, time, timestamp, and discrete values. Additionally can ignore columns and add labels to training rows. Columns can also be defined by formulas. N.B. Not all of these are supported in this package.
+- Natively performs cross-validation and sampling.
+- Error can be further reduced by using multiple models (committees).
+- Allows for extrapolation beyond the original training target values (sets a minimum of zero for predicted output if all training target values are greater than zero).
+
 ## Installation
 
 ### Model-Only
@@ -49,7 +61,7 @@ or
 uv add cubist --extra viz
 ```
 
-## Usage
+## Usage with Verbose Output
 
 ```python
 >>> from sklearn.datasets import load_iris
@@ -62,7 +74,7 @@ uv add cubist --extra viz
 >>> X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.05
     )
->>> model = Cubist()
+>>> model = Cubist(n_rules=2, verbose=True)
 >>> model.fit(X_train, y_train)
 
 Cubist [Release 2.07 GPL Edition]  Sat Dec 28 19:52:49 2024
@@ -115,18 +127,6 @@ array([1.1257    , 0.        , 2.04999995, 1.25449991, 1.30480003,
 0.9543285583162371
 ```
 
-## Cubist Model Features
-
-The Cubist model has the following distinguishing features, although not all are fully enabled in this package:
-
-- Generates a piecewise model formulated as a collection of conditional rules with corresponding linear regressors (optionally allowing for nearest-neighbor correction).
-- High interpretability due to piecewise rules and linear regressors.
-- Handles missing values.
-- Handles continuous, date, time, timestamp, and discrete values. Additionally can ignore columns and add labels to training rows. Columns can also be defined by formulas. N.B. Not all of these are supported in this package.
-- Natively performs cross-validation and sampling.
-- Error can be further reduced by using multiple models (committees).
-- Allows for extrapolation beyond the original training target values (sets a minimum of zero for predicted output if all training target values are greater than zero).
-
 ## Package Contents
 
 - Cubist model exposed as a scikit-learn estimator.

diff --git a/cubist/_cubist_display_mixin.py b/cubist/_cubist_display_mixin.py
@@ -126,10 +126,10 @@ def _validate_from_estimator_params(
                 axis=1,
             )
         # get the distinct ordered labels
-        y_label_map = df.label.drop_duplicates().reset_index(drop=True).to_dict()
+        y_label_map = df["label"].drop_duplicates().reset_index(drop=True).to_dict()
         # get the labels as a list
         y_labels = list(y_label_map.values())
         # replace the dataframe label column values with the index of the
         # same value in y_labels
-        df.label = df.label.apply(y_labels.index)
+        df["label"] = df["label"].apply(y_labels.index)
         return df, y_axis_label, y_label_map
diff --git a/cubist/_make_data_string.py b/cubist/_make_data_string.py
@@ -1,6 +1,5 @@
 """Function to create the Cubist datav_ input"""
 
-import numpy as np
 import pandas as pd
 from pandas.api.types import is_numeric_dtype, is_string_dtype
 
@@ -35,9 +34,9 @@ def _make_data_string(x, y=None, w=None):
             x[col] = _escapes(x[col].astype(str))
 
     # if y is None for model predictions, set y as a column of NaN values,
-    # # which will become ?'s later
+    # which will become ?'s later
     if y is None:
-        y = [np.nan] * x.shape[0]
+        y = ["?"] * x.shape[0]
         y = pd.Series(y)
     else:
         y = y.copy(deep=True)
@@ -55,15 +54,12 @@ def _make_data_string(x, y=None, w=None):
         x = x.assign(w=w)
         x.columns = column_names
 
-    # convert all columns to strings
-    for col in x:
+    # convert all columns to strings and remove leading whitespace from all
+    # elements
+    for col in x.columns:
         if is_numeric_dtype(x[col]):
             x[col] = x[col].apply(_format)
-        else:
-            x[col] = x[col].astype(str)
-
-    # remove leading whitespace from all elements
-    x = x.map(lambda a: a.lstrip())
+        x[col] = x[col].astype(str).str.lstrip()
 
     # replace missing values with ?
     x = x.fillna("?")

diff --git a/cubist/_make_names_string.py b/cubist/_make_names_string.py
@@ -3,6 +3,7 @@
 import re
 import sys
 from datetime import datetime
+from typing import Any
 
 from ._quinlan_attributes import _quinlan_attributes
 
@@ -64,12 +65,12 @@ def _make_names_string(x, w=None, label="outcome"):
     return out
 
 
-def _escapes(x):
+def _escapes(x: list[Any]) -> list[str]:
     """Double escape reserved and special characters in x."""
     # set custom reserved characters list
     chars = [":", ";", "|"]
     # apply first escaping
     for i in chars:
-        x = [c.replace(i, f"\\{i}") for c in x]
+        x = [str(c).replace(i, f"\\{i}") for c in x]
     # apply second escaping
     return [re.escape(c) for c in x]
diff --git a/docs/index.rst b/docs/index.rst
@@ -43,6 +43,19 @@ This makes it straightforward to understand the model's predictive decisions. To
 
    ::
 
+      >>> from sklearn.datasets import load_iris
+      >>> from sklearn.model_selection import train_test_split
+      >>> from cubist import Cubist
+      >>> X, y = load_iris(return_X_y=True, as_frame=True)
+      >>> X_train, X_test, y_train, y_test = train_test_split(
+              X, y, test_size=0.05
+          )
+      >>> X_train, X_test, y_train, y_test = train_test_split(
+              X, y, test_size=0.05
+          )
+      >>> model = Cubist(verbose=True)
+      >>> model.fit(X_train, y_train)
+
       Cubist [Release 2.07 GPL Edition]  Sat Dec 28 19:52:49 2024
       ---------------------------------
 
@@ -91,7 +104,7 @@ Like XGBoost, Cubist can perform boosting by the addition of more models (called
 
 In addition to boosting, the model supports instance-based (nearest-neighbor) corrections to create composite models, combining the advantages of these two methods. Note that with instance-based correction, model accuracy may be improved at the expense of compute time as this extra step takes longer and somewhat reduced interpretability as the linear models are no longer completely followed. It should be noted that enabling instance-based correction requires saving the entire training dataset with the model if disk space is a consideration. Of not is that Cubist can be allowed to decide whether to take advantage of composite models with the appropriate settings and will report it's choice to the user.
 
-A final difference with other models is that Cubist natively supports missing and categorical values. This means users are not required to introduce encodings and may exlore more patterns in the dataset (e.g. around missingness).
+A final difference with other models is that Cubist natively supports missing and categorical values. This means users are not required to introduce encodings and may explore more patterns in the dataset (e.g. around missingness).
 
 
 Considerations

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cubist"
-version = "1.2.1"
+version = "1.2.2"
 requires-python = ">= 3.10"
 authors = [
   {name = "Ross Quinlan"},
@@ -83,7 +83,6 @@ packages = [
 
 [tool.pytest.ini_options]
 addopts = "-v -s"
-filterwarnings = "ignore:np.asscalar:DeprecationWarning"
 
 [tool.coverage.run]
 branch = true

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,7 +5,7 @@
 
 import pandas as pd
 import pytest
-from sklearn.datasets import fetch_openml, load_iris
+from sklearn.datasets import fetch_california_housing, load_iris
 
 
 @contextmanager
@@ -17,17 +17,19 @@ def no_raise():
 @pytest.fixture(scope="session")
 def ames_housing_dataset():
     """Fixture for ames housing dataset"""
-    return fetch_openml(
-        "ames_housing", version=1, as_frame=True, return_X_y=True, parser="auto"
+    X = pd.read_csv(
+        "https://raw.githubusercontent.com/wblakecannon/ames/378badd2c9e2e901a4bd4d466e9439d5e0059499/data/housing.csv",
+        index_col=0,
     )
+    y = X["SalePrice"]
+    X = X.drop(columns=["SalePrice"])
+    return X, y
 
 
 @pytest.fixture(scope="session")
 def california_housing_dataset():
     """Fixture for california housing dataset"""
-    return fetch_openml(
-        "california_housing", version=1, as_frame=True, return_X_y=True, parser="auto"
-    )
+    return fetch_california_housing(return_X_y=True, as_frame=True)
 
 
 @pytest.fixture(scope="session")
@@ -40,8 +42,8 @@ def iris_dataset():
 def boston_dataset():
     """Fixture for the Boston housing dataset"""
     X = pd.read_csv(
-        "https://raw.githubusercontent.com/selva86/datasets/refs/heads/master/BostonHousing.csv"
+        "https://raw.githubusercontent.com/selva86/datasets/5d788b9286864a80bc7b23703f372823bf6c600e/BostonHousing.csv"
     )
-    y = X.medv
+    y = X["medv"]
     X = X.drop(columns=["medv"])
     return X, y
diff --git a/tests/test_coefficient_display.py b/tests/test_coefficient_display.py
@@ -10,7 +10,7 @@ def test_coefficient_display(ames_housing_dataset):
     model = Cubist(n_committees=2).fit(*ames_housing_dataset)
     CubistCoefficientDisplay.from_estimator(model)
     plt.savefig("coefficient_display_test_ames.png")
-    CubistCoefficientDisplay.from_estimator(model, feature_names=["Gr_Liv_Area"])
+    CubistCoefficientDisplay.from_estimator(model, feature_names=["Gr Liv Area"])
     plt.savefig("coefficient_display_test_ames_subselect.png")
 
 

diff --git a/tests/test_coverage_display.py b/tests/test_coverage_display.py
@@ -12,7 +12,7 @@ def test_coverage_display(ames_housing_dataset):
     CubistCoverageDisplay.from_estimator(model, ames_housing_dataset[0])
     plt.savefig("coverage_display_test_ames.png")
     CubistCoverageDisplay.from_estimator(
-        model, ames_housing_dataset[0], feature_names=["Gr_Liv_Area"]
+        model, ames_housing_dataset[0], feature_names=["Gr Liv Area"]
     )
     plt.savefig("coverage_display_test_ames_subselect.png")