diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f34dccb..b05caeb 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,6 +14,9 @@ on: types: - published +env: + UV_VERSION: "0.11.17" + jobs: build-wheels: name: Build wheels on ${{ matrix.os }} @@ -35,7 +38,7 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v6 with: - version: "0.9.8" + version: ${{ env.UV_VERSION }} - name: Enable Developer Command Prompt for Windows if: matrix.os == 'windows-latest' @@ -74,7 +77,7 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v6 with: - version: "0.9.8" + version: ${{ env.UV_VERSION }} - name: Build source distribution run: | @@ -153,7 +156,7 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v6 with: - version: "0.9.8" + version: ${{ env.UV_VERSION }} - name: Install dependencies run: uv sync --group docs diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5308854..be27f78 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,6 +18,9 @@ on: branches: - main +env: + UV_VERSION: "0.11.17" + jobs: lint-and-test: name: Lint and test on ${{ matrix.os }} with Python ${{ matrix.python-version }} @@ -50,7 +53,7 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v6 with: - version: "0.9.8" + version: ${{ env.UV_VERSION }} - name: Enable Developer Command Prompt for Windows if: matrix.os == 'windows-latest' @@ -64,6 +67,10 @@ jobs: - name: Install dependencies run: uv sync --extra viz + - name: Run pre-commit hooks over repo + if: matrix.os == 'ubuntu-latest' + uses: pre-commit/action@v3.0.1 + - name: Lint with Ruff run: uv run ruff check @@ -100,7 +107,7 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v6 with: - version: "0.9.8" + version: ${{ env.UV_VERSION }} - name: Install dependencies run: uv sync --group docs diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 476a75a..0add206 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ --- default_language_version: - node: 22.9.0 + node: 22.22.3 repos: - repo: https://github.com/pre-commit/pre-commit-hooks @@ -16,7 +16,7 @@ repos: args: [--pytest-test-first] - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.14.4 + rev: v0.15.15 hooks: # Run the linter. - id: ruff @@ -24,31 +24,40 @@ repos: - id: ruff-format exclude: ^cubist/src - repo: https://github.com/DavidAnson/markdownlint-cli2 - rev: v0.18.1 + rev: v0.22.1 hooks: - id: markdownlint-cli2 - repo: https://github.com/thlorenz/doctoc - rev: v2.2.0 + rev: v2.4.1 hooks: - id: doctoc - repo: https://github.com/crate-ci/typos - rev: v1.39.0 + rev: v1.47.0 hooks: - id: typos exclude: ^cubist/src args: [] - repo: https://github.com/pycqa/isort - rev: 7.0.0 + rev: 9.0.0a3 hooks: - id: isort - repo: https://github.com/commitizen-tools/commitizen - rev: v4.9.1 + rev: v4.16.3 hooks: - id: commitizen - repo: https://github.com/PyCQA/doc8 rev: v2.0.0 hooks: - id: doc8 + - repo: https://github.com/semgrep/pre-commit + rev: v1.164.0 + hooks: + - id: semgrep-docker + name: semgrep + description: This hook runs semgrep (a.k.a. semgrep:latest) + language: docker_image + # See the comment above for why we set those SEMGREP_XXX variables + entry: -e SEMGREP_LOG_FILE=/tmp/out.log -e SEMGREP_VERSION_CACHE_PATH=/tmp/cache semgrep/semgrep:latest semgrep - repo: local hooks: - id: mypy diff --git a/README.md b/README.md index b83fabc..03ec6f0 100644 --- a/README.md +++ b/README.md @@ -8,21 +8,33 @@ [![PyPI - Downloads](https://img.shields.io/pypi/dm/cubist)](https://pypi.org/project/cubist) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) -`cubist` is a Python package and wrapper for [Ross Quinlan](https://www.rulequest.com/Personal/)'s [Cubist](https://www.rulequest.com/cubist-unix.html) v2.07 regression model with additional utilities for visualizing the model. The package is both inspired by and a translation of the [R wrapper for Cubist](https://github.com/topepo/Cubist). This implementation of the model is compatible with and the visualization utilities are designed after those in [scikit-learn](https://scikit-learn.org/stable/). +`cubist` is a Python package and wrapper for [Ross Quinlan](https://www.rulequest.com/Personal/)'s [Cubist](https://www.rulequest.com/cubist-unix.html) v2.07 rule-based regression model with additional utilities for visualizing a trained model. The package is both inspired by and a translation of the [R wrapper for Cubist](https://github.com/topepo/Cubist). This implementation of the model is compatible with and the visualization utilities are designed after those in [scikit-learn](https://scikit-learn.org/stable/). **Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* +- [Model Features](#model-features) - [Installation](#installation) - [Model-Only](#model-only) - [Enable Visualization Utilities](#enable-visualization-utilities) -- [Usage](#usage) -- [Cubist Model Features](#cubist-model-features) +- [Usage with Verbose Output](#usage-with-verbose-output) - [Package Contents](#package-contents) +## Model Features + +The Cubist model has the following distinguishing features, though not all are fully enabled in this package: + +- Generates a piecewise model formulated as a collection of conditional rules with corresponding linear regressors (optionally allowing for nearest-neighbor correction). +- High interpretability due to piecewise rules and linear regressors. +- Handles missing values. +- Handles continuous, date, time, timestamp, and discrete values. Additionally can ignore columns and add labels to training rows. Columns can also be defined by formulas. N.B. Not all of these are supported in this package. +- Natively performs cross-validation and sampling. +- Error can be further reduced by using multiple models (committees). +- Allows for extrapolation beyond the original training target values (sets a minimum of zero for predicted output if all training target values are greater than zero). + ## Installation ### Model-Only @@ -49,7 +61,7 @@ or uv add cubist --extra viz ``` -## Usage +## Usage with Verbose Output ```python >>> from sklearn.datasets import load_iris @@ -62,7 +74,7 @@ uv add cubist --extra viz >>> X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.05 ) ->>> model = Cubist() +>>> model = Cubist(n_rules=2, verbose=True) >>> model.fit(X_train, y_train) Cubist [Release 2.07 GPL Edition] Sat Dec 28 19:52:49 2024 @@ -115,18 +127,6 @@ array([1.1257 , 0. , 2.04999995, 1.25449991, 1.30480003, 0.9543285583162371 ``` -## Cubist Model Features - -The Cubist model has the following distinguishing features, although not all are fully enabled in this package: - -- Generates a piecewise model formulated as a collection of conditional rules with corresponding linear regressors (optionally allowing for nearest-neighbor correction). -- High interpretability due to piecewise rules and linear regressors. -- Handles missing values. -- Handles continuous, date, time, timestamp, and discrete values. Additionally can ignore columns and add labels to training rows. Columns can also be defined by formulas. N.B. Not all of these are supported in this package. -- Natively performs cross-validation and sampling. -- Error can be further reduced by using multiple models (committees). -- Allows for extrapolation beyond the original training target values (sets a minimum of zero for predicted output if all training target values are greater than zero). - ## Package Contents - Cubist model exposed as a scikit-learn estimator. diff --git a/cubist/_cubist_display_mixin.py b/cubist/_cubist_display_mixin.py index bd55bc3..2aed629 100644 --- a/cubist/_cubist_display_mixin.py +++ b/cubist/_cubist_display_mixin.py @@ -126,10 +126,10 @@ def _validate_from_estimator_params( axis=1, ) # get the distinct ordered labels - y_label_map = df.label.drop_duplicates().reset_index(drop=True).to_dict() + y_label_map = df["label"].drop_duplicates().reset_index(drop=True).to_dict() # get the labels as a list y_labels = list(y_label_map.values()) # replace the dataframe label column values with the index of the # same value in y_labels - df.label = df.label.apply(y_labels.index) + df["label"] = df["label"].apply(y_labels.index) return df, y_axis_label, y_label_map diff --git a/cubist/_make_data_string.py b/cubist/_make_data_string.py index b2d72b7..96a63bc 100644 --- a/cubist/_make_data_string.py +++ b/cubist/_make_data_string.py @@ -1,6 +1,5 @@ """Function to create the Cubist datav_ input""" -import numpy as np import pandas as pd from pandas.api.types import is_numeric_dtype, is_string_dtype @@ -35,9 +34,9 @@ def _make_data_string(x, y=None, w=None): x[col] = _escapes(x[col].astype(str)) # if y is None for model predictions, set y as a column of NaN values, - # # which will become ?'s later + # which will become ?'s later if y is None: - y = [np.nan] * x.shape[0] + y = ["?"] * x.shape[0] y = pd.Series(y) else: y = y.copy(deep=True) @@ -55,15 +54,12 @@ def _make_data_string(x, y=None, w=None): x = x.assign(w=w) x.columns = column_names - # convert all columns to strings - for col in x: + # convert all columns to strings and remove leading whitespace from all + # elements + for col in x.columns: if is_numeric_dtype(x[col]): x[col] = x[col].apply(_format) - else: - x[col] = x[col].astype(str) - - # remove leading whitespace from all elements - x = x.map(lambda a: a.lstrip()) + x[col] = x[col].astype(str).str.lstrip() # replace missing values with ? x = x.fillna("?") diff --git a/cubist/_make_names_string.py b/cubist/_make_names_string.py index 7c7aa9b..da68a44 100644 --- a/cubist/_make_names_string.py +++ b/cubist/_make_names_string.py @@ -3,6 +3,7 @@ import re import sys from datetime import datetime +from typing import Any from ._quinlan_attributes import _quinlan_attributes @@ -64,12 +65,12 @@ def _make_names_string(x, w=None, label="outcome"): return out -def _escapes(x): +def _escapes(x: list[Any]) -> list[str]: """Double escape reserved and special characters in x.""" # set custom reserved characters list chars = [":", ";", "|"] # apply first escaping for i in chars: - x = [c.replace(i, f"\\{i}") for c in x] + x = [str(c).replace(i, f"\\{i}") for c in x] # apply second escaping return [re.escape(c) for c in x] diff --git a/docs/index.rst b/docs/index.rst index 160fbef..6a657d9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -43,6 +43,19 @@ This makes it straightforward to understand the model's predictive decisions. To :: + >>> from sklearn.datasets import load_iris + >>> from sklearn.model_selection import train_test_split + >>> from cubist import Cubist + >>> X, y = load_iris(return_X_y=True, as_frame=True) + >>> X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.05 + ) + >>> X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.05 + ) + >>> model = Cubist(verbose=True) + >>> model.fit(X_train, y_train) + Cubist [Release 2.07 GPL Edition] Sat Dec 28 19:52:49 2024 --------------------------------- @@ -91,7 +104,7 @@ Like XGBoost, Cubist can perform boosting by the addition of more models (called In addition to boosting, the model supports instance-based (nearest-neighbor) corrections to create composite models, combining the advantages of these two methods. Note that with instance-based correction, model accuracy may be improved at the expense of compute time as this extra step takes longer and somewhat reduced interpretability as the linear models are no longer completely followed. It should be noted that enabling instance-based correction requires saving the entire training dataset with the model if disk space is a consideration. Of not is that Cubist can be allowed to decide whether to take advantage of composite models with the appropriate settings and will report it's choice to the user. -A final difference with other models is that Cubist natively supports missing and categorical values. This means users are not required to introduce encodings and may exlore more patterns in the dataset (e.g. around missingness). +A final difference with other models is that Cubist natively supports missing and categorical values. This means users are not required to introduce encodings and may explore more patterns in the dataset (e.g. around missingness). Considerations diff --git a/pyproject.toml b/pyproject.toml index 9d18af8..84fd166 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta" [project] name = "cubist" -version = "1.2.1" +version = "1.2.2" requires-python = ">= 3.10" authors = [ {name = "Ross Quinlan"}, @@ -83,7 +83,6 @@ packages = [ [tool.pytest.ini_options] addopts = "-v -s" -filterwarnings = "ignore:np.asscalar:DeprecationWarning" [tool.coverage.run] branch = true diff --git a/tests/conftest.py b/tests/conftest.py index 203e727..aa4d37b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,7 @@ import pandas as pd import pytest -from sklearn.datasets import fetch_openml, load_iris +from sklearn.datasets import fetch_california_housing, load_iris @contextmanager @@ -17,17 +17,19 @@ def no_raise(): @pytest.fixture(scope="session") def ames_housing_dataset(): """Fixture for ames housing dataset""" - return fetch_openml( - "ames_housing", version=1, as_frame=True, return_X_y=True, parser="auto" + X = pd.read_csv( + "https://raw.githubusercontent.com/wblakecannon/ames/378badd2c9e2e901a4bd4d466e9439d5e0059499/data/housing.csv", + index_col=0, ) + y = X["SalePrice"] + X = X.drop(columns=["SalePrice"]) + return X, y @pytest.fixture(scope="session") def california_housing_dataset(): """Fixture for california housing dataset""" - return fetch_openml( - "california_housing", version=1, as_frame=True, return_X_y=True, parser="auto" - ) + return fetch_california_housing(return_X_y=True, as_frame=True) @pytest.fixture(scope="session") @@ -40,8 +42,8 @@ def iris_dataset(): def boston_dataset(): """Fixture for the Boston housing dataset""" X = pd.read_csv( - "https://raw.githubusercontent.com/selva86/datasets/refs/heads/master/BostonHousing.csv" + "https://raw.githubusercontent.com/selva86/datasets/5d788b9286864a80bc7b23703f372823bf6c600e/BostonHousing.csv" ) - y = X.medv + y = X["medv"] X = X.drop(columns=["medv"]) return X, y diff --git a/tests/test_coefficient_display.py b/tests/test_coefficient_display.py index 6028945..da765a5 100644 --- a/tests/test_coefficient_display.py +++ b/tests/test_coefficient_display.py @@ -10,7 +10,7 @@ def test_coefficient_display(ames_housing_dataset): model = Cubist(n_committees=2).fit(*ames_housing_dataset) CubistCoefficientDisplay.from_estimator(model) plt.savefig("coefficient_display_test_ames.png") - CubistCoefficientDisplay.from_estimator(model, feature_names=["Gr_Liv_Area"]) + CubistCoefficientDisplay.from_estimator(model, feature_names=["Gr Liv Area"]) plt.savefig("coefficient_display_test_ames_subselect.png") diff --git a/tests/test_coverage_display.py b/tests/test_coverage_display.py index 33dc83f..3c6350a 100644 --- a/tests/test_coverage_display.py +++ b/tests/test_coverage_display.py @@ -12,7 +12,7 @@ def test_coverage_display(ames_housing_dataset): CubistCoverageDisplay.from_estimator(model, ames_housing_dataset[0]) plt.savefig("coverage_display_test_ames.png") CubistCoverageDisplay.from_estimator( - model, ames_housing_dataset[0], feature_names=["Gr_Liv_Area"] + model, ames_housing_dataset[0], feature_names=["Gr Liv Area"] ) plt.savefig("coverage_display_test_ames_subselect.png") diff --git a/tests/test_cubist.py b/tests/test_cubist.py index a0e516a..8850ef6 100644 --- a/tests/test_cubist.py +++ b/tests/test_cubist.py @@ -200,10 +200,10 @@ def test_training_errors(ames_housing_dataset): model = Cubist().fit(*ames_housing_dataset) check_is_fitted(model) X, y = deepcopy(ames_housing_dataset) - # set the Sale_Condition column as a string - X.Sale_Condition = X.Sale_Condition.astype(str) + # set the Sale Condition column as a string + X["Sale Condition"] = X["Sale Condition"].astype(str) # add a bad string - X.loc[0, "Sale_Condition"] = "test. bad, string" + X.loc[0, "Sale Condition"] = "test. bad, string" # training should now fail with pytest.raises(CubistError): model = Cubist().fit(X, y) diff --git a/tests/test_make_names_string.py b/tests/test_make_names_string.py new file mode 100644 index 0000000..150bf8a --- /dev/null +++ b/tests/test_make_names_string.py @@ -0,0 +1,17 @@ +import pytest + +from cubist._make_names_string import _escapes + + +@pytest.mark.parametrize( + ("input", "expected"), + [ + (["a", "b"], ["a", "b"]), + (["a", 1], ["a", "1"]), + (["a", None], ["a", "None"]), + (["a:", "b|"], ["a\\\\:", "b\\\\\\|"]), + ], +) +def test_escapes(input, expected): + """make sure column titles are correctly escaped""" + assert _escapes(input) == expected