diff --git a/smarttree/_cgini_index.pyi b/smarttree/_cgini_index.pyi deleted file mode 100644 index 2e4e34b..0000000 --- a/smarttree/_cgini_index.pyi +++ /dev/null @@ -1,4 +0,0 @@ -import pandas as pd -from numpy.typing import NDArray - -def cgini_index(mask: pd.Series, y: pd.Series, class_names: NDArray) -> float: ... diff --git a/smarttree/_cgini_index.pyx b/smarttree/_cgini_index.pyx deleted file mode 100644 index 2e97138..0000000 --- a/smarttree/_cgini_index.pyx +++ /dev/null @@ -1,47 +0,0 @@ -cimport cython -from libc.stdint cimport int8_t -import numpy as np - - -@cython.boundscheck(False) -@cython.wraparound(False) -@cython.cdivision(True) -def cgini_index(mask, y, class_names): - - cdef int8_t[:] mask_arr = mask.values.astype(np.int8) - cdef object[:] y_arr = y.values - cdef long N = 0 - cdef long N_i = 0 - cdef double p_i = 0.0 - cdef double gini_index = 1.0 - cdef int i - cdef int j - cdef int n = len(mask) - cdef int n_classes = len(class_names) - cdef object class_name - cdef object label - cdef int8_t mask_value - - for i in range(n): - mask_value = mask_arr[i] - if mask_value: - N += 1 - - if N == 0: - return 0.0 - - for j in range(n_classes): - N_i = 0 - class_name = class_names[j] - - for i in range(n): - mask_value = mask_arr[i] - if mask_value: - label = y_arr[i] - if label == class_name: - N_i += 1 - - p_i = N_i / N - gini_index -= p_i * p_i - - return gini_index diff --git a/smarttree/_column_splitter.py b/smarttree/_column_splitter.py index e245624..1a58ad1 100644 --- a/smarttree/_column_splitter.py +++ b/smarttree/_column_splitter.py @@ -1,6 +1,5 @@ from __future__ import annotations -import math from abc import ABC, abstractmethod from collections.abc import Generator from copy import deepcopy @@ -10,10 +9,10 @@ import pandas as pd from numpy.typing import NDArray -from ._cgini_index import cgini_index +from ._cy_column_splitter import CyBaseColumnSplitter from ._dataset import Dataset from ._tree import TreeNode -from ._types import ClassificationCriterionType, NaModeType +from ._types import ClassificationCriterionType, Criterion, NaModeType NO_INFORMATION_GAIN = float("-inf") @@ -37,6 +36,12 @@ def no_split(cls) -> ColumnSplitResult: class BaseColumnSplitter(ABC): + mapping: dict[ClassificationCriterionType, Criterion] = { + "gini": Criterion.GINI, + "entropy": Criterion.ENTROPY, + "log_loss": Criterion.LOG_LOSS, + } + def __init__( self, dataset: Dataset, @@ -47,17 +52,11 @@ def __init__( ) -> None: self.dataset = dataset - self.criterion = criterion + self.criterion = self.mapping[criterion] self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.feature_na_mode = feature_na_mode - match self.criterion: - case "gini": - self.impurity = self.gini_index - case "entropy" | "log_loss": - self.impurity = self.entropy - @abstractmethod def split(self, *args, **kwargs) -> ColumnSplitResult: raise NotImplementedError @@ -168,62 +167,8 @@ def information_gain( \item $\text{impurity}_{\text{child}_i}$ — child node impurity. \end{itemize} """ - N = self.dataset.size - N_parent = parent_mask.sum() - - impurity_parent = self.impurity(parent_mask) - - weighted_impurity_childs = 0 - N_childs = 0 - for child_mask_i in child_masks: - N_child_i = child_mask_i.sum() - N_childs += N_child_i - impurity_child_i = self.impurity(child_mask_i) - weighted_impurity_childs += (N_child_i / N_parent) * impurity_child_i - - if normalize: - norm_coef = N_parent / N_childs - weighted_impurity_childs *= norm_coef - - local_information_gain = impurity_parent - weighted_impurity_childs - - information_gain = (N_parent / N) * local_information_gain - - return information_gain - - def gini_index(self, mask: pd.Series) -> float: - r""" - Calculates Gini index in a tree node. - - Gini index formula in LaTeX: - \text{Gini Index} = 1 - \sum^C_{i=1} p_i^2 - where - C - total number of classes; - p_i - the probability of choosing a sample with class i. - """ - return cgini_index(mask, self.dataset.y, self.dataset.class_names) - - def entropy(self, mask: pd.Series) -> float: - r""" - Calculates entropy in a tree node. - - Entropy formula in LaTeX: - H = \log{\overline{N}} = \sum^N_{i=1} p_i \log{(1/p_i)} = -\sum^N_{i=1} p_i \log{p_i} - where - H - entropy; - \overline{N} - effective number of states; - p_i - probability of the i-th system state. - """ - N = mask.sum() - - entropy = 0 - for label in self.dataset.class_names: - N_i = (mask & (self.dataset.y == label)).sum() - if N_i != 0: - p_i = N_i / N - entropy -= p_i * math.log2(p_i) - - return entropy + cs = CyBaseColumnSplitter(self.dataset, self.criterion) + return cs.information_gain(parent_mask, child_masks, normalize) class NumColumnSplitter(BaseColumnSplitter): diff --git a/smarttree/_cy_column_splitter.pyi b/smarttree/_cy_column_splitter.pyi new file mode 100644 index 0000000..5328e3c --- /dev/null +++ b/smarttree/_cy_column_splitter.pyi @@ -0,0 +1,78 @@ +import numpy as np +import pandas as pd +from numpy.typing import NDArray + +from ._dataset import Dataset +from ._types import Criterion + +class CyBaseColumnSplitter: + + def __init__(self, dataset: Dataset, criterion: Criterion) -> None: + ... + + def information_gain( + self, + parent_mask: pd.Series, + child_masks: list[pd.Series], + normalize: bool = False, + ) -> float: + r""" + Calculates information gain of the split. + + Parameters: + parent_mask: pd.Series + boolean mask of parent node. + child_masks: pd.Series + list of boolean masks of child nodes. + normalize: bool, default=False + if True, normalizes information gain by split factor to handle + unbalanced splits. Uses child node counts for normalization. + + Returns: + float: information gain. + + Formula in LaTeX: + \begin{align*} + \text{Information Gain} = + \frac{N_{\text{parent}}}{N} \cdot + \Biggl( & \text{impurity}_{\text{parent}} - \\ + & \sum^C_{i=1} \frac{N_{\text{child}_i}}{N_{\text{parent}}} + \cdot \text{impurity}_{\text{child}_i} \Biggr) + \end{align*} + where: + \begin{itemize} + \item $\text{Information Gain}$ — information gain; + \item $N$ — number of samples in entire training set; + \item $N_{\text{parent}}$ — number of samples in parent node; + \item $\text{impurity}_{\text{parent}}$ — parent node impurity; + \item $C$ — number of child nodes; + \item $N_{\text{child}_i}$ — number of samples in child node; + \item $\text{impurity}_{\text{child}_i}$ — child node impurity. + \end{itemize} + """ + ... + + def gini_index(self, mask: NDArray[np.int8]) -> float: + r""" + Calculates Gini index in a tree node. + + Gini index formula in LaTeX: + \text{Gini Index} = 1 - \sum^C_{i=1} p_i^2 + where + C - total number of classes; + p_i - the probability of choosing a sample with class i. + """ + ... + + def entropy(self, mask: NDArray[np.int8]) -> float: + r""" + Calculates entropy in a tree node. + + Entropy formula in LaTeX: + H = \log{\overline{N}} = \sum^N_{i=1} p_i \log{(1/p_i)} = -\sum^N_{i=1} p_i \log{p_i} + where + H - entropy; + \overline{N} - effective number of states; + p_i - probability of the i-th system state. + """ + ... diff --git a/smarttree/_cy_column_splitter.pyx b/smarttree/_cy_column_splitter.pyx new file mode 100644 index 0000000..5e0ef56 --- /dev/null +++ b/smarttree/_cy_column_splitter.pyx @@ -0,0 +1,205 @@ +cimport cython +from libc.math cimport log2 +from libc.stdint cimport int8_t + +import numpy as np +import pandas as pd + +from ._dataset import Dataset +from ._types import Criterion + + +cdef int CRITERION_GINI = 1 + + +cdef class CyBaseColumnSplitter: + + cdef int criterion + cdef object[:] y + cdef object[:] class_names + + def __cinit__(self, dataset: Dataset, criterion: Criterion) -> None: + self.criterion = criterion.value + self.y = dataset.y.values + self.class_names = dataset.class_names + + cdef double impurity(self, int8_t[:] mask): + if self.criterion == CRITERION_GINI: + return self.gini_index(mask) + else: + return self.entropy(mask) + + def information_gain( + self, + parent_mask: pd.Series, + child_masks: list[pd.Series], + normalize: bool = False, + ) -> float: + r""" + Calculates information gain of the split. + + Parameters: + parent_mask: pd.Series + boolean mask of parent node. + child_masks: pd.Series + list of boolean masks of child nodes. + normalize: bool, default=False + if True, normalizes information gain by split factor to handle + unbalanced splits. Uses child node counts for normalization. + + Returns: + float: information gain. + + Formula in LaTeX: + \begin{align*} + \text{Information Gain} = + \frac{N_{\text{parent}}}{N} \cdot + \Biggl( & \text{impurity}_{\text{parent}} - \\ + & \sum^C_{i=1} \frac{N_{\text{child}_i}}{N_{\text{parent}}} + \cdot \text{impurity}_{\text{child}_i} \Biggr) + \end{align*} + where: + \begin{itemize} + \item $\text{Information Gain}$ — information gain; + \item $N$ — number of samples in entire training set; + \item $N_{\text{parent}}$ — number of samples in parent node; + \item $\text{impurity}_{\text{parent}}$ — parent node impurity; + \item $C$ — number of child nodes; + \item $N_{\text{child}_i}$ — number of samples in child node; + \item $\text{impurity}_{\text{child}_i}$ — child node impurity. + \end{itemize} + """ + cdef int8_t[:] parent_mask_arr, child_mask_arr + parent_mask_arr = parent_mask.values.astype(np.int8) + child_mask_arrs = [ + child_mask.values.astype(np.int8) for child_mask in child_masks + ] + + cdef: + int i + Py_ssize_t n = len(parent_mask_arr) + long N = 0 + long N_parent = 0 + int8_t parent_mask_value + for i in range(n): + N += 1 + parent_mask_value = parent_mask_arr[i] + if parent_mask_value: + N_parent += 1 + + cdef double impurity_parent = self.impurity(parent_mask_arr) + + cdef: + int j + double weighted_impurity_childs = 0.0 + long N_childs = 0 + long N_child_j + int8_t child_mask_value + double impurity_child_i + for j in range(len(child_mask_arrs)): + N_child_j = 0 + child_mask_arr = child_mask_arrs[j] + for i in range(n): + child_mask_value = child_mask_arr[i] + if child_mask_value: + N_child_j += 1 + N_childs += N_child_j + impurity_child_i = self.impurity(child_mask_arr) + weighted_impurity_childs += (N_child_j / N_parent) * impurity_child_i + + cdef double norm_coef + if normalize: + norm_coef = N_parent / N_childs + weighted_impurity_childs *= norm_coef + + cdef double local_information_gain = impurity_parent - weighted_impurity_childs + + cdef double information_gain = (N_parent / N) * local_information_gain + + return information_gain + + cpdef double gini_index(self, int8_t[:] mask): + r""" + Calculates Gini index in a tree node. + + Gini index formula in LaTeX: + \text{Gini Index} = 1 - \sum^C_{i=1} p_i^2 + where + C - total number of classes; + p_i - the probability of choosing a sample with class i. + """ + cdef: + int i + Py_ssize_t n = len(mask) + int8_t mask_value + long N = 0 + for i in range(n): + mask_value = mask[i] + if mask_value: + N += 1 + + cdef: + int j + cdef long N_i + cdef object class_name, label + double p_i = 0.0 + gini_index = 1.0 + for j in range(len(self.class_names)): + N_i = 0 + class_name = self.class_names[j] + + for i in range(n): + mask_value = mask[i] + if mask_value: + label = self.y[i] + if label == class_name: + N_i += 1 + + p_i = N_i / N + gini_index -= p_i * p_i + + return gini_index + + cpdef double entropy(self, int8_t[:] mask): + r""" + Calculates entropy in a tree node. + + Entropy formula in LaTeX: + H = \log{\overline{N}} = \sum^N_{i=1} p_i \log{(1/p_i)} = -\sum^N_{i=1} p_i \log{p_i} + where + H - entropy; + \overline{N} - effective number of states; + p_i - probability of the i-th system state. + """ + cdef: + int i + Py_ssize_t n = len(mask) + int8_t mask_value + long N = 0 + for i in range(n): + mask_value = mask[i] + if mask_value: + N += 1 + + cdef: + int j + long N_i = 0 + object class_name, label + double p_i = 0.0 + entropy = 0.0 + for j in range(len(self.class_names)): + N_i = 0 + class_name = self.class_names[j] + + for i in range(n): + mask_value = mask[i] + if mask_value: + label = self.y[i] + if label == class_name: + N_i += 1 + + if N_i != 0: + p_i = N_i / N + entropy -= p_i * log2(p_i) + + return entropy diff --git a/smarttree/_types.py b/smarttree/_types.py index 927e113..9ffa7b8 100644 --- a/smarttree/_types.py +++ b/smarttree/_types.py @@ -1,3 +1,4 @@ +from enum import Enum from typing import Literal @@ -11,3 +12,9 @@ VerboseType = Literal["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"] | int SplitType = Literal["numerical", "categorical", "rank"] + + +class Criterion(Enum): + GINI = 1 + ENTROPY = 2 + LOG_LOSS = 2 diff --git a/tests/column_splitter/test__base_column_splitter.py b/tests/column_splitter/test__base_column_splitter.py index 6a03709..7f7ae7e 100644 --- a/tests/column_splitter/test__base_column_splitter.py +++ b/tests/column_splitter/test__base_column_splitter.py @@ -24,18 +24,6 @@ def split( ) -def test__gini_index(concrete_column_splitter, y): - parent_mask = y.apply(lambda x: True) - gini_index = concrete_column_splitter.gini_index(parent_mask) - assert gini_index == 0.6666591342419322 - - -def test__entropy(concrete_column_splitter, y): - parent_mask = y.apply(lambda x: True) - entropy = concrete_column_splitter.entropy(parent_mask) - assert entropy == 1.584946181877191 - - def test__information_gain(concrete_column_splitter, y): parent_mask = y.apply(lambda x: True) diff --git a/tests/column_splitter/test__cy_base_column_splitter.py b/tests/column_splitter/test__cy_base_column_splitter.py new file mode 100644 index 0000000..77605d2 --- /dev/null +++ b/tests/column_splitter/test__cy_base_column_splitter.py @@ -0,0 +1,28 @@ +import numpy as np + +from smarttree._cy_column_splitter import CyBaseColumnSplitter +from smarttree._types import Criterion + + +def test__gini_index(dataset): + + cy_base_column_splitter = CyBaseColumnSplitter( + dataset=dataset, criterion=Criterion.GINI + ) + + mask = dataset.y.apply(lambda x: True).values.astype(np.int8) + + gini_index = cy_base_column_splitter.gini_index(mask) + assert gini_index == 0.6666591342419322 + + +def test__entropy(dataset): + + cy_base_column_splitter = CyBaseColumnSplitter( + dataset=dataset, criterion=Criterion.ENTROPY + ) + + mask = dataset.y.apply(lambda x: True).values.astype(np.int8) + + gini_index = cy_base_column_splitter.entropy(mask) + assert gini_index == 1.584946181877191 diff --git a/tests/conftest.py b/tests/conftest.py index 482b5e0..0c3b604 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ from numpy.typing import NDArray from smarttree import BaseSmartDecisionTree +from smarttree._dataset import Dataset from smarttree._tree import TreeNode from smarttree._types import NaModeType @@ -163,6 +164,11 @@ def y(data) -> pd.Series: return data[TARGET_COL] +@pytest.fixture(scope="session") +def dataset(X, y) -> Dataset: + return Dataset(X, y) + + @pytest.fixture(scope="function") def root_node(X, y): return TreeNode(