wip: finger

sabbatinif · sabbatinif · commit eb030a392c62 · 2025-10-28T21:37:05.000+01:00
diff --git a/psyke/__init__.py b/psyke/__init__.py
@@ -251,12 +251,12 @@ def plot_fairness(self, dataframe: pd.DataFrame, groups: dict[str, list], colorm
         plt.show()
 
     def make_fair(self, features: Iterable[str]):
-        raise NotImplementedError(f'Fairness for {type(self).__name__} is not supported at the moment')
+        raise NotImplementedError(f'Fairness for {type(self).__name__} is not currently supported')
 
     def mae(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
             n: int = 3) -> float:
         """
-        Calculates the predictions' MAE w.r.t. the instances given as input.
+        Calculates the predictions' MAE with respect to the instances given as input.
 
         :param dataframe: the set of instances to be used to calculate the mean absolute error.
         :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
@@ -271,7 +271,7 @@ def mae(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, crit
     def mse(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
             n: int = 3) -> float:
         """
-        Calculates the predictions' MSE w.r.t. the instances given as input.
+        Calculates the predictions' MSE with respect to the instances given as input.
 
         :param dataframe: the set of instances to be used to calculate the mean squared error.
         :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
@@ -286,7 +286,7 @@ def mse(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, crit
     def r2(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
             n: int = 3) -> float:
         """
-        Calculates the predictions' R2 score w.r.t. the instances given as input.
+        Calculates the predictions' R2 score with respect to the instances given as input.
 
         :param dataframe: the set of instances to be used to calculate the R2 score.
         :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
@@ -301,7 +301,7 @@ def r2(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, crite
     def accuracy(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
                  n: int = 3) -> float:
         """
-        Calculates the predictions' accuracy classification score w.r.t. the instances given as input.
+        Calculates the predictions' accuracy classification score with respect to the instances given as input.
 
         :param dataframe: the set of instances to be used to calculate the accuracy classification score.
         :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
@@ -317,7 +317,7 @@ def accuracy(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False,
     def f1(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
             n: int = 3) -> float:
         """
-        Calculates the predictions' F1 score w.r.t. the instances given as input.
+        Calculates the predictions' F1 score with respect to the instances given as input.
 
         :param dataframe: the set of instances to be used to calculate the F1 score.
         :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
@@ -398,14 +398,14 @@ def hex(predictor, grid, min_examples: int = 250, threshold: float = 0.1, output
     def ginger(predictor, features: Iterable[str], sigmas: Iterable[float], max_slices: int, min_rules: int = 1,
                max_poly: int = 1, alpha: float = 0.5, indpb: float = 0.5, tournsize: int = 3, metric: str = 'R2',
                n_gen: int = 50, n_pop: int = 50, threshold=None, valid=None, output=Target.REGRESSION,
-               normalization: dict[str, tuple[float, float]] = None,
+               discretization=None, normalization: dict[str, tuple[float, float]] = None,
                seed: int = get_default_random_seed()) -> Extractor:
         """
         Creates a new GInGER extractor.
         """
         from psyke.extraction.hypercubic.ginger import GInGER
         return GInGER(predictor, features, sigmas, max_slices, min_rules, max_poly, alpha, indpb, tournsize, metric,
-                      n_gen, n_pop, threshold, valid, output, normalization, seed)
+                      n_gen, n_pop, threshold, valid, output, discretization, normalization, seed)
 
     @staticmethod
     def gridrex(predictor, grid, min_examples: int = 250, threshold: float = 0.1,
diff --git a/psyke/extraction/hypercubic/ginger/__init__.py b/psyke/extraction/hypercubic/ginger/__init__.py
@@ -22,9 +22,10 @@ class GInGER(HyperCubeExtractor):
 
     def __init__(self, predictor, features, sigmas, max_slices, min_rules=1, max_poly=1, alpha=0.5, indpb=0.5,
                  tournsize=3, metric='R2', n_gen=50, n_pop=50, threshold=None, valid=None,
-                 output: Target = Target.REGRESSION, normalization=None, seed: int = get_default_random_seed()):
+                 output: Target = Target.REGRESSION, discretization=None, normalization=None,
+                 seed: int = get_default_random_seed()):
         super().__init__(predictor, output=Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output,
-                         normalization=normalization)
+                         discretization=discretization, normalization=normalization)
         self.threshold = threshold
         np.random.seed(seed)
 
diff --git a/psyke/extraction/hypercubic/hypercube.py b/psyke/extraction/hypercubic/hypercube.py
@@ -176,8 +176,8 @@ def barycenter(self) -> Point:
 
     def subcubes(self, cubes: Iterable[GenericCube], only_largest: bool = True) -> Iterable[GenericCube]:
         subcubes = [c for c in cubes if c in self and c.output != self.output]
-        if only_largest:
-            subsubcubes = [c for cube_list in [c.subcubes(cubes) for c in subcubes] for c in cube_list]
+        if only_largest and subcubes:
+            subsubcubes = {sc for c in subcubes for sc in c.subcubes(subcubes)}
             subcubes = [c for c in subcubes if c not in subsubcubes]
         return subcubes
 
@@ -247,7 +247,7 @@ def interval_to_value(self, dimension, unscale=None):
         if dimension not in self._infinite_dimensions:
             return Between(unscale(self[dimension][0], dimension), unscale(self[dimension][1], dimension))
         if len(self._infinite_dimensions[dimension]) == 2:
-            return
+            return None
         if '+' in self._infinite_dimensions[dimension]:
             return GreaterThan(unscale(self[dimension][0], dimension))
         if '-' in self._infinite_dimensions[dimension]:
diff --git a/psyke/fuzzy/__init__.py b/psyke/fuzzy/__init__.py
@@ -0,0 +1,82 @@
+from collections.abc import Iterable
+from itertools import product
+
+import numpy as np
+import skfuzzy as skf
+from matplotlib import pyplot as plt
+from sklearn.linear_model import LinearRegression
+
+
+def generate_membership(var, domain, thresholds, shape='tri'):
+    th = [var.min()] + [min(max(t, var.min()), var.max()) for t in thresholds] + [var.max()]
+
+    if shape == 'tri':
+        mid = [(x1 + x2) / 2 for x1, x2 in zip(th[:-1], th[1:])]
+        return [skf.trapmf(domain, [domain.min()] * 2 + mid[:2])] + \
+            [skf.trimf(domain, [x1, x2, x3]) for x1, x2, x3 in zip(mid[:-2], mid[1:-1], mid[2:])] + \
+            [skf.trapmf(domain, mid[-2:] + [domain.max()] * 2)]
+    if shape == 'trap':
+        beg = [None, domain.min()] + [(3 * x1 + x2) / 4 for x1, x2 in zip(th[1:-1], th[2:])] + [domain.max()]
+        end = [domain.min()] + [(x1 + 3 * x2) / 4 for x1, x2 in zip(th[:-2], th[1:-1])] + [domain.max()]
+        return [skf.trapmf(domain, [end[i - 1], beg[i], end[i], beg[i + 1]]) for i in range(1, len(th))]
+    raise ValueError('Supported shape values are only \'tri\' and \'trap\'')
+
+def extend_domain(x, q_low=0.05, q_high=0.95, p=0.05, k_sigma=2.0, abs_min_margin=0.0):
+    ql, qh = np.quantile(x, [q_low, q_high])
+    margin = max(p * (qh - ql), k_sigma * np.std(x), abs_min_margin)
+    return np.linspace(ql - margin, qh + margin, 200)
+
+def fuzzify(cuts, data, features, feature_to_idx, shape='tri'):
+    cuts = dict(zip(features, cuts))
+    domains = {c: extend_domain(data[c]) for c in features}
+    return {c: (generate_membership(data[c], domains[c], cuts[c], shape), (min(domains[c]), max(domains[c])),
+                feature_to_idx[c]) for c in features}
+
+def fuzzy_labels(n):
+    if n < 1 or n > 9:
+        raise ValueError('n must be between 1 and 9')
+    if n == 1:
+        return ["Medium"]
+    if n == 2:
+        return ["Low", "High"]
+
+    full_scale = ["Extremely Low", "Very Low", "Low", "Slightly Low", "Medium",
+                  "Slightly High", "High", "Very High", "Extremely High"]
+    indices = np.round(np.linspace(0, len(full_scale) - 1, n)).astype(int)
+
+    selected = []
+    for i in indices:
+        if full_scale[i] not in selected:
+            selected.append(full_scale[i])
+
+    return selected
+
+def get_activations(x, functions_domains, valid):
+    levels = [np.array([skf.interp_membership(np.linspace(domain[0], domain[1], 200), mf, x[index]) for mf in mfs])
+              for mfs, domain, index in functions_domains.values()]
+    return np.prod(np.meshgrid(*levels, indexing='ij'), axis=0).ravel()[valid]
+
+def crisp_or_equation(lr: float | str | LinearRegression, features=Iterable[str], decimals: int = 3) -> str | float:
+    if isinstance(lr, LinearRegression):
+        terms = [f"{c:.{decimals}f}*{f}" for c, f in zip(lr.coef_, features)]
+        return f"y = {lr.intercept_:.{decimals}f} + " + " + ".join(terms)
+    return lr
+
+def generate_fuzzy_rules(variables: dict[str, Iterable[str]], outputs: Iterable[str | float | LinearRegression],
+                         features: Iterable[str], valid: Iterable[bool]) -> list[str]:
+    outputs = [crisp_or_equation(output, features) for output in outputs]
+    return [f'Output is {output} if {" and ".join(f"{var} is {label}" for var, label in zip(variables.keys(), combo))}'
+            for combo, output in zip(np.array(list(product(*list(variables.values()))))[valid], outputs)]
+
+def plot_membership(functions_domains):
+    fig, ax = plt.subplots(nrows=len(functions_domains), figsize=(6, len(functions_domains) * 3))
+
+    for i, (k, v) in enumerate(functions_domains.items()):
+        for s, l in zip(v[0], v[2]):
+            ax[i].plot(np.linspace(v[1][0], v[1][1], 200), s, linewidth=1.5, label=l)
+        ax[i].set_title(k)
+        ax[i].set_xlim(v[1][0], v[1][1])
+        ax[i].legend()
+
+    plt.tight_layout()
+    plt.show()
diff --git a/psyke/fuzzy/finger/__init__.py b/psyke/fuzzy/finger/__init__.py
@@ -0,0 +1,111 @@
+import itertools
+from collections.abc import Iterable
+
+import numpy as np
+import pandas as pd
+from deap import base, creator
+from sklearn.base import ClassifierMixin
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import PolynomialFeatures
+
+from psyke import Target
+from psyke.fuzzy import fuzzify, plot_membership, fuzzy_labels, generate_fuzzy_rules, get_activations
+from psyke.genetic import regions_from_cuts, output_estimation
+from psyke.genetic.fgin import FGIn
+
+
+class FInGER:
+
+    def __init__(self, predictor, features, sigmas, max_slices, min_rules=1, max_poly=1, alpha=0.5, indpb=0.5,
+                 tournsize=3, n_gen=50, n_pop=50, membership_shape='trap', metric='R2', valid=None,
+                 output=Target.REGRESSION):
+
+        self.predictor = predictor
+        self.features = features
+        self.max_features = len(features)
+        self.sigmas = sigmas
+        self.max_slices = max_slices
+        self.min_rules = min_rules
+        self.poly = max_poly
+        self._output = Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output
+        self.valid = valid
+        self.trained_poly = None
+
+        self.alpha = alpha
+        self.indpb = indpb
+        self.tournsize = tournsize
+        self.metric = metric
+        self.n_gen = n_gen
+        self.n_pop = n_pop
+
+        self.shape = membership_shape
+        self.valid_masks = None
+        self.outputs = None
+        self.functions_domains = {}
+
+        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
+        creator.create("Individual", list, fitness=creator.FitnessMax)
+
+    # TODO: a class for methods and attributes supporting polynomial combinations
+    def __poly_names(self):
+        return [''.join(['' if pp == 0 else f'{n} * ' if pp == 1 else f'{n}**{pp} * '
+                         for pp, n in zip(p, self.trained_poly.feature_names_in_)])[:-3]
+                for p in self.trained_poly.powers_]
+
+    @staticmethod
+    def _get_cuts(individual, slices):
+        boundaries = np.cumsum([0] + list(slices))
+        return [sorted(individual[boundaries[i]:boundaries[i + 1]]) for i in range(len(slices))]
+
+    def extract(self, dataframe: pd.DataFrame) -> str:
+        best = {}
+        for poly in range(self.poly):
+            for slices in list(itertools.product(range(1, self.max_slices + 1), repeat=self.max_features)):
+                gr = FGIn((dataframe.iloc[:, :-1], dataframe.iloc[:, -1]), self.valid, self.features, self.sigmas,
+                          slices, min_rules=self.min_rules, poly=poly + 1, alpha=self.alpha, indpb=self.indpb,
+                          tournsize=self.tournsize, membership_shape=self.shape, metric=self.metric,
+                          output=self._output, warm=True)
+
+                b, score, _, _ = gr.run(n_gen=self.n_gen, n_pop=self.n_pop)
+                best[(score, poly + 1, slices)] = b
+        m = min(best)
+        poly, slices, best = m[1], m[2], best[m]
+        self.trained_poly = PolynomialFeatures(degree=poly, include_bias=False)
+
+        cuts = FInGER._get_cuts(best, slices)
+        self.functions_domains = fuzzify(cuts, dataframe.iloc[:, :-1], self.features,
+                                         {f: i for i, f in enumerate(dataframe.columns[:-1])}, self.shape)
+
+        masks = np.array([regions_from_cuts(dataframe, cuts, self.features) == r
+                          for r in range(np.prod([s + 1 for s in slices]))])
+        self.valid_masks = masks.sum(axis=1) >= 3
+        masks = masks[self.valid_masks]
+
+        self.outputs = np.array([output_estimation(dataframe.iloc[:, :-1], dataframe.iloc[:, -1], self._output,
+                                                   self.trained_poly, mask) for mask in masks]).T
+
+        functions_domains = {k: (v[0], v[1], fuzzy_labels(len(v[0]))) for k, v in self.functions_domains.items()}
+        return "\n".join(generate_fuzzy_rules({k: v[2] for k, v in functions_domains.items()}, self.outputs,
+                                              dataframe.columns[:-1], self.valid_masks))
+
+    def show_membership_functions(self):
+        functions_domains = {k: (v[0], v[1], fuzzy_labels(len(v[0]))) for k, v in self.functions_domains.items()}
+        plot_membership(functions_domains)
+
+    def predict(self, dataframe: pd.DataFrame) -> Iterable:
+        activations = np.array([get_activations(x, self.functions_domains, self.valid_masks)
+                                for _, x in dataframe.iterrows()])
+
+        if self._output == Target.CLASSIFICATION:
+            classes, idx = np.unique(self.outputs, return_inverse=True)
+            pred = classes[np.argmax(np.vstack([activations[:, idx == i].sum(axis=1) for i, c in enumerate(classes)]),
+                                     axis=0)]
+        else:
+            outputs = self.outputs if self._output == Target.CONSTANT else \
+                np.vstack([lr.predict(self.trained_poly.fit_transform(dataframe)) for lr in self.outputs]).T
+            pred = (outputs * activations).sum(axis=1)
+        return np.array(pred)
+
+    @property
+    def n_rules(self):
+        return len(self.outputs)
diff --git a/psyke/genetic/__init__.py b/psyke/genetic/__init__.py
@@ -0,0 +1,27 @@
+from statistics import mode
+import numpy as np
+from sklearn.linear_model import LinearRegression
+
+from psyke import Target
+
+
+def regions_from_cuts(x, cuts, features):
+    indices = [np.searchsorted(np.array(cut), x[f].to_numpy(), side='right')
+               for cut, f in zip(cuts, features)]
+
+    regions = np.zeros(len(x), dtype=int)
+    multiplier = 1
+    for idx, n in zip(reversed(indices), reversed([len(cut) + 1 for cut in cuts])):
+        regions += idx * multiplier
+        multiplier *= n
+    return regions
+
+def output_estimation(x, y, output, poly, mask, to_pred=None):
+    if output == Target.REGRESSION:
+        lr = LinearRegression().fit(poly.fit_transform(x)[mask], y[mask])
+        return lr if to_pred is None else lr.predict(poly.fit_transform(to_pred))
+    if output == Target.CONSTANT:
+        return np.mean(y[mask])
+    if output == Target.CLASSIFICATION:
+        return mode(y[mask])
+    raise ValueError('Supported outputs are Target.{REGRESSION, CONSTANT, CLASSIFICATION}')
diff --git a/psyke/genetic/fgin/__init__.py b/psyke/genetic/fgin/__init__.py
diff --git a/psyke/genetic/gin/__init__.py b/psyke/genetic/gin/__init__.py