From cae59a83de5db33d8fc99d74c1e7a502c6d53bce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81ngel=20Sevilla=20Molina?= Date: Fri, 4 Jul 2025 09:15:05 +0200 Subject: [PATCH 1/4] ENH: Initialize preprocessing module --- orca_python/preprocessing/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 orca_python/preprocessing/__init__.py diff --git a/orca_python/preprocessing/__init__.py b/orca_python/preprocessing/__init__.py new file mode 100644 index 0000000..0ba2ad5 --- /dev/null +++ b/orca_python/preprocessing/__init__.py @@ -0,0 +1 @@ +"""Preprocessing module.""" From 3bd5ff767c9b24fd906d7dfc4efdece1313d006a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81ngel=20Sevilla=20Molina?= Date: Fri, 4 Jul 2025 09:22:11 +0200 Subject: [PATCH 2/4] ENH: Add preprocessing module with core functions --- orca_python/preprocessing/__init__.py | 12 +++ orca_python/preprocessing/preprocessing.py | 107 +++++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 orca_python/preprocessing/preprocessing.py diff --git a/orca_python/preprocessing/__init__.py b/orca_python/preprocessing/__init__.py index 0ba2ad5..5144fcc 100644 --- a/orca_python/preprocessing/__init__.py +++ b/orca_python/preprocessing/__init__.py @@ -1 +1,13 @@ """Preprocessing module.""" + +from .preprocessing import ( + normalize, + preprocess_input, + standardize, +) + +__all__ = [ + "normalize", + "preprocess_input", + "standardize", +] diff --git a/orca_python/preprocessing/preprocessing.py b/orca_python/preprocessing/preprocessing.py new file mode 100644 index 0000000..2265ce2 --- /dev/null +++ b/orca_python/preprocessing/preprocessing.py @@ -0,0 +1,107 @@ +"""Preprocessing module.""" + +from sklearn import preprocessing + + +def preprocess_input(X_train, X_test=None, input_preprocessing=None): + """Apply normalization or standardization to the input data. + + The preprocessing is fit on the training data and then applied to both + training and test data (if provided). + + Parameters + ---------- + X_train : np.ndarray + Feature matrix used specifically for model training. + + X_test : np.ndarray, optional + Feature matrix used for model evaluation and prediction. + + input_preprocessing : str, optional + Data normalization strategy: + - "norm": Linear scaling + - "std": Standardization + - None: No preprocessing + + Returns + ------- + X_train_scaled : np.ndarray + Scaled training data. + + X_test_scaled : np.ndarray, optional + Scaled test data. + + Raises + ------ + ValueError + If an unknown preprocessing method is specified. + + """ + if input_preprocessing is None: + return X_train, X_test + + input_preprocessing = input_preprocessing.lower() + if input_preprocessing == "norm": + X_train_scaled, X_test_scaled = normalize(X_train, X_test) + elif input_preprocessing == "std": + X_train_scaled, X_test_scaled = standardize(X_train, X_test) + else: + raise ValueError(f"Input preprocessing named '{input_preprocessing}' unknown") + + return X_train_scaled, X_test_scaled + + +def normalize(X_train, X_test=None): + """Normalize the data. + + Test data normalization will be based on train data. + + Parameters + ---------- + X_train : np.ndarray + Feature matrix used specifically for model training. + + X_test : np.ndarray, optional + Feature matrix used for model evaluation and prediction. + + Returns + ------- + X_train_normalized : np.ndarray + Normalized training data. + + X_test_normalized : np.ndarray, optional + Normalized test data. + + """ + scaler = preprocessing.MinMaxScaler() + X_train_normalized = scaler.fit_transform(X_train) + X_test_normalized = scaler.transform(X_test) if X_test is not None else None + return X_train_normalized, X_test_normalized + + +def standardize(X_train, X_test=None): + """Standardize the data. + + Test data standardization will be based on train data. + + Parameters + ---------- + X_train : np.ndarray + Feature matrix used specifically for model training. + + X_test : np.ndarray, optional + Feature matrix used for model evaluation and prediction. + + Returns + ------- + X_train_standardized : np.ndarray + Standardized training data. + + X_test_standardized : np.ndarray, optional + Standardized test data. + + """ + scaler = preprocessing.StandardScaler() + X_train_standardized = scaler.fit_transform(X_train) + X_test_standardized = scaler.transform(X_test) if X_test is not None else None + return X_train_standardized, X_test_standardized From 645115525e46f78cd643a8d5643e6d31905ceeb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81ngel=20Sevilla=20Molina?= Date: Fri, 4 Jul 2025 09:29:51 +0200 Subject: [PATCH 3/4] TST: Add preprocessing unit tests --- orca_python/preprocessing/tests/__init__.py | 3 + .../preprocessing/tests/test_preprocessing.py | 71 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 orca_python/preprocessing/tests/__init__.py create mode 100644 orca_python/preprocessing/tests/test_preprocessing.py diff --git a/orca_python/preprocessing/tests/__init__.py b/orca_python/preprocessing/tests/__init__.py new file mode 100644 index 0000000..6d8ff4b --- /dev/null +++ b/orca_python/preprocessing/tests/__init__.py @@ -0,0 +1,3 @@ +"""Tests for preprocessing module.""" + +__all__ = [] diff --git a/orca_python/preprocessing/tests/test_preprocessing.py b/orca_python/preprocessing/tests/test_preprocessing.py new file mode 100644 index 0000000..43a1727 --- /dev/null +++ b/orca_python/preprocessing/tests/test_preprocessing.py @@ -0,0 +1,71 @@ +"""Tests for the preprocessing module.""" + +import numpy as np +import numpy.testing as npt +import pytest + +from orca_python.preprocessing import normalize, preprocess_input, standardize + + +@pytest.fixture +def dataset(): + """Create synthetic dataset for testing preprocessing functions.""" + X_train = np.random.randn(100, 5) + X_test = np.random.randn(50, 5) + return X_train, X_test + + +def test_normalize_data(dataset): + """Test that normalize function correctly scales input data to [0,1] range.""" + X_train, X_test = dataset + norm_X_train, _ = normalize(X_train, X_test) + assert np.all(norm_X_train >= 0) and np.all(norm_X_train <= 1) + + +def test_standardize_data(dataset): + """Test that standardize function correctly produces output with zero mean and unit variance.""" + X_train, X_test = dataset + std_X_train, _ = standardize(X_train, X_test) + npt.assert_almost_equal(np.mean(std_X_train), 0, decimal=6) + npt.assert_almost_equal(np.std(std_X_train), 1, decimal=6) + + +@pytest.mark.parametrize( + "input_preprocessing, method_func", + [ + ("norm", normalize), + ("std", standardize), + ], +) +def test_input_preprocessing(dataset, input_preprocessing, method_func): + """Test that different preprocessing methods work as expected.""" + X_train, X_test = dataset + post_X_train, post_X_test = preprocess_input(X_train, X_test, input_preprocessing) + expected_X_train, expected_X_test = method_func(X_train, X_test) + npt.assert_array_almost_equal(post_X_train, expected_X_train) + npt.assert_array_almost_equal(post_X_test, expected_X_test) + + +def test_none_input_preprocessing(dataset): + """Test that preprocessing function handles None input correctly.""" + X_train, X_test = dataset + post_X_train, post_X_test = preprocess_input(X_train, X_test, None) + npt.assert_array_equal(post_X_train, X_train) + npt.assert_array_equal(post_X_test, X_test) + + +def test_input_preprocessing_unknown_method(dataset): + """Test that an unknown preprocessing method raises an AttributeError.""" + X_train, X_test = dataset + error_msg = "Input preprocessing named 'esc' unknown" + with pytest.raises(ValueError, match=error_msg): + preprocess_input(X_train, X_test, "esc") + + +def test_input_preprocessing_inconsistent_features(dataset): + """Test that preprocessing with inconsistent feature dimensions raises error.""" + X_train, X_test = dataset + X_test = X_test[:, :-1] + with pytest.raises(ValueError): + preprocess_input(X_train, X_test, "norm") + preprocess_input(X_train, X_test, "norm") From f47577b93cf6eb0213a0a3fed1f971058c473f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81ngel=20Sevilla=20Molina?= Date: Fri, 4 Jul 2025 09:30:31 +0200 Subject: [PATCH 4/4] STY: Pre-commit fixes --- .github/ISSUE_TEMPLATE/bug_report.yml | 2 +- .github/ISSUE_TEMPLATE/doc_improvement.yml | 2 +- .github/ISSUE_TEMPLATE/feature_request.yml | 2 +- .github/ISSUE_TEMPLATE/other_issue.yml | 2 +- .github/PULL_REQUEST_TEMPLATE.md | 2 +- .github/workflows/pr_pytest.yml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index c669748..899afa4 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -48,4 +48,4 @@ body: Place traceback error here if applicable. If your issue has no traceback, please describe the observed output without formatting. ``` validations: - required: true \ No newline at end of file + required: true diff --git a/.github/ISSUE_TEMPLATE/doc_improvement.yml b/.github/ISSUE_TEMPLATE/doc_improvement.yml index cd1121d..002692d 100644 --- a/.github/ISSUE_TEMPLATE/doc_improvement.yml +++ b/.github/ISSUE_TEMPLATE/doc_improvement.yml @@ -15,4 +15,4 @@ body: attributes: label: Suggest a potential alternative/fix description: > - Tell us how you think the documentation could be improved. \ No newline at end of file + Tell us how you think the documentation could be improved. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index c2111a5..2210df1 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -27,4 +27,4 @@ body: attributes: label: Additional context description: > - Add any other context about the problem here. \ No newline at end of file + Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/other_issue.yml b/.github/ISSUE_TEMPLATE/other_issue.yml index ad364e9..95ff6d1 100644 --- a/.github/ISSUE_TEMPLATE/other_issue.yml +++ b/.github/ISSUE_TEMPLATE/other_issue.yml @@ -18,4 +18,4 @@ body: label: Suggest a potential alternative/fix - type: textarea attributes: - label: Additional context \ No newline at end of file + label: Additional context diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 3947512..0a8b9a4 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -19,4 +19,4 @@ A clear and concise description of what you have implemented. \ No newline at end of file +--> diff --git a/.github/workflows/pr_pytest.yml b/.github/workflows/pr_pytest.yml index a4ab8f2..85fca86 100644 --- a/.github/workflows/pr_pytest.yml +++ b/.github/workflows/pr_pytest.yml @@ -2,7 +2,7 @@ name: Run Tests on: push: - branches: + branches: - main paths: - "orca_python/**"