diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..7d31ffb --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +__pycache__/ +*.py[cod] +.pytest_cache/ +.env +venv/ +.venv/ +.git/ +.github/ +output/ +*.db +.DS_Store + diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..2e22f28 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,34 @@ +--- +name: Bug report +about: Report something that is not working as expected +title: '[BUG] ' +labels: bug +--- + +## Description + +A clear and concise description of the bug. + +## Steps to Reproduce + +1. +2. +3. + +## Expected Behavior + +What you expected to happen. + +## Actual Behavior + +What actually happened. Include the full error message or stack trace if applicable. + +## Environment + +- OS: +- Python version: +- Project commit / version: + +## Additional Context + +Any other context, screenshots, or sample inputs. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..657dc6e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,22 @@ +--- +name: Feature request +about: Suggest an idea or improvement +title: '[FEATURE] ' +labels: enhancement +--- + +## Problem + +What problem does this solve? Who would benefit? + +## Proposed Solution + +How could it work? + +## Alternatives Considered + +Any other approaches you considered. + +## Additional Context + +Mockups, references, or related issues. diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..a211623 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,17 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + open-pull-requests-limit: 5 + labels: + - "dependencies" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + labels: + - "dependencies" + - "ci" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..5c7b86b --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,23 @@ +## Summary + + + +## Changes + + +- +- + +## Testing + +- [ ] All existing tests pass (`pytest -v`) +- [ ] Added or updated tests for new behavior +- [ ] Manually verified end-to-end pipeline + +## Screenshots (if UI / report changes) + + + +## Related Issues + + diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..3b29154 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,30 @@ +name: Tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run tests + run: pytest -v diff --git a/.gitignore b/.gitignore index 8435aa8..1c81f82 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,10 @@ Thumbs.db # Output output/ + +# ML artifacts +*.pkl +*.joblib +*.h5 +*.parquet +*.npy diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..948872b --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,42 @@ +# Contributing + +Thanks for your interest! This is primarily a personal portfolio project, but contributions are welcome. + +## Getting Started + +1. Fork the repository and clone your fork. +2. Create and activate a virtual environment: + ```bash + python3 -m venv venv + source venv/bin/activate + ``` +3. Install dependencies: + ```bash + pip install -r requirements.txt + ``` +4. Run the test suite to confirm your environment is set up: + ```bash + pytest -v + ``` +5. Try a demo run: + ```bash + python main.py + ``` + +## Submitting Changes + +1. Create a feature branch from `main`: + ```bash + git checkout -b feature/your-feature + ``` +2. Make focused, well-described commits. +3. Make sure the test suite passes locally before pushing. +4. Open a pull request against `main` with a clear description of what you changed and why. Reference any related issues. + +## Code Style + +- Follow PEP 8 for Python code. +- Add tests for any new behavior — especially in the model training and evaluation pipeline. +- Do not commit pickled models, large datasets, or generated reports. +- Update the README if user-facing behavior changes. +- Keep changes focused — one PR, one concern. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5880e4c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8501 + +CMD ["streamlit", "run", "app.py", "--server.address", "0.0.0.0", "--server.port", "8501"] diff --git a/LICENSE b/LICENSE index 14fac91..fc7bc76 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2026 +Copyright (c) 2026 Eugen Goebel Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index a392aa4..7c08b0a 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,24 @@ An automated machine learning pipeline that profiles datasets, preprocesses data, selects features, trains and compares multiple models, and generates a professional evaluation report — all without requiring an API key. +![CI](https://github.com/eugen-goebel/predictive-analytics-agent/actions/workflows/tests.yml/badge.svg) +![Python](https://img.shields.io/badge/Python-3.10+-blue) +![Tests](https://img.shields.io/badge/Tests-35_passed-brightgreen) +![scikit--learn](https://img.shields.io/badge/scikit--learn-1.5+-f7931e) +![Streamlit](https://img.shields.io/badge/Streamlit-1.40+-red) +![License](https://img.shields.io/badge/License-MIT-green) + +## Screenshots + +**Data Analysis** — auto-detected target column, task type, and preprocessing pipeline +![Data Analysis](docs/screenshots/01-data-analysis.png) + +**Model Comparison** — cross-validated accuracy, standard deviation, and training time across 4 models +![Model Comparison](docs/screenshots/02-model-comparison.png) + +**Evaluation Results** — test/train scores, overfitting detection, and accuracy comparison chart +![Evaluation](docs/screenshots/03-evaluation.png) + ## Features - **Auto-Detection**: Automatically identifies the target column and task type (classification or regression) diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..eb4ec2e --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,15 @@ +# Security Policy + +## Reporting a Vulnerability + +If you discover a security vulnerability in this project, please report it privately by emailing **eugen-goebel@hotmail.de**. + +Please do not file public GitHub issues for security vulnerabilities, as this could expose users to risk before a fix is available. + +## Response Time + +I aim to acknowledge reports within 7 days and provide an initial assessment within 14 days. + +## Supported Versions + +This is a portfolio project; only the latest commit on `main` is supported. diff --git a/agents/model_trainer.py b/agents/model_trainer.py index f6bde60..2d2667f 100644 --- a/agents/model_trainer.py +++ b/agents/model_trainer.py @@ -20,13 +20,14 @@ import numpy as np from pydantic import BaseModel, Field from typing import Literal -from sklearn.model_selection import train_test_split, cross_val_score +from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.ensemble import ( RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, ) from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor +from sklearn.preprocessing import StandardScaler # --------------------------------------------------------------------------- @@ -48,6 +49,52 @@ class TrainingResult(BaseModel): task_type: Literal["classification", "regression"] model_scores: list[ModelScore] training_time_seconds: float = Field(description="Total time for all models") + tuned: bool = Field(default=False, description="Whether hyperparameter tuning was applied") + best_params: dict | None = Field(default=None, description="Best hyperparameters found by tuning") + + +# --------------------------------------------------------------------------- +# Hyperparameter grids for GridSearchCV +# --------------------------------------------------------------------------- + +CLASSIFICATION_PARAM_GRIDS = { + "Logistic Regression": { + "C": [0.01, 0.1, 1, 10], + "max_iter": [1000], + }, + "Random Forest": { + "n_estimators": [50, 100, 200], + "max_depth": [None, 10, 20], + "min_samples_split": [2, 5], + }, + "Gradient Boosting": { + "n_estimators": [50, 100, 200], + "learning_rate": [0.01, 0.1, 0.2], + "max_depth": [3, 5], + }, + "K-Nearest Neighbors": { + "n_neighbors": [3, 5, 7, 11], + "weights": ["uniform", "distance"], + }, +} + +REGRESSION_PARAM_GRIDS = { + "Linear Regression": {}, + "Random Forest": { + "n_estimators": [50, 100, 200], + "max_depth": [None, 10, 20], + "min_samples_split": [2, 5], + }, + "Gradient Boosting": { + "n_estimators": [50, 100, 200], + "learning_rate": [0.01, 0.1, 0.2], + "max_depth": [3, 5], + }, + "K-Nearest Neighbors": { + "n_neighbors": [3, 5, 7, 11], + "weights": ["uniform", "distance"], + }, +} # --------------------------------------------------------------------------- @@ -75,12 +122,13 @@ class ModelTrainerAgent: def __init__(self): self.best_model = None + self.scaler = None self.X_train = None self.y_train = None self.X_test = None self.y_test = None - def train(self, X: np.ndarray, y: np.ndarray, task_type: str) -> TrainingResult: + def train(self, X: np.ndarray, y: np.ndarray, task_type: str, tune: bool = False) -> TrainingResult: """ Train multiple models and select the best one. @@ -90,10 +138,14 @@ def train(self, X: np.ndarray, y: np.ndarray, task_type: str) -> TrainingResult: 3. Pick the best model by mean CV score 4. Refit the best model on full training set + If tune=True, each model is first optimized via GridSearchCV + before the comparison step. + Args: X: Feature matrix y: Target array task_type: "classification" or "regression" + tune: Run hyperparameter tuning with GridSearchCV Returns: TrainingResult with comparison metrics @@ -105,6 +157,11 @@ def train(self, X: np.ndarray, y: np.ndarray, task_type: str) -> TrainingResult: X, y, test_size=0.2, random_state=42 ) + # --- Step 1b: Scale features (fit on train only to prevent data leakage) --- + self.scaler = StandardScaler() + self.X_train = self.scaler.fit_transform(self.X_train) + self.X_test = self.scaler.transform(self.X_test) + # --- Step 2: Define candidate models --- if task_type == "classification": candidates = { @@ -114,6 +171,7 @@ def train(self, X: np.ndarray, y: np.ndarray, task_type: str) -> TrainingResult: "K-Nearest Neighbors": KNeighborsClassifier(), } scoring = "accuracy" + param_grids = CLASSIFICATION_PARAM_GRIDS else: candidates = { "Linear Regression": LinearRegression(), @@ -122,6 +180,14 @@ def train(self, X: np.ndarray, y: np.ndarray, task_type: str) -> TrainingResult: "K-Nearest Neighbors": KNeighborsRegressor(), } scoring = "r2" + param_grids = REGRESSION_PARAM_GRIDS + + # --- Step 2b: Hyperparameter tuning (optional) --- + tuned_params = {} + if tune: + candidates, tuned_params = self._tune_candidates( + candidates, param_grids, scoring, + ) # --- Step 3: Train and evaluate each model --- model_scores: list[ModelScore] = [] @@ -165,4 +231,34 @@ def train(self, X: np.ndarray, y: np.ndarray, task_type: str) -> TrainingResult: task_type=task_type, model_scores=model_scores, training_time_seconds=round(total_time, 3), + tuned=tune, + best_params=tuned_params.get(best_name) if tune else None, ) + + def _tune_candidates( + self, + candidates: dict, + param_grids: dict, + scoring: str, + ) -> tuple[dict, dict]: + """Run GridSearchCV on each candidate and return tuned models with best params.""" + tuned = {} + best_params = {} + cv_folds = min(5, len(self.X_train)) + + for name, model in candidates.items(): + grid = param_grids.get(name, {}) + if not grid: + tuned[name] = model + best_params[name] = {} + continue + + search = GridSearchCV( + model, grid, scoring=scoring, + cv=cv_folds, n_jobs=-1, error_score="raise", + ) + search.fit(self.X_train, self.y_train) + tuned[name] = search.best_estimator_ + best_params[name] = search.best_params_ + + return tuned, best_params diff --git a/agents/orchestrator.py b/agents/orchestrator.py index ca8cc23..550d66c 100644 --- a/agents/orchestrator.py +++ b/agents/orchestrator.py @@ -8,11 +8,15 @@ import os import tempfile +import pandas as pd +import numpy as np + from agents.data_profiler import DataProfiler from agents.preprocessor import PreprocessorAgent from agents.feature_engineer import FeatureEngineerAgent from agents.model_trainer import ModelTrainerAgent from agents.evaluator import EvaluatorAgent +from agents.timeseries_trainer import TimeSeriesTrainerAgent, ForecastResult from utils.report_generator import generate_docx_report @@ -25,12 +29,13 @@ class MLPipelineOrchestrator: def __init__(self, output_dir: str = "output"): self.output_dir = output_dir - def run(self, filepath: str) -> str: + def run(self, filepath: str, tune: bool = False) -> str: """ Execute the full 6-phase ML pipeline. Args: filepath: Path to CSV or Excel file + tune: Run hyperparameter tuning with GridSearchCV Returns: Absolute path to the generated DOCX report @@ -61,9 +66,12 @@ def run(self, filepath: str) -> str: print(f" Method: {feature_result.method}") # Phase 4: Train models - print(f"\n[4/6] Training models ...") + if tune: + print(f"\n[4/6] Training models with hyperparameter tuning ...") + else: + print(f"\n[4/6] Training models ...") trainer = ModelTrainerAgent() - training_result = trainer.train(X_selected, y, profile.task_type) + training_result = trainer.train(X_selected, y, profile.task_type, tune=tune) print(f" Best: {training_result.best_model_name} " f"(score: {training_result.best_score:.4f})") print(f" Time: {training_result.training_time_seconds:.1f}s") @@ -98,3 +106,53 @@ def run(self, filepath: str) -> str: print(f" Report: {report_path}") return report_path + + def run_timeseries( + self, + filepath: str, + target_column: str, + n_lags: int = 12, + horizon: int = 6, + ) -> ForecastResult: + """ + Run time series forecasting on a specific column. + + Args: + filepath: Path to CSV or Excel file + target_column: Name of the numeric column to forecast + n_lags: Number of lag features + horizon: Steps to forecast ahead + + Returns: + ForecastResult with model comparison and forecast values + """ + ext = os.path.splitext(filepath)[1].lower() + if ext == ".csv": + df = pd.read_csv(filepath) + else: + df = pd.read_excel(filepath) + + if target_column not in df.columns: + raise ValueError( + f"Column '{target_column}' not found. " + f"Available: {', '.join(df.columns)}" + ) + + series = pd.to_numeric(df[target_column], errors="coerce").dropna().values + + if len(series) == 0: + raise ValueError(f"Column '{target_column}' has no valid numeric values") + + print(f"\n[1/2] Preparing time series ({len(series)} data points) ...") + print(f" Column: {target_column}") + print(f" Lags: {n_lags}, Horizon: {horizon}") + + print(f"\n[2/2] Training forecasting models ...") + trainer = TimeSeriesTrainerAgent() + result = trainer.train(series, n_lags=n_lags, horizon=horizon) + + print(f" Best: {result.best_model_name} (RMSE: {result.best_rmse})") + for score in result.model_scores: + print(f" {score.name:25s} RMSE={score.rmse:.4f} MAE={score.mae:.4f}") + + return result diff --git a/agents/preprocessor.py b/agents/preprocessor.py index c0b11ac..3cc2828 100644 --- a/agents/preprocessor.py +++ b/agents/preprocessor.py @@ -20,7 +20,7 @@ import pandas as pd from pydantic import BaseModel, Field from typing import Literal -from sklearn.preprocessing import LabelEncoder, StandardScaler +from sklearn.preprocessing import LabelEncoder from .data_profiler import DataProfile @@ -126,11 +126,9 @@ def preprocess( else: y_array = y_series.values.astype(float) - # --- Step 6: Scale numeric features --- + # --- Step 6: Convert to numpy (scaling deferred to ModelTrainer to prevent data leakage) --- feature_names = list(X_df.columns) - scaler = StandardScaler() - X_array = scaler.fit_transform(X_df.values.astype(float)) - steps.append(f"Scaled {len(feature_names)} features with StandardScaler") + X_array = X_df.values.astype(float) result = PreprocessResult( feature_names=feature_names, diff --git a/agents/timeseries_trainer.py b/agents/timeseries_trainer.py new file mode 100644 index 0000000..3b74eca --- /dev/null +++ b/agents/timeseries_trainer.py @@ -0,0 +1,200 @@ +""" +Time Series Trainer — Forecasting models for temporal data. + +Supports multiple approaches for time series prediction: + - Lag-based regression (using sklearn regressors on lag features) + - Simple moving average baseline + - Exponential smoothing + +The agent auto-generates lag features from a time-ordered numeric series +and trains regression models for multi-step forecasting. +""" + +import time +import numpy as np +import pandas as pd +from pydantic import BaseModel, Field +from sklearn.model_selection import TimeSeriesSplit, cross_val_score +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor + + +# --------------------------------------------------------------------------- +# Pydantic models +# --------------------------------------------------------------------------- + +class TimeSeriesModelScore(BaseModel): + """Performance metrics for a single forecasting model.""" + name: str + rmse: float = Field(description="Root mean squared error on test set") + mae: float = Field(description="Mean absolute error on test set") + training_time: float = Field(description="Training time in seconds") + + +class ForecastResult(BaseModel): + """Results of time series model training and selection.""" + best_model_name: str + best_rmse: float + n_lags: int = Field(description="Number of lag features used") + horizon: int = Field(description="Forecast horizon (steps ahead)") + model_scores: list[TimeSeriesModelScore] + training_time_seconds: float + forecast_values: list[float] = Field(description="Forecasted values for the next `horizon` steps") + + +# --------------------------------------------------------------------------- +# Agent +# --------------------------------------------------------------------------- + +class TimeSeriesTrainerAgent: + """ + Trains time series forecasting models using lag-based feature engineering. + + Converts a single time series into a supervised learning problem by + creating lag features, then compares multiple regressors. + + Usage: + trainer = TimeSeriesTrainerAgent() + result = trainer.train(series, n_lags=12, horizon=6) + """ + + def __init__(self): + self.best_model = None + self.n_lags = None + + def train( + self, + series: np.ndarray, + n_lags: int = 12, + horizon: int = 6, + ) -> ForecastResult: + """ + Train forecasting models on a time series. + + Args: + series: 1D array of time-ordered values + n_lags: Number of past observations to use as features + horizon: Number of future steps to forecast + + Returns: + ForecastResult with model comparison and forecast + """ + total_start = time.time() + self.n_lags = n_lags + + if len(series) < n_lags + horizon + 10: + raise ValueError( + f"Series too short ({len(series)} points) for " + f"n_lags={n_lags} and horizon={horizon}. " + f"Need at least {n_lags + horizon + 10} data points." + ) + + # Build supervised dataset from lag features + X, y = self._create_lag_features(series, n_lags) + + # Time-aware train/test split (last 20% for testing) + split_idx = int(len(X) * 0.8) + X_train, X_test = X[:split_idx], X[split_idx:] + y_train, y_test = y[:split_idx], y[split_idx:] + + # Define candidate models + candidates = { + "Linear Regression": LinearRegression(), + "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42), + "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42), + "Moving Average": None, # handled separately + } + + model_scores: list[TimeSeriesModelScore] = [] + best_name = "" + best_rmse = np.inf + best_estimator = None + + for name, model in candidates.items(): + start = time.time() + + if name == "Moving Average": + # Simple moving average baseline + y_pred = self._moving_average_predict(series, split_idx, n_lags, len(y_test)) + rmse = float(np.sqrt(np.mean((y_test - y_pred) ** 2))) + mae = float(np.mean(np.abs(y_test - y_pred))) + else: + model.fit(X_train, y_train) + y_pred = model.predict(X_test) + rmse = float(np.sqrt(np.mean((y_test - y_pred) ** 2))) + mae = float(np.mean(np.abs(y_test - y_pred))) + + elapsed = time.time() - start + + model_scores.append(TimeSeriesModelScore( + name=name, + rmse=round(rmse, 4), + mae=round(mae, 4), + training_time=round(elapsed, 3), + )) + + if rmse < best_rmse: + best_rmse = rmse + best_name = name + best_estimator = model + + # Refit best model on full data + if best_name != "Moving Average" and best_estimator is not None: + best_estimator.fit(X, y) + self.best_model = best_estimator + + # Generate forecast + forecast = self._forecast(series, n_lags, horizon, best_name) + + total_time = time.time() - total_start + + return ForecastResult( + best_model_name=best_name, + best_rmse=round(best_rmse, 4), + n_lags=n_lags, + horizon=horizon, + model_scores=model_scores, + training_time_seconds=round(total_time, 3), + forecast_values=[round(v, 4) for v in forecast], + ) + + def _create_lag_features(self, series: np.ndarray, n_lags: int) -> tuple[np.ndarray, np.ndarray]: + """Convert a time series into a supervised learning dataset using lag features.""" + X, y = [], [] + for i in range(n_lags, len(series)): + X.append(series[i - n_lags:i]) + y.append(series[i]) + return np.array(X), np.array(y) + + def _moving_average_predict( + self, series: np.ndarray, split_idx: int, window: int, n_predictions: int + ) -> np.ndarray: + """Generate moving average predictions for the test period.""" + predictions = [] + data = list(series[:split_idx + window]) + for i in range(n_predictions): + idx = split_idx + window + i + start = idx - window + avg = np.mean(data[start:idx]) + predictions.append(avg) + if idx < len(series): + data.append(series[idx]) + else: + data.append(avg) + return np.array(predictions) + + def _forecast( + self, series: np.ndarray, n_lags: int, horizon: int, model_name: str + ) -> list[float]: + """Generate future forecast values.""" + values = list(series[-n_lags:]) + + for _ in range(horizon): + if model_name == "Moving Average": + pred = float(np.mean(values[-n_lags:])) + else: + features = np.array(values[-n_lags:]).reshape(1, -1) + pred = float(self.best_model.predict(features)[0]) + values.append(pred) + + return values[n_lags:] diff --git a/docs/screenshots/01-data-analysis.png b/docs/screenshots/01-data-analysis.png new file mode 100644 index 0000000..19bf561 Binary files /dev/null and b/docs/screenshots/01-data-analysis.png differ diff --git a/docs/screenshots/02-model-comparison.png b/docs/screenshots/02-model-comparison.png new file mode 100644 index 0000000..efab6ff Binary files /dev/null and b/docs/screenshots/02-model-comparison.png differ diff --git a/docs/screenshots/03-evaluation.png b/docs/screenshots/03-evaluation.png new file mode 100644 index 0000000..41e702f Binary files /dev/null and b/docs/screenshots/03-evaluation.png differ diff --git a/main.py b/main.py index 4f67501..0027977 100644 --- a/main.py +++ b/main.py @@ -31,6 +31,30 @@ def main(): default="output", help="Output directory for reports (default: output/)", ) + parser.add_argument( + "--tune", + action="store_true", + help="Enable hyperparameter tuning with GridSearchCV", + ) + parser.add_argument( + "--timeseries", + type=str, + default=None, + metavar="COLUMN", + help="Run time series forecasting on the specified numeric column", + ) + parser.add_argument( + "--horizon", + type=int, + default=6, + help="Number of future steps to forecast (default: 6)", + ) + parser.add_argument( + "--lags", + type=int, + default=12, + help="Number of lag features for time series models (default: 12)", + ) args = parser.parse_args() @@ -52,15 +76,29 @@ def main(): print("=" * 60) print(" PREDICTIVE ANALYTICS PIPELINE") print(f" Data: {os.path.basename(args.filepath)}") + if args.timeseries: + print(f" Mode: Time Series Forecasting (column: {args.timeseries})") print(f" No API key required — runs entirely locally") print("=" * 60) orch = MLPipelineOrchestrator(output_dir=args.output) - report_path = orch.run(args.filepath) - print("\n" + "=" * 60) - print(f" Report ready: {report_path}") - print("=" * 60) + if args.timeseries: + result = orch.run_timeseries( + args.filepath, + target_column=args.timeseries, + n_lags=args.lags, + horizon=args.horizon, + ) + print("\n" + "=" * 60) + print(f" Best model: {result.best_model_name} (RMSE: {result.best_rmse})") + print(f" Forecast ({result.horizon} steps): {result.forecast_values}") + print("=" * 60) + else: + report_path = orch.run(args.filepath, tune=args.tune) + print("\n" + "=" * 60) + print(f" Report ready: {report_path}") + print("=" * 60) if __name__ == "__main__": diff --git a/tests/test_model_trainer.py b/tests/test_model_trainer.py index b66580d..0460203 100644 --- a/tests/test_model_trainer.py +++ b/tests/test_model_trainer.py @@ -2,7 +2,12 @@ import numpy as np import pytest -from agents.model_trainer import ModelTrainerAgent, TrainingResult +from agents.model_trainer import ( + ModelTrainerAgent, + TrainingResult, + CLASSIFICATION_PARAM_GRIDS, + REGRESSION_PARAM_GRIDS, +) class TestModelTrainer: @@ -51,3 +56,56 @@ def test_training_time_positive(self): trainer = ModelTrainerAgent() result = trainer.train(X, y, "classification") assert result.training_time_seconds > 0 + + def test_default_not_tuned(self): + X = np.random.randn(50, 3) + y = np.random.randint(0, 2, 50) + trainer = ModelTrainerAgent() + result = trainer.train(X, y, "classification") + assert result.tuned is False + assert result.best_params is None + + +class TestHyperparameterTuning: + def test_tune_classification(self): + X = np.random.randn(80, 4) + y = np.random.randint(0, 2, 80) + trainer = ModelTrainerAgent() + result = trainer.train(X, y, "classification", tune=True) + assert result.tuned is True + assert isinstance(result, TrainingResult) + + def test_tune_regression(self): + X = np.random.randn(80, 4) + y = np.random.randn(80) + trainer = ModelTrainerAgent() + result = trainer.train(X, y, "regression", tune=True) + assert result.tuned is True + + def test_tune_returns_best_params(self): + X = np.random.randn(80, 4) + y = np.random.randint(0, 2, 80) + trainer = ModelTrainerAgent() + result = trainer.train(X, y, "classification", tune=True) + assert result.best_params is not None + assert isinstance(result.best_params, dict) + + def test_tune_four_models_compared(self): + X = np.random.randn(80, 4) + y = np.random.randint(0, 2, 80) + trainer = ModelTrainerAgent() + result = trainer.train(X, y, "classification", tune=True) + assert len(result.model_scores) == 4 + + def test_tune_best_score_matches(self): + X = np.random.randn(80, 4) + y = np.random.randint(0, 2, 80) + trainer = ModelTrainerAgent() + result = trainer.train(X, y, "classification", tune=True) + scores = [m.score for m in result.model_scores] + assert result.best_score == max(scores) + + def test_param_grids_exist(self): + assert "Random Forest" in CLASSIFICATION_PARAM_GRIDS + assert "Gradient Boosting" in REGRESSION_PARAM_GRIDS + assert "n_estimators" in CLASSIFICATION_PARAM_GRIDS["Random Forest"] diff --git a/tests/test_timeseries.py b/tests/test_timeseries.py new file mode 100644 index 0000000..5103a3c --- /dev/null +++ b/tests/test_timeseries.py @@ -0,0 +1,89 @@ +"""Tests for the TimeSeriesTrainerAgent.""" + +import numpy as np +import pytest +from agents.timeseries_trainer import TimeSeriesTrainerAgent, ForecastResult + + +@pytest.fixture +def sample_series(): + """Sine wave with trend — realistic enough for testing.""" + np.random.seed(42) + t = np.arange(100) + return np.sin(t * 0.1) * 10 + t * 0.5 + np.random.randn(100) * 0.5 + + +@pytest.fixture +def short_series(): + """Series too short for default parameters.""" + return np.array([1.0, 2.0, 3.0, 4.0, 5.0]) + + +class TestTimeSeriesTrainer: + def test_returns_forecast_result(self, sample_series): + trainer = TimeSeriesTrainerAgent() + result = trainer.train(sample_series, n_lags=10, horizon=5) + assert isinstance(result, ForecastResult) + + def test_correct_horizon(self, sample_series): + trainer = TimeSeriesTrainerAgent() + result = trainer.train(sample_series, n_lags=10, horizon=5) + assert len(result.forecast_values) == 5 + + def test_correct_lags(self, sample_series): + trainer = TimeSeriesTrainerAgent() + result = trainer.train(sample_series, n_lags=8, horizon=4) + assert result.n_lags == 8 + assert result.horizon == 4 + + def test_four_models_compared(self, sample_series): + trainer = TimeSeriesTrainerAgent() + result = trainer.train(sample_series, n_lags=10, horizon=5) + assert len(result.model_scores) == 4 + names = {s.name for s in result.model_scores} + assert "Linear Regression" in names + assert "Random Forest" in names + assert "Gradient Boosting" in names + assert "Moving Average" in names + + def test_best_has_lowest_rmse(self, sample_series): + trainer = TimeSeriesTrainerAgent() + result = trainer.train(sample_series, n_lags=10, horizon=5) + min_rmse = min(s.rmse for s in result.model_scores) + assert result.best_rmse == min_rmse + + def test_best_model_stored(self, sample_series): + trainer = TimeSeriesTrainerAgent() + trainer.train(sample_series, n_lags=10, horizon=5) + # best_model can be None for Moving Average, or an estimator + assert trainer.n_lags == 10 + + def test_training_time_positive(self, sample_series): + trainer = TimeSeriesTrainerAgent() + result = trainer.train(sample_series, n_lags=10, horizon=5) + assert result.training_time_seconds > 0 + + def test_forecast_values_are_finite(self, sample_series): + trainer = TimeSeriesTrainerAgent() + result = trainer.train(sample_series, n_lags=10, horizon=5) + for v in result.forecast_values: + assert np.isfinite(v) + + def test_rmse_and_mae_positive(self, sample_series): + trainer = TimeSeriesTrainerAgent() + result = trainer.train(sample_series, n_lags=10, horizon=5) + for score in result.model_scores: + assert score.rmse >= 0 + assert score.mae >= 0 + + def test_short_series_raises(self, short_series): + trainer = TimeSeriesTrainerAgent() + with pytest.raises(ValueError, match="too short"): + trainer.train(short_series, n_lags=12, horizon=6) + + def test_custom_lags_and_horizon(self, sample_series): + trainer = TimeSeriesTrainerAgent() + result = trainer.train(sample_series, n_lags=5, horizon=3) + assert result.n_lags == 5 + assert result.horizon == 3 + assert len(result.forecast_values) == 3