From d6966ae58aa95bcff86bf24db0d44d06f48b40cf Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 7 Feb 2026 19:35:03 +0000 Subject: [PATCH 1/5] Implement HousingHand development pipeline intelligence platform Build the complete HousingHand system for tracking affordable housing projects from concept to certificate of occupancy. This creates the intelligence layer that connects regulatory friction to real production outcomes within the HousingMind ecosystem. Key components: - SQLAlchemy models for projects, funding sources, barriers, peer groups, portfolios, and policy reforms with comprehensive schema coverage - Analytics engine with weighted health assessment, bottleneck detection, timeline prediction, reform impact measurement, and portfolio intelligence - ML module with Random Forest timeline prediction model, feature engineering, training pipeline, and evaluation metrics - FastAPI application with full CRUD endpoints, analytics APIs, webhook integration, and Pydantic request/response schemas - Integration clients for HousingLens (friction scores), HousingEar (policy monitoring), and HousingMind (query metadata webhooks) - Data collection layer with developer portal service, permit scrapers, LIHTC data scraper, and data quality validation - Celery task queue with periodic health checks, prediction updates, benchmark recalculation, and alert generation - Comprehensive test suite with SQLite in-memory fixtures, factory helpers, and tests across analytics, API, integrations, and ML - Docker Compose configurations for production, development, and testing - GitHub Actions CI/CD for tests, linting, deployment, and model training - Database seed scripts, model training scripts, and report generators https://claude.ai/code/session_01K1NfMrvDoARx7PAB1cvHbZ --- .env.example | 31 + .github/workflows/deploy.yml | 45 + .github/workflows/model_training.yml | 50 ++ .github/workflows/test.yml | 84 ++ .gitignore | 58 ++ LICENSE | 21 + README.md | 78 +- alembic/alembic.ini | 36 + alembic/env.py | 64 ++ alembic/versions/.gitkeep | 0 config/__init__.py | 3 + config/database.yaml | 28 + config/national_benchmarks.yaml | 61 ++ config/pipeline_stages.yaml | 67 ++ config/settings.py | 57 ++ docker/Dockerfile | 27 + docker/docker-compose.dev.yml | 63 ++ docker/docker-compose.test.yml | 36 + docker/docker-compose.yml | 72 ++ docs/ANALYTICS.md | 59 ++ docs/API.md | 102 +++ docs/DATA_MODEL.md | 59 ++ docs/DEPLOYMENT.md | 91 ++ docs/DEVELOPER_PORTAL.md | 71 ++ docs/INTEGRATION_GUIDE.md | 57 ++ notebooks/exploratory/.gitkeep | 0 notebooks/model_development/.gitkeep | 0 notebooks/validation/.gitkeep | 0 pyproject.toml | 90 ++ requirements/base.txt | 15 + requirements/dev.txt | 7 + requirements/ml.txt | 8 + requirements/test.txt | 6 + scripts/generate_reports.py | 68 ++ scripts/migrate_data.py | 72 ++ scripts/seed_database.py | 260 ++++++ scripts/train_model.py | 60 ++ src/__init__.py | 0 src/analytics/__init__.py | 93 ++ src/analytics/bottleneck_detection.py | 537 ++++++++++++ src/analytics/health_assessment.py | 715 +++++++++++++++ src/analytics/peer_benchmarking.py | 626 ++++++++++++++ src/analytics/portfolio_intelligence.py | 817 ++++++++++++++++++ src/analytics/reform_impact.py | 683 +++++++++++++++ src/analytics/statistical_tests.py | 529 ++++++++++++ src/analytics/timeline_prediction.py | 602 +++++++++++++ src/api/__init__.py | 0 src/api/app.py | 145 ++++ src/api/dependencies.py | 77 ++ src/api/endpoints/__init__.py | 0 src/api/endpoints/analytics.py | 455 ++++++++++ src/api/endpoints/health.py | 562 ++++++++++++ src/api/endpoints/portfolio.py | 567 ++++++++++++ src/api/endpoints/predictions.py | 413 +++++++++ src/api/endpoints/projects.py | 806 +++++++++++++++++ src/api/endpoints/reforms.py | 566 ++++++++++++ src/api/webhooks.py | 390 +++++++++ src/data_collection/__init__.py | 4 + src/data_collection/developer_portal.py | 164 ++++ src/data_collection/lihtc_scraper.py | 63 ++ src/data_collection/permit_scraper.py | 80 ++ src/data_collection/validation.py | 189 ++++ src/database/__init__.py | 3 + src/database/connection.py | 42 + src/database/migrations/.gitkeep | 0 src/database/queries.py | 226 +++++ src/integrations/__init__.py | 5 + src/integrations/housing_ear.py | 112 +++ src/integrations/housing_lens.py | 124 +++ src/integrations/housing_mind.py | 82 ++ src/integrations/public_records.py | 97 +++ src/ml/__init__.py | 87 ++ src/ml/feature_engineering.py | 384 ++++++++ src/ml/model_evaluation.py | 384 ++++++++ src/ml/model_training.py | 490 +++++++++++ src/ml/timeline_model.py | 445 ++++++++++ src/models/__init__.py | 41 + src/models/barrier.py | 60 ++ src/models/enums.py | 128 +++ src/models/funding_source.py | 68 ++ src/models/peer_group.py | 52 ++ src/models/portfolio.py | 54 ++ src/models/project.py | 237 +++++ src/models/reform.py | 61 ++ src/tasks/__init__.py | 3 + src/tasks/alert_generation.py | 121 +++ src/tasks/calculate_benchmarks.py | 120 +++ src/tasks/celery_app.py | 48 + src/tasks/health_checks.py | 120 +++ src/tasks/update_predictions.py | 128 +++ src/utils/__init__.py | 32 + src/utils/date_helpers.py | 78 ++ src/utils/formatting.py | 49 ++ src/utils/geography.py | 76 ++ src/utils/statistical_helpers.py | 82 ++ tests/__init__.py | 0 tests/conftest.py | 365 ++++++++ tests/fixtures/__init__.py | 0 tests/fixtures/sample_data.py | 358 ++++++++ tests/test_analytics/__init__.py | 0 .../test_bottleneck_detection.py | 275 ++++++ .../test_analytics/test_health_assessment.py | 421 +++++++++ .../test_portfolio_intelligence.py | 150 ++++ tests/test_analytics/test_reform_impact.py | 110 +++ .../test_timeline_prediction.py | 298 +++++++ tests/test_api/__init__.py | 0 tests/test_api/test_projects.py | 104 +++ tests/test_integrations/__init__.py | 0 tests/test_integrations/test_housing_lens.py | 81 ++ tests/test_ml/__init__.py | 0 tests/test_ml/test_feature_engineering.py | 84 ++ 111 files changed, 17273 insertions(+), 1 deletion(-) create mode 100644 .env.example create mode 100644 .github/workflows/deploy.yml create mode 100644 .github/workflows/model_training.yml create mode 100644 .github/workflows/test.yml create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 alembic/alembic.ini create mode 100644 alembic/env.py create mode 100644 alembic/versions/.gitkeep create mode 100644 config/__init__.py create mode 100644 config/database.yaml create mode 100644 config/national_benchmarks.yaml create mode 100644 config/pipeline_stages.yaml create mode 100644 config/settings.py create mode 100644 docker/Dockerfile create mode 100644 docker/docker-compose.dev.yml create mode 100644 docker/docker-compose.test.yml create mode 100644 docker/docker-compose.yml create mode 100644 docs/ANALYTICS.md create mode 100644 docs/API.md create mode 100644 docs/DATA_MODEL.md create mode 100644 docs/DEPLOYMENT.md create mode 100644 docs/DEVELOPER_PORTAL.md create mode 100644 docs/INTEGRATION_GUIDE.md create mode 100644 notebooks/exploratory/.gitkeep create mode 100644 notebooks/model_development/.gitkeep create mode 100644 notebooks/validation/.gitkeep create mode 100644 pyproject.toml create mode 100644 requirements/base.txt create mode 100644 requirements/dev.txt create mode 100644 requirements/ml.txt create mode 100644 requirements/test.txt create mode 100644 scripts/generate_reports.py create mode 100644 scripts/migrate_data.py create mode 100644 scripts/seed_database.py create mode 100644 scripts/train_model.py create mode 100644 src/__init__.py create mode 100644 src/analytics/__init__.py create mode 100644 src/analytics/bottleneck_detection.py create mode 100644 src/analytics/health_assessment.py create mode 100644 src/analytics/peer_benchmarking.py create mode 100644 src/analytics/portfolio_intelligence.py create mode 100644 src/analytics/reform_impact.py create mode 100644 src/analytics/statistical_tests.py create mode 100644 src/analytics/timeline_prediction.py create mode 100644 src/api/__init__.py create mode 100644 src/api/app.py create mode 100644 src/api/dependencies.py create mode 100644 src/api/endpoints/__init__.py create mode 100644 src/api/endpoints/analytics.py create mode 100644 src/api/endpoints/health.py create mode 100644 src/api/endpoints/portfolio.py create mode 100644 src/api/endpoints/predictions.py create mode 100644 src/api/endpoints/projects.py create mode 100644 src/api/endpoints/reforms.py create mode 100644 src/api/webhooks.py create mode 100644 src/data_collection/__init__.py create mode 100644 src/data_collection/developer_portal.py create mode 100644 src/data_collection/lihtc_scraper.py create mode 100644 src/data_collection/permit_scraper.py create mode 100644 src/data_collection/validation.py create mode 100644 src/database/__init__.py create mode 100644 src/database/connection.py create mode 100644 src/database/migrations/.gitkeep create mode 100644 src/database/queries.py create mode 100644 src/integrations/__init__.py create mode 100644 src/integrations/housing_ear.py create mode 100644 src/integrations/housing_lens.py create mode 100644 src/integrations/housing_mind.py create mode 100644 src/integrations/public_records.py create mode 100644 src/ml/__init__.py create mode 100644 src/ml/feature_engineering.py create mode 100644 src/ml/model_evaluation.py create mode 100644 src/ml/model_training.py create mode 100644 src/ml/timeline_model.py create mode 100644 src/models/__init__.py create mode 100644 src/models/barrier.py create mode 100644 src/models/enums.py create mode 100644 src/models/funding_source.py create mode 100644 src/models/peer_group.py create mode 100644 src/models/portfolio.py create mode 100644 src/models/project.py create mode 100644 src/models/reform.py create mode 100644 src/tasks/__init__.py create mode 100644 src/tasks/alert_generation.py create mode 100644 src/tasks/calculate_benchmarks.py create mode 100644 src/tasks/celery_app.py create mode 100644 src/tasks/health_checks.py create mode 100644 src/tasks/update_predictions.py create mode 100644 src/utils/__init__.py create mode 100644 src/utils/date_helpers.py create mode 100644 src/utils/formatting.py create mode 100644 src/utils/geography.py create mode 100644 src/utils/statistical_helpers.py create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/fixtures/__init__.py create mode 100644 tests/fixtures/sample_data.py create mode 100644 tests/test_analytics/__init__.py create mode 100644 tests/test_analytics/test_bottleneck_detection.py create mode 100644 tests/test_analytics/test_health_assessment.py create mode 100644 tests/test_analytics/test_portfolio_intelligence.py create mode 100644 tests/test_analytics/test_reform_impact.py create mode 100644 tests/test_analytics/test_timeline_prediction.py create mode 100644 tests/test_api/__init__.py create mode 100644 tests/test_api/test_projects.py create mode 100644 tests/test_integrations/__init__.py create mode 100644 tests/test_integrations/test_housing_lens.py create mode 100644 tests/test_ml/__init__.py create mode 100644 tests/test_ml/test_feature_engineering.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..79a0946 --- /dev/null +++ b/.env.example @@ -0,0 +1,31 @@ +# Database +DATABASE_URL=postgresql://housinghand:password@localhost:5432/housinghand +DATABASE_POOL_SIZE=10 +DATABASE_MAX_OVERFLOW=20 + +# Redis +REDIS_URL=redis://localhost:6379/0 +CELERY_BROKER_URL=redis://localhost:6379/1 +CELERY_RESULT_BACKEND=redis://localhost:6379/2 + +# API +API_HOST=0.0.0.0 +API_PORT=8000 +API_DEBUG=true +API_SECRET_KEY=change-me-in-production +CORS_ORIGINS=http://localhost:3000,http://localhost:8000 + +# HousingMind Ecosystem Integrations +HOUSING_LENS_API_URL=http://localhost:8001/api/v1 +HOUSING_LENS_API_KEY=your-housing-lens-api-key +HOUSING_EAR_API_URL=http://localhost:8002/api/v1 +HOUSING_EAR_API_KEY=your-housing-ear-api-key +HOUSING_MIND_WEBHOOK_SECRET=your-webhook-secret + +# ML Model +ML_MODEL_PATH=models/timeline_prediction.joblib +ML_MODEL_VERSION=0.1.0 + +# Logging +LOG_LEVEL=INFO +LOG_FORMAT=json diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..11a45d4 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,45 @@ +name: Deploy + +on: + push: + branches: [main] + tags: ["v*"] + +jobs: + deploy: + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/v') + + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to container registry + uses: docker/login-action@v3 + with: + registry: ${{ secrets.REGISTRY_URL }} + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_PASSWORD }} + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: docker/Dockerfile + push: true + tags: | + ${{ secrets.REGISTRY_URL }}/housinghand:latest + ${{ secrets.REGISTRY_URL }}/housinghand:${{ github.sha }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Run database migrations + run: | + echo "Run: alembic upgrade head" + echo "Migration step placeholder - configure with actual deployment target" + + - name: Deploy notification + run: | + echo "Deployed HousingHand commit ${{ github.sha }}" diff --git a/.github/workflows/model_training.yml b/.github/workflows/model_training.yml new file mode 100644 index 0000000..901c845 --- /dev/null +++ b/.github/workflows/model_training.yml @@ -0,0 +1,50 @@ +name: Model Training + +on: + schedule: + - cron: "0 4 1 * *" # First day of each month at 4:00 AM UTC + workflow_dispatch: + +jobs: + train: + runs-on: ubuntu-latest + + services: + postgres: + image: postgis/postgis:14-3.4 + env: + POSTGRES_USER: housinghand + POSTGRES_PASSWORD: password + POSTGRES_DB: housinghand + ports: + - 5432:5432 + options: >- + --health-cmd "pg_isready -U housinghand" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + pip install -r requirements/ml.txt + + - name: Train timeline prediction model + env: + DATABASE_URL: postgresql://housinghand:password@localhost:5432/housinghand + run: | + python scripts/train_model.py + + - name: Upload model artifact + uses: actions/upload-artifact@v4 + with: + name: timeline-model-${{ github.run_number }} + path: models/ + retention-days: 90 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..971d4d3 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,84 @@ +name: Tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + + services: + postgres: + image: postgis/postgis:14-3.4 + env: + POSTGRES_USER: housinghand + POSTGRES_PASSWORD: password + POSTGRES_DB: housinghand_test + ports: + - 5432:5432 + options: >- + --health-cmd "pg_isready -U housinghand" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + + redis: + image: redis:7-alpine + ports: + - 6379:6379 + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements/test.txt + pip install -r requirements/ml.txt + + - name: Run tests + env: + DATABASE_URL: postgresql://housinghand:password@localhost:5432/housinghand_test + REDIS_URL: redis://localhost:6379/0 + run: | + pytest tests/ -v --cov=src --cov-report=xml --cov-report=term-missing + + - name: Upload coverage + uses: codecov/codecov-action@v3 + with: + file: coverage.xml + fail_ci_if_error: false + + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install linters + run: | + pip install ruff black mypy + + - name: Run ruff + run: ruff check src/ tests/ + + - name: Check formatting + run: black --check src/ tests/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2199423 --- /dev/null +++ b/.gitignore @@ -0,0 +1,58 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +*.egg-info/ +*.egg +dist/ +build/ +.eggs/ + +# Virtual environments +venv/ +.venv/ +env/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Environment +.env +.env.local +.env.production + +# Database +*.db +*.sqlite3 + +# Testing +.coverage +htmlcov/ +.pytest_cache/ + +# ML models +models/*.pkl +models/*.joblib +*.model + +# Jupyter +.ipynb_checkpoints/ + +# OS +.DS_Store +Thumbs.db + +# Docker +docker-compose.override.yml + +# Logs +*.log +logs/ + +# Redis dump +dump.rdb diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..85328c2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 HousingMind Team + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 09c73a6..2f3bad3 100644 --- a/README.md +++ b/README.md @@ -1 +1,77 @@ -# HousingHand \ No newline at end of file +# HousingHand + +Development Pipeline Intelligence Platform for the HousingMind ecosystem. + +HousingHand tracks every affordable housing project from concept to certificate of occupancy, quantifying where development pipelines break down and connecting regulatory friction to real production outcomes. It creates the first comprehensive affordable housing development pipeline database, enabling stakeholders to predict timelines, identify bottlenecks, measure policy reform impact, and optimize portfolio performance. + +## Core Capabilities + +- **Pipeline Tracking** - Monitor affordable housing projects through all seven development stages +- **Health Assessment** - Weighted scoring system evaluating timeline, budget, funding, risk, and team stability +- **Bottleneck Intelligence** - Identify systematic barriers that delay or kill projects across jurisdictions +- **Predictive Analytics** - Forecast project timelines using ML models trained on historical data and jurisdiction friction scores +- **Policy Impact Measurement** - Quantify actual outcomes of regulatory reforms with statistical significance testing +- **Portfolio Intelligence** - Aggregate views for PHAs, funders, cities, and policymakers + +## Tech Stack + +- **API**: Python 3.10+, FastAPI +- **Database**: PostgreSQL 14+ with PostGIS +- **ML/Analytics**: scikit-learn, pandas, numpy, scipy +- **Task Queue**: Celery + Redis +- **Testing**: pytest + +## Quick Start + +```bash +# Clone and set up +cp .env.example .env + +# Docker +cd docker && docker compose up -d + +# Or local development +python -m venv venv && source venv/bin/activate +pip install -r requirements/dev.txt +pip install -r requirements/ml.txt +uvicorn src.api.app:app --reload +``` + +## Project Structure + +``` +src/ + api/ # FastAPI application and endpoints + models/ # SQLAlchemy models (Project, FundingSource, Barrier, etc.) + analytics/ # Core analytics engine + ml/ # Timeline prediction ML model + integrations/ # HousingLens, HousingEar, HousingMind clients + data_collection/ # Developer portal, permit scrapers + database/ # Connection, queries, migrations + tasks/ # Celery async tasks + utils/ # Helper utilities +tests/ # Test suite +config/ # Settings and YAML configuration +scripts/ # Database seeding, model training, reports +docker/ # Docker and compose files +docs/ # Documentation +``` + +## HousingMind Ecosystem Integration + +- **HousingLens** - Regulatory friction scores predict entitlement timelines +- **HousingEar** - Policy monitoring feeds reform impact measurement +- **HousingMind** - Query metadata tracks stakeholder engagement patterns + +## Documentation + +- [API Reference](docs/API.md) +- [Data Model](docs/DATA_MODEL.md) +- [Analytics Methodology](docs/ANALYTICS.md) +- [Integration Guide](docs/INTEGRATION_GUIDE.md) +- [Developer Portal](docs/DEVELOPER_PORTAL.md) +- [Deployment Guide](docs/DEPLOYMENT.md) + +## License + +MIT diff --git a/alembic/alembic.ini b/alembic/alembic.ini new file mode 100644 index 0000000..e40e2f3 --- /dev/null +++ b/alembic/alembic.ini @@ -0,0 +1,36 @@ +[alembic] +script_location = alembic +sqlalchemy.url = postgresql://housinghand:password@localhost:5432/housinghand + +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/alembic/env.py b/alembic/env.py new file mode 100644 index 0000000..c967bf2 --- /dev/null +++ b/alembic/env.py @@ -0,0 +1,64 @@ +"""Alembic environment configuration for database migrations.""" + +from logging.config import fileConfig + +from alembic import context +from sqlalchemy import engine_from_config, pool + +from config.settings import get_settings +from src.database.connection import Base + +# Import all models so Alembic can detect them +from src.models import ( # noqa: F401 + ProjectBarrier, + FundingSource, + PeerGroup, + PortfolioDashboard, + Project, + PolicyReform, +) + +config = context.config + +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +target_metadata = Base.metadata + +settings = get_settings() +config.set_main_option("sqlalchemy.url", settings.database_url) + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode.""" + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode.""" + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure(connection=connection, target_metadata=target_metadata) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/alembic/versions/.gitkeep b/alembic/versions/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..a3d1595 --- /dev/null +++ b/config/__init__.py @@ -0,0 +1,3 @@ +from config.settings import Settings, get_settings + +__all__ = ["Settings", "get_settings"] diff --git a/config/database.yaml b/config/database.yaml new file mode 100644 index 0000000..4e2b151 --- /dev/null +++ b/config/database.yaml @@ -0,0 +1,28 @@ +database: + development: + host: localhost + port: 5432 + name: housinghand + user: housinghand + password: password + pool_size: 10 + max_overflow: 20 + + test: + host: localhost + port: 5432 + name: housinghand_test + user: housinghand + password: password + pool_size: 5 + max_overflow: 10 + + production: + pool_size: 20 + max_overflow: 40 + pool_timeout: 30 + pool_recycle: 3600 + + extensions: + - postgis + - timescaledb diff --git a/config/national_benchmarks.yaml b/config/national_benchmarks.yaml new file mode 100644 index 0000000..094805d --- /dev/null +++ b/config/national_benchmarks.yaml @@ -0,0 +1,61 @@ +national_benchmarks: + # Median durations in days by stage (based on national data) + stage_durations: + concept: + median: 60 + p25: 30 + p75: 90 + p90: 120 + pre_development: + median: 180 + p25: 120 + p75: 270 + p90: 365 + entitlement: + median: 240 + p25: 150 + p75: 365 + p90: 540 + financing: + median: 180 + p25: 120 + p75: 270 + p90: 365 + construction: + median: 540 + p25: 365 + p75: 720 + p90: 900 + lease_up: + median: 120 + p25: 60 + p75: 180 + p90: 270 + + # Total concept-to-CO benchmarks + total_timeline: + median_days: 1320 + median_months: 44 + p25_days: 900 + p75_days: 1800 + p90_days: 2160 + + # Cost benchmarks (per unit) + costs: + national_median_per_unit: 350000 + national_p25_per_unit: 250000 + national_p75_per_unit: 475000 + national_p90_per_unit: 600000 + + # Holding cost estimates + holding_costs: + daily_per_unit_during_entitlement: 15 + daily_per_unit_during_financing: 20 + daily_per_unit_during_construction: 35 + + # Health score thresholds + health_thresholds: + on_track: 80 + at_risk: 60 + delayed: 40 + stalled: 0 diff --git a/config/pipeline_stages.yaml b/config/pipeline_stages.yaml new file mode 100644 index 0000000..f86d1a5 --- /dev/null +++ b/config/pipeline_stages.yaml @@ -0,0 +1,67 @@ +pipeline_stages: + concept: + description: "Site identified, preliminary feasibility" + typical_duration_months: 2 + exit_criteria: "Financial pro forma, initial architect sketches" + key_activities: + - site_selection + - initial_feasibility + - team_formation + + pre_development: + description: "Due diligence, community engagement, design development" + typical_duration_months: 6 + exit_criteria: "Schematic design, zoning analysis complete" + key_activities: + - environmental_review + - market_study + - preliminary_design + - community_engagement + + entitlement: + description: "Zoning approvals, design review, land use permits" + typical_duration_months: 8 + exit_criteria: "All land use approvals obtained" + key_activities: + - zoning_application + - design_review + - public_hearings + - variance_requests + friction_source: "HousingLens friction scores predict duration" + + financing: + description: "Tax credit applications, loan underwriting, equity closing" + typical_duration_months: 6 + exit_criteria: "Construction financing closed" + key_activities: + - LIHTC_application + - loan_application + - equity_syndication + - closing + + construction: + description: "Groundbreaking to certificate of occupancy" + typical_duration_months: 18 + exit_criteria: "CO received, units ready for occupancy" + key_activities: + - site_prep + - foundation + - framing + - systems + - finishes + - inspection + + lease_up: + description: "Marketing and tenant placement" + typical_duration_months: 4 + exit_criteria: "95% occupancy achieved" + key_activities: + - marketing + - applications + - screening + - move_ins + + operations: + description: "Long-term affordability compliance" + typical_duration_months: 360 + monitoring: "Annual compliance reporting" diff --git a/config/settings.py b/config/settings.py new file mode 100644 index 0000000..a627f62 --- /dev/null +++ b/config/settings.py @@ -0,0 +1,57 @@ +"""Application settings loaded from environment variables.""" + +from functools import lru_cache +from pathlib import Path + +from pydantic_settings import BaseSettings + + +class Settings(BaseSettings): + """HousingHand application configuration.""" + + # Database + database_url: str = "postgresql://housinghand:password@localhost:5432/housinghand" + database_pool_size: int = 10 + database_max_overflow: int = 20 + + # Redis + redis_url: str = "redis://localhost:6379/0" + celery_broker_url: str = "redis://localhost:6379/1" + celery_result_backend: str = "redis://localhost:6379/2" + + # API + api_host: str = "0.0.0.0" + api_port: int = 8000 + api_debug: bool = False + api_secret_key: str = "change-me-in-production" + cors_origins: str = "http://localhost:3000,http://localhost:8000" + + # HousingMind Ecosystem + housing_lens_api_url: str = "http://localhost:8001/api/v1" + housing_lens_api_key: str = "" + housing_ear_api_url: str = "http://localhost:8002/api/v1" + housing_ear_api_key: str = "" + housing_mind_webhook_secret: str = "" + + # ML + ml_model_path: str = "models/timeline_prediction.joblib" + ml_model_version: str = "0.1.0" + + # Logging + log_level: str = "INFO" + log_format: str = "json" + + # Paths + project_root: Path = Path(__file__).parent.parent + + model_config = {"env_file": ".env", "env_file_encoding": "utf-8"} + + @property + def cors_origin_list(self) -> list[str]: + return [origin.strip() for origin in self.cors_origins.split(",")] + + +@lru_cache +def get_settings() -> Settings: + """Return cached application settings.""" + return Settings() diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..9d44d45 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +WORKDIR /app + +# System dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + libpq-dev \ + && rm -rf /var/lib/apt/lists/* + +# Python dependencies +COPY requirements/base.txt requirements/base.txt +COPY requirements/ml.txt requirements/ml.txt +RUN pip install --no-cache-dir -r requirements/ml.txt + +# Application code +COPY . . + +# Create non-root user +RUN adduser --disabled-password --gecos "" appuser && \ + chown -R appuser:appuser /app +USER appuser + +EXPOSE 8000 + +CMD ["uvicorn", "src.api.app:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml new file mode 100644 index 0000000..97bfa8b --- /dev/null +++ b/docker/docker-compose.dev.yml @@ -0,0 +1,63 @@ +version: "3.9" + +services: + api: + build: + context: .. + dockerfile: docker/Dockerfile + command: uvicorn src.api.app:app --host 0.0.0.0 --port 8000 --reload + ports: + - "8000:8000" + env_file: + - ../.env + volumes: + - ..:/app + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + + db: + image: postgis/postgis:14-3.4 + environment: + POSTGRES_USER: housinghand + POSTGRES_PASSWORD: password + POSTGRES_DB: housinghand + ports: + - "5432:5432" + volumes: + - pgdata-dev:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U housinghand"] + interval: 5s + timeout: 5s + retries: 5 + + redis: + image: redis:7-alpine + ports: + - "6379:6379" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 5s + retries: 5 + + celery-worker: + build: + context: .. + dockerfile: docker/Dockerfile + command: celery -A src.tasks.celery_app worker --loglevel=debug + env_file: + - ../.env + volumes: + - ..:/app + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + +volumes: + pgdata-dev: diff --git a/docker/docker-compose.test.yml b/docker/docker-compose.test.yml new file mode 100644 index 0000000..2c64791 --- /dev/null +++ b/docker/docker-compose.test.yml @@ -0,0 +1,36 @@ +version: "3.9" + +services: + test: + build: + context: .. + dockerfile: docker/Dockerfile + command: pytest tests/ -v --cov=src --cov-report=term-missing + environment: + DATABASE_URL: postgresql://housinghand:password@db:5432/housinghand_test + REDIS_URL: redis://redis:6379/0 + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + + db: + image: postgis/postgis:14-3.4 + environment: + POSTGRES_USER: housinghand + POSTGRES_PASSWORD: password + POSTGRES_DB: housinghand_test + healthcheck: + test: ["CMD-SHELL", "pg_isready -U housinghand"] + interval: 5s + timeout: 5s + retries: 5 + + redis: + image: redis:7-alpine + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 5s + retries: 5 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000..c529354 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,72 @@ +version: "3.9" + +services: + api: + build: + context: .. + dockerfile: docker/Dockerfile + ports: + - "8000:8000" + env_file: + - ../.env + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + restart: unless-stopped + + db: + image: postgis/postgis:14-3.4 + environment: + POSTGRES_USER: housinghand + POSTGRES_PASSWORD: password + POSTGRES_DB: housinghand + ports: + - "5432:5432" + volumes: + - pgdata:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U housinghand"] + interval: 5s + timeout: 5s + retries: 5 + + redis: + image: redis:7-alpine + ports: + - "6379:6379" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 5s + retries: 5 + + celery-worker: + build: + context: .. + dockerfile: docker/Dockerfile + command: celery -A src.tasks.celery_app worker --loglevel=info + env_file: + - ../.env + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + restart: unless-stopped + + celery-beat: + build: + context: .. + dockerfile: docker/Dockerfile + command: celery -A src.tasks.celery_app beat --loglevel=info + env_file: + - ../.env + depends_on: + redis: + condition: service_healthy + restart: unless-stopped + +volumes: + pgdata: diff --git a/docs/ANALYTICS.md b/docs/ANALYTICS.md new file mode 100644 index 0000000..8de57c7 --- /dev/null +++ b/docs/ANALYTICS.md @@ -0,0 +1,59 @@ +# HousingHand Analytics Methodology + +## Pipeline Health Assessment + +Each project receives a weighted health score (0-100): + +| Component | Weight | Description | +|-----------|--------|-------------| +| Timeline | 30% | Adherence to peer benchmark duration | +| Budget | 25% | Variance from original budget | +| Funding | 20% | Funding gap as % of total cost | +| Risk | 15% | Accumulated risk factor count | +| Team | 10% | Team stability indicators | + +**Health statuses:** +- On Track (80-100) +- At Risk (60-79) +- Delayed (40-59) +- Stalled (0-39) + +## Bottleneck Detection + +Identifies systematic barriers across jurisdictions by: + +1. **Stage analysis** - Which pipeline stage has the longest delays vs national benchmarks +2. **Topic analysis** - Which regulatory requirements cause the most friction (cross-referenced with HousingLens) +3. **Temporal analysis** - Whether timelines are improving, worsening, or stable +4. **Peer comparison** - How jurisdiction compares to similar jurisdictions + +## Timeline Prediction + +Uses a Random Forest Regression model trained on completed projects. + +**Features:** +- Project characteristics (units, AMI mix, building type, stories) +- Jurisdiction friction scores (from HousingLens) +- Market conditions (construction cost index, interest rates) +- Historical peer project timelines +- Seasonal factors + +**Output:** Predicted months for each remaining stage with 80% confidence intervals. + +## Policy Reform Impact + +Uses difference-in-differences methodology: +1. Compare pre-reform and post-reform project timelines +2. Control for market conditions and project characteristics +3. Statistical significance via Welch's t-test (p < 0.05) +4. Effectiveness categories: highly effective (>30% improvement), effective (>15%), marginal (>5%), ineffective + +## Portfolio Intelligence + +Aggregates project data across dimensions: +- Pipeline snapshot (units by stage) +- Delivery forecast (units by quarter) +- Health distribution +- Velocity metrics (annualized production rates) +- Funding gap analysis +- Geographic distribution diff --git a/docs/API.md b/docs/API.md new file mode 100644 index 0000000..a0be97d --- /dev/null +++ b/docs/API.md @@ -0,0 +1,102 @@ +# HousingHand API Documentation + +## Base URL + +``` +http://localhost:8000/api/v1 +``` + +## Authentication + +API key authentication via `Authorization: Bearer ` header. + +## Endpoints + +### Projects + +| Method | Path | Description | +|--------|------|-------------| +| GET | `/projects` | List projects with filtering | +| POST | `/projects` | Create a new project | +| GET | `/projects/{id}` | Get project details | +| PUT | `/projects/{id}` | Update a project | +| DELETE | `/projects/{id}` | Delete a project | + +#### Query Parameters (GET /projects) + +- `jurisdiction` - Filter by jurisdiction +- `city` - Filter by city +- `state` - Filter by state (2-letter code) +- `current_stage` - Filter by pipeline stage +- `overall_health` - Filter by health status +- `limit` - Results per page (default: 50, max: 200) +- `offset` - Pagination offset + +### Health Assessments + +| Method | Path | Description | +|--------|------|-------------| +| GET | `/health/{project_id}` | Get health assessment | +| POST | `/health/batch` | Batch health assessment | + +### Analytics + +| Method | Path | Description | +|--------|------|-------------| +| GET | `/analytics/bottlenecks` | Jurisdiction bottleneck analysis | + +#### Query Parameters + +- `jurisdiction` (required) +- `timeframe` - Analysis window (default: `last_24_months`) + +### Predictions + +| Method | Path | Description | +|--------|------|-------------| +| POST | `/predictions/timeline` | Predict project timeline | + +### Portfolio + +| Method | Path | Description | +|--------|------|-------------| +| POST | `/portfolio/intelligence` | Generate portfolio dashboard | + +### Reforms + +| Method | Path | Description | +|--------|------|-------------| +| POST | `/reforms` | Create policy reform record | +| GET | `/reforms/{id}/impact` | Measure reform impact | + +## Response Format + +All responses follow this structure: + +```json +{ + "data": { ... }, + "meta": { + "total": 100, + "limit": 50, + "offset": 0 + } +} +``` + +Error responses: + +```json +{ + "detail": "Error message" +} +``` + +## Status Codes + +- `200` - Success +- `201` - Created +- `400` - Bad request +- `404` - Not found +- `422` - Validation error +- `500` - Internal server error diff --git a/docs/DATA_MODEL.md b/docs/DATA_MODEL.md new file mode 100644 index 0000000..e21f1be --- /dev/null +++ b/docs/DATA_MODEL.md @@ -0,0 +1,59 @@ +# HousingHand Data Model + +## Overview + +HousingHand tracks affordable housing projects through seven pipeline stages from concept to long-term operations. + +## Core Tables + +### projects + +The central table tracking every affordable housing project. + +**Key fields:** +- `project_id` (UUID, PK) - Unique identifier +- `project_slug` (string, unique) - URL-friendly identifier +- `current_stage` - Pipeline stage enum +- `overall_health` - Health status enum +- `health_score` - Numeric health score (0-100) +- `jurisdiction` - Links to HousingLens friction data + +**Timeline fields:** Each stage has `_start`, `_complete`, and `_duration_days` columns. + +**Cost fields:** Total development cost, component breakdown, and friction-induced costs. + +### funding_sources + +Many-to-one relationship with projects. Tracks each funding source with type, amount, status, and terms. + +### project_barriers + +Many-to-one relationship with projects. Links specific regulatory friction points to project delays with cost and time impact. + +### peer_groups + +Defines comparable project cohorts for benchmarking. Stores calculated statistics (medians, percentiles). + +### portfolio_dashboards + +Saved portfolio configurations with cached aggregate metrics. + +### policy_reforms + +Tracks regulatory changes with pre/post impact measurements and statistical significance. + +## Pipeline Stages + +1. **Concept** - Site identified, preliminary feasibility +2. **Pre-Development** - Due diligence, community engagement +3. **Entitlement** - Zoning approvals, design review +4. **Financing** - Tax credit applications, loan underwriting +5. **Construction** - Groundbreaking to certificate of occupancy +6. **Lease-Up** - Marketing and tenant placement +7. **Operations** - Long-term affordability compliance + +Additional statuses: `stalled`, `abandoned` + +## Enumerations + +See `src/models/enums.py` for all enum definitions including BuildingType, StructureType, FundingSourceType, OverallHealth, etc. diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md new file mode 100644 index 0000000..bc1dda0 --- /dev/null +++ b/docs/DEPLOYMENT.md @@ -0,0 +1,91 @@ +# HousingHand Deployment Guide + +## Prerequisites + +- Docker and Docker Compose +- PostgreSQL 14+ with PostGIS extension +- Redis 7+ +- Python 3.10+ (for local development) + +## Quick Start (Docker) + +```bash +# Copy environment config +cp .env.example .env + +# Start all services +cd docker +docker compose up -d + +# Run migrations +docker compose exec api alembic upgrade head + +# Seed sample data +docker compose exec api python scripts/seed_database.py +``` + +## Local Development + +```bash +# Create virtual environment +python -m venv venv +source venv/bin/activate + +# Install dependencies +pip install -r requirements/dev.txt +pip install -r requirements/ml.txt + +# Set up environment +cp .env.example .env +# Edit .env with your local database credentials + +# Run migrations +alembic upgrade head + +# Start API server +uvicorn src.api.app:app --reload --port 8000 + +# Start Celery worker (separate terminal) +celery -A src.tasks.celery_app worker --loglevel=info + +# Start Celery beat (separate terminal) +celery -A src.tasks.celery_app beat --loglevel=info +``` + +## Running Tests + +```bash +# Local +pytest tests/ -v --cov=src + +# Docker +cd docker +docker compose -f docker-compose.test.yml up --abort-on-container-exit +``` + +## Database Migrations + +```bash +# Create a new migration +alembic revision --autogenerate -m "description" + +# Apply migrations +alembic upgrade head + +# Rollback one migration +alembic downgrade -1 +``` + +## Environment Variables + +See `.env.example` for all available configuration options. + +## Production Considerations + +- Set `API_DEBUG=false` +- Use a strong `API_SECRET_KEY` +- Configure proper database credentials +- Set up SSL/TLS termination +- Configure log aggregation +- Set up monitoring and alerting +- Schedule regular database backups diff --git a/docs/DEVELOPER_PORTAL.md b/docs/DEVELOPER_PORTAL.md new file mode 100644 index 0000000..1dc2c53 --- /dev/null +++ b/docs/DEVELOPER_PORTAL.md @@ -0,0 +1,71 @@ +# Developer Portal Guide + +## Overview + +The Developer Portal allows housing developers to submit and track their affordable housing projects through the HousingHand pipeline. + +## Submitting a Project + +### Required Information + +- **Project name** - Official project name +- **Total units** - Total number of housing units +- **Location** - Address, city, state, jurisdiction + +### Recommended Information + +- Affordable unit count and AMI breakdown +- Building type and structure type +- Developer organization +- Site acreage and stories +- Estimated total development cost + +### API Endpoint + +``` +POST /api/v1/projects +Content-Type: application/json + +{ + "project_name": "Example Apartments", + "address": "123 Main Street", + "city": "Sacramento", + "state": "CA", + "jurisdiction": "Sacramento, CA", + "total_units": 80, + "affordable_units": 72, + "building_type": "new_construction", + "developer_org": "Example Housing Corp" +} +``` + +## Updating Pipeline Stage + +When a project advances to the next stage: + +``` +PUT /api/v1/projects/{project_id} +{ + "current_stage": "entitlement", + "stage_entry_date": "2025-06-15" +} +``` + +## Reporting Barriers + +When a project encounters regulatory friction: + +``` +POST /api/v1/projects/{project_id}/barriers +{ + "barrier_type": "parking_requirements", + "barrier_description": "Required to provide 2 spaces per unit", + "stage_encountered": "entitlement", + "date_encountered": "2025-03-01", + "variance_required": true +} +``` + +## Data Quality + +Submitted data is validated automatically. The system calculates a completeness score and flags inconsistencies. More complete data leads to better analytics and predictions. diff --git a/docs/INTEGRATION_GUIDE.md b/docs/INTEGRATION_GUIDE.md new file mode 100644 index 0000000..8971e04 --- /dev/null +++ b/docs/INTEGRATION_GUIDE.md @@ -0,0 +1,57 @@ +# HousingHand Integration Guide + +## HousingMind Ecosystem + +HousingHand integrates with three other HousingMind components: + +### HousingLens (Friction Scores) + +HousingLens provides jurisdiction-level regulatory friction scores that HousingHand uses for: +- Predicting project timelines +- Identifying bottleneck root causes +- Validating observed delays against predicted friction + +**Configuration:** +``` +HOUSING_LENS_API_URL=http://housing-lens-api:8001/api/v1 +HOUSING_LENS_API_KEY=your-api-key +``` + +**Client:** `src/integrations/housing_lens.py` + +### HousingEar (Policy Monitoring) + +HousingEar monitors policy changes and funding opportunities. HousingHand uses this to: +- Alert projects about new funding programs +- Track policy reforms for impact measurement +- Connect regulatory changes to pipeline outcomes + +**Configuration:** +``` +HOUSING_EAR_API_URL=http://housing-ear-api:8002/api/v1 +HOUSING_EAR_API_KEY=your-api-key +``` + +**Client:** `src/integrations/housing_ear.py` + +### HousingMind (Query Metadata) + +HousingMind sends webhook events when users query about specific projects or jurisdictions. HousingHand tracks these to understand stakeholder engagement patterns. + +**Configuration:** +``` +HOUSING_MIND_WEBHOOK_SECRET=your-webhook-secret +``` + +**Webhook endpoint:** `POST /api/v1/webhooks/housing-mind` + +## Data Sources + +### Developer Portal +Developers submit and update project data through the API. + +### Public Records +Permit databases and LIHTC allocation data are scraped periodically. + +### Funder Reports +Funding organizations can submit portfolio data via API or batch import. diff --git a/notebooks/exploratory/.gitkeep b/notebooks/exploratory/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/model_development/.gitkeep b/notebooks/model_development/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/validation/.gitkeep b/notebooks/validation/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a62038b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,90 @@ +[build-system] +requires = ["setuptools>=68.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "housinghand" +version = "0.1.0" +description = "Development Pipeline Intelligence Platform for the HousingMind ecosystem" +readme = "README.md" +license = {text = "MIT"} +requires-python = ">=3.10" +authors = [ + {name = "HousingMind Team"}, +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Framework :: FastAPI", +] +dependencies = [ + "fastapi>=0.104.0", + "uvicorn[standard]>=0.24.0", + "sqlalchemy>=2.0.0", + "alembic>=1.12.0", + "psycopg2-binary>=2.9.0", + "geoalchemy2>=0.14.0", + "pydantic>=2.5.0", + "pydantic-settings>=2.1.0", + "celery[redis]>=5.3.0", + "redis>=5.0.0", + "httpx>=0.25.0", + "pandas>=2.1.0", + "numpy>=1.26.0", + "scikit-learn>=1.3.0", + "scipy>=1.11.0", + "statsmodels>=0.14.0", + "plotly>=5.18.0", + "python-dateutil>=2.8.0", + "pyyaml>=6.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.4.0", + "pytest-cov>=4.1.0", + "pytest-asyncio>=0.21.0", + "httpx>=0.25.0", + "factory-boy>=3.3.0", + "faker>=20.0.0", + "black>=23.0.0", + "ruff>=0.1.0", + "mypy>=1.7.0", + "pre-commit>=3.5.0", +] +ml = [ + "joblib>=1.3.0", + "optuna>=3.4.0", + "shap>=0.43.0", +] + +[tool.setuptools.packages.find] +where = ["."] +include = ["src*", "config*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = "-v --tb=short" + +[tool.black] +line-length = 100 +target-version = ["py310"] + +[tool.ruff] +line-length = 100 +target-version = "py310" + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W", "UP"] + +[tool.mypy] +python_version = "3.10" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = true diff --git a/requirements/base.txt b/requirements/base.txt new file mode 100644 index 0000000..b10ea1e --- /dev/null +++ b/requirements/base.txt @@ -0,0 +1,15 @@ +fastapi>=0.104.0 +uvicorn[standard]>=0.24.0 +sqlalchemy>=2.0.0 +alembic>=1.12.0 +psycopg2-binary>=2.9.0 +geoalchemy2>=0.14.0 +pydantic>=2.5.0 +pydantic-settings>=2.1.0 +celery[redis]>=5.3.0 +redis>=5.0.0 +httpx>=0.25.0 +pandas>=2.1.0 +numpy>=1.26.0 +python-dateutil>=2.8.0 +pyyaml>=6.0 diff --git a/requirements/dev.txt b/requirements/dev.txt new file mode 100644 index 0000000..8a52245 --- /dev/null +++ b/requirements/dev.txt @@ -0,0 +1,7 @@ +-r base.txt +-r test.txt +black>=23.0.0 +ruff>=0.1.0 +mypy>=1.7.0 +pre-commit>=3.5.0 +ipython>=8.0.0 diff --git a/requirements/ml.txt b/requirements/ml.txt new file mode 100644 index 0000000..7a2ab8d --- /dev/null +++ b/requirements/ml.txt @@ -0,0 +1,8 @@ +-r base.txt +scikit-learn>=1.3.0 +scipy>=1.11.0 +statsmodels>=0.14.0 +plotly>=5.18.0 +joblib>=1.3.0 +optuna>=3.4.0 +shap>=0.43.0 diff --git a/requirements/test.txt b/requirements/test.txt new file mode 100644 index 0000000..24b6a93 --- /dev/null +++ b/requirements/test.txt @@ -0,0 +1,6 @@ +-r base.txt +pytest>=7.4.0 +pytest-cov>=4.1.0 +pytest-asyncio>=0.21.0 +factory-boy>=3.3.0 +faker>=20.0.0 diff --git a/scripts/generate_reports.py b/scripts/generate_reports.py new file mode 100644 index 0000000..281735c --- /dev/null +++ b/scripts/generate_reports.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +"""Generate ad-hoc reports from HousingHand data.""" + +import json +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import logging + +from src.analytics.bottleneck_detection import identify_systemic_bottlenecks +from src.analytics.portfolio_intelligence import generate_portfolio_intelligence +from src.database.connection import get_session_factory + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(levelname)s %(message)s") +logger = logging.getLogger(__name__) + + +def portfolio_report(jurisdiction: str) -> None: + """Generate a portfolio intelligence report for a jurisdiction.""" + SessionLocal = get_session_factory() + db = SessionLocal() + + try: + result = generate_portfolio_intelligence( + db=db, + geography_filter={"jurisdiction": jurisdiction}, + stakeholder_type="city", + ) + print(json.dumps(result, indent=2, default=str)) + finally: + db.close() + + +def bottleneck_report(jurisdiction: str, timeframe: str = "last_24_months") -> None: + """Generate a bottleneck analysis report for a jurisdiction.""" + SessionLocal = get_session_factory() + db = SessionLocal() + + try: + result = identify_systemic_bottlenecks( + db=db, + jurisdiction=jurisdiction, + timeframe=timeframe, + ) + print(json.dumps(result, indent=2, default=str)) + finally: + db.close() + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Usage: python generate_reports.py ") + print("Report types: portfolio, bottleneck") + sys.exit(1) + + report_type = sys.argv[1] + jurisdiction = sys.argv[2] + + if report_type == "portfolio": + portfolio_report(jurisdiction) + elif report_type == "bottleneck": + timeframe = sys.argv[3] if len(sys.argv) > 3 else "last_24_months" + bottleneck_report(jurisdiction, timeframe) + else: + print(f"Unknown report type: {report_type}") + sys.exit(1) diff --git a/scripts/migrate_data.py b/scripts/migrate_data.py new file mode 100644 index 0000000..279be0d --- /dev/null +++ b/scripts/migrate_data.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +"""Data migration utilities for importing projects from external sources.""" + +import csv +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import logging + +from src.data_collection.developer_portal import DeveloperPortalService +from src.data_collection.validation import DataValidator +from src.database.connection import get_session_factory + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(levelname)s %(message)s") +logger = logging.getLogger(__name__) + + +def import_from_csv(csv_path: str) -> dict: + """Import project data from a CSV file. + + Expected columns: project_name, address, city, state, jurisdiction, + developer_org, total_units, affordable_units, building_type, current_stage + """ + SessionLocal = get_session_factory() + db = SessionLocal() + service = DeveloperPortalService(db) + validator = DataValidator() + + results = {"imported": 0, "skipped": 0, "errors": 0} + + try: + with open(csv_path, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + try: + # Clean up the row data + data = {k.strip(): v.strip() for k, v in row.items() if v and v.strip()} + + # Convert numeric fields + for field in ["total_units", "affordable_units"]: + if field in data: + data[field] = int(data[field]) + + project = service.create_project(data) + validation = validator.validate_project(project) + + if not validation.is_valid: + logger.warning( + f"Project '{data.get('project_name')}' has validation errors: " + f"{validation.errors}" + ) + + results["imported"] += 1 + + except Exception as e: + logger.error(f"Error importing row: {e}") + results["errors"] += 1 + + logger.info(f"Import complete: {results}") + return results + + finally: + db.close() + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python migrate_data.py ") + sys.exit(1) + import_from_csv(sys.argv[1]) diff --git a/scripts/seed_database.py b/scripts/seed_database.py new file mode 100644 index 0000000..ec2bcd0 --- /dev/null +++ b/scripts/seed_database.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +"""Seed the database with sample project data for development and testing.""" + +import sys +import uuid +from datetime import date, timedelta +from pathlib import Path + +# Add project root to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from src.database.connection import Base, get_engine, get_session_factory +from src.models.barrier import ProjectBarrier +from src.models.enums import ( + BarrierStage, + BuildingType, + DataSource, + FundingSourceStatus, + FundingSourceType, + NeighborOpposition, + OverallHealth, + PipelineStage, + StructureType, +) +from src.models.funding_source import FundingSource +from src.models.project import Project + +SAMPLE_PROJECTS = [ + { + "project_name": "Sunrise Terrace Apartments", + "project_slug": "sunrise-terrace-apartments", + "address": "1200 Main Street", + "city": "Sacramento", + "county": "Sacramento", + "state": "CA", + "zip": "95814", + "jurisdiction": "Sacramento, CA", + "developer_org": "Community Housing Partners", + "total_units": 80, + "affordable_units": 72, + "market_units": 8, + "ami_30_units": 20, + "ami_50_units": 32, + "ami_60_units": 20, + "building_type": BuildingType.NEW_CONSTRUCTION, + "structure_type": StructureType.WOOD_FRAME, + "stories": 4, + "parking_spaces": 60, + "current_stage": PipelineStage.CONSTRUCTION, + "overall_health": OverallHealth.ON_TRACK, + "health_score": 82.5, + "concept_start": date.today() - timedelta(days=720), + "entitlement_start": date.today() - timedelta(days=540), + "entitlement_complete": date.today() - timedelta(days=300), + "entitlement_duration_days": 240, + "financing_start": date.today() - timedelta(days=300), + "financing_complete": date.today() - timedelta(days=180), + "financing_duration_days": 120, + "construction_start": date.today() - timedelta(days=180), + "total_development_cost": 32_000_000, + "cost_per_unit": 400_000, + "hard_costs": 22_000_000, + "soft_costs": 5_500_000, + "land_acquisition_cost": 3_000_000, + "data_source": DataSource.DEVELOPER_PORTAL, + "data_quality_score": 0.85, + }, + { + "project_name": "Oak Park Senior Village", + "project_slug": "oak-park-senior-village", + "address": "450 Broadway", + "city": "Oakland", + "county": "Alameda", + "state": "CA", + "zip": "94607", + "jurisdiction": "Oakland, CA", + "developer_org": "Affordable Seniors Inc", + "total_units": 120, + "affordable_units": 120, + "senior_units": 120, + "ami_30_units": 30, + "ami_50_units": 50, + "ami_60_units": 40, + "building_type": BuildingType.NEW_CONSTRUCTION, + "structure_type": StructureType.CONCRETE, + "stories": 6, + "current_stage": PipelineStage.ENTITLEMENT, + "overall_health": OverallHealth.AT_RISK, + "health_score": 55.0, + "concept_start": date.today() - timedelta(days=400), + "entitlement_start": date.today() - timedelta(days=250), + "total_development_cost": 55_000_000, + "cost_per_unit": 458_333, + "jurisdiction_friction_score": 720, + "neighbor_opposition_level": NeighborOpposition.MODERATE, + "data_source": DataSource.PUBLIC_RECORDS, + "data_quality_score": 0.7, + }, + { + "project_name": "Riverdale Family Homes", + "project_slug": "riverdale-family-homes", + "address": "789 Elm Street", + "city": "Portland", + "county": "Multnomah", + "state": "OR", + "zip": "97201", + "jurisdiction": "Portland, OR", + "developer_org": "Northwest Housing Alliance", + "total_units": 45, + "affordable_units": 45, + "family_units": 45, + "ami_30_units": 10, + "ami_50_units": 20, + "ami_60_units": 15, + "building_type": BuildingType.NEW_CONSTRUCTION, + "structure_type": StructureType.WOOD_FRAME, + "stories": 3, + "current_stage": PipelineStage.FINANCING, + "overall_health": OverallHealth.ON_TRACK, + "health_score": 75.0, + "concept_start": date.today() - timedelta(days=500), + "entitlement_complete": date.today() - timedelta(days=90), + "financing_start": date.today() - timedelta(days=90), + "total_development_cost": 18_000_000, + "cost_per_unit": 400_000, + "funding_gap": 2_500_000, + "data_source": DataSource.DEVELOPER_PORTAL, + "data_quality_score": 0.8, + }, + { + "project_name": "Metro Heights Mixed-Use", + "project_slug": "metro-heights-mixed-use", + "address": "200 Central Avenue", + "city": "Denver", + "county": "Denver", + "state": "CO", + "zip": "80202", + "jurisdiction": "Denver, CO", + "developer_org": "Mountain West Development", + "total_units": 200, + "affordable_units": 100, + "market_units": 100, + "ami_50_units": 40, + "ami_60_units": 40, + "ami_80_units": 20, + "building_type": BuildingType.NEW_CONSTRUCTION, + "structure_type": StructureType.STEEL, + "stories": 12, + "current_stage": PipelineStage.PRE_DEVELOPMENT, + "overall_health": OverallHealth.ON_TRACK, + "health_score": 90.0, + "concept_start": date.today() - timedelta(days=120), + "pre_development_start": date.today() - timedelta(days=60), + "total_development_cost": 85_000_000, + "cost_per_unit": 425_000, + "data_source": DataSource.DEVELOPER_PORTAL, + "data_quality_score": 0.6, + }, + { + "project_name": "Heritage Court Rehabilitation", + "project_slug": "heritage-court-rehab", + "address": "55 Heritage Lane", + "city": "Austin", + "county": "Travis", + "state": "TX", + "zip": "78701", + "jurisdiction": "Austin, TX", + "developer_org": "Lone Star Affordable Housing", + "total_units": 60, + "affordable_units": 60, + "ami_30_units": 15, + "ami_50_units": 25, + "ami_60_units": 20, + "building_type": BuildingType.SUBSTANTIAL_REHAB, + "structure_type": StructureType.WOOD_FRAME, + "stories": 2, + "current_stage": PipelineStage.STALLED, + "overall_health": OverallHealth.STALLED, + "health_score": 25.0, + "concept_start": date.today() - timedelta(days=900), + "entitlement_start": date.today() - timedelta(days=600), + "total_development_cost": 15_000_000, + "cost_per_unit": 250_000, + "funding_gap": 5_000_000, + "data_source": DataSource.FUNDER_REPORT, + "data_quality_score": 0.65, + }, +] + + +def seed() -> None: + """Create tables and insert sample data.""" + engine = get_engine() + Base.metadata.create_all(bind=engine) + + SessionLocal = get_session_factory() + db = SessionLocal() + + try: + # Check if data already exists + existing = db.query(Project).count() + if existing > 0: + print(f"Database already has {existing} projects. Skipping seed.") + return + + for data in SAMPLE_PROJECTS: + project = Project(**data) + db.add(project) + + db.flush() + + # Add sample funding sources for first project + projects = db.query(Project).all() + if projects: + p = projects[0] + db.add(FundingSource( + project_id=p.project_id, + source_type=FundingSourceType.LIHTC_9PCT, + source_name="9% LIHTC - California", + provider_organization="California Tax Credit Allocation Committee", + amount=12_000_000, + status=FundingSourceStatus.CLOSED, + )) + db.add(FundingSource( + project_id=p.project_id, + source_type=FundingSourceType.CONSTRUCTION_LOAN, + source_name="Construction Loan", + provider_organization="Wells Fargo", + amount=18_000_000, + status=FundingSourceStatus.CLOSED, + interest_rate=5.5, + term_years=2, + )) + + # Add sample barrier + db.add(ProjectBarrier( + project_id=p.project_id, + barrier_type="parking_requirements", + barrier_description="Required 1.5 spaces per unit, requested reduction to 0.75", + jurisdiction="Sacramento, CA", + friction_score=650, + stage_encountered=BarrierStage.ENTITLEMENT, + date_encountered=date.today() - timedelta(days=500), + date_resolved=date.today() - timedelta(days=400), + days_delayed=45, + cost_impact=150_000, + variance_required=True, + variance_granted=True, + resolution_strategy="Submitted parking demand study showing lower actual usage", + )) + + db.commit() + print(f"Seeded {len(SAMPLE_PROJECTS)} projects with funding sources and barriers.") + + finally: + db.close() + + +if __name__ == "__main__": + seed() diff --git a/scripts/train_model.py b/scripts/train_model.py new file mode 100644 index 0000000..c97c32a --- /dev/null +++ b/scripts/train_model.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +"""Train the timeline prediction ML model.""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import logging + +from config.settings import get_settings +from src.database.connection import get_session_factory +from src.ml.model_training import TrainingPipeline +from src.models.enums import PipelineStage +from src.models.project import Project + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(levelname)s %(message)s") +logger = logging.getLogger(__name__) + + +def main() -> None: + """Run the model training pipeline.""" + settings = get_settings() + SessionLocal = get_session_factory() + db = SessionLocal() + + try: + # Fetch completed projects for training + completed_stages = [PipelineStage.OPERATIONS, PipelineStage.LEASE_UP] + projects = ( + db.query(Project) + .filter(Project.current_stage.in_(completed_stages)) + .all() + ) + + logger.info(f"Found {len(projects)} completed projects for training") + + if len(projects) < 10: + logger.warning( + "Fewer than 10 completed projects available. " + "Model quality may be limited. Proceeding anyway." + ) + + pipeline = TrainingPipeline() + result = pipeline.train(projects) + + # Save model + model_path = Path(settings.ml_model_path) + model_path.parent.mkdir(parents=True, exist_ok=True) + pipeline.save_model(str(model_path)) + + logger.info(f"Model saved to {model_path}") + logger.info(f"Training results: {result}") + + finally: + db.close() + + +if __name__ == "__main__": + main() diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/analytics/__init__.py b/src/analytics/__init__.py new file mode 100644 index 0000000..092d8bb --- /dev/null +++ b/src/analytics/__init__.py @@ -0,0 +1,93 @@ +"""HousingHand Analytics Engine. + +Provides pipeline health assessment, bottleneck detection, timeline +prediction, policy reform impact measurement, portfolio intelligence, +peer benchmarking, and statistical testing utilities. +""" + +from src.analytics.bottleneck_detection import ( + detect_jurisdiction_bottlenecks, + detect_systemic_bottlenecks, + identify_stage_chokepoints, +) +from src.analytics.health_assessment import ( + assess_batch_health, + assess_project_health, + assess_project_health_and_persist, +) +from src.analytics.peer_benchmarking import ( + compare_project_to_peers, + compute_peer_benchmarks, + find_peer_group, + refresh_peer_group_stats, +) +from src.analytics.portfolio_intelligence import ( + generate_and_persist_dashboard, + generate_funder_view, + generate_pha_view, + generate_portfolio_dashboard, + generate_state_view, +) +from src.analytics.reform_impact import ( + build_reform_time_series, + compare_reforms_in_jurisdiction, + measure_reform_impact, + measure_reform_impact_and_persist, +) +from src.analytics.statistical_tests import ( + cohens_d, + confidence_interval, + independent_ttest, + mann_whitney_test, + paired_ttest, + percentile_rank, + select_and_run_test, + test_normality, + z_score, +) +from src.analytics.timeline_prediction import ( + predict_batch_timelines, + predict_from_friction_score, + predict_project_timeline, +) + +__all__ = [ + # Health Assessment + "assess_project_health", + "assess_project_health_and_persist", + "assess_batch_health", + # Bottleneck Detection + "detect_jurisdiction_bottlenecks", + "detect_systemic_bottlenecks", + "identify_stage_chokepoints", + # Timeline Prediction + "predict_project_timeline", + "predict_batch_timelines", + "predict_from_friction_score", + # Reform Impact + "measure_reform_impact", + "measure_reform_impact_and_persist", + "compare_reforms_in_jurisdiction", + "build_reform_time_series", + # Portfolio Intelligence + "generate_portfolio_dashboard", + "generate_and_persist_dashboard", + "generate_funder_view", + "generate_pha_view", + "generate_state_view", + # Peer Benchmarking + "find_peer_group", + "compute_peer_benchmarks", + "compare_project_to_peers", + "refresh_peer_group_stats", + # Statistical Tests + "independent_ttest", + "paired_ttest", + "mann_whitney_test", + "cohens_d", + "confidence_interval", + "test_normality", + "select_and_run_test", + "percentile_rank", + "z_score", +] diff --git a/src/analytics/bottleneck_detection.py b/src/analytics/bottleneck_detection.py new file mode 100644 index 0000000..14a0db6 --- /dev/null +++ b/src/analytics/bottleneck_detection.py @@ -0,0 +1,537 @@ +"""Systemic bottleneck identification across jurisdictions. + +Detects recurring friction patterns by aggregating project barriers, +stage durations, and stall rates across jurisdictions. Produces ranked +bottleneck reports that surface the highest-impact regulatory friction +topics and the pipeline stages where projects most commonly get stuck. +""" + +from __future__ import annotations + +import logging +from collections import defaultdict +from datetime import date, datetime +from typing import TypedDict + +import numpy as np +from sqlalchemy import func, select +from sqlalchemy.orm import Session + +from src.analytics.peer_benchmarking import ( + _ACTIVE_STAGES, + _extract_stage_durations, + _load_national_benchmarks, +) +from src.database.queries import ( + get_stalled_projects, + query_projects, +) +from src.models.barrier import ProjectBarrier +from src.models.enums import BarrierStage, PipelineStage +from src.models.project import Project + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Typed results +# --------------------------------------------------------------------------- + + +class StageBottleneck(TypedDict): + """Bottleneck summary for a single pipeline stage.""" + + stage: str + project_count: int + median_days: float + national_median_days: float + excess_days: float + excess_ratio: float # actual / national median + stalled_count: int + stall_rate: float # fraction of projects stalled in this stage + top_friction_topics: list[FrictionTopicSummary] + + +class FrictionTopicSummary(TypedDict): + """Summary of a friction topic's impact.""" + + topic: str + occurrence_count: int + total_days_delayed: int + total_cost_impact: float + median_days_delayed: float + affected_project_ids: list[str] + jurisdictions: list[str] + + +class JurisdictionBottleneck(TypedDict): + """Bottleneck analysis for a specific jurisdiction.""" + + jurisdiction: str + total_projects: int + overall_friction_score: float + stage_bottlenecks: list[StageBottleneck] + top_friction_topics: list[FrictionTopicSummary] + worst_stage: str | None + estimated_excess_days_per_project: float + estimated_excess_cost_per_project: float + + +class SystemicBottleneckReport(TypedDict): + """Cross-jurisdiction bottleneck analysis.""" + + total_jurisdictions_analyzed: int + total_projects_analyzed: int + jurisdiction_rankings: list[JurisdictionBottleneck] + global_top_friction_topics: list[FrictionTopicSummary] + global_worst_stages: list[StageBottleneck] + generated_at: str + + +# --------------------------------------------------------------------------- +# Main entry points +# --------------------------------------------------------------------------- + + +def detect_jurisdiction_bottlenecks( + db: Session, + jurisdiction: str, + *, + stall_threshold_days: int = 180, +) -> JurisdictionBottleneck: + """Analyze bottlenecks for a single jurisdiction. + + Aggregates project barriers by stage and friction topic, compares + stage durations to national medians, and identifies where projects + are most commonly stalled or delayed. + + Args: + db: SQLAlchemy session. + jurisdiction: The jurisdiction to analyze. + stall_threshold_days: Number of days beyond which a project in + a stage is considered stalled. + + Returns: + JurisdictionBottleneck with ranked stage bottlenecks and friction + topic summaries. + """ + projects = query_projects(db, jurisdiction=jurisdiction, limit=1000) + if not projects: + return _empty_jurisdiction_bottleneck(jurisdiction) + + national = _load_national_benchmarks() + national_stages = national.get("stage_durations", {}) + holding_costs = national.get("holding_costs", {}) + + # Barriers for this jurisdiction + barriers = _get_jurisdiction_barriers(db, jurisdiction) + friction_by_stage = _group_barriers_by_stage(barriers) + friction_by_topic = _group_barriers_by_topic(barriers) + + stalled = get_stalled_projects( + db, days_threshold=stall_threshold_days, jurisdiction=jurisdiction + ) + stalled_by_stage = _count_by_stage(stalled) + + # Analyze each stage + stage_bottlenecks: list[StageBottleneck] = [] + total_excess_days = 0.0 + + for stage in _ACTIVE_STAGES: + durations = _extract_stage_durations(projects, stage) + nat_bench = national_stages.get(stage, {}) + nat_median = float(nat_bench.get("median", 0)) + + if durations: + arr = np.array(durations, dtype=float) + median_days = float(np.median(arr)) + else: + median_days = 0.0 + + excess = max(0.0, median_days - nat_median) if nat_median > 0 else 0.0 + ratio = median_days / nat_median if nat_median > 0 else 1.0 + total_excess_days += excess + + stage_stalled = stalled_by_stage.get(stage, 0) + # Count how many projects have been in this stage + in_stage = sum( + 1 for p in projects if p.current_stage.value == stage + ) + stall_rate = stage_stalled / in_stage if in_stage > 0 else 0.0 + + # Top friction topics for this stage + stage_friction = friction_by_stage.get(stage, []) + top_topics = _summarize_friction_topics(stage_friction, top_n=5) + + stage_bottlenecks.append( + StageBottleneck( + stage=stage, + project_count=len(durations), + median_days=median_days, + national_median_days=nat_median, + excess_days=excess, + excess_ratio=round(ratio, 2), + stalled_count=stage_stalled, + stall_rate=round(stall_rate, 3), + top_friction_topics=top_topics, + ) + ) + + # Sort stages by excess ratio descending (worst first) + stage_bottlenecks.sort(key=lambda s: s["excess_ratio"], reverse=True) + worst_stage = stage_bottlenecks[0]["stage"] if stage_bottlenecks else None + + # Global friction topics for jurisdiction + top_friction = _summarize_friction_topics(barriers, top_n=10) + + # Overall friction score: weighted average of excess ratios + if stage_bottlenecks: + ratios = [s["excess_ratio"] for s in stage_bottlenecks if s["project_count"] > 0] + overall_friction = float(np.mean(ratios)) * 50.0 if ratios else 50.0 + else: + overall_friction = 50.0 + + # Estimated excess cost per project + daily_holding = float( + holding_costs.get("daily_per_unit_during_entitlement", 15) + ) + avg_units = ( + np.mean([p.total_units for p in projects if p.total_units]) + if projects + else 50 + ) + excess_cost = total_excess_days * daily_holding * avg_units + + return JurisdictionBottleneck( + jurisdiction=jurisdiction, + total_projects=len(projects), + overall_friction_score=round(min(100.0, overall_friction), 1), + stage_bottlenecks=stage_bottlenecks, + top_friction_topics=top_friction, + worst_stage=worst_stage, + estimated_excess_days_per_project=round( + total_excess_days / max(1, len(projects)), 1 + ), + estimated_excess_cost_per_project=round( + excess_cost / max(1, len(projects)), 2 + ), + ) + + +def detect_systemic_bottlenecks( + db: Session, + *, + jurisdictions: list[str] | None = None, + state: str | None = None, + top_n_jurisdictions: int = 20, + stall_threshold_days: int = 180, +) -> SystemicBottleneckReport: + """Cross-jurisdiction systemic bottleneck analysis. + + Runs bottleneck detection across multiple jurisdictions and aggregates + findings to reveal system-wide patterns. + + Args: + db: SQLAlchemy session. + jurisdictions: Specific jurisdictions to analyze. If None, + discovers jurisdictions from the database. + state: If provided and jurisdictions is None, limit to this state. + top_n_jurisdictions: Max jurisdictions to include in results. + stall_threshold_days: Stall threshold for per-jurisdiction analysis. + + Returns: + SystemicBottleneckReport with ranked jurisdictions and global + friction topic summaries. + """ + if jurisdictions is None: + jurisdictions = _discover_jurisdictions(db, state=state, limit=top_n_jurisdictions) + + jurisdiction_results: list[JurisdictionBottleneck] = [] + all_barriers: list[ProjectBarrier] = [] + total_projects = 0 + + for jur in jurisdictions: + try: + result = detect_jurisdiction_bottlenecks( + db, jur, stall_threshold_days=stall_threshold_days + ) + jurisdiction_results.append(result) + total_projects += result["total_projects"] + + # Collect barriers for global analysis + jur_barriers = _get_jurisdiction_barriers(db, jur) + all_barriers.extend(jur_barriers) + except Exception: + logger.exception("Failed bottleneck analysis for %s.", jur) + + # Rank jurisdictions by friction score descending (worst first) + jurisdiction_results.sort( + key=lambda j: j["overall_friction_score"], reverse=True + ) + + # Global friction topics + global_topics = _summarize_friction_topics(all_barriers, top_n=15) + + # Global worst stages (aggregate across jurisdictions) + global_stages = _aggregate_stage_bottlenecks(jurisdiction_results) + + return SystemicBottleneckReport( + total_jurisdictions_analyzed=len(jurisdiction_results), + total_projects_analyzed=total_projects, + jurisdiction_rankings=jurisdiction_results[:top_n_jurisdictions], + global_top_friction_topics=global_topics, + global_worst_stages=global_stages, + generated_at=datetime.utcnow().isoformat(), + ) + + +def identify_stage_chokepoints( + db: Session, + *, + jurisdiction: str | None = None, + state: str | None = None, + min_projects: int = 3, +) -> list[StageBottleneck]: + """Identify the specific pipeline stages acting as chokepoints. + + A chokepoint is defined as a stage where: + - The median duration exceeds the national benchmark by > 25%, AND + - At least min_projects have data for that stage. + + Args: + db: SQLAlchemy session. + jurisdiction: Optional jurisdiction filter. + state: Optional state filter. + min_projects: Minimum projects with data to consider a stage. + + Returns: + List of StageBottleneck for stages qualifying as chokepoints, + sorted by excess ratio descending. + """ + projects = query_projects( + db, jurisdiction=jurisdiction, state=state, limit=1000 + ) + national = _load_national_benchmarks().get("stage_durations", {}) + barriers = [] + if jurisdiction: + barriers = _get_jurisdiction_barriers(db, jurisdiction) + + friction_by_stage = _group_barriers_by_stage(barriers) + chokepoints: list[StageBottleneck] = [] + + for stage in _ACTIVE_STAGES: + durations = _extract_stage_durations(projects, stage) + if len(durations) < min_projects: + continue + + nat_bench = national.get(stage, {}) + nat_median = float(nat_bench.get("median", 0)) + if nat_median == 0: + continue + + arr = np.array(durations, dtype=float) + median_days = float(np.median(arr)) + excess_ratio = median_days / nat_median + + if excess_ratio > 1.25: + stage_friction = friction_by_stage.get(stage, []) + top_topics = _summarize_friction_topics(stage_friction, top_n=3) + + chokepoints.append( + StageBottleneck( + stage=stage, + project_count=len(durations), + median_days=median_days, + national_median_days=nat_median, + excess_days=median_days - nat_median, + excess_ratio=round(excess_ratio, 2), + stalled_count=0, + stall_rate=0.0, + top_friction_topics=top_topics, + ) + ) + + chokepoints.sort(key=lambda c: c["excess_ratio"], reverse=True) + return chokepoints + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _get_jurisdiction_barriers( + db: Session, + jurisdiction: str, +) -> list[ProjectBarrier]: + """Fetch all barriers for projects in a jurisdiction.""" + stmt = ( + select(ProjectBarrier) + .join(Project, ProjectBarrier.project_id == Project.project_id) + .where(Project.jurisdiction == jurisdiction) + ) + return list(db.scalars(stmt).all()) + + +def _group_barriers_by_stage( + barriers: list[ProjectBarrier], +) -> dict[str, list[ProjectBarrier]]: + """Group barriers by the stage in which they were encountered.""" + grouped: dict[str, list[ProjectBarrier]] = defaultdict(list) + for b in barriers: + if b.stage_encountered is not None: + grouped[b.stage_encountered.value].append(b) + else: + grouped["unknown"].append(b) + return grouped + + +def _group_barriers_by_topic( + barriers: list[ProjectBarrier], +) -> dict[str, list[ProjectBarrier]]: + """Group barriers by their barrier_type (friction topic).""" + grouped: dict[str, list[ProjectBarrier]] = defaultdict(list) + for b in barriers: + grouped[b.barrier_type].append(b) + return grouped + + +def _summarize_friction_topics( + barriers: list[ProjectBarrier], + top_n: int = 10, +) -> list[FrictionTopicSummary]: + """Aggregate barriers by topic and produce ranked summaries.""" + by_topic = _group_barriers_by_topic(barriers) + summaries: list[FrictionTopicSummary] = [] + + for topic, topic_barriers in by_topic.items(): + days_list = [b.days_delayed for b in topic_barriers if b.days_delayed] + cost_list = [ + float(b.cost_impact) for b in topic_barriers if b.cost_impact + ] + project_ids = list( + {str(b.project_id) for b in topic_barriers} + ) + jurisdictions = list( + {b.jurisdiction for b in topic_barriers if b.jurisdiction} + ) + + summaries.append( + FrictionTopicSummary( + topic=topic, + occurrence_count=len(topic_barriers), + total_days_delayed=sum(days_list), + total_cost_impact=sum(cost_list), + median_days_delayed=( + float(np.median(days_list)) if days_list else 0.0 + ), + affected_project_ids=project_ids, + jurisdictions=jurisdictions, + ) + ) + + # Rank by total days delayed descending + summaries.sort(key=lambda s: s["total_days_delayed"], reverse=True) + return summaries[:top_n] + + +def _count_by_stage(projects: list[Project]) -> dict[str, int]: + """Count projects by their current stage.""" + counts: dict[str, int] = defaultdict(int) + for p in projects: + counts[p.current_stage.value] += 1 + return counts + + +def _discover_jurisdictions( + db: Session, + *, + state: str | None = None, + limit: int = 50, +) -> list[str]: + """Discover jurisdictions with the most projects.""" + stmt = ( + select( + Project.jurisdiction, + func.count(Project.project_id).label("cnt"), + ) + .where(Project.jurisdiction.isnot(None)) + .group_by(Project.jurisdiction) + .order_by(func.count(Project.project_id).desc()) + .limit(limit) + ) + if state: + stmt = stmt.where(Project.state == state) + + rows = db.execute(stmt).all() + return [row.jurisdiction for row in rows if row.jurisdiction] + + +def _aggregate_stage_bottlenecks( + jurisdiction_results: list[JurisdictionBottleneck], +) -> list[StageBottleneck]: + """Aggregate stage bottleneck data across jurisdictions. + + Computes a weighted-average excess ratio per stage across all + analyzed jurisdictions. + """ + stage_data: dict[str, list[tuple[float, int]]] = defaultdict(list) + + for jur in jurisdiction_results: + for sb in jur["stage_bottlenecks"]: + if sb["project_count"] > 0: + stage_data[sb["stage"]].append( + (sb["excess_ratio"], sb["project_count"]) + ) + + global_stages: list[StageBottleneck] = [] + + for stage in _ACTIVE_STAGES: + entries = stage_data.get(stage, []) + if not entries: + continue + + ratios = [e[0] for e in entries] + counts = [e[1] for e in entries] + total_count = sum(counts) + + # Weighted average excess ratio + weighted_ratio = float( + np.average(ratios, weights=counts) + ) + + # Weighted average median days + medians = [e[0] for e in entries] # approximation + national = _load_national_benchmarks().get("stage_durations", {}) + nat_median = float(national.get(stage, {}).get("median", 0)) + + global_stages.append( + StageBottleneck( + stage=stage, + project_count=total_count, + median_days=weighted_ratio * nat_median if nat_median else 0.0, + national_median_days=nat_median, + excess_days=max(0.0, (weighted_ratio - 1.0) * nat_median), + excess_ratio=round(weighted_ratio, 2), + stalled_count=0, + stall_rate=0.0, + top_friction_topics=[], + ) + ) + + global_stages.sort(key=lambda s: s["excess_ratio"], reverse=True) + return global_stages + + +def _empty_jurisdiction_bottleneck(jurisdiction: str) -> JurisdictionBottleneck: + """Return an empty bottleneck result when no projects are found.""" + return JurisdictionBottleneck( + jurisdiction=jurisdiction, + total_projects=0, + overall_friction_score=0.0, + stage_bottlenecks=[], + top_friction_topics=[], + worst_stage=None, + estimated_excess_days_per_project=0.0, + estimated_excess_cost_per_project=0.0, + ) diff --git a/src/analytics/health_assessment.py b/src/analytics/health_assessment.py new file mode 100644 index 0000000..d89987e --- /dev/null +++ b/src/analytics/health_assessment.py @@ -0,0 +1,715 @@ +"""Pipeline health assessment with weighted scoring. + +Computes a composite health score for each project based on five +dimensions: timeline adherence (30%), budget variance (25%), +funding completeness (20%), risk exposure (15%), and team stability +(10%). Scores map to OverallHealth categories (on_track, at_risk, +delayed, stalled) using configurable thresholds from national +benchmarks. +""" + +from __future__ import annotations + +import logging +from datetime import date, datetime +from typing import TypedDict +from uuid import UUID + +import numpy as np +import yaml +from sqlalchemy import select +from sqlalchemy.orm import Session + +from src.analytics.peer_benchmarking import ( + PeerBenchmarkResult, + compute_peer_benchmarks, + _load_national_benchmarks, +) +from src.database.queries import get_project, query_projects +from src.models.enums import ( + FundingSourceStatus, + OverallHealth, + PipelineStage, +) +from src.models.funding_source import FundingSource +from src.models.project import Project + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Weight configuration +# --------------------------------------------------------------------------- + +HEALTH_WEIGHTS: dict[str, float] = { + "timeline": 0.30, + "budget": 0.25, + "funding": 0.20, + "risk": 0.15, + "team": 0.10, +} + +# Stages considered terminal (not scored) +_TERMINAL_STAGES = {PipelineStage.OPERATIONS, PipelineStage.ABANDONED} + +# Stages that can be assessed +_ACTIVE_STAGES_LIST = [ + "concept", + "pre_development", + "entitlement", + "financing", + "construction", + "lease_up", +] + + +# --------------------------------------------------------------------------- +# Typed results +# --------------------------------------------------------------------------- + + +class DimensionScore(TypedDict): + """Score for a single health dimension.""" + + dimension: str + raw_score: float # 0-100 + weight: float + weighted_score: float + detail: str + + +class HealthAssessmentResult(TypedDict): + """Complete health assessment for one project.""" + + project_id: str + project_name: str + current_stage: str + composite_score: float # 0-100 + overall_health: str # OverallHealth enum value + dimensions: dict[str, DimensionScore] + previous_score: float | None + score_trend: str # "improving", "stable", "declining" + recommendations: list[str] + assessed_at: str + + +class BatchHealthResult(TypedDict): + """Summary of batch health assessment across multiple projects.""" + + total_assessed: int + health_distribution: dict[str, int] # OverallHealth -> count + average_score: float + median_score: float + projects_declining: int + projects_improving: int + assessments: list[HealthAssessmentResult] + assessed_at: str + + +# --------------------------------------------------------------------------- +# Single project assessment +# --------------------------------------------------------------------------- + + +def assess_project_health( + db: Session, + project_id: UUID, + peer_benchmark: PeerBenchmarkResult | None = None, +) -> HealthAssessmentResult: + """Compute a weighted health score for a single project. + + Evaluates five dimensions and produces a composite score from 0 to 100. + The score maps to an OverallHealth enum using configured thresholds. + + Args: + db: SQLAlchemy session. + project_id: UUID of the project to assess. + peer_benchmark: Optional pre-computed peer benchmarks. If None, + they will be computed from the project's jurisdiction and + characteristics. + + Returns: + HealthAssessmentResult with composite score, per-dimension + breakdown, and actionable recommendations. + + Raises: + ValueError: If the project is not found. + """ + project = get_project(db, project_id) + if project is None: + raise ValueError(f"Project {project_id} not found.") + + if project.current_stage in _TERMINAL_STAGES: + return _terminal_assessment(project) + + if peer_benchmark is None: + peer_benchmark = compute_peer_benchmarks( + db, + jurisdiction=project.jurisdiction, + state=project.state, + building_type=( + project.building_type.value if project.building_type else None + ), + ) + + # Compute each dimension + timeline_dim = _score_timeline(project, peer_benchmark) + budget_dim = _score_budget(project) + funding_dim = _score_funding(db, project) + risk_dim = _score_risk(project) + team_dim = _score_team(project) + + dimensions = { + "timeline": timeline_dim, + "budget": budget_dim, + "funding": funding_dim, + "risk": risk_dim, + "team": team_dim, + } + + # Weighted composite + composite = sum(d["weighted_score"] for d in dimensions.values()) + composite = max(0.0, min(100.0, composite)) + + # Map to health category + overall = _score_to_health(composite) + + # Score trend (compare to stored score) + previous = project.health_score + if previous is not None: + delta = composite - previous + if delta > 3.0: + trend = "improving" + elif delta < -3.0: + trend = "declining" + else: + trend = "stable" + else: + trend = "stable" + + # Generate recommendations + recommendations = _generate_recommendations(project, dimensions, overall) + + return HealthAssessmentResult( + project_id=str(project.project_id), + project_name=project.project_name, + current_stage=project.current_stage.value, + composite_score=round(composite, 1), + overall_health=overall.value, + dimensions=dimensions, + previous_score=float(previous) if previous is not None else None, + score_trend=trend, + recommendations=recommendations, + assessed_at=datetime.utcnow().isoformat(), + ) + + +def assess_project_health_and_persist( + db: Session, + project_id: UUID, + peer_benchmark: PeerBenchmarkResult | None = None, +) -> HealthAssessmentResult: + """Assess health and write the score back to the project record. + + Args: + db: SQLAlchemy session. + project_id: UUID of the project to assess. + peer_benchmark: Optional pre-computed peer benchmarks. + + Returns: + HealthAssessmentResult (same as assess_project_health). + """ + result = assess_project_health(db, project_id, peer_benchmark) + + project = get_project(db, project_id) + if project: + project.health_score = result["composite_score"] + project.overall_health = OverallHealth(result["overall_health"]) + db.commit() + logger.info( + "Persisted health score %.1f (%s) for project %s.", + result["composite_score"], + result["overall_health"], + project_id, + ) + + return result + + +# --------------------------------------------------------------------------- +# Batch assessment +# --------------------------------------------------------------------------- + + +def assess_batch_health( + db: Session, + *, + jurisdiction: str | None = None, + state: str | None = None, + stages: list[PipelineStage] | None = None, + limit: int = 500, +) -> BatchHealthResult: + """Assess health for all projects matching filter criteria. + + Args: + db: SQLAlchemy session. + jurisdiction: Optional jurisdiction filter. + state: Optional state filter. + stages: Optional list of stages to include. + limit: Maximum projects to assess. + + Returns: + BatchHealthResult with distribution stats and all individual + assessments. + """ + active_stages = stages or [ + s + for s in PipelineStage + if s not in _TERMINAL_STAGES + ] + + projects = query_projects( + db, + jurisdiction=jurisdiction, + state=state, + stages=active_stages, + limit=limit, + ) + + # Pre-compute peer benchmarks once for the jurisdiction + peer_benchmark = None + if jurisdiction or state: + peer_benchmark = compute_peer_benchmarks( + db, + jurisdiction=jurisdiction, + state=state, + ) + + assessments: list[HealthAssessmentResult] = [] + health_counts: dict[str, int] = {h.value: 0 for h in OverallHealth} + scores: list[float] = [] + improving = 0 + declining = 0 + + for project in projects: + try: + result = assess_project_health( + db, project.project_id, peer_benchmark + ) + assessments.append(result) + health_counts[result["overall_health"]] = ( + health_counts.get(result["overall_health"], 0) + 1 + ) + scores.append(result["composite_score"]) + + if result["score_trend"] == "improving": + improving += 1 + elif result["score_trend"] == "declining": + declining += 1 + except Exception: + logger.exception( + "Failed to assess project %s.", project.project_id + ) + + score_arr = np.array(scores) if scores else np.array([0.0]) + + return BatchHealthResult( + total_assessed=len(assessments), + health_distribution=health_counts, + average_score=float(np.mean(score_arr)), + median_score=float(np.median(score_arr)), + projects_declining=declining, + projects_improving=improving, + assessments=assessments, + assessed_at=datetime.utcnow().isoformat(), + ) + + +# --------------------------------------------------------------------------- +# Dimension scoring functions (0-100 each) +# --------------------------------------------------------------------------- + + +def _score_timeline( + project: Project, + peer_benchmark: PeerBenchmarkResult, +) -> DimensionScore: + """Score timeline adherence by comparing actual days to peer medians. + + 100 = at or below peer median for current stage. + Linearly penalized for every percent over the median, reaching 0 + when actual >= 2x the peer median. + """ + stage = project.current_stage.value + days = project.days_in_current_stage + + if stage in ("operations", "abandoned", "stalled") or days is None: + raw = 50.0 + detail = "No timeline data for scoring." + else: + bench = peer_benchmark["stage_benchmarks"].get(stage) + if bench and bench["median_days"] > 0: + ratio = days / bench["median_days"] + if ratio <= 1.0: + raw = 100.0 + elif ratio >= 2.0: + raw = 0.0 + else: + # Linear interpolation: 100 at ratio=1.0, 0 at ratio=2.0 + raw = max(0.0, 100.0 * (2.0 - ratio)) + + detail = ( + f"Stage {stage}: {days}d actual vs " + f"{bench['median_days']:.0f}d peer median " + f"(ratio {ratio:.2f})" + ) + else: + raw = 70.0 + detail = f"No peer benchmark for stage {stage}; default score." + + weight = HEALTH_WEIGHTS["timeline"] + return DimensionScore( + dimension="timeline", + raw_score=round(raw, 1), + weight=weight, + weighted_score=round(raw * weight, 2), + detail=detail, + ) + + +def _score_budget(project: Project) -> DimensionScore: + """Score budget health based on variance percentage. + + 100 = variance <= 0% (on or under budget). + Linearly deducted, reaching 0 at >= 30% over budget. + """ + variance_pct = project.budget_variance_percent + + if variance_pct is None: + # Check if we can compute from original / current budget + if project.original_budget and project.current_budget: + if float(project.original_budget) > 0: + variance_pct = ( + (float(project.current_budget) - float(project.original_budget)) + / float(project.original_budget) + * 100.0 + ) + else: + variance_pct = None + + if variance_pct is None: + raw = 70.0 + detail = "No budget data available; default score." + elif variance_pct <= 0: + raw = 100.0 + detail = f"Budget variance {variance_pct:.1f}% (on/under budget)." + elif variance_pct >= 30: + raw = 0.0 + detail = f"Budget variance {variance_pct:.1f}% (severely over budget)." + else: + raw = max(0.0, 100.0 * (1.0 - variance_pct / 30.0)) + detail = f"Budget variance {variance_pct:.1f}%." + + weight = HEALTH_WEIGHTS["budget"] + return DimensionScore( + dimension="budget", + raw_score=round(raw, 1), + weight=weight, + weighted_score=round(raw * weight, 2), + detail=detail, + ) + + +def _score_funding(db: Session, project: Project) -> DimensionScore: + """Score funding completeness based on committed/closed sources. + + 100 = full funding stack committed or closed with no gap. + Penalized proportionally to the funding gap as a fraction of TDC. + """ + # Get funding sources + stmt = select(FundingSource).where( + FundingSource.project_id == project.project_id + ) + sources = list(db.scalars(stmt).all()) + + if not sources and project.funding_gap is None: + raw = 50.0 + detail = "No funding sources recorded." + else: + committed_statuses = { + FundingSourceStatus.AWARDED, + FundingSourceStatus.COMMITTED, + FundingSourceStatus.CLOSED, + } + total_committed = sum( + float(s.amount or 0) + for s in sources + if s.status in committed_statuses + ) + total_all = sum(float(s.amount or 0) for s in sources) + + tdc = float(project.total_development_cost or 0) + gap = float(project.funding_gap or 0) + + if tdc > 0: + # Percentage of TDC covered by committed sources + committed_pct = (total_committed / tdc) * 100.0 + gap_pct = (gap / tdc) * 100.0 if gap > 0 else 0.0 + + # Score: 100 at 100% committed, linearly down + raw = min(100.0, committed_pct) + # Additional penalty for explicit gap + if gap_pct > 0: + gap_penalty = min(30.0, gap_pct * 0.5) + raw = max(0.0, raw - gap_penalty) + + detail = ( + f"${total_committed:,.0f} committed of ${tdc:,.0f} TDC " + f"({committed_pct:.0f}%); gap ${gap:,.0f}." + ) + elif total_all > 0: + # No TDC but have sources - use ratio of committed to total + ratio = total_committed / total_all if total_all > 0 else 0 + raw = ratio * 100.0 + detail = f"${total_committed:,.0f} of ${total_all:,.0f} committed." + else: + raw = 30.0 + detail = "Funding sources present but amounts not populated." + + weight = HEALTH_WEIGHTS["funding"] + return DimensionScore( + dimension="funding", + raw_score=round(raw, 1), + weight=weight, + weighted_score=round(raw * weight, 2), + detail=detail, + ) + + +def _score_risk(project: Project) -> DimensionScore: + """Score risk exposure from risk_score, friction, and opposition. + + Combines the project's risk_score (0-100 where high = more risk), + friction score, neighbor opposition, and appeals into a composite + risk penalty. + """ + penalties: list[float] = [] + + # Risk score (inverse: high risk_score = low health) + if project.risk_score is not None: + penalties.append(project.risk_score) # 0-100 + + # Friction score (1-100 scale, higher = worse) + if project.jurisdiction_friction_score is not None: + penalties.append(min(100.0, project.jurisdiction_friction_score)) + + # Neighbor opposition + opposition_map = { + "none": 0, + "low": 15, + "moderate": 35, + "high": 60, + "severe": 85, + } + if project.neighbor_opposition_level is not None: + opp_val = opposition_map.get(project.neighbor_opposition_level.value, 0) + penalties.append(float(opp_val)) + + # Appeals filed + if project.appeals_filed and project.appeals_filed > 0: + penalties.append(min(100.0, project.appeals_filed * 25.0)) + + if penalties: + avg_penalty = np.mean(penalties) + raw = max(0.0, 100.0 - avg_penalty) + detail = ( + f"Average risk penalty {avg_penalty:.0f} from " + f"{len(penalties)} risk factors." + ) + else: + raw = 75.0 + detail = "No explicit risk data; default score." + + weight = HEALTH_WEIGHTS["risk"] + return DimensionScore( + dimension="risk", + raw_score=round(raw, 1), + weight=weight, + weighted_score=round(raw * weight, 2), + detail=detail, + ) + + +def _score_team(project: Project) -> DimensionScore: + """Score team completeness based on development team fields populated. + + Checks developer_org, architect, general_contractor, and + property_manager. Each present field adds 25 points. + """ + team_fields = [ + project.developer_org, + project.architect, + project.general_contractor, + project.property_manager, + ] + filled = sum(1 for f in team_fields if f is not None and str(f).strip()) + raw = (filled / len(team_fields)) * 100.0 + + # Bonus for data completeness + if project.data_completeness is not None: + # Blend in data quality: weight 70% team, 30% data completeness + raw = raw * 0.7 + (project.data_completeness * 100.0) * 0.3 + + detail = f"{filled}/{len(team_fields)} team roles filled." + + weight = HEALTH_WEIGHTS["team"] + return DimensionScore( + dimension="team", + raw_score=round(raw, 1), + weight=weight, + weighted_score=round(raw * weight, 2), + detail=detail, + ) + + +# --------------------------------------------------------------------------- +# Health category mapping +# --------------------------------------------------------------------------- + + +def _score_to_health(score: float) -> OverallHealth: + """Map a composite score to an OverallHealth enum.""" + thresholds = _load_national_benchmarks().get("health_thresholds", {}) + on_track = thresholds.get("on_track", 80) + at_risk = thresholds.get("at_risk", 60) + delayed = thresholds.get("delayed", 40) + + if score >= on_track: + return OverallHealth.ON_TRACK + elif score >= at_risk: + return OverallHealth.AT_RISK + elif score >= delayed: + return OverallHealth.DELAYED + else: + return OverallHealth.STALLED + + +def _terminal_assessment(project: Project) -> HealthAssessmentResult: + """Return a fixed assessment for terminal-stage projects.""" + stage = project.current_stage.value + if project.current_stage == PipelineStage.OPERATIONS: + score = 100.0 + health = OverallHealth.ON_TRACK + else: + score = 0.0 + health = OverallHealth.STALLED + + empty_dim = DimensionScore( + dimension="n/a", + raw_score=score, + weight=0.0, + weighted_score=0.0, + detail=f"Project in terminal stage: {stage}.", + ) + + return HealthAssessmentResult( + project_id=str(project.project_id), + project_name=project.project_name, + current_stage=stage, + composite_score=score, + overall_health=health.value, + dimensions={ + "timeline": empty_dim, + "budget": empty_dim, + "funding": empty_dim, + "risk": empty_dim, + "team": empty_dim, + }, + previous_score=float(project.health_score) if project.health_score else None, + score_trend="stable", + recommendations=[], + assessed_at=datetime.utcnow().isoformat(), + ) + + +# --------------------------------------------------------------------------- +# Recommendations +# --------------------------------------------------------------------------- + + +def _generate_recommendations( + project: Project, + dimensions: dict[str, DimensionScore], + overall: OverallHealth, +) -> list[str]: + """Generate prioritized recommendations based on lowest-scoring dimensions.""" + recs: list[str] = [] + + # Sort dimensions by raw_score ascending (worst first) + sorted_dims = sorted( + dimensions.values(), + key=lambda d: d["raw_score"], + ) + + for dim in sorted_dims: + if dim["raw_score"] >= 80: + continue + + name = dim["dimension"] + score = dim["raw_score"] + + if name == "timeline" and score < 60: + recs.append( + "Timeline is significantly behind peer benchmarks. " + "Consider escalating pending approvals or re-sequencing " + "activities to recover schedule." + ) + elif name == "timeline" and score < 80: + recs.append( + "Timeline is slightly behind peers. Monitor closely and " + "identify critical-path blockers." + ) + + if name == "budget" and score < 60: + recs.append( + "Budget variance exceeds acceptable thresholds. " + "Conduct a cost reconciliation and identify value-engineering " + "opportunities." + ) + elif name == "budget" and score < 80: + recs.append( + "Budget is trending over original estimates. " + "Review change orders and soft cost assumptions." + ) + + if name == "funding" and score < 50: + recs.append( + "Significant funding gap remains. Explore additional subsidy " + "sources, request increased allocations, or adjust project " + "scope to close the gap." + ) + elif name == "funding" and score < 80: + recs.append( + "Not all funding sources are committed. Follow up on " + "outstanding applications and closing timelines." + ) + + if name == "risk" and score < 50: + recs.append( + "High risk exposure detected. Address neighbor opposition, " + "pending appeals, and jurisdiction friction points." + ) + elif name == "risk" and score < 80: + recs.append( + "Moderate risk factors present. Proactive community " + "engagement and political liaison recommended." + ) + + if name == "team" and score < 60: + recs.append( + "Key team roles are unfilled. Engage an architect, GC, " + "or property manager to strengthen the development team." + ) + + # Cap recommendations + return recs[:5] diff --git a/src/analytics/peer_benchmarking.py b/src/analytics/peer_benchmarking.py new file mode 100644 index 0000000..f84136f --- /dev/null +++ b/src/analytics/peer_benchmarking.py @@ -0,0 +1,626 @@ +"""Peer group benchmarking for affordable housing projects. + +Identifies comparable project cohorts based on jurisdiction, unit count, +building type, and AMI mix, then computes benchmark statistics used by +the timeline prediction and health assessment modules. +""" + +from __future__ import annotations + +import logging +from datetime import date, datetime +from typing import TypedDict +from uuid import UUID + +import numpy as np +import yaml +from sqlalchemy import func, select +from sqlalchemy.orm import Session + +from src.database.queries import query_similar_projects +from src.models.enums import AMIMixCategory, BuildingType, PipelineStage +from src.models.peer_group import PeerGroup +from src.models.project import Project + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Typed results +# --------------------------------------------------------------------------- + +# Ordered pipeline stages used for duration calculations +_ACTIVE_STAGES: list[str] = [ + "concept", + "pre_development", + "entitlement", + "financing", + "construction", + "lease_up", +] + + +class StageBenchmark(TypedDict): + """Duration benchmarks for a single pipeline stage.""" + + stage: str + median_days: float + mean_days: float + p25_days: float + p75_days: float + p90_days: float + std_days: float + sample_size: int + + +class PeerBenchmarkResult(TypedDict): + """Complete peer benchmark result for a project or jurisdiction.""" + + peer_group_id: UUID | None + peer_group_name: str + project_count: int + stage_benchmarks: dict[str, StageBenchmark] + median_total_duration_days: float | None + median_cost_per_unit: float | None + p25_cost_per_unit: float | None + p75_cost_per_unit: float | None + computed_at: str + + +class PeerComparisonResult(TypedDict): + """How a single project compares to its peer group.""" + + project_id: str + project_name: str + peer_group_name: str + stage_comparisons: dict[str, StageComparisonDetail] + overall_percentile: float | None + cost_percentile: float | None + faster_than_peers: bool + cheaper_than_peers: bool + + +class StageComparisonDetail(TypedDict): + """Per-stage comparison metrics.""" + + actual_days: int | None + peer_median_days: float + delta_days: float | None + percentile: float | None + status: str # "faster", "on_pace", "slower" + + +# --------------------------------------------------------------------------- +# National benchmarks loader +# --------------------------------------------------------------------------- + +_national_benchmarks_cache: dict | None = None + + +def _load_national_benchmarks() -> dict: + """Load national benchmark data from YAML config, with caching.""" + global _national_benchmarks_cache + if _national_benchmarks_cache is not None: + return _national_benchmarks_cache + + try: + from config.settings import get_settings + + settings = get_settings() + benchmarks_path = settings.project_root / "config" / "national_benchmarks.yaml" + with open(benchmarks_path, "r") as f: + data = yaml.safe_load(f) + _national_benchmarks_cache = data.get("national_benchmarks", data) + return _national_benchmarks_cache + except Exception: + logger.warning("Could not load national_benchmarks.yaml; using defaults.") + _national_benchmarks_cache = _default_benchmarks() + return _national_benchmarks_cache + + +def _default_benchmarks() -> dict: + """Hardcoded fallback matching the YAML defaults.""" + return { + "stage_durations": { + "concept": {"median": 60, "p25": 30, "p75": 90, "p90": 120}, + "pre_development": {"median": 180, "p25": 120, "p75": 270, "p90": 365}, + "entitlement": {"median": 240, "p25": 150, "p75": 365, "p90": 540}, + "financing": {"median": 180, "p25": 120, "p75": 270, "p90": 365}, + "construction": {"median": 540, "p25": 365, "p75": 720, "p90": 900}, + "lease_up": {"median": 120, "p25": 60, "p75": 180, "p90": 270}, + }, + "total_timeline": { + "median_days": 1320, + "p25_days": 900, + "p75_days": 1800, + }, + "costs": { + "national_median_per_unit": 350000, + "national_p25_per_unit": 250000, + "national_p75_per_unit": 475000, + }, + "health_thresholds": { + "on_track": 80, + "at_risk": 60, + "delayed": 40, + "stalled": 0, + }, + } + + +# --------------------------------------------------------------------------- +# Peer group identification +# --------------------------------------------------------------------------- + + +def find_peer_group( + db: Session, + project: Project, +) -> PeerGroup | None: + """Find the best matching PeerGroup for a given project. + + Matches on jurisdiction first, then falls back to building type and + unit count range. + + Args: + db: SQLAlchemy session. + project: The project to find peers for. + + Returns: + Best-matching PeerGroup, or None if no group matches. + """ + stmt = select(PeerGroup) + + # Prefer jurisdiction-specific groups + if project.jurisdiction: + stmt_j = stmt.where(PeerGroup.jurisdiction == project.jurisdiction) + groups = list(db.scalars(stmt_j).all()) + if groups: + return _best_match(groups, project) + + # Fall back to building-type + unit-range groups + all_groups = list(db.scalars(stmt).all()) + return _best_match(all_groups, project) if all_groups else None + + +def _best_match(groups: list[PeerGroup], project: Project) -> PeerGroup | None: + """Score peer groups against project attributes and return best match.""" + best_group = None + best_score = -1 + + for pg in groups: + score = 0 + + # Jurisdiction match + if pg.jurisdiction and project.jurisdiction and pg.jurisdiction == project.jurisdiction: + score += 3 + + # Building type match + if pg.building_type and project.building_type and pg.building_type == project.building_type: + score += 2 + + # Unit count range match + if pg.unit_count_min is not None and pg.unit_count_max is not None: + if pg.unit_count_min <= (project.total_units or 0) <= pg.unit_count_max: + score += 2 + + # AMI mix match + if pg.ami_mix_category and project.total_units: + project_ami_cat = _classify_ami_mix(project) + if project_ami_cat and pg.ami_mix_category == project_ami_cat: + score += 1 + + if score > best_score: + best_score = score + best_group = pg + + return best_group + + +def _classify_ami_mix(project: Project) -> AMIMixCategory | None: + """Classify a project's AMI targeting into a category.""" + total = project.affordable_units or project.total_units or 0 + if total == 0: + return None + + deep = (project.ami_30_units or 0) + (project.ami_40_units or 0) + senior = project.senior_units or 0 + + if senior > 0 and senior >= total * 0.5: + return AMIMixCategory.SENIOR + if deep >= total * 0.5: + return AMIMixCategory.DEEP_AFFORDABILITY + if (project.market_rate_units or 0) > 0: + return AMIMixCategory.MIXED_INCOME + return AMIMixCategory.WORKFORCE + + +# --------------------------------------------------------------------------- +# Benchmark computation from peer data +# --------------------------------------------------------------------------- + + +def compute_peer_benchmarks( + db: Session, + *, + jurisdiction: str | None = None, + state: str | None = None, + building_type: str | None = None, + unit_count_range: tuple[float, float] | None = None, + min_sample_size: int = 5, +) -> PeerBenchmarkResult: + """Compute duration and cost benchmarks from peer projects. + + Queries completed or advanced-stage projects matching the given + criteria and calculates percentile-based benchmarks for each + pipeline stage. + + Args: + db: SQLAlchemy session. + jurisdiction: Filter by jurisdiction. + state: Filter by state (used if jurisdiction yields too few). + building_type: Filter by building type. + unit_count_range: (min_units, max_units) filter. + min_sample_size: Minimum peers required; falls back to national + benchmarks if not met. + + Returns: + PeerBenchmarkResult with per-stage and aggregate benchmarks. + """ + peers = query_similar_projects( + db, + jurisdiction=jurisdiction, + state=state, + unit_count_range=unit_count_range, + building_type=building_type, + completed_only=True, + limit=200, + ) + + # Fall back to state level if jurisdiction yields too few + if len(peers) < min_sample_size and jurisdiction and state: + logger.info( + "Only %d peers in jurisdiction %s; expanding to state %s.", + len(peers), + jurisdiction, + state, + ) + peers = query_similar_projects( + db, + state=state, + unit_count_range=unit_count_range, + building_type=building_type, + completed_only=True, + limit=200, + ) + + use_national = len(peers) < min_sample_size + + stage_benchmarks: dict[str, StageBenchmark] = {} + total_durations: list[float] = [] + cost_per_units: list[float] = [] + + if use_national: + logger.info( + "Only %d peers found; supplementing with national benchmarks.", + len(peers), + ) + national = _load_national_benchmarks() + stage_durations = national.get("stage_durations", {}) + + for stage in _ACTIVE_STAGES: + sd = stage_durations.get(stage, {}) + stage_benchmarks[stage] = StageBenchmark( + stage=stage, + median_days=float(sd.get("median", 0)), + mean_days=float(sd.get("median", 0)), + p25_days=float(sd.get("p25", 0)), + p75_days=float(sd.get("p75", 0)), + p90_days=float(sd.get("p90", 0)), + std_days=0.0, + sample_size=0, + ) + + total_timeline = national.get("total_timeline", {}) + median_total = float(total_timeline.get("median_days", 1320)) + costs_cfg = national.get("costs", {}) + median_cpu = float(costs_cfg.get("national_median_per_unit", 350000)) + + return PeerBenchmarkResult( + peer_group_id=None, + peer_group_name="national_benchmarks", + project_count=0, + stage_benchmarks=stage_benchmarks, + median_total_duration_days=median_total, + median_cost_per_unit=median_cpu, + p25_cost_per_unit=float(costs_cfg.get("national_p25_per_unit", 250000)), + p75_cost_per_unit=float(costs_cfg.get("national_p75_per_unit", 475000)), + computed_at=datetime.utcnow().isoformat(), + ) + + # Compute from actual peer data + for stage in _ACTIVE_STAGES: + durations = _extract_stage_durations(peers, stage) + if len(durations) >= 2: + arr = np.array(durations, dtype=float) + stage_benchmarks[stage] = StageBenchmark( + stage=stage, + median_days=float(np.median(arr)), + mean_days=float(np.mean(arr)), + p25_days=float(np.percentile(arr, 25)), + p75_days=float(np.percentile(arr, 75)), + p90_days=float(np.percentile(arr, 90)), + std_days=float(np.std(arr, ddof=1)), + sample_size=len(durations), + ) + else: + # Use national for stages with insufficient data + national = _load_national_benchmarks() + sd = national.get("stage_durations", {}).get(stage, {}) + stage_benchmarks[stage] = StageBenchmark( + stage=stage, + median_days=float(sd.get("median", 0)), + mean_days=float(sd.get("median", 0)), + p25_days=float(sd.get("p25", 0)), + p75_days=float(sd.get("p75", 0)), + p90_days=float(sd.get("p90", 0)), + std_days=0.0, + sample_size=len(durations), + ) + + for p in peers: + if p.total_elapsed_days is not None: + total_durations.append(float(p.total_elapsed_days)) + if p.cost_per_unit is not None: + cost_per_units.append(float(p.cost_per_unit)) + + total_arr = np.array(total_durations, dtype=float) if total_durations else np.array([]) + cost_arr = np.array(cost_per_units, dtype=float) if cost_per_units else np.array([]) + + group_name = _build_group_name(jurisdiction, state, building_type) + + return PeerBenchmarkResult( + peer_group_id=None, + peer_group_name=group_name, + project_count=len(peers), + stage_benchmarks=stage_benchmarks, + median_total_duration_days=( + float(np.median(total_arr)) if len(total_arr) > 0 else None + ), + median_cost_per_unit=( + float(np.median(cost_arr)) if len(cost_arr) > 0 else None + ), + p25_cost_per_unit=( + float(np.percentile(cost_arr, 25)) if len(cost_arr) > 0 else None + ), + p75_cost_per_unit=( + float(np.percentile(cost_arr, 75)) if len(cost_arr) > 0 else None + ), + computed_at=datetime.utcnow().isoformat(), + ) + + +def compare_project_to_peers( + db: Session, + project: Project, + peer_benchmark: PeerBenchmarkResult | None = None, +) -> PeerComparisonResult: + """Compare a project's actual durations and costs to peer benchmarks. + + Args: + db: SQLAlchemy session. + project: The project to evaluate. + peer_benchmark: Pre-computed peer benchmarks. If None, will be + computed automatically. + + Returns: + PeerComparisonResult with per-stage comparisons and percentile + rankings. + """ + if peer_benchmark is None: + peer_benchmark = compute_peer_benchmarks( + db, + jurisdiction=project.jurisdiction, + state=project.state, + building_type=( + project.building_type.value if project.building_type else None + ), + unit_count_range=( + _unit_range(project.total_units) if project.total_units else None + ), + ) + + stage_comparisons: dict[str, StageComparisonDetail] = {} + deltas: list[float] = [] + + for stage in _ACTIVE_STAGES: + bench = peer_benchmark["stage_benchmarks"].get(stage) + if bench is None: + continue + + actual = project.get_stage_duration(stage) + peer_median = bench["median_days"] + + if actual is not None and peer_median > 0: + delta = float(actual) - peer_median + deltas.append(delta) + + # Compute percentile using normal approximation from bench stats + if bench["std_days"] > 0: + from scipy.stats import norm + + z = (float(actual) - bench["mean_days"]) / bench["std_days"] + pctl = float(norm.cdf(z) * 100) + else: + pctl = 50.0 if actual <= peer_median else 75.0 + + if delta <= -peer_median * 0.1: + status = "faster" + elif delta >= peer_median * 0.15: + status = "slower" + else: + status = "on_pace" + else: + delta = None + pctl = None + status = "no_data" + + stage_comparisons[stage] = StageComparisonDetail( + actual_days=actual, + peer_median_days=peer_median, + delta_days=delta, + percentile=pctl, + status=status, + ) + + # Overall percentile from total elapsed + overall_pctl = None + if project.total_elapsed_days and peer_benchmark["median_total_duration_days"]: + if peer_benchmark["median_total_duration_days"] > 0: + ratio = project.total_elapsed_days / peer_benchmark["median_total_duration_days"] + overall_pctl = min(100.0, ratio * 50.0) + + # Cost percentile + cost_pctl = None + if project.cost_per_unit and peer_benchmark["median_cost_per_unit"]: + if peer_benchmark["median_cost_per_unit"] > 0: + ratio = float(project.cost_per_unit) / peer_benchmark["median_cost_per_unit"] + cost_pctl = min(100.0, ratio * 50.0) + + avg_delta = np.mean(deltas) if deltas else 0.0 + + return PeerComparisonResult( + project_id=str(project.project_id), + project_name=project.project_name, + peer_group_name=peer_benchmark["peer_group_name"], + stage_comparisons=stage_comparisons, + overall_percentile=overall_pctl, + cost_percentile=cost_pctl, + faster_than_peers=bool(avg_delta < 0), + cheaper_than_peers=bool( + cost_pctl is not None and cost_pctl < 50.0 + ), + ) + + +def refresh_peer_group_stats( + db: Session, + peer_group_id: UUID, +) -> PeerGroup: + """Recalculate benchmark stats for a saved PeerGroup and persist them. + + Queries matching projects, computes medians and percentiles, and + updates the PeerGroup record in the database. + + Args: + db: SQLAlchemy session. + peer_group_id: ID of the PeerGroup to refresh. + + Returns: + Updated PeerGroup instance. + + Raises: + ValueError: If the PeerGroup does not exist. + """ + pg = db.get(PeerGroup, peer_group_id) + if pg is None: + raise ValueError(f"PeerGroup {peer_group_id} not found.") + + peers = query_similar_projects( + db, + jurisdiction=pg.jurisdiction, + unit_count_range=( + (float(pg.unit_count_min), float(pg.unit_count_max)) + if pg.unit_count_min is not None and pg.unit_count_max is not None + else None + ), + building_type=pg.building_type.value if pg.building_type else None, + completed_only=True, + limit=500, + ) + + pg.project_count = len(peers) + + # Stage duration medians + for stage, attr in [ + ("concept", "median_concept_duration"), + ("pre_development", "median_pre_dev_duration"), + ("entitlement", "median_entitlement_duration"), + ("financing", "median_financing_duration"), + ("construction", "median_construction_duration"), + ]: + durations = _extract_stage_durations(peers, stage) + if durations: + setattr(pg, attr, int(np.median(durations))) + else: + setattr(pg, attr, None) + + # Total duration + totals = [p.total_elapsed_days for p in peers if p.total_elapsed_days is not None] + pg.median_total_duration = int(np.median(totals)) if totals else None + + # Cost per unit + cpus = [float(p.cost_per_unit) for p in peers if p.cost_per_unit is not None] + if cpus: + arr = np.array(cpus) + pg.median_cost_per_unit = float(np.median(arr)) + pg.p25_cost_per_unit = float(np.percentile(arr, 25)) + pg.p75_cost_per_unit = float(np.percentile(arr, 75)) + else: + pg.median_cost_per_unit = None + pg.p25_cost_per_unit = None + pg.p75_cost_per_unit = None + + pg.last_calculated = datetime.utcnow() + db.commit() + db.refresh(pg) + + logger.info( + "Refreshed PeerGroup %s (%s) with %d projects.", + pg.peer_group_id, + pg.group_name, + pg.project_count, + ) + return pg + + +# --------------------------------------------------------------------------- +# Helper functions +# --------------------------------------------------------------------------- + + +def _extract_stage_durations( + projects: list[Project], + stage: str, +) -> list[float]: + """Extract non-null durations for a pipeline stage from project list.""" + attr = f"{stage}_duration_days" + durations = [] + for p in projects: + val = getattr(p, attr, None) + if val is not None: + durations.append(float(val)) + return durations + + +def _unit_range(total_units: int) -> tuple[float, float]: + """Generate a +/- 50% unit count range for peer matching.""" + low = max(1, total_units * 0.5) + high = total_units * 1.5 + return (low, high) + + +def _build_group_name( + jurisdiction: str | None, + state: str | None, + building_type: str | None, +) -> str: + """Construct a human-readable peer group label.""" + parts = [] + if jurisdiction: + parts.append(jurisdiction) + elif state: + parts.append(f"state:{state}") + if building_type: + parts.append(building_type) + return "_".join(parts) if parts else "all_projects" diff --git a/src/analytics/portfolio_intelligence.py b/src/analytics/portfolio_intelligence.py new file mode 100644 index 0000000..cbff3c1 --- /dev/null +++ b/src/analytics/portfolio_intelligence.py @@ -0,0 +1,817 @@ +"""Portfolio dashboard generation for stakeholders. + +Aggregates project data across multiple dimensions (stage, geography, +funding status, health, timeline) to produce comprehensive portfolio +dashboards for PHAs, funders, cities, states, and researchers. +""" + +from __future__ import annotations + +import logging +from collections import defaultdict +from datetime import date, datetime +from typing import TypedDict +from uuid import UUID + +import numpy as np +from sqlalchemy import func, select +from sqlalchemy.orm import Session + +from src.analytics.health_assessment import ( + HealthAssessmentResult, + assess_project_health, +) +from src.analytics.peer_benchmarking import _load_national_benchmarks +from src.analytics.timeline_prediction import ( + TimelinePredictionResult, + predict_project_timeline, +) +from src.database.queries import ( + get_portfolio_summary_stats, + get_stage_distribution, + get_stalled_projects, + query_projects, +) +from src.models.enums import ( + FundingSourceStatus, + FundingSourceType, + OverallHealth, + PipelineStage, + PortfolioType, + StakeholderType, +) +from src.models.funding_source import FundingSource +from src.models.portfolio import PortfolioDashboard +from src.models.project import Project + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Typed results +# --------------------------------------------------------------------------- + + +class StageDistribution(TypedDict): + """Project and unit counts by pipeline stage.""" + + stage: str + project_count: int + total_units: int + affordable_units: int + median_days_in_stage: float | None + + +class HealthDistribution(TypedDict): + """Project counts by health status.""" + + health_status: str + project_count: int + total_units: int + percentage: float + + +class FundingBreakdown(TypedDict): + """Funding aggregation by source type.""" + + source_type: str + total_amount: float + source_count: int + committed_amount: float + gap_amount: float + + +class VelocityMetrics(TypedDict): + """Pipeline velocity measurements.""" + + projects_entering_pipeline_last_90d: int + projects_completing_last_90d: int + median_concept_to_construction_days: float | None + median_concept_to_co_days: float | None + throughput_units_per_month: float + stage_velocity: dict[str, float] # stage -> median days + + +class GeographicBreakdown(TypedDict): + """Aggregation by geographic area.""" + + area: str + area_type: str # "city", "county", "jurisdiction" + project_count: int + total_units: int + at_risk_count: int + average_health_score: float + + +class AtRiskSummary(TypedDict): + """Summary of at-risk and stalled projects.""" + + project_id: str + project_name: str + current_stage: str + days_in_stage: int | None + health_score: float | None + primary_risk: str | None + funding_gap: float | None + + +class PortfolioDashboardResult(TypedDict): + """Complete portfolio intelligence dashboard.""" + + portfolio_id: str | None + portfolio_name: str + stakeholder_type: str + generated_at: str + + # Summary + total_projects: int + total_units: int + total_affordable_units: int + total_development_cost: float + total_funding_gap: float + + # Distributions + stage_distribution: list[StageDistribution] + health_distribution: list[HealthDistribution] + funding_breakdown: list[FundingBreakdown] + geographic_breakdown: list[GeographicBreakdown] + + # Velocity + velocity: VelocityMetrics + + # At-risk + at_risk_projects: list[AtRiskSummary] + stalled_projects: list[AtRiskSummary] + + # Timeline outlook + projects_expected_co_next_12m: int + units_expected_co_next_12m: int + projects_expected_groundbreaking_next_6m: int + + # Key metrics + average_health_score: float + median_cost_per_unit: float | None + average_friction_score: float | None + + +# --------------------------------------------------------------------------- +# Dashboard generation +# --------------------------------------------------------------------------- + + +def generate_portfolio_dashboard( + db: Session, + *, + portfolio_id: UUID | None = None, + jurisdiction: str | None = None, + city: str | None = None, + state: str | None = None, + stakeholder_type: StakeholderType = StakeholderType.CITY, + funding_organization: str | None = None, + limit: int = 1000, +) -> PortfolioDashboardResult: + """Generate a comprehensive portfolio intelligence dashboard. + + Aggregates project data across multiple dimensions to produce a + stakeholder-appropriate view of the housing pipeline. + + Args: + db: SQLAlchemy session. + portfolio_id: Optional saved PortfolioDashboard to use for filters. + jurisdiction: Optional jurisdiction filter. + city: Optional city filter. + state: Optional state filter. + stakeholder_type: Type of stakeholder viewing the dashboard. + funding_organization: Filter to projects funded by this org. + limit: Maximum projects to include. + + Returns: + PortfolioDashboardResult with full dashboard data. + """ + # Load portfolio filters if portfolio_id provided + portfolio_name = "Ad-hoc Dashboard" + if portfolio_id: + portfolio = db.get(PortfolioDashboard, portfolio_id) + if portfolio: + portfolio_name = portfolio.portfolio_name + # Apply stored filters + if portfolio.geography_filter: + jurisdiction = jurisdiction or portfolio.geography_filter.get("jurisdiction") + city = city or portfolio.geography_filter.get("city") + state = state or portfolio.geography_filter.get("state") + + # Fetch projects + projects = query_projects( + db, + jurisdiction=jurisdiction, + city=city, + state=state, + funding_source_organization=funding_organization, + limit=limit, + ) + + if not projects: + return _empty_dashboard( + portfolio_id, portfolio_name, stakeholder_type + ) + + # Summary stats + summary = get_portfolio_summary_stats( + db, jurisdiction=jurisdiction, city=city, state=state + ) + + # Stage distribution + stage_dist = _compute_stage_distribution(projects) + + # Health distribution + health_dist = _compute_health_distribution(projects) + + # Funding breakdown + funding_bkdn = _compute_funding_breakdown(db, projects) + + # Geographic breakdown + geo_bkdn = _compute_geographic_breakdown(projects) + + # Velocity metrics + velocity = _compute_velocity_metrics(db, projects, jurisdiction) + + # At-risk & stalled + at_risk = _identify_at_risk_projects(projects) + stalled = _identify_stalled_projects(db, jurisdiction, state) + + # Timeline outlook + co_12m, units_12m = _projects_expected_co(projects, months=12) + gb_6m = _projects_expected_groundbreaking(projects, months=6) + + # Aggregate scores + health_scores = [ + p.health_score for p in projects if p.health_score is not None + ] + avg_health = float(np.mean(health_scores)) if health_scores else 0.0 + + cpus = [ + float(p.cost_per_unit) + for p in projects + if p.cost_per_unit is not None + ] + median_cpu = float(np.median(cpus)) if cpus else None + + friction_scores = [ + p.jurisdiction_friction_score + for p in projects + if p.jurisdiction_friction_score is not None + ] + avg_friction = float(np.mean(friction_scores)) if friction_scores else None + + return PortfolioDashboardResult( + portfolio_id=str(portfolio_id) if portfolio_id else None, + portfolio_name=portfolio_name, + stakeholder_type=stakeholder_type.value, + generated_at=datetime.utcnow().isoformat(), + total_projects=summary["total_projects"], + total_units=summary["total_units"], + total_affordable_units=summary["total_affordable_units"], + total_development_cost=summary["total_cost"], + total_funding_gap=summary["total_funding_gap"], + stage_distribution=stage_dist, + health_distribution=health_dist, + funding_breakdown=funding_bkdn, + geographic_breakdown=geo_bkdn, + velocity=velocity, + at_risk_projects=at_risk, + stalled_projects=stalled, + projects_expected_co_next_12m=co_12m, + units_expected_co_next_12m=units_12m, + projects_expected_groundbreaking_next_6m=gb_6m, + average_health_score=round(avg_health, 1), + median_cost_per_unit=round(median_cpu, 2) if median_cpu else None, + average_friction_score=round(avg_friction, 1) if avg_friction else None, + ) + + +def generate_and_persist_dashboard( + db: Session, + portfolio_id: UUID, + **kwargs, +) -> PortfolioDashboardResult: + """Generate dashboard and cache results to the PortfolioDashboard record. + + Args: + db: SQLAlchemy session. + portfolio_id: UUID of the PortfolioDashboard to update. + **kwargs: Additional arguments for generate_portfolio_dashboard. + + Returns: + PortfolioDashboardResult. + + Raises: + ValueError: If the portfolio is not found. + """ + dashboard = generate_portfolio_dashboard( + db, portfolio_id=portfolio_id, **kwargs + ) + + portfolio = db.get(PortfolioDashboard, portfolio_id) + if portfolio is None: + raise ValueError(f"PortfolioDashboard {portfolio_id} not found.") + + portfolio.total_projects = dashboard["total_projects"] + portfolio.total_units = dashboard["total_units"] + portfolio.units_by_stage = { + sd["stage"]: sd["total_units"] for sd in dashboard["stage_distribution"] + } + portfolio.funding_gap_aggregate = dashboard["total_funding_gap"] + portfolio.at_risk_count = len(dashboard["at_risk_projects"]) + portfolio.velocity_metrics = { + "throughput_units_per_month": dashboard["velocity"]["throughput_units_per_month"], + "projects_entering_90d": dashboard["velocity"]["projects_entering_pipeline_last_90d"], + "projects_completing_90d": dashboard["velocity"]["projects_completing_last_90d"], + } + portfolio.last_calculated = datetime.utcnow() + + db.commit() + logger.info( + "Persisted dashboard metrics for portfolio %s.", + portfolio_id, + ) + + return dashboard + + +# --------------------------------------------------------------------------- +# Stakeholder-specific views +# --------------------------------------------------------------------------- + + +def generate_funder_view( + db: Session, + funding_organization: str, + *, + state: str | None = None, +) -> PortfolioDashboardResult: + """Generate a funder-specific portfolio view. + + Filters to projects where the organization is a funding source. + + Args: + db: SQLAlchemy session. + funding_organization: Name of the funding organization. + state: Optional state filter. + + Returns: + PortfolioDashboardResult scoped to the funder's portfolio. + """ + return generate_portfolio_dashboard( + db, + state=state, + stakeholder_type=StakeholderType.FUNDER, + funding_organization=funding_organization, + ) + + +def generate_pha_view( + db: Session, + jurisdiction: str, +) -> PortfolioDashboardResult: + """Generate a PHA (Public Housing Authority) view. + + Args: + db: SQLAlchemy session. + jurisdiction: PHA service area jurisdiction. + + Returns: + PortfolioDashboardResult scoped to the PHA's jurisdiction. + """ + return generate_portfolio_dashboard( + db, + jurisdiction=jurisdiction, + stakeholder_type=StakeholderType.PHA, + ) + + +def generate_state_view( + db: Session, + state: str, +) -> PortfolioDashboardResult: + """Generate a state-level portfolio view. + + Args: + db: SQLAlchemy session. + state: Two-letter state code. + + Returns: + PortfolioDashboardResult aggregated at the state level. + """ + return generate_portfolio_dashboard( + db, + state=state, + stakeholder_type=StakeholderType.STATE, + ) + + +# --------------------------------------------------------------------------- +# Internal aggregation helpers +# --------------------------------------------------------------------------- + + +def _compute_stage_distribution( + projects: list[Project], +) -> list[StageDistribution]: + """Compute project and unit counts grouped by pipeline stage.""" + by_stage: dict[str, dict] = defaultdict( + lambda: { + "count": 0, + "units": 0, + "affordable": 0, + "days": [], + } + ) + + for p in projects: + stage = p.current_stage.value + by_stage[stage]["count"] += 1 + by_stage[stage]["units"] += p.total_units or 0 + by_stage[stage]["affordable"] += p.affordable_units or 0 + if p.days_in_current_stage is not None: + by_stage[stage]["days"].append(p.days_in_current_stage) + + result: list[StageDistribution] = [] + for stage_val in PipelineStage: + stage = stage_val.value + data = by_stage.get(stage) + if data and data["count"] > 0: + days_arr = data["days"] + median_days = ( + float(np.median(days_arr)) if days_arr else None + ) + result.append( + StageDistribution( + stage=stage, + project_count=data["count"], + total_units=data["units"], + affordable_units=data["affordable"], + median_days_in_stage=median_days, + ) + ) + + return result + + +def _compute_health_distribution( + projects: list[Project], +) -> list[HealthDistribution]: + """Compute project counts grouped by health status.""" + by_health: dict[str, dict] = defaultdict( + lambda: {"count": 0, "units": 0} + ) + + for p in projects: + health = ( + p.overall_health.value + if p.overall_health + else "unknown" + ) + by_health[health]["count"] += 1 + by_health[health]["units"] += p.total_units or 0 + + total = len(projects) or 1 + result: list[HealthDistribution] = [] + + for health_val in list(OverallHealth) + [None]: + key = health_val.value if health_val else "unknown" + data = by_health.get(key) + if data and data["count"] > 0: + result.append( + HealthDistribution( + health_status=key, + project_count=data["count"], + total_units=data["units"], + percentage=round(data["count"] / total * 100, 1), + ) + ) + + return result + + +def _compute_funding_breakdown( + db: Session, + projects: list[Project], +) -> list[FundingBreakdown]: + """Aggregate funding by source type across portfolio projects.""" + project_ids = [p.project_id for p in projects] + if not project_ids: + return [] + + stmt = select(FundingSource).where( + FundingSource.project_id.in_(project_ids) + ) + sources = list(db.scalars(stmt).all()) + + by_type: dict[str, dict] = defaultdict( + lambda: { + "total": 0.0, + "count": 0, + "committed": 0.0, + } + ) + + committed_statuses = { + FundingSourceStatus.AWARDED, + FundingSourceStatus.COMMITTED, + FundingSourceStatus.CLOSED, + } + + for s in sources: + key = s.source_type.value + amount = float(s.amount or 0) + by_type[key]["total"] += amount + by_type[key]["count"] += 1 + if s.status in committed_statuses: + by_type[key]["committed"] += amount + + result: list[FundingBreakdown] = [] + for src_type, data in sorted( + by_type.items(), key=lambda x: x[1]["total"], reverse=True + ): + result.append( + FundingBreakdown( + source_type=src_type, + total_amount=round(data["total"], 2), + source_count=data["count"], + committed_amount=round(data["committed"], 2), + gap_amount=round(data["total"] - data["committed"], 2), + ) + ) + + return result + + +def _compute_geographic_breakdown( + projects: list[Project], +) -> list[GeographicBreakdown]: + """Aggregate projects by jurisdiction/city.""" + by_area: dict[str, dict] = defaultdict( + lambda: { + "type": "jurisdiction", + "count": 0, + "units": 0, + "at_risk": 0, + "scores": [], + } + ) + + for p in projects: + area = p.jurisdiction or p.city or p.county or "unknown" + area_type = "jurisdiction" if p.jurisdiction else "city" + by_area[area]["type"] = area_type + by_area[area]["count"] += 1 + by_area[area]["units"] += p.total_units or 0 + if p.overall_health in (OverallHealth.AT_RISK, OverallHealth.DELAYED, OverallHealth.STALLED): + by_area[area]["at_risk"] += 1 + if p.health_score is not None: + by_area[area]["scores"].append(float(p.health_score)) + + result: list[GeographicBreakdown] = [] + for area, data in sorted( + by_area.items(), key=lambda x: x[1]["count"], reverse=True + ): + scores = data["scores"] + avg_score = float(np.mean(scores)) if scores else 0.0 + result.append( + GeographicBreakdown( + area=area, + area_type=data["type"], + project_count=data["count"], + total_units=data["units"], + at_risk_count=data["at_risk"], + average_health_score=round(avg_score, 1), + ) + ) + + return result + + +def _compute_velocity_metrics( + db: Session, + projects: list[Project], + jurisdiction: str | None, +) -> VelocityMetrics: + """Compute pipeline throughput and stage velocity metrics.""" + today = date.today() + ninety_days_ago = today - __import__("datetime").timedelta(days=90) + + # Projects entering pipeline in last 90 days + entering = sum( + 1 + for p in projects + if p.created_at and p.created_at.date() >= ninety_days_ago + ) + + # Projects completing (reaching operations/lease_up) in last 90 days + completing = sum( + 1 + for p in projects + if p.current_stage in (PipelineStage.OPERATIONS, PipelineStage.LEASE_UP) + and p.stage_entry_date + and p.stage_entry_date >= ninety_days_ago + ) + + # Concept to construction duration + c2c_durations = [ + p.concept_to_groundbreaking_days + for p in projects + if p.concept_to_groundbreaking_days is not None + ] + median_c2c = float(np.median(c2c_durations)) if c2c_durations else None + + # Concept to CO + c2co_durations = [ + p.concept_to_co_days + for p in projects + if p.concept_to_co_days is not None + ] + median_c2co = float(np.median(c2co_durations)) if c2co_durations else None + + # Throughput: units reaching operations per month (last 12 months) + twelve_months_ago = today - __import__("datetime").timedelta(days=365) + completed_units = sum( + p.total_units or 0 + for p in projects + if p.current_stage == PipelineStage.OPERATIONS + and p.stage_entry_date + and p.stage_entry_date >= twelve_months_ago + ) + throughput = completed_units / 12.0 + + # Per-stage velocity (median days for completed stages) + stage_velocity: dict[str, float] = {} + for stage_name in [ + "concept", + "pre_development", + "entitlement", + "financing", + "construction", + "lease_up", + ]: + attr = f"{stage_name}_duration_days" + durations = [ + float(getattr(p, attr)) + for p in projects + if getattr(p, attr, None) is not None + ] + if durations: + stage_velocity[stage_name] = float(np.median(durations)) + + return VelocityMetrics( + projects_entering_pipeline_last_90d=entering, + projects_completing_last_90d=completing, + median_concept_to_construction_days=median_c2c, + median_concept_to_co_days=median_c2co, + throughput_units_per_month=round(throughput, 1), + stage_velocity=stage_velocity, + ) + + +def _identify_at_risk_projects( + projects: list[Project], +) -> list[AtRiskSummary]: + """Identify projects with at_risk, delayed, or stalled health.""" + at_risk_statuses = { + OverallHealth.AT_RISK, + OverallHealth.DELAYED, + OverallHealth.STALLED, + } + + results: list[AtRiskSummary] = [] + for p in projects: + if p.overall_health in at_risk_statuses: + # Determine primary risk from risk_factors JSON + primary_risk = None + if p.risk_factors and isinstance(p.risk_factors, dict): + # Pick the highest-scored risk factor + try: + primary_risk = max( + p.risk_factors.keys(), + key=lambda k: p.risk_factors[k] + if isinstance(p.risk_factors[k], (int, float)) + else 0, + ) + except (ValueError, TypeError): + primary_risk = None + + results.append( + AtRiskSummary( + project_id=str(p.project_id), + project_name=p.project_name, + current_stage=p.current_stage.value, + days_in_stage=p.days_in_current_stage, + health_score=float(p.health_score) if p.health_score else None, + primary_risk=primary_risk, + funding_gap=( + float(p.funding_gap) if p.funding_gap else None + ), + ) + ) + + # Sort by health score ascending (worst first) + results.sort(key=lambda r: r["health_score"] or 0) + return results + + +def _identify_stalled_projects( + db: Session, + jurisdiction: str | None, + state: str | None, +) -> list[AtRiskSummary]: + """Find projects that appear stalled based on time in stage.""" + stalled = get_stalled_projects( + db, days_threshold=180, jurisdiction=jurisdiction + ) + + results: list[AtRiskSummary] = [] + for p in stalled: + if state and p.state != state: + continue + results.append( + AtRiskSummary( + project_id=str(p.project_id), + project_name=p.project_name, + current_stage=p.current_stage.value, + days_in_stage=p.days_in_current_stage, + health_score=float(p.health_score) if p.health_score else None, + primary_risk="stalled_in_stage", + funding_gap=( + float(p.funding_gap) if p.funding_gap else None + ), + ) + ) + + results.sort(key=lambda r: -(r["days_in_stage"] or 0)) + return results + + +def _projects_expected_co( + projects: list[Project], + months: int = 12, +) -> tuple[int, int]: + """Count projects with predicted CO within the given timeframe.""" + cutoff = date.today() + __import__("datetime").timedelta(days=months * 30) + count = 0 + units = 0 + for p in projects: + if p.predicted_co and p.predicted_co <= cutoff: + count += 1 + units += p.total_units or 0 + return count, units + + +def _projects_expected_groundbreaking( + projects: list[Project], + months: int = 6, +) -> int: + """Count projects with predicted groundbreaking within the timeframe.""" + cutoff = date.today() + __import__("datetime").timedelta(days=months * 30) + return sum( + 1 + for p in projects + if p.predicted_groundbreaking and p.predicted_groundbreaking <= cutoff + ) + + +def _empty_dashboard( + portfolio_id: UUID | None, + portfolio_name: str, + stakeholder_type: StakeholderType, +) -> PortfolioDashboardResult: + """Return an empty dashboard when no projects match.""" + return PortfolioDashboardResult( + portfolio_id=str(portfolio_id) if portfolio_id else None, + portfolio_name=portfolio_name, + stakeholder_type=stakeholder_type.value, + generated_at=datetime.utcnow().isoformat(), + total_projects=0, + total_units=0, + total_affordable_units=0, + total_development_cost=0.0, + total_funding_gap=0.0, + stage_distribution=[], + health_distribution=[], + funding_breakdown=[], + geographic_breakdown=[], + velocity=VelocityMetrics( + projects_entering_pipeline_last_90d=0, + projects_completing_last_90d=0, + median_concept_to_construction_days=None, + median_concept_to_co_days=None, + throughput_units_per_month=0.0, + stage_velocity={}, + ), + at_risk_projects=[], + stalled_projects=[], + projects_expected_co_next_12m=0, + units_expected_co_next_12m=0, + projects_expected_groundbreaking_next_6m=0, + average_health_score=0.0, + median_cost_per_unit=None, + average_friction_score=None, + ) diff --git a/src/analytics/reform_impact.py b/src/analytics/reform_impact.py new file mode 100644 index 0000000..3ed1550 --- /dev/null +++ b/src/analytics/reform_impact.py @@ -0,0 +1,683 @@ +"""Policy reform impact measurement with statistical testing. + +Measures the effect of zoning changes, parking reforms, density bonuses, +streamlining, and fee reductions on housing development timelines and +costs. Uses pre/post comparison with t-tests (or Mann-Whitney when +normality is violated) to determine statistical significance. +""" + +from __future__ import annotations + +import logging +from datetime import date, datetime, timedelta +from typing import TypedDict +from uuid import UUID + +import numpy as np +from sqlalchemy import select +from sqlalchemy.orm import Session + +from src.analytics.peer_benchmarking import _load_national_benchmarks +from src.analytics.statistical_tests import ( + ConfidenceIntervalResult, + TTestResult, + confidence_interval, + independent_ttest, + mann_whitney_test, + select_and_run_test, +) +from src.database.queries import ( + get_abandoned_projects, + get_projects_by_entitlement_window, + query_projects, +) +from src.models.enums import ConfidenceLevel, PipelineStage, ReformType +from src.models.project import Project +from src.models.reform import PolicyReform + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Typed results +# --------------------------------------------------------------------------- + + +class ReformImpactResult(TypedDict): + """Complete impact assessment for a single policy reform.""" + + reform_id: str + reform_name: str + jurisdiction: str + reform_type: str + effective_date: str | None + + # Sample sizes + pre_reform_n: int + post_reform_n: int + + # Duration metrics + pre_reform_median_days: float + post_reform_median_days: float + days_saved_per_project: float + percent_improvement: float + + # Statistical test + test_used: str # "ttest" or "mann_whitney" + p_value: float + is_significant: bool + effect_size: float + effect_interpretation: str + confidence_interval_days_saved: tuple[float, float] + + # Broader impact + total_cost_savings: float + units_enabled: int + projects_unblocked: int + stall_rate_change: float # negative = improvement + + # Confidence + confidence_level: str # ConfidenceLevel value + caveats: list[str] + + measured_at: str + + +class MultiReformComparisonResult(TypedDict): + """Comparison of multiple reforms in a jurisdiction.""" + + jurisdiction: str + reforms_analyzed: int + reform_results: list[ReformImpactResult] + most_impactful_reform: str | None + total_days_saved_all_reforms: float + total_cost_savings_all_reforms: float + total_units_enabled: int + measured_at: str + + +class ReformTimeSeriesPoint(TypedDict): + """Single data point in a reform time series.""" + + period_start: str # ISO date + period_end: str + project_count: int + median_duration_days: float + mean_duration_days: float + reform_in_effect: bool + + +class ReformTimeSeriesResult(TypedDict): + """Time series of duration metrics around a reform's effective date.""" + + reform_id: str + reform_name: str + jurisdiction: str + stage_measured: str + time_series: list[ReformTimeSeriesPoint] + trend_before: float # slope in days/month + trend_after: float + structural_break_detected: bool + measured_at: str + + +# --------------------------------------------------------------------------- +# Single reform impact +# --------------------------------------------------------------------------- + + +def measure_reform_impact( + db: Session, + reform_id: UUID, + *, + stage: str = "entitlement", + buffer_days: int | None = None, +) -> ReformImpactResult: + """Measure the impact of a single policy reform on project timelines. + + Compares stage durations for projects that went through the relevant + stage before vs. after the reform's effective date (with a buffer + period excluded). Uses the appropriate statistical test based on data + normality. + + Args: + db: SQLAlchemy session. + reform_id: UUID of the PolicyReform to evaluate. + stage: Pipeline stage to measure (default "entitlement"). + buffer_days: Days after effective_date to exclude (transition + period). If None, uses the reform's implementation_buffer_days. + + Returns: + ReformImpactResult with statistical test results and cost + estimates. + + Raises: + ValueError: If the reform is not found or has no effective date. + """ + reform = db.get(PolicyReform, reform_id) + if reform is None: + raise ValueError(f"PolicyReform {reform_id} not found.") + if reform.effective_date is None: + raise ValueError(f"Reform {reform.reform_name} has no effective_date.") + + buffer = buffer_days if buffer_days is not None else reform.implementation_buffer_days + cutoff_date = reform.effective_date + timedelta(days=buffer) + + # Collect pre-reform durations + pre_projects = _get_pre_reform_projects(db, reform, stage) + pre_durations = _extract_durations(pre_projects, stage) + + # Collect post-reform durations + post_projects = _get_post_reform_projects(db, reform, stage, cutoff_date) + post_durations = _extract_durations(post_projects, stage) + + caveats: list[str] = [] + + # Check minimum sample sizes + if len(pre_durations) < 2: + caveats.append( + f"Only {len(pre_durations)} pre-reform project(s) with data; " + f"results may not be reliable." + ) + if len(post_durations) < 2: + caveats.append( + f"Only {len(post_durations)} post-reform project(s) with data; " + f"results may not be reliable." + ) + + # Compute basic metrics + pre_median = float(np.median(pre_durations)) if pre_durations else 0.0 + post_median = float(np.median(post_durations)) if post_durations else 0.0 + days_saved = pre_median - post_median + pct_improvement = ( + (days_saved / pre_median * 100.0) if pre_median > 0 else 0.0 + ) + + # Statistical testing + if len(pre_durations) >= 2 and len(post_durations) >= 2: + test_result = select_and_run_test( + pre_durations, post_durations, alpha=0.05 + ) + test_used = test_result["test_used"] + result_data = test_result["result"] + p_value = result_data["p_value"] + is_sig = result_data["significant"] + + if test_used == "ttest": + effect_size = result_data.get("cohens_d", 0.0) + ci = result_data.get("confidence_interval_diff", (0.0, 0.0)) + else: + effect_size = result_data.get("rank_biserial_r", 0.0) + # Approximate CI from pre/post + ci = _bootstrap_ci_diff(pre_durations, post_durations) + + # Interpret effect size + abs_es = abs(effect_size) + if abs_es < 0.2: + effect_interp = "negligible" + elif abs_es < 0.5: + effect_interp = "small" + elif abs_es < 0.8: + effect_interp = "medium" + else: + effect_interp = "large" + else: + test_used = "insufficient_data" + p_value = 1.0 + is_sig = False + effect_size = 0.0 + effect_interp = "insufficient_data" + ci = (0.0, 0.0) + caveats.append("Insufficient data for statistical testing.") + + # Cost savings estimate + national = _load_national_benchmarks() + holding_costs = national.get("holding_costs", {}) + daily_cost = float( + holding_costs.get(f"daily_per_unit_during_{stage}", 20) + ) + avg_units = _average_units(pre_projects + post_projects) + cost_per_project = days_saved * daily_cost * avg_units + total_cost_savings = cost_per_project * len(post_durations) + + # Units enabled: count projects that were stalled/abandoned pre-reform + # and comparable projects succeeding post-reform + abandoned_pre = get_abandoned_projects( + db, + reform.jurisdiction, + abandoned_before=reform.effective_date, + ) + units_enabled = sum( + p.total_units or 0 + for p in abandoned_pre + if _project_in_stage(p, stage) + ) + + # Stall rate change + pre_stall_rate = _stall_rate(pre_projects, stage) + post_stall_rate = _stall_rate(post_projects, stage) + stall_change = post_stall_rate - pre_stall_rate + + # Projects unblocked (completed post-reform that wouldn't have under old timing) + projects_unblocked = sum( + 1 + for d in post_durations + if d < pre_median * 0.85 # significantly faster + ) + + # Confidence level + conf_level = _determine_confidence( + len(pre_durations), len(post_durations), is_sig, effect_interp + ) + + return ReformImpactResult( + reform_id=str(reform.reform_id), + reform_name=reform.reform_name, + jurisdiction=reform.jurisdiction, + reform_type=reform.reform_type.value, + effective_date=( + reform.effective_date.isoformat() + if reform.effective_date + else None + ), + pre_reform_n=len(pre_durations), + post_reform_n=len(post_durations), + pre_reform_median_days=round(pre_median, 1), + post_reform_median_days=round(post_median, 1), + days_saved_per_project=round(days_saved, 1), + percent_improvement=round(pct_improvement, 1), + test_used=test_used, + p_value=round(p_value, 6), + is_significant=is_sig, + effect_size=round(effect_size, 3), + effect_interpretation=effect_interp, + confidence_interval_days_saved=(round(ci[0], 1), round(ci[1], 1)), + total_cost_savings=round(total_cost_savings, 2), + units_enabled=units_enabled, + projects_unblocked=projects_unblocked, + stall_rate_change=round(stall_change, 3), + confidence_level=conf_level.value, + caveats=caveats, + measured_at=datetime.utcnow().isoformat(), + ) + + +def measure_reform_impact_and_persist( + db: Session, + reform_id: UUID, + **kwargs, +) -> ReformImpactResult: + """Measure reform impact and write results back to the PolicyReform. + + Args: + db: SQLAlchemy session. + reform_id: UUID of the PolicyReform. + **kwargs: Additional arguments passed to measure_reform_impact. + + Returns: + ReformImpactResult. + """ + result = measure_reform_impact(db, reform_id, **kwargs) + + reform = db.get(PolicyReform, reform_id) + if reform: + reform.projects_pre_reform = result["pre_reform_n"] + reform.projects_post_reform = result["post_reform_n"] + reform.pre_reform_median_days = ( + int(result["pre_reform_median_days"]) + if result["pre_reform_median_days"] + else None + ) + reform.post_reform_median_days = ( + int(result["post_reform_median_days"]) + if result["post_reform_median_days"] + else None + ) + reform.days_saved_per_project = ( + int(result["days_saved_per_project"]) + if result["days_saved_per_project"] + else None + ) + reform.percent_improvement = result["percent_improvement"] + reform.statistical_significance_p_value = result["p_value"] + reform.confidence_level = ConfidenceLevel(result["confidence_level"]) + reform.total_cost_savings = result["total_cost_savings"] + reform.units_enabled = result["units_enabled"] + reform.projects_no_longer_delayed = result["projects_unblocked"] + reform.impact_last_measured = date.today() + + db.commit() + logger.info( + "Persisted impact measurement for reform %s (%s).", + reform.reform_name, + reform.jurisdiction, + ) + + return result + + +# --------------------------------------------------------------------------- +# Multi-reform comparison +# --------------------------------------------------------------------------- + + +def compare_reforms_in_jurisdiction( + db: Session, + jurisdiction: str, + *, + stage: str = "entitlement", +) -> MultiReformComparisonResult: + """Compare impacts of all reforms in a jurisdiction. + + Args: + db: SQLAlchemy session. + jurisdiction: Jurisdiction to analyze. + stage: Pipeline stage to measure. + + Returns: + MultiReformComparisonResult with ranked reform impacts. + """ + stmt = select(PolicyReform).where( + PolicyReform.jurisdiction == jurisdiction, + PolicyReform.effective_date.isnot(None), + ) + reforms = list(db.scalars(stmt).all()) + + results: list[ReformImpactResult] = [] + for reform in reforms: + try: + result = measure_reform_impact( + db, reform.reform_id, stage=stage + ) + results.append(result) + except Exception: + logger.exception( + "Failed to measure reform %s.", reform.reform_name + ) + + # Sort by days saved descending + results.sort(key=lambda r: r["days_saved_per_project"], reverse=True) + + most_impactful = results[0]["reform_name"] if results else None + total_days = sum(r["days_saved_per_project"] for r in results) + total_cost = sum(r["total_cost_savings"] for r in results) + total_units = sum(r["units_enabled"] for r in results) + + return MultiReformComparisonResult( + jurisdiction=jurisdiction, + reforms_analyzed=len(results), + reform_results=results, + most_impactful_reform=most_impactful, + total_days_saved_all_reforms=round(total_days, 1), + total_cost_savings_all_reforms=round(total_cost, 2), + total_units_enabled=total_units, + measured_at=datetime.utcnow().isoformat(), + ) + + +# --------------------------------------------------------------------------- +# Time series analysis +# --------------------------------------------------------------------------- + + +def build_reform_time_series( + db: Session, + reform_id: UUID, + *, + stage: str = "entitlement", + period_months: int = 6, + lookback_periods: int = 4, + lookahead_periods: int = 4, +) -> ReformTimeSeriesResult: + """Build a time series of stage durations around a reform's effective date. + + Divides the timeline into equal periods before and after the reform + and computes median/mean durations for each period. + + Args: + db: SQLAlchemy session. + reform_id: UUID of the PolicyReform. + stage: Pipeline stage to measure. + period_months: Length of each time bucket in months. + lookback_periods: Number of periods before the reform. + lookahead_periods: Number of periods after the reform. + + Returns: + ReformTimeSeriesResult with time series data points and trend + estimates. + """ + reform = db.get(PolicyReform, reform_id) + if reform is None: + raise ValueError(f"PolicyReform {reform_id} not found.") + if reform.effective_date is None: + raise ValueError(f"Reform {reform.reform_name} has no effective_date.") + + effective = reform.effective_date + period_days = period_months * 30 # Approximate + + time_points: list[ReformTimeSeriesPoint] = [] + pre_medians: list[float] = [] + post_medians: list[float] = [] + + # Build periods + for i in range(-lookback_periods, lookahead_periods + 1): + if i < 0: + period_start = effective + timedelta(days=i * period_days) + period_end = effective + timedelta(days=(i + 1) * period_days) + reform_active = False + elif i == 0: + period_start = effective + period_end = effective + timedelta(days=period_days) + reform_active = True + else: + period_start = effective + timedelta(days=i * period_days) + period_end = effective + timedelta(days=(i + 1) * period_days) + reform_active = True + + # Get projects that completed the stage during this period + projects = _get_projects_completing_stage_in_window( + db, reform.jurisdiction, stage, period_start, period_end + ) + durations = _extract_durations(projects, stage) + + if durations: + med = float(np.median(durations)) + mean = float(np.mean(durations)) + else: + med = 0.0 + mean = 0.0 + + time_points.append( + ReformTimeSeriesPoint( + period_start=period_start.isoformat(), + period_end=period_end.isoformat(), + project_count=len(durations), + median_duration_days=round(med, 1), + mean_duration_days=round(mean, 1), + reform_in_effect=reform_active, + ) + ) + + if not reform_active and med > 0: + pre_medians.append(med) + elif reform_active and med > 0: + post_medians.append(med) + + # Compute trends (simple slope via least squares) + trend_before = _compute_slope(pre_medians) + trend_after = _compute_slope(post_medians) + + # Structural break: significant difference in means pre vs post + structural_break = False + if len(pre_medians) >= 2 and len(post_medians) >= 2: + try: + test_result = independent_ttest(pre_medians, post_medians, alpha=0.10) + structural_break = test_result["significant"] + except ValueError: + pass + + return ReformTimeSeriesResult( + reform_id=str(reform.reform_id), + reform_name=reform.reform_name, + jurisdiction=reform.jurisdiction, + stage_measured=stage, + time_series=time_points, + trend_before=round(trend_before, 2), + trend_after=round(trend_after, 2), + structural_break_detected=structural_break, + measured_at=datetime.utcnow().isoformat(), + ) + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _get_pre_reform_projects( + db: Session, + reform: PolicyReform, + stage: str, +) -> list[Project]: + """Get projects that completed the given stage before the reform.""" + stage_complete_attr = f"{stage}_complete" + + stmt = select(Project).where( + Project.jurisdiction == reform.jurisdiction, + getattr(Project, stage_complete_attr).isnot(None), + getattr(Project, stage_complete_attr) < reform.effective_date, + ) + return list(db.scalars(stmt).all()) + + +def _get_post_reform_projects( + db: Session, + reform: PolicyReform, + stage: str, + cutoff_date: date, +) -> list[Project]: + """Get projects that started the given stage after the buffer period.""" + stage_start_attr = f"{stage}_start" + + stmt = select(Project).where( + Project.jurisdiction == reform.jurisdiction, + getattr(Project, stage_start_attr).isnot(None), + getattr(Project, stage_start_attr) >= cutoff_date, + ) + return list(db.scalars(stmt).all()) + + +def _get_projects_completing_stage_in_window( + db: Session, + jurisdiction: str, + stage: str, + window_start: date, + window_end: date, +) -> list[Project]: + """Get projects that completed a stage within a date window.""" + stage_complete_attr = f"{stage}_complete" + + stmt = select(Project).where( + Project.jurisdiction == jurisdiction, + getattr(Project, stage_complete_attr).isnot(None), + getattr(Project, stage_complete_attr) >= window_start, + getattr(Project, stage_complete_attr) <= window_end, + ) + return list(db.scalars(stmt).all()) + + +def _extract_durations( + projects: list[Project], + stage: str, +) -> list[float]: + """Extract non-null stage durations from a list of projects.""" + attr = f"{stage}_duration_days" + return [ + float(getattr(p, attr)) + for p in projects + if getattr(p, attr, None) is not None + ] + + +def _average_units(projects: list[Project]) -> float: + """Average total_units across projects, defaulting to 50.""" + units = [p.total_units for p in projects if p.total_units and p.total_units > 0] + return float(np.mean(units)) if units else 50.0 + + +def _stall_rate(projects: list[Project], stage: str) -> float: + """Fraction of projects in a stage that are stalled or abandoned.""" + in_stage = [ + p for p in projects + if p.current_stage.value == stage + or p.current_stage in (PipelineStage.STALLED, PipelineStage.ABANDONED) + ] + stalled = [ + p for p in in_stage + if p.current_stage in (PipelineStage.STALLED, PipelineStage.ABANDONED) + ] + return len(stalled) / max(1, len(in_stage)) + + +def _project_in_stage(project: Project, stage: str) -> bool: + """Check if a project was last active in a given stage.""" + # Projects that were abandoned may have stage data hinting at their last stage + if project.current_stage == PipelineStage.ABANDONED: + # Check if the stage start was populated but not completed + start_attr = f"{stage}_start" + complete_attr = f"{stage}_complete" + started = getattr(project, start_attr, None) is not None + completed = getattr(project, complete_attr, None) is not None + return started and not completed + return project.current_stage.value == stage + + +def _bootstrap_ci_diff( + pre: list[float], + post: list[float], + n_bootstrap: int = 1000, + confidence: float = 0.95, +) -> tuple[float, float]: + """Bootstrap confidence interval for the difference in medians.""" + rng = np.random.default_rng(42) + pre_arr = np.array(pre, dtype=float) + post_arr = np.array(post, dtype=float) + + diffs: list[float] = [] + for _ in range(n_bootstrap): + pre_sample = rng.choice(pre_arr, size=len(pre_arr), replace=True) + post_sample = rng.choice(post_arr, size=len(post_arr), replace=True) + diffs.append(float(np.median(pre_sample) - np.median(post_sample))) + + alpha = 1 - confidence + lower = float(np.percentile(diffs, alpha / 2 * 100)) + upper = float(np.percentile(diffs, (1 - alpha / 2) * 100)) + return (round(lower, 1), round(upper, 1)) + + +def _determine_confidence( + n_pre: int, + n_post: int, + is_significant: bool, + effect_interp: str, +) -> ConfidenceLevel: + """Determine confidence level from sample sizes and statistical results.""" + min_n = min(n_pre, n_post) + + if min_n >= 15 and is_significant and effect_interp in ("medium", "large"): + return ConfidenceLevel.HIGH + elif min_n >= 5 and (is_significant or effect_interp in ("small", "medium", "large")): + return ConfidenceLevel.MODERATE + else: + return ConfidenceLevel.LOW + + +def _compute_slope(values: list[float]) -> float: + """Compute simple linear slope over an ordered list of values.""" + if len(values) < 2: + return 0.0 + x = np.arange(len(values), dtype=float) + y = np.array(values, dtype=float) + # Least squares: slope = cov(x,y) / var(x) + slope = float(np.polyfit(x, y, 1)[0]) + return slope diff --git a/src/analytics/statistical_tests.py b/src/analytics/statistical_tests.py new file mode 100644 index 0000000..411fa82 --- /dev/null +++ b/src/analytics/statistical_tests.py @@ -0,0 +1,529 @@ +"""Statistical testing utilities for HousingHand analytics. + +Provides t-tests, significance testing, effect size calculations, and +confidence interval computation used across reform impact analysis and +peer benchmarking modules. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import TypedDict + +import numpy as np +from scipy import stats + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Result types +# --------------------------------------------------------------------------- + + +class TTestResult(TypedDict): + """Result of an independent two-sample t-test.""" + + t_statistic: float + p_value: float + degrees_of_freedom: float + mean_a: float + mean_b: float + std_a: float + std_b: float + n_a: int + n_b: int + significant: bool + alpha: float + cohens_d: float + confidence_interval_diff: tuple[float, float] + + +class PairedTTestResult(TypedDict): + """Result of a paired t-test.""" + + t_statistic: float + p_value: float + degrees_of_freedom: int + mean_diff: float + std_diff: float + n: int + significant: bool + alpha: float + confidence_interval_diff: tuple[float, float] + + +class MannWhitneyResult(TypedDict): + """Result of a Mann-Whitney U test (non-parametric alternative).""" + + u_statistic: float + p_value: float + n_a: int + n_b: int + significant: bool + alpha: float + rank_biserial_r: float + + +class ConfidenceIntervalResult(TypedDict): + """A confidence interval for a sample mean.""" + + mean: float + lower: float + upper: float + confidence_level: float + margin_of_error: float + n: int + + +class NormalityTestResult(TypedDict): + """Result from a Shapiro-Wilk normality test.""" + + statistic: float + p_value: float + is_normal: bool + alpha: float + n: int + + +class EffectSizeResult(TypedDict): + """Cohen's d effect size with interpretation.""" + + cohens_d: float + interpretation: str # "negligible", "small", "medium", "large" + pooled_std: float + + +# --------------------------------------------------------------------------- +# Core statistical functions +# --------------------------------------------------------------------------- + + +def independent_ttest( + sample_a: list[float] | np.ndarray, + sample_b: list[float] | np.ndarray, + alpha: float = 0.05, + equal_var: bool = False, +) -> TTestResult: + """Perform an independent two-sample t-test (Welch's by default). + + Compares means of two independent groups and determines whether the + observed difference is statistically significant. + + Args: + sample_a: Observations from group A (e.g. pre-reform durations). + sample_b: Observations from group B (e.g. post-reform durations). + alpha: Significance threshold (default 0.05). + equal_var: If True, use Student's t-test (assumes equal variance). + If False, use Welch's t-test (default, more robust). + + Returns: + TTestResult dict with test statistics, p-value, significance flag, + Cohen's d, and a confidence interval for the mean difference. + + Raises: + ValueError: If either sample has fewer than 2 observations. + """ + a = np.asarray(sample_a, dtype=float) + b = np.asarray(sample_b, dtype=float) + + # Drop NaN values + a = a[~np.isnan(a)] + b = b[~np.isnan(b)] + + if len(a) < 2 or len(b) < 2: + raise ValueError( + f"Each sample must have at least 2 observations. " + f"Got n_a={len(a)}, n_b={len(b)}." + ) + + t_stat, p_value = stats.ttest_ind(a, b, equal_var=equal_var) + + # Welch-Satterthwaite degrees of freedom + if equal_var: + df = float(len(a) + len(b) - 2) + else: + df = _welch_df(a, b) + + effect = cohens_d(a, b) + + # Confidence interval for the difference in means + mean_diff = float(np.mean(a) - np.mean(b)) + se_diff = np.sqrt(np.var(a, ddof=1) / len(a) + np.var(b, ddof=1) / len(b)) + t_crit = stats.t.ppf(1 - alpha / 2, df) + ci_lower = mean_diff - t_crit * se_diff + ci_upper = mean_diff + t_crit * se_diff + + return TTestResult( + t_statistic=float(t_stat), + p_value=float(p_value), + degrees_of_freedom=float(df), + mean_a=float(np.mean(a)), + mean_b=float(np.mean(b)), + std_a=float(np.std(a, ddof=1)), + std_b=float(np.std(b, ddof=1)), + n_a=len(a), + n_b=len(b), + significant=bool(p_value < alpha), + alpha=alpha, + cohens_d=effect["cohens_d"], + confidence_interval_diff=(float(ci_lower), float(ci_upper)), + ) + + +def paired_ttest( + before: list[float] | np.ndarray, + after: list[float] | np.ndarray, + alpha: float = 0.05, +) -> PairedTTestResult: + """Perform a paired (dependent) t-test on matched observations. + + Use this when the same projects are measured before and after an + intervention. + + Args: + before: Pre-intervention measurements. + after: Post-intervention measurements. + alpha: Significance threshold. + + Returns: + PairedTTestResult dict. + + Raises: + ValueError: If sample sizes do not match or are < 2. + """ + b = np.asarray(before, dtype=float) + a = np.asarray(after, dtype=float) + + # Drop pairs where either value is NaN + mask = ~(np.isnan(b) | np.isnan(a)) + b = b[mask] + a = a[mask] + + if len(b) != len(a): + raise ValueError("Before and after samples must be the same length.") + if len(b) < 2: + raise ValueError(f"Need at least 2 paired observations. Got {len(b)}.") + + diffs = b - a + t_stat, p_value = stats.ttest_rel(b, a) + df = len(diffs) - 1 + + mean_diff = float(np.mean(diffs)) + std_diff = float(np.std(diffs, ddof=1)) + se_diff = std_diff / np.sqrt(len(diffs)) + t_crit = stats.t.ppf(1 - alpha / 2, df) + ci_lower = mean_diff - t_crit * se_diff + ci_upper = mean_diff + t_crit * se_diff + + return PairedTTestResult( + t_statistic=float(t_stat), + p_value=float(p_value), + degrees_of_freedom=df, + mean_diff=mean_diff, + std_diff=std_diff, + n=len(diffs), + significant=bool(p_value < alpha), + alpha=alpha, + confidence_interval_diff=(float(ci_lower), float(ci_upper)), + ) + + +def mann_whitney_test( + sample_a: list[float] | np.ndarray, + sample_b: list[float] | np.ndarray, + alpha: float = 0.05, + alternative: str = "two-sided", +) -> MannWhitneyResult: + """Perform Mann-Whitney U test (non-parametric alternative to t-test). + + Preferred when data is non-normal or ordinal, which is common for + housing development timeline durations that are right-skewed. + + Args: + sample_a: Observations from group A. + sample_b: Observations from group B. + alpha: Significance threshold. + alternative: 'two-sided', 'less', or 'greater'. + + Returns: + MannWhitneyResult dict. + """ + a = np.asarray(sample_a, dtype=float) + b = np.asarray(sample_b, dtype=float) + a = a[~np.isnan(a)] + b = b[~np.isnan(b)] + + if len(a) < 1 or len(b) < 1: + raise ValueError( + f"Each sample must have at least 1 observation. " + f"Got n_a={len(a)}, n_b={len(b)}." + ) + + u_stat, p_value = stats.mannwhitneyu(a, b, alternative=alternative) + + # Rank-biserial correlation as effect size + n_total = len(a) * len(b) + r_rb = 1 - (2 * u_stat) / n_total if n_total > 0 else 0.0 + + return MannWhitneyResult( + u_statistic=float(u_stat), + p_value=float(p_value), + n_a=len(a), + n_b=len(b), + significant=bool(p_value < alpha), + alpha=alpha, + rank_biserial_r=float(r_rb), + ) + + +def cohens_d( + sample_a: list[float] | np.ndarray, + sample_b: list[float] | np.ndarray, +) -> EffectSizeResult: + """Calculate Cohen's d effect size for two independent samples. + + Uses the pooled standard deviation. Interpretation follows + conventional thresholds: |d| < 0.2 negligible, < 0.5 small, + < 0.8 medium, >= 0.8 large. + + Args: + sample_a: Observations from group A. + sample_b: Observations from group B. + + Returns: + EffectSizeResult with Cohen's d and a textual interpretation. + """ + a = np.asarray(sample_a, dtype=float) + b = np.asarray(sample_b, dtype=float) + a = a[~np.isnan(a)] + b = b[~np.isnan(b)] + + n_a, n_b = len(a), len(b) + if n_a < 2 or n_b < 2: + return EffectSizeResult( + cohens_d=0.0, + interpretation="insufficient_data", + pooled_std=0.0, + ) + + var_a = np.var(a, ddof=1) + var_b = np.var(b, ddof=1) + pooled_std = float(np.sqrt(((n_a - 1) * var_a + (n_b - 1) * var_b) / (n_a + n_b - 2))) + + if pooled_std == 0.0: + d = 0.0 + else: + d = float((np.mean(a) - np.mean(b)) / pooled_std) + + abs_d = abs(d) + if abs_d < 0.2: + interp = "negligible" + elif abs_d < 0.5: + interp = "small" + elif abs_d < 0.8: + interp = "medium" + else: + interp = "large" + + return EffectSizeResult( + cohens_d=d, + interpretation=interp, + pooled_std=pooled_std, + ) + + +def confidence_interval( + sample: list[float] | np.ndarray, + confidence_level: float = 0.95, +) -> ConfidenceIntervalResult: + """Compute a confidence interval for the population mean. + + Uses the t-distribution which is appropriate for small sample sizes + typical in housing project datasets. + + Args: + sample: Array of observations. + confidence_level: Desired confidence (default 0.95 for 95% CI). + + Returns: + ConfidenceIntervalResult with lower/upper bounds and margin of error. + """ + data = np.asarray(sample, dtype=float) + data = data[~np.isnan(data)] + + if len(data) < 2: + raise ValueError(f"Need at least 2 observations. Got {len(data)}.") + + n = len(data) + mean = float(np.mean(data)) + se = float(stats.sem(data)) + df = n - 1 + t_crit = stats.t.ppf((1 + confidence_level) / 2, df) + margin = t_crit * se + + return ConfidenceIntervalResult( + mean=mean, + lower=mean - margin, + upper=mean + margin, + confidence_level=confidence_level, + margin_of_error=margin, + n=n, + ) + + +def test_normality( + sample: list[float] | np.ndarray, + alpha: float = 0.05, +) -> NormalityTestResult: + """Test whether a sample follows a normal distribution (Shapiro-Wilk). + + Housing timeline durations are often right-skewed, so this test helps + decide whether to use parametric (t-test) or non-parametric + (Mann-Whitney) methods. + + Args: + sample: Array of observations. + alpha: Significance threshold for rejecting normality. + + Returns: + NormalityTestResult with test statistic, p-value, and boolean flag. + """ + data = np.asarray(sample, dtype=float) + data = data[~np.isnan(data)] + + if len(data) < 3: + raise ValueError(f"Shapiro-Wilk requires at least 3 observations. Got {len(data)}.") + + # Shapiro-Wilk has a sample size limit of 5000 + if len(data) > 5000: + logger.warning( + "Sample size %d exceeds Shapiro-Wilk limit; sub-sampling to 5000.", len(data) + ) + rng = np.random.default_rng(42) + data = rng.choice(data, size=5000, replace=False) + + stat, p_value = stats.shapiro(data) + + return NormalityTestResult( + statistic=float(stat), + p_value=float(p_value), + is_normal=bool(p_value >= alpha), + alpha=alpha, + n=len(data), + ) + + +def select_and_run_test( + sample_a: list[float] | np.ndarray, + sample_b: list[float] | np.ndarray, + alpha: float = 0.05, + normality_alpha: float = 0.05, +) -> dict: + """Automatically select the appropriate test based on data normality. + + Runs Shapiro-Wilk on both samples. If both pass normality, uses + Welch's t-test. Otherwise, falls back to Mann-Whitney U. + + Args: + sample_a: First group of observations. + sample_b: Second group of observations. + alpha: Significance threshold for the main test. + normality_alpha: Significance threshold for normality pre-test. + + Returns: + Dict with 'test_used' ('ttest' | 'mann_whitney'), 'result' (the + typed dict), and 'normality_a' / 'normality_b' NormalityTestResults. + """ + a = np.asarray(sample_a, dtype=float) + b = np.asarray(sample_b, dtype=float) + a = a[~np.isnan(a)] + b = b[~np.isnan(b)] + + norm_a = None + norm_b = None + use_parametric = True + + # Only test normality if samples are large enough + if len(a) >= 3: + norm_a = test_normality(a, alpha=normality_alpha) + if not norm_a["is_normal"]: + use_parametric = False + else: + # Too few observations; prefer non-parametric + use_parametric = False + + if len(b) >= 3: + norm_b = test_normality(b, alpha=normality_alpha) + if not norm_b["is_normal"]: + use_parametric = False + else: + use_parametric = False + + if use_parametric: + result = independent_ttest(a, b, alpha=alpha, equal_var=False) + test_name = "ttest" + else: + result = mann_whitney_test(a, b, alpha=alpha) + test_name = "mann_whitney" + + return { + "test_used": test_name, + "result": result, + "normality_a": norm_a, + "normality_b": norm_b, + } + + +def percentile_rank(value: float, distribution: list[float] | np.ndarray) -> float: + """Return the percentile rank of a value within a distribution. + + Args: + value: The observation to rank. + distribution: Reference distribution to rank against. + + Returns: + Float between 0.0 and 100.0 representing the percentile. + """ + dist = np.asarray(distribution, dtype=float) + dist = dist[~np.isnan(dist)] + if len(dist) == 0: + return 50.0 + return float(stats.percentileofscore(dist, value, kind="rank")) + + +def z_score(value: float, mean: float, std: float) -> float: + """Compute the z-score of an observation. + + Args: + value: Observed value. + mean: Population or sample mean. + std: Population or sample standard deviation. + + Returns: + Z-score as a float. Returns 0.0 if std is zero. + """ + if std == 0.0: + return 0.0 + return (value - mean) / std + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _welch_df(a: np.ndarray, b: np.ndarray) -> float: + """Welch-Satterthwaite approximation for degrees of freedom.""" + var_a = np.var(a, ddof=1) + var_b = np.var(b, ddof=1) + n_a = len(a) + n_b = len(b) + + num = (var_a / n_a + var_b / n_b) ** 2 + denom = (var_a / n_a) ** 2 / (n_a - 1) + (var_b / n_b) ** 2 / (n_b - 1) + + if denom == 0: + return float(n_a + n_b - 2) + return float(num / denom) diff --git a/src/analytics/timeline_prediction.py b/src/analytics/timeline_prediction.py new file mode 100644 index 0000000..c967394 --- /dev/null +++ b/src/analytics/timeline_prediction.py @@ -0,0 +1,602 @@ +"""Predictive timeline model using peer data and friction scores. + +Estimates remaining duration and projected milestone dates for +in-progress projects by combining peer benchmarks, jurisdiction-specific +friction adjustments, and project-level risk factors. Predictions are +expressed as point estimates with confidence intervals. +""" + +from __future__ import annotations + +import logging +from datetime import date, datetime, timedelta +from typing import TypedDict +from uuid import UUID + +import numpy as np +from sqlalchemy.orm import Session + +from src.analytics.peer_benchmarking import ( + PeerBenchmarkResult, + _ACTIVE_STAGES, + _extract_stage_durations, + _load_national_benchmarks, + compute_peer_benchmarks, +) +from src.database.queries import get_project, query_similar_projects +from src.models.enums import OverallHealth, PipelineStage +from src.models.project import Project + +logger = logging.getLogger(__name__) + + +# Stages in sequential order for remaining-stage calculations +_STAGE_ORDER: list[str] = [ + "concept", + "pre_development", + "entitlement", + "financing", + "construction", + "lease_up", +] + +_TERMINAL_STAGES = {"operations", "abandoned", "stalled"} + + +# --------------------------------------------------------------------------- +# Typed results +# --------------------------------------------------------------------------- + + +class StagePrediction(TypedDict): + """Predicted duration for one remaining pipeline stage.""" + + stage: str + predicted_days: float + peer_median_days: float + friction_adjustment_days: float + risk_adjustment_days: float + confidence_low_days: float + confidence_high_days: float + + +class TimelinePredictionResult(TypedDict): + """Full timeline prediction for a project.""" + + project_id: str + project_name: str + current_stage: str + days_in_current_stage: int | None + predicted_remaining_days: float + predicted_total_days: float + predicted_groundbreaking: str | None # ISO date + predicted_co: str | None # ISO date (certificate of occupancy) + confidence: float # 0.0 - 1.0 + confidence_interval_days: tuple[float, float] + stage_predictions: list[StagePrediction] + friction_score_used: int | None + peer_group_name: str + method: str # "peer_adjusted", "national_adjusted", "extrapolation" + predicted_at: str + + +class BatchTimelinePredictionResult(TypedDict): + """Batch prediction summary.""" + + total_predicted: int + average_remaining_days: float + median_remaining_days: float + average_confidence: float + predictions: list[TimelinePredictionResult] + predicted_at: str + + +# --------------------------------------------------------------------------- +# Single project prediction +# --------------------------------------------------------------------------- + + +def predict_project_timeline( + db: Session, + project_id: UUID, + peer_benchmark: PeerBenchmarkResult | None = None, +) -> TimelinePredictionResult: + """Predict remaining timeline for a single project. + + The prediction combines three inputs: + 1. Peer benchmark median durations for each remaining stage. + 2. Jurisdiction friction score adjustment (higher friction -> longer). + 3. Project-level risk factors (opposition, appeals, etc.). + + Confidence is reduced when fewer peer data points are available or + when friction/risk adjustments are large. + + Args: + db: SQLAlchemy session. + project_id: UUID of the project to predict. + peer_benchmark: Optional pre-computed peer benchmarks. + + Returns: + TimelinePredictionResult with stage-by-stage predictions and + projected milestone dates. + + Raises: + ValueError: If project is not found. + """ + project = get_project(db, project_id) + if project is None: + raise ValueError(f"Project {project_id} not found.") + + current_stage_val = project.current_stage.value + + if current_stage_val in _TERMINAL_STAGES: + return _terminal_prediction(project) + + # Compute peer benchmarks if not provided + if peer_benchmark is None: + peer_benchmark = compute_peer_benchmarks( + db, + jurisdiction=project.jurisdiction, + state=project.state, + building_type=( + project.building_type.value if project.building_type else None + ), + ) + + method = ( + "peer_adjusted" + if peer_benchmark["project_count"] > 0 + else "national_adjusted" + ) + + # Compute friction and risk adjustments + friction_score = project.jurisdiction_friction_score + friction_multiplier = _friction_to_multiplier(friction_score) + risk_multiplier = _risk_to_multiplier(project) + + # Determine remaining stages + remaining_stages = _get_remaining_stages(current_stage_val) + + # Build per-stage predictions + stage_predictions: list[StagePrediction] = [] + total_remaining = 0.0 + total_low = 0.0 + total_high = 0.0 + confidence_factors: list[float] = [] + + for stage in remaining_stages: + bench = peer_benchmark["stage_benchmarks"].get(stage) + + if bench and bench["median_days"] > 0: + base_days = bench["median_days"] + sample_size = bench["sample_size"] + else: + # Fall back to national + national = _load_national_benchmarks() + nat_stage = national.get("stage_durations", {}).get(stage, {}) + base_days = float(nat_stage.get("median", 180)) + sample_size = 0 + + # Apply friction adjustment (primarily affects entitlement/pre-dev) + if stage in ("entitlement", "pre_development"): + friction_adj = base_days * (friction_multiplier - 1.0) + else: + friction_adj = base_days * (friction_multiplier - 1.0) * 0.3 + + # Apply risk adjustment + risk_adj = base_days * (risk_multiplier - 1.0) + + predicted = base_days + friction_adj + risk_adj + + # For the current stage, subtract days already spent + if stage == current_stage_val and project.days_in_current_stage: + predicted = max(0.0, predicted - project.days_in_current_stage) + friction_adj = max(0.0, friction_adj) + risk_adj = max(0.0, risk_adj) + + # Confidence interval: use p25/p75 if available, else +/- 30% + if bench and bench["p25_days"] > 0: + ci_low = bench["p25_days"] + ci_high = bench["p75_days"] + # Adjust CI for friction/risk + ci_low = ci_low * friction_multiplier * risk_multiplier + ci_high = ci_high * friction_multiplier * risk_multiplier + if stage == current_stage_val and project.days_in_current_stage: + ci_low = max(0.0, ci_low - project.days_in_current_stage) + ci_high = max(0.0, ci_high - project.days_in_current_stage) + else: + ci_low = predicted * 0.7 + ci_high = predicted * 1.5 + + # Per-stage confidence based on sample size + if sample_size >= 20: + stage_conf = 0.85 + elif sample_size >= 10: + stage_conf = 0.70 + elif sample_size >= 5: + stage_conf = 0.55 + else: + stage_conf = 0.35 + confidence_factors.append(stage_conf) + + stage_predictions.append( + StagePrediction( + stage=stage, + predicted_days=round(predicted, 1), + peer_median_days=base_days, + friction_adjustment_days=round(friction_adj, 1), + risk_adjustment_days=round(risk_adj, 1), + confidence_low_days=round(ci_low, 1), + confidence_high_days=round(ci_high, 1), + ) + ) + + total_remaining += predicted + total_low += ci_low + total_high += ci_high + + # Overall confidence + if confidence_factors: + base_confidence = float(np.mean(confidence_factors)) + else: + base_confidence = 0.3 + + # Penalize confidence for high friction / risk + adj_penalty = (friction_multiplier - 1.0) * 0.1 + (risk_multiplier - 1.0) * 0.1 + confidence = max(0.1, min(0.95, base_confidence - adj_penalty)) + + # Predicted total from concept + elapsed = project.total_elapsed_days or 0 + predicted_total = elapsed + total_remaining + + # Projected dates + today = date.today() + predicted_groundbreaking = _project_milestone_date( + project, stage_predictions, "construction", today + ) + predicted_co = _project_milestone_date( + project, stage_predictions, "lease_up", today + ) + + return TimelinePredictionResult( + project_id=str(project.project_id), + project_name=project.project_name, + current_stage=current_stage_val, + days_in_current_stage=project.days_in_current_stage, + predicted_remaining_days=round(total_remaining, 1), + predicted_total_days=round(predicted_total, 1), + predicted_groundbreaking=( + predicted_groundbreaking.isoformat() + if predicted_groundbreaking + else None + ), + predicted_co=( + predicted_co.isoformat() if predicted_co else None + ), + confidence=round(confidence, 3), + confidence_interval_days=(round(total_low, 1), round(total_high, 1)), + stage_predictions=stage_predictions, + friction_score_used=friction_score, + peer_group_name=peer_benchmark["peer_group_name"], + method=method, + predicted_at=datetime.utcnow().isoformat(), + ) + + +# --------------------------------------------------------------------------- +# Batch prediction +# --------------------------------------------------------------------------- + + +def predict_batch_timelines( + db: Session, + *, + jurisdiction: str | None = None, + state: str | None = None, + stages: list[PipelineStage] | None = None, + limit: int = 500, +) -> BatchTimelinePredictionResult: + """Predict timelines for multiple projects. + + Args: + db: SQLAlchemy session. + jurisdiction: Optional jurisdiction filter. + state: Optional state filter. + stages: Optional pipeline stage filter. + limit: Maximum projects to predict. + + Returns: + BatchTimelinePredictionResult with summary statistics and + individual predictions. + """ + from src.database.queries import query_projects + + active_stages = stages or [ + s for s in PipelineStage if s.value not in _TERMINAL_STAGES + ] + + projects = query_projects( + db, + jurisdiction=jurisdiction, + state=state, + stages=active_stages, + limit=limit, + ) + + # Pre-compute peer benchmark once + peer_benchmark = compute_peer_benchmarks( + db, + jurisdiction=jurisdiction, + state=state, + ) + + predictions: list[TimelinePredictionResult] = [] + + for project in projects: + try: + pred = predict_project_timeline( + db, project.project_id, peer_benchmark + ) + predictions.append(pred) + except Exception: + logger.exception( + "Failed timeline prediction for project %s.", + project.project_id, + ) + + remaining_days = [p["predicted_remaining_days"] for p in predictions] + confidences = [p["confidence"] for p in predictions] + + arr = np.array(remaining_days) if remaining_days else np.array([0.0]) + conf_arr = np.array(confidences) if confidences else np.array([0.0]) + + return BatchTimelinePredictionResult( + total_predicted=len(predictions), + average_remaining_days=float(np.mean(arr)), + median_remaining_days=float(np.median(arr)), + average_confidence=float(np.mean(conf_arr)), + predictions=predictions, + predicted_at=datetime.utcnow().isoformat(), + ) + + +# --------------------------------------------------------------------------- +# Friction-based prediction for new projects +# --------------------------------------------------------------------------- + + +def predict_from_friction_score( + db: Session, + *, + jurisdiction: str, + friction_score: int, + total_units: int = 50, + building_type: str | None = None, + state: str | None = None, +) -> TimelinePredictionResult: + """Predict timeline for a hypothetical project using friction score. + + Useful for "what-if" analysis: given a jurisdiction's friction score, + how long should a typical project expect to take? + + Args: + db: SQLAlchemy session. + jurisdiction: Target jurisdiction. + friction_score: Jurisdiction friction score (1-100). + total_units: Assumed project size. + building_type: Optional building type. + state: State code for peer matching. + + Returns: + TimelinePredictionResult for the hypothetical project. + """ + peer_benchmark = compute_peer_benchmarks( + db, + jurisdiction=jurisdiction, + state=state, + building_type=building_type, + ) + + friction_multiplier = _friction_to_multiplier(friction_score) + stage_predictions: list[StagePrediction] = [] + total_days = 0.0 + total_low = 0.0 + total_high = 0.0 + + for stage in _STAGE_ORDER: + bench = peer_benchmark["stage_benchmarks"].get(stage) + if bench and bench["median_days"] > 0: + base = bench["median_days"] + else: + national = _load_national_benchmarks() + base = float( + national.get("stage_durations", {}) + .get(stage, {}) + .get("median", 180) + ) + + if stage in ("entitlement", "pre_development"): + friction_adj = base * (friction_multiplier - 1.0) + else: + friction_adj = base * (friction_multiplier - 1.0) * 0.3 + + predicted = base + friction_adj + ci_low = predicted * 0.7 + ci_high = predicted * 1.5 + + stage_predictions.append( + StagePrediction( + stage=stage, + predicted_days=round(predicted, 1), + peer_median_days=base, + friction_adjustment_days=round(friction_adj, 1), + risk_adjustment_days=0.0, + confidence_low_days=round(ci_low, 1), + confidence_high_days=round(ci_high, 1), + ) + ) + + total_days += predicted + total_low += ci_low + total_high += ci_high + + today = date.today() + groundbreaking_offset = sum( + sp["predicted_days"] + for sp in stage_predictions + if sp["stage"] in ("concept", "pre_development", "entitlement", "financing") + ) + co_offset = total_days + + return TimelinePredictionResult( + project_id="hypothetical", + project_name=f"Hypothetical ({jurisdiction}, friction={friction_score})", + current_stage="concept", + days_in_current_stage=0, + predicted_remaining_days=round(total_days, 1), + predicted_total_days=round(total_days, 1), + predicted_groundbreaking=( + (today + timedelta(days=int(groundbreaking_offset))).isoformat() + ), + predicted_co=( + (today + timedelta(days=int(co_offset))).isoformat() + ), + confidence=0.45, # Lower confidence for hypothetical + confidence_interval_days=(round(total_low, 1), round(total_high, 1)), + stage_predictions=stage_predictions, + friction_score_used=friction_score, + peer_group_name=peer_benchmark["peer_group_name"], + method="friction_projection", + predicted_at=datetime.utcnow().isoformat(), + ) + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _get_remaining_stages(current_stage: str) -> list[str]: + """Return stages remaining including the current stage.""" + if current_stage not in _STAGE_ORDER: + return _STAGE_ORDER # Full pipeline + + idx = _STAGE_ORDER.index(current_stage) + return _STAGE_ORDER[idx:] + + +def _friction_to_multiplier(friction_score: int | None) -> float: + """Convert a friction score (1-100) to a duration multiplier. + + A friction score of 50 (median) maps to 1.0x (no adjustment). + A friction score of 100 maps to ~1.8x. + A friction score of 1 maps to ~0.7x. + + The mapping uses a logistic-style curve centered at 50. + """ + if friction_score is None: + return 1.0 + + score = max(1, min(100, friction_score)) + # Normalized to -1 .. +1 around the center of 50 + normalized = (score - 50) / 50.0 + + # Asymmetric scaling: high friction has more impact than low + if normalized >= 0: + multiplier = 1.0 + normalized * 0.8 # Up to 1.8x + else: + multiplier = 1.0 + normalized * 0.3 # Down to 0.7x + + return round(multiplier, 3) + + +def _risk_to_multiplier(project: Project) -> float: + """Convert project risk factors to a duration multiplier. + + Considers risk_score, appeals, opposition level, and design review + iterations. Center is 1.0x; max is ~1.5x. + """ + adjustments: list[float] = [] + + # Risk score (0-100) + if project.risk_score is not None: + adjustments.append((project.risk_score - 50.0) / 100.0) + + # Appeals + if project.appeals_filed and project.appeals_filed > 0: + adjustments.append(min(0.3, project.appeals_filed * 0.1)) + + # Neighbor opposition + opp_map = { + "none": -0.05, + "low": 0.0, + "moderate": 0.1, + "high": 0.2, + "severe": 0.35, + } + if project.neighbor_opposition_level is not None: + adjustments.append(opp_map.get(project.neighbor_opposition_level.value, 0.0)) + + # Design review iterations + if project.design_review_iterations and project.design_review_iterations > 2: + adjustments.append(min(0.2, (project.design_review_iterations - 2) * 0.05)) + + if not adjustments: + return 1.0 + + total_adj = sum(adjustments) + multiplier = 1.0 + total_adj + return round(max(0.8, min(1.5, multiplier)), 3) + + +def _project_milestone_date( + project: Project, + stage_predictions: list[StagePrediction], + target_stage: str, + reference_date: date, +) -> date | None: + """Calculate a projected date for reaching a target stage. + + Sums predicted days for stages between now and the target stage. + """ + current_stage = project.current_stage.value + if current_stage in _TERMINAL_STAGES: + return None + + try: + current_idx = _STAGE_ORDER.index(current_stage) + target_idx = _STAGE_ORDER.index(target_stage) + except ValueError: + return None + + if current_idx >= target_idx: + return None # Already past this stage + + days_to_target = sum( + sp["predicted_days"] + for sp in stage_predictions + if sp["stage"] in _STAGE_ORDER[current_idx:target_idx] + ) + + return reference_date + timedelta(days=int(days_to_target)) + + +def _terminal_prediction(project: Project) -> TimelinePredictionResult: + """Return a no-op prediction for terminal-stage projects.""" + return TimelinePredictionResult( + project_id=str(project.project_id), + project_name=project.project_name, + current_stage=project.current_stage.value, + days_in_current_stage=project.days_in_current_stage, + predicted_remaining_days=0.0, + predicted_total_days=float(project.total_elapsed_days or 0), + predicted_groundbreaking=None, + predicted_co=None, + confidence=1.0, + confidence_interval_days=(0.0, 0.0), + stage_predictions=[], + friction_score_used=project.jurisdiction_friction_score, + peer_group_name="n/a", + method="terminal", + predicted_at=datetime.utcnow().isoformat(), + ) diff --git a/src/api/__init__.py b/src/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/api/app.py b/src/api/app.py new file mode 100644 index 0000000..4b735e1 --- /dev/null +++ b/src/api/app.py @@ -0,0 +1,145 @@ +"""FastAPI application for the HousingHand pipeline intelligence platform. + +This module creates the FastAPI ``app`` instance, configures CORS +middleware, registers all API routers under the ``/api/v1`` prefix, and +defines the application lifespan (startup / shutdown hooks). +""" + +from __future__ import annotations + +import logging +from contextlib import asynccontextmanager +from datetime import datetime +from typing import AsyncIterator + +from fastapi import FastAPI, Request, status +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse + +from config.settings import get_settings + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Lifespan +# --------------------------------------------------------------------------- + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncIterator[None]: + """Application lifespan context manager. + + * **Startup**: validates database connectivity, logs configuration + summary, and warms any cached resources. + * **Shutdown**: performs graceful cleanup. + """ + settings = get_settings() + logger.info( + "HousingHand API starting (debug=%s, host=%s, port=%d)", + settings.api_debug, + settings.api_host, + settings.api_port, + ) + + # Verify database is reachable. + try: + from src.database.connection import get_engine + engine = get_engine() + with engine.connect() as conn: + conn.execute(__import__("sqlalchemy").text("SELECT 1")) + logger.info("Database connection verified") + except Exception: + logger.warning( + "Database connection could not be verified at startup -- " + "requests that require the database will fail until connectivity is restored" + ) + + yield # Application runs here. + + logger.info("HousingHand API shutting down") + + +# --------------------------------------------------------------------------- +# Application factory +# --------------------------------------------------------------------------- + +def create_app() -> FastAPI: + """Build and return a fully configured FastAPI application instance.""" + settings = get_settings() + + app = FastAPI( + title="HousingHand", + description=( + "Development Pipeline Intelligence Platform for the HousingMind " + "ecosystem. Tracks affordable housing projects from concept " + "through certificate of occupancy, identifies systemic " + "bottlenecks, predicts timelines, and measures the impact of " + "policy reforms." + ), + version="0.1.0", + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json", + lifespan=lifespan, + ) + + # -- CORS --------------------------------------------------------------- + app.add_middleware( + CORSMiddleware, + allow_origins=settings.cors_origin_list, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + # -- Global exception handler ------------------------------------------- + @app.exception_handler(Exception) + async def _unhandled_exception_handler( + request: Request, exc: Exception + ) -> JSONResponse: + logger.exception("Unhandled exception on %s %s", request.method, request.url.path) + return JSONResponse( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + content={"detail": "Internal server error"}, + ) + + # -- Routers ------------------------------------------------------------ + from src.api.endpoints.analytics import router as analytics_router + from src.api.endpoints.health import router as health_router + from src.api.endpoints.portfolio import router as portfolio_router + from src.api.endpoints.predictions import router as predictions_router + from src.api.endpoints.projects import router as projects_router + from src.api.endpoints.reforms import router as reforms_router + from src.api.webhooks import router as webhooks_router + + api_prefix = "/api/v1" + + app.include_router(projects_router, prefix=api_prefix) + app.include_router(analytics_router, prefix=api_prefix) + app.include_router(portfolio_router, prefix=api_prefix) + app.include_router(predictions_router, prefix=api_prefix) + app.include_router(health_router, prefix=api_prefix) + app.include_router(reforms_router, prefix=api_prefix) + app.include_router(webhooks_router, prefix=api_prefix) + + # -- Root / health-check ------------------------------------------------ + @app.get("/", include_in_schema=False) + async def root() -> dict: + """Root endpoint -- basic service liveness check.""" + return { + "service": "HousingHand", + "version": "0.1.0", + "status": "ok", + "timestamp": datetime.utcnow().isoformat(), + } + + @app.get("/healthz", include_in_schema=False) + async def healthz() -> dict: + """Kubernetes / load-balancer health-check endpoint.""" + return {"status": "healthy"} + + return app + + +# The canonical application object used by ``uvicorn src.api.app:app``. +app = create_app() diff --git a/src/api/dependencies.py b/src/api/dependencies.py new file mode 100644 index 0000000..56c33e0 --- /dev/null +++ b/src/api/dependencies.py @@ -0,0 +1,77 @@ +"""FastAPI dependency injection utilities for HousingHand.""" + +from collections.abc import Generator +from typing import Annotated + +from fastapi import Depends, Query +from pydantic import BaseModel +from sqlalchemy.orm import Session + +from config.settings import Settings, get_settings +from src.database.connection import get_session_factory + + +# --------------------------------------------------------------------------- +# Database session dependency +# --------------------------------------------------------------------------- + +def get_db() -> Generator[Session, None, None]: + """Yield a SQLAlchemy session that is closed after the request. + + This is the canonical FastAPI dependency for obtaining a database + session. It mirrors ``src.database.connection.get_db`` but is placed + here so that the API layer can be tested independently (by overriding + this single dependency) without touching the database module. + """ + SessionLocal = get_session_factory() + db = SessionLocal() + try: + yield db + finally: + db.close() + + +# Annotated shorthand so endpoints can write ``db: DbSession`` instead of +# repeating ``Depends(get_db)`` everywhere. +DbSession = Annotated[Session, Depends(get_db)] + + +# --------------------------------------------------------------------------- +# Settings dependency +# --------------------------------------------------------------------------- + +SettingsDep = Annotated[Settings, Depends(get_settings)] + + +# --------------------------------------------------------------------------- +# Pagination +# --------------------------------------------------------------------------- + +class PaginationParams(BaseModel): + """Common pagination query parameters. + + Used as a FastAPI dependency so that every list endpoint shares the + same ``limit`` / ``offset`` contract. + """ + + limit: int + offset: int + + model_config = {"frozen": True} + + +def get_pagination( + limit: Annotated[ + int, + Query(ge=1, le=500, description="Maximum number of records to return"), + ] = 50, + offset: Annotated[ + int, + Query(ge=0, description="Number of records to skip"), + ] = 0, +) -> PaginationParams: + """Parse and validate pagination query parameters.""" + return PaginationParams(limit=limit, offset=offset) + + +PaginationDep = Annotated[PaginationParams, Depends(get_pagination)] diff --git a/src/api/endpoints/__init__.py b/src/api/endpoints/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/api/endpoints/analytics.py b/src/api/endpoints/analytics.py new file mode 100644 index 0000000..34b8aee --- /dev/null +++ b/src/api/endpoints/analytics.py @@ -0,0 +1,455 @@ +"""Bottleneck analysis endpoints for the affordable housing pipeline. + +Surfaces systemic friction points across jurisdictions and identifies where +projects are getting stuck, helping stakeholders focus reform efforts. +""" + +from __future__ import annotations + +import logging +from datetime import date, timedelta +from typing import Any + +from fastapi import APIRouter, HTTPException, Query, status +from pydantic import BaseModel, Field +from sqlalchemy import case, func, select +from sqlalchemy.orm import Session + +from src.api.dependencies import DbSession, PaginationDep +from src.models.barrier import ProjectBarrier +from src.models.enums import OverallHealth, PipelineStage +from src.models.project import Project + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/analytics", tags=["analytics"]) + + +# --------------------------------------------------------------------------- +# Pydantic response schemas +# --------------------------------------------------------------------------- + +class StageBottleneck(BaseModel): + """Aggregated statistics for a single pipeline stage.""" + + stage: PipelineStage + project_count: int = Field(..., description="Number of projects currently in this stage") + median_days: float | None = Field( + None, + description="Median number of days projects spend in this stage", + ) + avg_days: float | None = Field( + None, + description="Average number of days projects spend in this stage", + ) + max_days: int | None = Field( + None, + description="Longest duration (days) any single project spent in this stage", + ) + stalled_count: int = Field( + 0, + description="Projects in this stage that exceed the stall threshold", + ) + pct_of_pipeline: float = Field( + 0.0, + description="Percentage of total active pipeline represented by this stage", + ) + + model_config = {"from_attributes": True} + + +class BarrierSummary(BaseModel): + """Frequency and impact summary for a single barrier type.""" + + barrier_type: str + occurrence_count: int + total_days_delayed: int + avg_days_delayed: float + total_cost_impact: float + affected_jurisdictions: int + + model_config = {"from_attributes": True} + + +class JurisdictionFriction(BaseModel): + """Friction summary for a specific jurisdiction.""" + + jurisdiction: str + project_count: int + avg_friction_score: float | None = None + stalled_count: int = 0 + at_risk_count: int = 0 + avg_entitlement_days: float | None = None + total_friction_induced_costs: float | None = None + + model_config = {"from_attributes": True} + + +class BottleneckAnalysisResponse(BaseModel): + """Complete bottleneck analysis result.""" + + generated_at: str = Field(..., description="ISO timestamp of analysis generation") + total_active_projects: int + stall_threshold_days: int + stage_bottlenecks: list[StageBottleneck] + top_barriers: list[BarrierSummary] + jurisdiction_friction: list[JurisdictionFriction] + + model_config = {"from_attributes": True} + + +class SystemicBottleneckResponse(BaseModel): + """Response model for the systemic bottleneck identification endpoint.""" + + jurisdiction: str | None + state: str | None + analysis_period_start: date | None + analysis_period_end: date | None + bottlenecks: list[dict[str, Any]] + recommendations: list[str] + + model_config = {"from_attributes": True} + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_ACTIVE_STAGES = [ + PipelineStage.CONCEPT, + PipelineStage.PRE_DEVELOPMENT, + PipelineStage.ENTITLEMENT, + PipelineStage.FINANCING, + PipelineStage.CONSTRUCTION, + PipelineStage.LEASE_UP, +] + + +def _compute_stage_bottlenecks( + db: Session, + stall_threshold_days: int, + jurisdiction: str | None, + state: str | None, +) -> tuple[list[StageBottleneck], int]: + """Compute per-stage aggregation and stall counts.""" + base = select(Project).where(Project.current_stage.in_(_ACTIVE_STAGES)) + if jurisdiction: + base = base.where(Project.jurisdiction == jurisdiction) + if state: + base = base.where(Project.state == state.upper()) + + total_stmt = select(func.count()).select_from(base.subquery()) + total_active = db.scalar(total_stmt) or 0 + + cutoff = date.today() - timedelta(days=stall_threshold_days) + + results: list[StageBottleneck] = [] + for stage in _ACTIVE_STAGES: + stage_filter = base.where(Project.current_stage == stage) + + count_stmt = select(func.count()).select_from(stage_filter.subquery()) + count = db.scalar(count_stmt) or 0 + + avg_stmt = select(func.avg(Project.days_in_current_stage)).where( + Project.current_stage == stage, + Project.current_stage.in_(_ACTIVE_STAGES), + ) + if jurisdiction: + avg_stmt = avg_stmt.where(Project.jurisdiction == jurisdiction) + if state: + avg_stmt = avg_stmt.where(Project.state == state.upper()) + avg_days = db.scalar(avg_stmt) + + max_stmt = select(func.max(Project.days_in_current_stage)).where( + Project.current_stage == stage, + Project.current_stage.in_(_ACTIVE_STAGES), + ) + if jurisdiction: + max_stmt = max_stmt.where(Project.jurisdiction == jurisdiction) + if state: + max_stmt = max_stmt.where(Project.state == state.upper()) + max_days = db.scalar(max_stmt) + + stalled_stmt = select(func.count()).where( + Project.current_stage == stage, + Project.stage_entry_date <= cutoff, + ) + if jurisdiction: + stalled_stmt = stalled_stmt.where(Project.jurisdiction == jurisdiction) + if state: + stalled_stmt = stalled_stmt.where(Project.state == state.upper()) + stalled = db.scalar(stalled_stmt) or 0 + + pct = (count / total_active * 100) if total_active > 0 else 0.0 + + results.append( + StageBottleneck( + stage=stage, + project_count=count, + median_days=None, # Median requires window functions; avg is shown instead. + avg_days=round(float(avg_days), 1) if avg_days is not None else None, + max_days=int(max_days) if max_days is not None else None, + stalled_count=stalled, + pct_of_pipeline=round(pct, 1), + ) + ) + + return results, total_active + + +def _compute_barrier_summaries( + db: Session, + jurisdiction: str | None, + state: str | None, + top_n: int, +) -> list[BarrierSummary]: + """Aggregate barrier types across all projects.""" + stmt = ( + select( + ProjectBarrier.barrier_type, + func.count(ProjectBarrier.barrier_id).label("occurrence_count"), + func.sum(ProjectBarrier.days_delayed).label("total_days_delayed"), + func.avg(ProjectBarrier.days_delayed).label("avg_days_delayed"), + func.sum(ProjectBarrier.cost_impact).label("total_cost_impact"), + func.count(func.distinct(ProjectBarrier.jurisdiction)).label( + "affected_jurisdictions" + ), + ) + .group_by(ProjectBarrier.barrier_type) + .order_by(func.sum(ProjectBarrier.days_delayed).desc()) + ) + + if jurisdiction: + stmt = stmt.where(ProjectBarrier.jurisdiction == jurisdiction) + if state: + stmt = stmt.join(Project, Project.project_id == ProjectBarrier.project_id).where( + Project.state == state.upper() + ) + + stmt = stmt.limit(top_n) + rows = db.execute(stmt).all() + + return [ + BarrierSummary( + barrier_type=r.barrier_type, + occurrence_count=r.occurrence_count, + total_days_delayed=int(r.total_days_delayed or 0), + avg_days_delayed=round(float(r.avg_days_delayed or 0), 1), + total_cost_impact=float(r.total_cost_impact or 0), + affected_jurisdictions=r.affected_jurisdictions, + ) + for r in rows + ] + + +def _compute_jurisdiction_friction( + db: Session, + state: str | None, + top_n: int, +) -> list[JurisdictionFriction]: + """Rank jurisdictions by average friction score.""" + stmt = ( + select( + Project.jurisdiction, + func.count(Project.project_id).label("project_count"), + func.avg(Project.jurisdiction_friction_score).label("avg_friction_score"), + func.sum( + case( + (Project.current_stage == PipelineStage.STALLED, 1), + else_=0, + ) + ).label("stalled_count"), + func.sum( + case( + (Project.overall_health == OverallHealth.AT_RISK, 1), + else_=0, + ) + ).label("at_risk_count"), + func.avg(Project.entitlement_duration_days).label("avg_entitlement_days"), + func.sum(Project.friction_induced_costs).label("total_friction_induced_costs"), + ) + .where(Project.jurisdiction.isnot(None)) + .group_by(Project.jurisdiction) + .order_by(func.avg(Project.jurisdiction_friction_score).desc().nullslast()) + ) + + if state: + stmt = stmt.where(Project.state == state.upper()) + + stmt = stmt.limit(top_n) + rows = db.execute(stmt).all() + + return [ + JurisdictionFriction( + jurisdiction=r.jurisdiction, + project_count=r.project_count, + avg_friction_score=round(float(r.avg_friction_score), 1) if r.avg_friction_score else None, + stalled_count=int(r.stalled_count or 0), + at_risk_count=int(r.at_risk_count or 0), + avg_entitlement_days=round(float(r.avg_entitlement_days), 1) if r.avg_entitlement_days else None, + total_friction_induced_costs=float(r.total_friction_induced_costs or 0), + ) + for r in rows + ] + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + +@router.get( + "/bottlenecks", + response_model=BottleneckAnalysisResponse, + summary="Analyze pipeline bottlenecks", +) +def analyze_bottlenecks( + db: DbSession, + jurisdiction: str | None = Query(None, description="Limit analysis to a jurisdiction"), + state: str | None = Query(None, max_length=2, description="Limit analysis to a state"), + stall_threshold_days: int = Query( + 180, ge=30, le=730, description="Days before a project is considered stalled" + ), + top_barriers: int = Query(10, ge=1, le=50, description="Number of top barriers to return"), + top_jurisdictions: int = Query(15, ge=1, le=100, description="Number of top jurisdictions to return"), +) -> BottleneckAnalysisResponse: + """Identify where affordable housing projects are getting stuck. + + This endpoint performs a multi-dimensional analysis: + + 1. **Stage bottlenecks** -- Which pipeline stages accumulate the most + projects and the longest durations? + 2. **Top barriers** -- Which regulatory friction types cause the most + aggregate delay and cost across the tracked portfolio? + 3. **Jurisdiction friction** -- Which jurisdictions have the highest + average friction scores, stall rates, and entitlement durations? + + The ``stall_threshold_days`` parameter controls when a project is + considered "stalled" in its current stage. + """ + from datetime import datetime as _dt + + stage_results, total_active = _compute_stage_bottlenecks( + db, stall_threshold_days, jurisdiction, state + ) + barrier_results = _compute_barrier_summaries(db, jurisdiction, state, top_barriers) + friction_results = _compute_jurisdiction_friction(db, state, top_jurisdictions) + + return BottleneckAnalysisResponse( + generated_at=_dt.utcnow().isoformat(), + total_active_projects=total_active, + stall_threshold_days=stall_threshold_days, + stage_bottlenecks=stage_results, + top_barriers=barrier_results, + jurisdiction_friction=friction_results, + ) + + +@router.get( + "/bottlenecks/systemic", + response_model=SystemicBottleneckResponse, + summary="Identify systemic bottlenecks", +) +def identify_systemic_bottlenecks( + db: DbSession, + jurisdiction: str | None = Query(None, description="Jurisdiction to analyze"), + state: str | None = Query(None, max_length=2, description="State to analyze"), + period_months: int = Query( + 24, ge=6, le=120, description="Look-back period in months" + ), +) -> SystemicBottleneckResponse: + """Identify systemic bottlenecks that transcend individual projects. + + Looks at patterns across the entire portfolio within the given time + window to surface recurring regulatory friction, chronic under-funding, + and other structural issues. Delegates heavy computation to + ``src.analytics.identify_systemic_bottlenecks`` when available, falling + back to a direct database analysis otherwise. + """ + from datetime import datetime as _dt + + period_start = date.today() - timedelta(days=period_months * 30) + period_end = date.today() + + # Try the analytics module first. + try: + from src.analytics import identify_systemic_bottlenecks as _identify + result = _identify( + db, + jurisdiction=jurisdiction, + state=state, + period_start=period_start, + period_end=period_end, + ) + return SystemicBottleneckResponse(**result) + except (ImportError, AttributeError): + pass + + # Fallback: direct DB aggregation. + bottlenecks: list[dict[str, Any]] = [] + recommendations: list[str] = [] + + # Find stages where projects are disproportionately accumulating. + for stage in _ACTIVE_STAGES: + stmt = select(func.count()).where( + Project.current_stage == stage, + Project.created_at >= period_start, + ) + if jurisdiction: + stmt = stmt.where(Project.jurisdiction == jurisdiction) + if state: + stmt = stmt.where(Project.state == state.upper()) + count = db.scalar(stmt) or 0 + + avg_stmt = select(func.avg(Project.days_in_current_stage)).where( + Project.current_stage == stage, + Project.created_at >= period_start, + ) + if jurisdiction: + avg_stmt = avg_stmt.where(Project.jurisdiction == jurisdiction) + if state: + avg_stmt = avg_stmt.where(Project.state == state.upper()) + avg_days = db.scalar(avg_stmt) + + if count > 0: + bottlenecks.append({ + "stage": stage.value, + "project_count": count, + "avg_days_in_stage": round(float(avg_days), 1) if avg_days else None, + "severity": "high" if (avg_days and float(avg_days) > 365) else "moderate", + }) + + # Barrier-driven recommendations. + barrier_stmt = ( + select( + ProjectBarrier.barrier_type, + func.count(ProjectBarrier.barrier_id).label("cnt"), + func.sum(ProjectBarrier.days_delayed).label("total_delay"), + ) + .where(ProjectBarrier.created_at >= period_start) + .group_by(ProjectBarrier.barrier_type) + .order_by(func.sum(ProjectBarrier.days_delayed).desc()) + .limit(5) + ) + if jurisdiction: + barrier_stmt = barrier_stmt.where(ProjectBarrier.jurisdiction == jurisdiction) + + top = db.execute(barrier_stmt).all() + for row in top: + recommendations.append( + f"Address '{row.barrier_type}' -- caused {int(row.total_delay or 0)} " + f"total days of delay across {row.cnt} occurrences." + ) + + if not recommendations: + recommendations.append( + "Insufficient barrier data to generate targeted recommendations." + ) + + return SystemicBottleneckResponse( + jurisdiction=jurisdiction, + state=state, + analysis_period_start=period_start, + analysis_period_end=period_end, + bottlenecks=bottlenecks, + recommendations=recommendations, + ) diff --git a/src/api/endpoints/health.py b/src/api/endpoints/health.py new file mode 100644 index 0000000..f37d946 --- /dev/null +++ b/src/api/endpoints/health.py @@ -0,0 +1,562 @@ +"""Pipeline health assessment endpoints. + +Evaluates individual project health and aggregate pipeline health, +identifying at-risk projects and providing actionable diagnostics. +""" + +from __future__ import annotations + +import logging +import uuid +from datetime import date, datetime, timedelta +from typing import Any + +from fastapi import APIRouter, HTTPException, Query, status +from pydantic import BaseModel, Field +from sqlalchemy import func, select +from sqlalchemy.orm import Session + +from src.api.dependencies import DbSession, PaginationDep +from src.models.barrier import ProjectBarrier +from src.models.enums import OverallHealth, PipelineStage +from src.models.project import Project + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/health", tags=["health"]) + + +# --------------------------------------------------------------------------- +# Pydantic schemas +# --------------------------------------------------------------------------- + +class HealthDimension(BaseModel): + """Score for a single health dimension.""" + + dimension: str = Field(..., description="Name of the health dimension") + score: float = Field(..., ge=0, le=100, description="Score from 0 (worst) to 100 (best)") + status: str = Field(..., description="Qualitative status label") + detail: str = Field("", description="Human-readable explanation") + + model_config = {"from_attributes": True} + + +class ProjectHealthResponse(BaseModel): + """Detailed health assessment for a single project.""" + + project_id: uuid.UUID + project_name: str + current_stage: PipelineStage + overall_health: OverallHealth | None = None + health_score: float | None = Field(None, ge=0, le=100) + assessed_at: str + + dimensions: list[HealthDimension] + top_risks: list[str] = Field(default_factory=list) + recommended_actions: list[str] = Field(default_factory=list) + + model_config = {"from_attributes": True} + + +class AtRiskProject(BaseModel): + """Summary of an at-risk project.""" + + project_id: uuid.UUID + project_name: str + city: str | None = None + state: str | None = None + jurisdiction: str | None = None + current_stage: PipelineStage + overall_health: OverallHealth | None = None + health_score: float | None = None + days_in_current_stage: int | None = None + funding_gap: float | None = None + primary_risk: str | None = None + + model_config = {"from_attributes": True} + + +class AtRiskListResponse(BaseModel): + """Paginated list of at-risk projects.""" + + items: list[AtRiskProject] + total: int + limit: int + offset: int + + model_config = {"from_attributes": True} + + +class PipelineHealthSummary(BaseModel): + """Aggregate pipeline health across the portfolio.""" + + generated_at: str + jurisdiction: str | None = None + state: str | None = None + + total_projects: int + on_track_count: int + on_track_pct: float + at_risk_count: int + at_risk_pct: float + delayed_count: int + delayed_pct: float + stalled_count: int + stalled_pct: float + + avg_health_score: float | None = None + total_funding_gap: float = 0.0 + total_friction_cost: float = 0.0 + + most_common_risk_factors: list[str] = Field(default_factory=list) + + model_config = {"from_attributes": True} + + +# --------------------------------------------------------------------------- +# Scoring helpers +# --------------------------------------------------------------------------- + +def _score_timeline(project: Project) -> HealthDimension: + """Score how the project is tracking against its timeline.""" + score = 80.0 # default assumption + details: list[str] = [] + + days = project.days_in_current_stage + if days is not None: + # Compare against loose thresholds by stage. + thresholds = { + PipelineStage.CONCEPT: 180, + PipelineStage.PRE_DEVELOPMENT: 270, + PipelineStage.ENTITLEMENT: 365, + PipelineStage.FINANCING: 270, + PipelineStage.CONSTRUCTION: 730, + PipelineStage.LEASE_UP: 180, + } + threshold = thresholds.get(project.current_stage, 365) + ratio = days / threshold + if ratio > 1.5: + score = max(score - 60, 0) + details.append(f"Significantly over expected duration ({days} days vs ~{threshold})") + elif ratio > 1.0: + score = max(score - 30, 0) + details.append(f"Over expected duration ({days} days vs ~{threshold})") + else: + details.append(f"Within expected duration ({days} of ~{threshold} days)") + + if project.current_stage in (PipelineStage.STALLED, PipelineStage.ABANDONED): + score = 0 + details.append(f"Project is {project.current_stage.value}") + + status_label = "good" if score >= 70 else "concern" if score >= 40 else "critical" + return HealthDimension( + dimension="timeline", + score=round(score, 1), + status=status_label, + detail="; ".join(details) if details else "Insufficient timeline data", + ) + + +def _score_funding(project: Project) -> HealthDimension: + """Score financial health based on funding gap and committed funds.""" + score = 85.0 + details: list[str] = [] + + tdc = float(project.total_development_cost or 0) + gap = float(project.funding_gap or 0) + committed = float(project.total_funding_committed or 0) + + if tdc > 0 and gap > 0: + gap_pct = gap / tdc * 100 + if gap_pct > 40: + score = max(score - 60, 0) + details.append(f"Funding gap is {gap_pct:.0f}% of TDC (${gap:,.0f})") + elif gap_pct > 20: + score = max(score - 35, 0) + details.append(f"Funding gap is {gap_pct:.0f}% of TDC (${gap:,.0f})") + elif gap_pct > 5: + score = max(score - 15, 0) + details.append(f"Modest funding gap ({gap_pct:.0f}% of TDC)") + else: + details.append("Funding nearly fully committed") + elif tdc > 0 and gap <= 0: + score = 95.0 + details.append("No funding gap") + else: + score = 50.0 + details.append("Insufficient cost/funding data") + + status_label = "good" if score >= 70 else "concern" if score >= 40 else "critical" + return HealthDimension( + dimension="funding", + score=round(score, 1), + status=status_label, + detail="; ".join(details) if details else "No funding data available", + ) + + +def _score_regulatory(project: Project) -> HealthDimension: + """Score regulatory friction risk.""" + score = 80.0 + details: list[str] = [] + + friction = project.jurisdiction_friction_score + if friction is not None: + if friction > 75: + score = max(score - 50, 0) + details.append(f"High friction jurisdiction (score: {friction})") + elif friction > 50: + score = max(score - 25, 0) + details.append(f"Moderate friction jurisdiction (score: {friction})") + else: + details.append(f"Low friction jurisdiction (score: {friction})") + + if project.appeals_filed and project.appeals_filed > 0: + score = max(score - 15 * project.appeals_filed, 0) + details.append(f"{project.appeals_filed} appeal(s) filed") + + opposition = project.neighbor_opposition_level + if opposition and opposition.value in ("high", "severe"): + score = max(score - 20, 0) + details.append(f"Neighbor opposition: {opposition.value}") + + status_label = "good" if score >= 70 else "concern" if score >= 40 else "critical" + return HealthDimension( + dimension="regulatory", + score=round(score, 1), + status=status_label, + detail="; ".join(details) if details else "No regulatory friction data", + ) + + +def _score_data_quality(project: Project) -> HealthDimension: + """Score data completeness and freshness.""" + score = 70.0 + details: list[str] = [] + + completeness = project.data_completeness + if completeness is not None: + score = completeness * 100 + details.append(f"Data completeness: {completeness:.0%}") + else: + details.append("Data completeness unknown") + + if project.last_verified: + staleness = (date.today() - project.last_verified).days + if staleness > 180: + score = max(score - 20, 0) + details.append(f"Last verified {staleness} days ago (stale)") + elif staleness > 90: + score = max(score - 10, 0) + details.append(f"Last verified {staleness} days ago") + else: + details.append(f"Recently verified ({staleness} days ago)") + + status_label = "good" if score >= 70 else "concern" if score >= 40 else "critical" + return HealthDimension( + dimension="data_quality", + score=round(score, 1), + status=status_label, + detail="; ".join(details) if details else "No data quality information", + ) + + +def _compute_project_health(project: Project) -> tuple[list[HealthDimension], float]: + """Run all scoring dimensions and compute a weighted aggregate.""" + timeline = _score_timeline(project) + funding = _score_funding(project) + regulatory = _score_regulatory(project) + data_quality = _score_data_quality(project) + + dimensions = [timeline, funding, regulatory, data_quality] + + # Weighted average: timeline 35%, funding 30%, regulatory 25%, data 10%. + weights = [0.35, 0.30, 0.25, 0.10] + aggregate = sum(d.score * w for d, w in zip(dimensions, weights)) + return dimensions, round(aggregate, 1) + + +def _derive_health_label(score: float) -> OverallHealth: + """Map a numeric score to an OverallHealth enum value.""" + if score >= 70: + return OverallHealth.ON_TRACK + if score >= 50: + return OverallHealth.AT_RISK + if score >= 25: + return OverallHealth.DELAYED + return OverallHealth.STALLED + + +def _derive_risks_and_actions( + project: Project, dimensions: list[HealthDimension] +) -> tuple[list[str], list[str]]: + """Generate risk descriptions and recommended actions.""" + risks: list[str] = [] + actions: list[str] = [] + + for dim in dimensions: + if dim.status == "critical": + risks.append(f"Critical {dim.dimension}: {dim.detail}") + elif dim.status == "concern": + risks.append(f"{dim.dimension.capitalize()} concern: {dim.detail}") + + # Specific action suggestions. + funding_dim = next((d for d in dimensions if d.dimension == "funding"), None) + if funding_dim and funding_dim.score < 50: + actions.append("Identify additional funding sources to close the gap") + + timeline_dim = next((d for d in dimensions if d.dimension == "timeline"), None) + if timeline_dim and timeline_dim.score < 40: + actions.append("Escalate timeline review with development team") + + reg_dim = next((d for d in dimensions if d.dimension == "regulatory"), None) + if reg_dim and reg_dim.score < 50: + actions.append("Engage with jurisdiction to resolve regulatory barriers") + + dq_dim = next((d for d in dimensions if d.dimension == "data_quality"), None) + if dq_dim and dq_dim.score < 50: + actions.append("Update and verify project data") + + if not actions: + actions.append("Continue monitoring -- project appears healthy") + + return risks, actions + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + +@router.get( + "/projects/{project_id}", + response_model=ProjectHealthResponse, + summary="Assess project health", +) +def assess_project_health( + project_id: uuid.UUID, + db: DbSession, +) -> ProjectHealthResponse: + """Perform a multi-dimensional health assessment for a single project. + + Scoring dimensions: + + * **Timeline** (35 %) -- Is the project progressing through its current + stage at a reasonable pace? + * **Funding** (30 %) -- How large is the remaining funding gap relative + to total development cost? + * **Regulatory** (25 %) -- What is the jurisdiction friction score, and + are there active appeals or community opposition? + * **Data quality** (10 %) -- How complete and fresh is the project data? + + When ``src.analytics.assess_pipeline_health`` is available it is used + for a richer assessment backed by ML. + """ + project = db.get(Project, project_id) + if project is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Project {project_id} not found", + ) + + # Try the analytics module. + try: + from src.analytics import assess_pipeline_health + result = assess_pipeline_health(db, project_id=project_id) + return ProjectHealthResponse( + project_id=project.project_id, + project_name=project.project_name, + current_stage=project.current_stage, + overall_health=result.get("overall_health"), + health_score=result.get("health_score"), + assessed_at=datetime.utcnow().isoformat(), + dimensions=[HealthDimension(**d) for d in result.get("dimensions", [])], + top_risks=result.get("top_risks", []), + recommended_actions=result.get("recommended_actions", []), + ) + except (ImportError, AttributeError): + pass + + dimensions, aggregate_score = _compute_project_health(project) + health_label = _derive_health_label(aggregate_score) + risks, actions = _derive_risks_and_actions(project, dimensions) + + return ProjectHealthResponse( + project_id=project.project_id, + project_name=project.project_name, + current_stage=project.current_stage, + overall_health=health_label, + health_score=aggregate_score, + assessed_at=datetime.utcnow().isoformat(), + dimensions=dimensions, + top_risks=risks, + recommended_actions=actions, + ) + + +@router.get( + "/at-risk", + response_model=AtRiskListResponse, + summary="List at-risk projects", +) +def list_at_risk_projects( + db: DbSession, + pagination: PaginationDep, + jurisdiction: str | None = Query(None, description="Filter by jurisdiction"), + state: str | None = Query(None, max_length=2, description="Filter by state"), + min_days_stalled: int | None = Query( + None, ge=1, description="Minimum days in current stage" + ), + include_stalled: bool = Query( + True, description="Include projects with STALLED stage" + ), +) -> AtRiskListResponse: + """Return projects that are at risk, delayed, or stalled. + + Useful for a "watch list" dashboard that surfaces projects needing + immediate attention. + """ + health_filter = [OverallHealth.AT_RISK, OverallHealth.DELAYED] + if include_stalled: + health_filter.append(OverallHealth.STALLED) + + stmt = select(Project).where( + Project.overall_health.in_(health_filter), + ) + if jurisdiction: + stmt = stmt.where(Project.jurisdiction == jurisdiction) + if state: + stmt = stmt.where(Project.state == state.upper()) + if min_days_stalled is not None: + stmt = stmt.where(Project.days_in_current_stage >= min_days_stalled) + + count_stmt = select(func.count()).select_from(stmt.subquery()) + total = db.scalar(count_stmt) or 0 + + stmt = stmt.order_by(Project.health_score.asc().nullslast()) + stmt = stmt.limit(pagination.limit).offset(pagination.offset) + projects = list(db.scalars(stmt).all()) + + items: list[AtRiskProject] = [] + for p in projects: + primary_risk: str | None = None + if p.funding_gap and float(p.funding_gap) > 0: + primary_risk = f"Funding gap: ${float(p.funding_gap):,.0f}" + elif p.days_in_current_stage and p.days_in_current_stage > 365: + primary_risk = f"Stalled: {p.days_in_current_stage} days in {p.current_stage.value}" + elif p.jurisdiction_friction_score and p.jurisdiction_friction_score > 70: + primary_risk = f"High friction score: {p.jurisdiction_friction_score}" + + items.append( + AtRiskProject( + project_id=p.project_id, + project_name=p.project_name, + city=p.city, + state=p.state, + jurisdiction=p.jurisdiction, + current_stage=p.current_stage, + overall_health=p.overall_health, + health_score=p.health_score, + days_in_current_stage=p.days_in_current_stage, + funding_gap=float(p.funding_gap) if p.funding_gap else None, + primary_risk=primary_risk, + ) + ) + + return AtRiskListResponse( + items=items, + total=total, + limit=pagination.limit, + offset=pagination.offset, + ) + + +@router.get( + "/summary", + response_model=PipelineHealthSummary, + summary="Pipeline health summary", +) +def get_pipeline_health_summary( + db: DbSession, + jurisdiction: str | None = Query(None, description="Filter by jurisdiction"), + state: str | None = Query(None, max_length=2, description="Filter by state"), +) -> PipelineHealthSummary: + """Get aggregate pipeline health metrics across the portfolio. + + Returns counts and percentages for each health category, average health + score, total funding gap, total friction costs, and the most frequently + occurring risk factors. + """ + conditions = [Project.project_id.isnot(None)] + if jurisdiction: + conditions.append(Project.jurisdiction == jurisdiction) + if state: + conditions.append(Project.state == state.upper()) + + from sqlalchemy import and_ + base_filter = and_(*conditions) + + # Total projects. + total = db.scalar(select(func.count()).where(base_filter)) or 0 + + # Health distribution. + health_stmt = ( + select(Project.overall_health, func.count(Project.project_id).label("cnt")) + .where(base_filter) + .group_by(Project.overall_health) + ) + health_rows = {r.overall_health: r.cnt for r in db.execute(health_stmt).all()} + + on_track = health_rows.get(OverallHealth.ON_TRACK, 0) + at_risk = health_rows.get(OverallHealth.AT_RISK, 0) + delayed = health_rows.get(OverallHealth.DELAYED, 0) + stalled = health_rows.get(OverallHealth.STALLED, 0) + + def _pct(n: int) -> float: + return round(n / total * 100, 1) if total > 0 else 0.0 + + # Aggregates. + agg_stmt = select( + func.avg(Project.health_score).label("avg_hs"), + func.coalesce(func.sum(Project.funding_gap), 0).label("total_gap"), + func.coalesce(func.sum(Project.friction_induced_costs), 0).label("total_friction"), + ).where(base_filter) + agg = db.execute(agg_stmt).first() + + # Most common risk factors (from the JSON risk_factors column). + # We surface the barrier types with the highest total occurrence count as a proxy. + barrier_stmt = ( + select( + ProjectBarrier.barrier_type, + func.count(ProjectBarrier.barrier_id).label("cnt"), + ) + .group_by(ProjectBarrier.barrier_type) + .order_by(func.count(ProjectBarrier.barrier_id).desc()) + .limit(5) + ) + if jurisdiction: + barrier_stmt = barrier_stmt.where(ProjectBarrier.jurisdiction == jurisdiction) + if state: + barrier_stmt = barrier_stmt.join( + Project, Project.project_id == ProjectBarrier.project_id + ).where(Project.state == state.upper()) + + common_risks = [r.barrier_type for r in db.execute(barrier_stmt).all()] + + return PipelineHealthSummary( + generated_at=datetime.utcnow().isoformat(), + jurisdiction=jurisdiction, + state=state, + total_projects=total, + on_track_count=on_track, + on_track_pct=_pct(on_track), + at_risk_count=at_risk, + at_risk_pct=_pct(at_risk), + delayed_count=delayed, + delayed_pct=_pct(delayed), + stalled_count=stalled, + stalled_pct=_pct(stalled), + avg_health_score=round(float(agg.avg_hs), 1) if agg and agg.avg_hs else None, + total_funding_gap=float(agg.total_gap) if agg else 0.0, + total_friction_cost=float(agg.total_friction) if agg else 0.0, + most_common_risk_factors=common_risks, + ) diff --git a/src/api/endpoints/portfolio.py b/src/api/endpoints/portfolio.py new file mode 100644 index 0000000..131b861 --- /dev/null +++ b/src/api/endpoints/portfolio.py @@ -0,0 +1,567 @@ +"""Portfolio dashboard endpoints. + +Provides aggregate portfolio views for PHAs, funders, cities, and other +stakeholders to monitor their slice of the affordable housing pipeline. +""" + +from __future__ import annotations + +import logging +import uuid +from datetime import date, datetime +from typing import Any + +from fastapi import APIRouter, HTTPException, Query, status +from pydantic import BaseModel, Field +from sqlalchemy import case, func, select +from sqlalchemy.orm import Session + +from src.api.dependencies import DbSession, PaginationDep +from src.models.enums import OverallHealth, PipelineStage, PortfolioType +from src.models.portfolio import PortfolioDashboard +from src.models.project import Project + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/portfolio", tags=["portfolio"]) + + +# --------------------------------------------------------------------------- +# Pydantic schemas +# --------------------------------------------------------------------------- + +class StageDistribution(BaseModel): + """Number of projects in each pipeline stage.""" + + concept: int = 0 + pre_development: int = 0 + entitlement: int = 0 + financing: int = 0 + construction: int = 0 + lease_up: int = 0 + operations: int = 0 + stalled: int = 0 + abandoned: int = 0 + + model_config = {"from_attributes": True} + + +class HealthDistribution(BaseModel): + """Count of projects by overall health status.""" + + on_track: int = 0 + at_risk: int = 0 + delayed: int = 0 + stalled: int = 0 + unknown: int = 0 + + model_config = {"from_attributes": True} + + +class FundingSummary(BaseModel): + """Aggregate funding information across the portfolio.""" + + total_development_cost: float = 0.0 + total_funding_committed: float = 0.0 + total_funding_gap: float = 0.0 + total_debt: float = 0.0 + total_equity: float = 0.0 + total_subsidy: float = 0.0 + + model_config = {"from_attributes": True} + + +class VelocityMetrics(BaseModel): + """Pipeline throughput metrics.""" + + avg_concept_to_construction_days: float | None = None + avg_entitlement_days: float | None = None + avg_financing_days: float | None = None + avg_construction_days: float | None = None + projects_completed_last_12_months: int = 0 + units_completed_last_12_months: int = 0 + + model_config = {"from_attributes": True} + + +class PortfolioOverview(BaseModel): + """Complete portfolio dashboard snapshot.""" + + generated_at: str + portfolio_name: str | None = None + jurisdiction: str | None = None + state: str | None = None + + # Headline numbers + total_projects: int + total_units: int + total_affordable_units: int + + # Distributions + stage_distribution: StageDistribution + health_distribution: HealthDistribution + + # Funding + funding: FundingSummary + + # Velocity + velocity: VelocityMetrics + + # At-risk summary + at_risk_projects: int + stalled_projects: int + + model_config = {"from_attributes": True} + + +class PortfolioDashboardResponse(BaseModel): + """Saved portfolio dashboard configuration plus cached metrics.""" + + portfolio_id: uuid.UUID + portfolio_name: str + organization: str | None = None + portfolio_type: PortfolioType + total_projects: int + total_units: int + units_by_stage: dict[str, Any] | None = None + funding_gap_aggregate: float | None = None + at_risk_count: int + velocity_metrics: dict[str, Any] | None = None + is_public: bool + last_calculated: datetime | None = None + created_at: datetime + updated_at: datetime + + model_config = {"from_attributes": True} + + +class PortfolioDashboardListResponse(BaseModel): + """Paginated list of saved portfolio dashboards.""" + + items: list[PortfolioDashboardResponse] + total: int + limit: int + offset: int + + model_config = {"from_attributes": True} + + +class PortfolioDashboardCreate(BaseModel): + """Schema for creating a new saved portfolio dashboard.""" + + portfolio_name: str = Field(..., min_length=1, max_length=300) + organization: str | None = None + portfolio_type: PortfolioType + geography_filter: dict[str, Any] | None = None + funding_filter: dict[str, Any] | None = None + stage_filter: dict[str, Any] | None = None + ami_filter: dict[str, Any] | None = None + date_range_start: date | None = None + date_range_end: date | None = None + is_public: bool = False + + model_config = {"json_schema_extra": { + "examples": [ + { + "portfolio_name": "Bay Area PHA Pipeline", + "portfolio_type": "pha_service_area", + "organization": "Bay Area Housing Authority", + "geography_filter": {"state": "CA", "counties": ["Alameda", "San Francisco"]}, + "is_public": True, + } + ] + }} + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _build_stage_distribution(db: Session, base_filter: Any) -> StageDistribution: + """Compute stage distribution from a base filter expression.""" + stmt = ( + select( + Project.current_stage, + func.count(Project.project_id).label("cnt"), + ) + .where(base_filter) + .group_by(Project.current_stage) + ) + rows = {r.current_stage: r.cnt for r in db.execute(stmt).all()} + return StageDistribution( + concept=rows.get(PipelineStage.CONCEPT, 0), + pre_development=rows.get(PipelineStage.PRE_DEVELOPMENT, 0), + entitlement=rows.get(PipelineStage.ENTITLEMENT, 0), + financing=rows.get(PipelineStage.FINANCING, 0), + construction=rows.get(PipelineStage.CONSTRUCTION, 0), + lease_up=rows.get(PipelineStage.LEASE_UP, 0), + operations=rows.get(PipelineStage.OPERATIONS, 0), + stalled=rows.get(PipelineStage.STALLED, 0), + abandoned=rows.get(PipelineStage.ABANDONED, 0), + ) + + +def _build_health_distribution(db: Session, base_filter: Any) -> HealthDistribution: + """Compute health distribution from a base filter expression.""" + stmt = ( + select( + Project.overall_health, + func.count(Project.project_id).label("cnt"), + ) + .where(base_filter) + .group_by(Project.overall_health) + ) + rows = {r.overall_health: r.cnt for r in db.execute(stmt).all()} + return HealthDistribution( + on_track=rows.get(OverallHealth.ON_TRACK, 0), + at_risk=rows.get(OverallHealth.AT_RISK, 0), + delayed=rows.get(OverallHealth.DELAYED, 0), + stalled=rows.get(OverallHealth.STALLED, 0), + unknown=rows.get(None, 0), + ) + + +def _build_funding_summary(db: Session, base_filter: Any) -> FundingSummary: + """Aggregate funding data across the filtered portfolio.""" + stmt = select( + func.coalesce(func.sum(Project.total_development_cost), 0).label("tdc"), + func.coalesce(func.sum(Project.total_funding_committed), 0).label("tfc"), + func.coalesce(func.sum(Project.funding_gap), 0).label("fg"), + func.coalesce(func.sum(Project.debt_amount), 0).label("debt"), + func.coalesce(func.sum(Project.equity_amount), 0).label("equity"), + func.coalesce(func.sum(Project.subsidy_amount), 0).label("subsidy"), + ).where(base_filter) + r = db.execute(stmt).first() + if r is None: + return FundingSummary() + return FundingSummary( + total_development_cost=float(r.tdc), + total_funding_committed=float(r.tfc), + total_funding_gap=float(r.fg), + total_debt=float(r.debt), + total_equity=float(r.equity), + total_subsidy=float(r.subsidy), + ) + + +def _build_velocity_metrics(db: Session, base_filter: Any) -> VelocityMetrics: + """Compute pipeline velocity metrics.""" + stmt = select( + func.avg(Project.concept_to_groundbreaking_days).label("avg_c2c"), + func.avg(Project.entitlement_duration_days).label("avg_ent"), + func.avg(Project.financing_duration_days).label("avg_fin"), + func.avg(Project.construction_duration_days).label("avg_con"), + ).where(base_filter) + r = db.execute(stmt).first() + + twelve_months_ago = date.today().replace( + year=date.today().year - 1, + ) + completed_stmt = select( + func.count(Project.project_id).label("cnt"), + func.coalesce(func.sum(Project.total_units), 0).label("units"), + ).where( + base_filter, + Project.current_stage == PipelineStage.OPERATIONS, + Project.construction_complete >= twelve_months_ago, + ) + comp = db.execute(completed_stmt).first() + + return VelocityMetrics( + avg_concept_to_construction_days=round(float(r.avg_c2c), 1) if r and r.avg_c2c else None, + avg_entitlement_days=round(float(r.avg_ent), 1) if r and r.avg_ent else None, + avg_financing_days=round(float(r.avg_fin), 1) if r and r.avg_fin else None, + avg_construction_days=round(float(r.avg_con), 1) if r and r.avg_con else None, + projects_completed_last_12_months=comp.cnt if comp else 0, + units_completed_last_12_months=int(comp.units) if comp else 0, + ) + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + +@router.get( + "/overview", + response_model=PortfolioOverview, + summary="Get portfolio overview", +) +def get_portfolio_overview( + db: DbSession, + jurisdiction: str | None = Query(None, description="Filter by jurisdiction"), + state: str | None = Query(None, max_length=2, description="Filter by state"), + city: str | None = Query(None, description="Filter by city"), + developer_org: str | None = Query(None, description="Filter by developer org"), +) -> PortfolioOverview: + """Generate a real-time portfolio overview for the filtered set of projects. + + This endpoint computes headline metrics, stage and health distributions, + aggregate funding, and pipeline velocity on the fly. For frequently + accessed slices consider creating a saved portfolio dashboard via + ``POST /api/v1/portfolio/dashboards``. + """ + # Build a composable boolean filter. + conditions = [Project.project_id.isnot(None)] # always-true seed + if jurisdiction: + conditions.append(Project.jurisdiction == jurisdiction) + if state: + conditions.append(Project.state == state.upper()) + if city: + conditions.append(Project.city == city) + if developer_org: + conditions.append(Project.developer_org == developer_org) + + from sqlalchemy import and_ + base_filter = and_(*conditions) + + # Headline counts. + totals_stmt = select( + func.count(Project.project_id).label("total_projects"), + func.coalesce(func.sum(Project.total_units), 0).label("total_units"), + func.coalesce(func.sum(Project.affordable_units), 0).label("total_affordable"), + ).where(base_filter) + totals = db.execute(totals_stmt).first() + + stage_dist = _build_stage_distribution(db, base_filter) + health_dist = _build_health_distribution(db, base_filter) + funding = _build_funding_summary(db, base_filter) + velocity = _build_velocity_metrics(db, base_filter) + + at_risk_stmt = select(func.count()).where( + base_filter, Project.overall_health == OverallHealth.AT_RISK, + ) + at_risk = db.scalar(at_risk_stmt) or 0 + + stalled_stmt = select(func.count()).where( + base_filter, Project.current_stage == PipelineStage.STALLED, + ) + stalled = db.scalar(stalled_stmt) or 0 + + portfolio_name_parts: list[str] = [] + if jurisdiction: + portfolio_name_parts.append(jurisdiction) + if city: + portfolio_name_parts.append(city) + if state: + portfolio_name_parts.append(state.upper()) + portfolio_name = " / ".join(portfolio_name_parts) if portfolio_name_parts else "All Projects" + + return PortfolioOverview( + generated_at=datetime.utcnow().isoformat(), + portfolio_name=portfolio_name, + jurisdiction=jurisdiction, + state=state, + total_projects=totals.total_projects if totals else 0, + total_units=int(totals.total_units) if totals else 0, + total_affordable_units=int(totals.total_affordable) if totals else 0, + stage_distribution=stage_dist, + health_distribution=health_dist, + funding=funding, + velocity=velocity, + at_risk_projects=at_risk, + stalled_projects=stalled, + ) + + +@router.get( + "/dashboards", + response_model=PortfolioDashboardListResponse, + summary="List saved portfolio dashboards", +) +def list_dashboards( + db: DbSession, + pagination: PaginationDep, + portfolio_type: PortfolioType | None = Query(None, description="Filter by portfolio type"), + organization: str | None = Query(None, description="Filter by organization"), + is_public: bool | None = Query(None, description="Filter by visibility"), +) -> PortfolioDashboardListResponse: + """Return saved portfolio dashboard configurations. + + These are reusable, named filter sets that stakeholders create so they + can quickly access their slice of the pipeline. + """ + stmt = select(PortfolioDashboard) + if portfolio_type is not None: + stmt = stmt.where(PortfolioDashboard.portfolio_type == portfolio_type) + if organization is not None: + stmt = stmt.where(PortfolioDashboard.organization == organization) + if is_public is not None: + stmt = stmt.where(PortfolioDashboard.is_public == is_public) + + count_stmt = select(func.count()).select_from(stmt.subquery()) + total = db.scalar(count_stmt) or 0 + + stmt = stmt.order_by(PortfolioDashboard.updated_at.desc()) + stmt = stmt.limit(pagination.limit).offset(pagination.offset) + dashboards = list(db.scalars(stmt).all()) + + return PortfolioDashboardListResponse( + items=[PortfolioDashboardResponse.model_validate(d) for d in dashboards], + total=total, + limit=pagination.limit, + offset=pagination.offset, + ) + + +@router.get( + "/dashboards/{portfolio_id}", + response_model=PortfolioDashboardResponse, + summary="Get a saved portfolio dashboard", +) +def get_dashboard( + portfolio_id: uuid.UUID, + db: DbSession, +) -> PortfolioDashboardResponse: + """Retrieve a single saved portfolio dashboard by ID.""" + dashboard = db.get(PortfolioDashboard, portfolio_id) + if dashboard is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Portfolio dashboard {portfolio_id} not found", + ) + return PortfolioDashboardResponse.model_validate(dashboard) + + +@router.post( + "/dashboards", + response_model=PortfolioDashboardResponse, + status_code=status.HTTP_201_CREATED, + summary="Create a saved portfolio dashboard", +) +def create_dashboard( + payload: PortfolioDashboardCreate, + db: DbSession, +) -> PortfolioDashboardResponse: + """Create a new saved portfolio dashboard. + + The dashboard is a named filter configuration. Cached aggregate metrics + (total_projects, total_units, etc.) are computed and stored on creation + and can be refreshed via ``POST /api/v1/portfolio/dashboards/{id}/refresh``. + """ + dashboard = PortfolioDashboard( + **payload.model_dump(exclude_unset=True), + ) + + # Compute initial cached metrics based on the geography filter. + geo = payload.geography_filter or {} + conditions = [Project.project_id.isnot(None)] + if geo.get("jurisdiction"): + conditions.append(Project.jurisdiction == geo["jurisdiction"]) + if geo.get("state"): + conditions.append(Project.state == geo["state"]) + if geo.get("city"): + conditions.append(Project.city == geo["city"]) + + from sqlalchemy import and_ + base_filter = and_(*conditions) + + totals_stmt = select( + func.count(Project.project_id).label("total_projects"), + func.coalesce(func.sum(Project.total_units), 0).label("total_units"), + func.coalesce(func.sum(Project.funding_gap), 0).label("funding_gap"), + ).where(base_filter) + totals = db.execute(totals_stmt).first() + + at_risk_stmt = select(func.count()).where( + base_filter, Project.overall_health == OverallHealth.AT_RISK, + ) + at_risk = db.scalar(at_risk_stmt) or 0 + + dashboard.total_projects = totals.total_projects if totals else 0 + dashboard.total_units = int(totals.total_units) if totals else 0 + dashboard.funding_gap_aggregate = float(totals.funding_gap) if totals else 0 + dashboard.at_risk_count = at_risk + dashboard.last_calculated = datetime.utcnow() + + db.add(dashboard) + try: + db.commit() + except Exception: + db.rollback() + logger.exception("Failed to create portfolio dashboard") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to create portfolio dashboard", + ) + + db.refresh(dashboard) + return PortfolioDashboardResponse.model_validate(dashboard) + + +@router.post( + "/dashboards/{portfolio_id}/refresh", + response_model=PortfolioDashboardResponse, + summary="Refresh cached portfolio metrics", +) +def refresh_dashboard( + portfolio_id: uuid.UUID, + db: DbSession, +) -> PortfolioDashboardResponse: + """Recompute the cached aggregate metrics for a saved dashboard. + + Uses the stored filter configuration to re-query the project table and + update the cached totals. Delegates to + ``src.analytics.generate_portfolio_intelligence`` when available. + """ + dashboard = db.get(PortfolioDashboard, portfolio_id) + if dashboard is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Portfolio dashboard {portfolio_id} not found", + ) + + # Try analytics module first. + try: + from src.analytics import generate_portfolio_intelligence + result = generate_portfolio_intelligence(db, portfolio_id=portfolio_id) + dashboard.total_projects = result.get("total_projects", dashboard.total_projects) + dashboard.total_units = result.get("total_units", dashboard.total_units) + dashboard.funding_gap_aggregate = result.get("funding_gap_aggregate", dashboard.funding_gap_aggregate) + dashboard.at_risk_count = result.get("at_risk_count", dashboard.at_risk_count) + dashboard.units_by_stage = result.get("units_by_stage") + dashboard.velocity_metrics = result.get("velocity_metrics") + except (ImportError, AttributeError): + # Fallback: recompute from DB. + geo = dashboard.geography_filter or {} + conditions = [Project.project_id.isnot(None)] + if geo.get("jurisdiction"): + conditions.append(Project.jurisdiction == geo["jurisdiction"]) + if geo.get("state"): + conditions.append(Project.state == geo["state"]) + if geo.get("city"): + conditions.append(Project.city == geo["city"]) + + from sqlalchemy import and_ + base_filter = and_(*conditions) + + totals_stmt = select( + func.count(Project.project_id).label("total_projects"), + func.coalesce(func.sum(Project.total_units), 0).label("total_units"), + func.coalesce(func.sum(Project.funding_gap), 0).label("funding_gap"), + ).where(base_filter) + totals = db.execute(totals_stmt).first() + + at_risk_stmt = select(func.count()).where( + base_filter, Project.overall_health == OverallHealth.AT_RISK, + ) + + dashboard.total_projects = totals.total_projects if totals else 0 + dashboard.total_units = int(totals.total_units) if totals else 0 + dashboard.funding_gap_aggregate = float(totals.funding_gap) if totals else 0 + dashboard.at_risk_count = db.scalar(at_risk_stmt) or 0 + + dashboard.last_calculated = datetime.utcnow() + dashboard.updated_at = datetime.utcnow() + + try: + db.commit() + except Exception: + db.rollback() + logger.exception("Failed to refresh portfolio %s", portfolio_id) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to refresh portfolio metrics", + ) + + db.refresh(dashboard) + return PortfolioDashboardResponse.model_validate(dashboard) diff --git a/src/api/endpoints/predictions.py b/src/api/endpoints/predictions.py new file mode 100644 index 0000000..fe35e5e --- /dev/null +++ b/src/api/endpoints/predictions.py @@ -0,0 +1,413 @@ +"""Timeline prediction endpoints. + +Provides machine-learning-backed predictions for project milestones and +completion dates, along with confidence intervals and comparable project +benchmarks. +""" + +from __future__ import annotations + +import logging +import uuid +from datetime import date, datetime, timedelta +from typing import Any + +from fastapi import APIRouter, HTTPException, Query, status +from pydantic import BaseModel, Field +from sqlalchemy import func, select +from sqlalchemy.orm import Session + +from src.api.dependencies import DbSession +from src.models.enums import BuildingType, PipelineStage +from src.models.project import Project + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/predictions", tags=["predictions"]) + + +# --------------------------------------------------------------------------- +# Pydantic schemas +# --------------------------------------------------------------------------- + +class MilestonePrediction(BaseModel): + """Predicted date range for a single pipeline milestone.""" + + milestone: str = Field(..., description="Name of the milestone (e.g. 'entitlement_complete')") + predicted_date: date | None = Field(None, description="Best-estimate date") + optimistic_date: date | None = Field(None, description="P25 (optimistic) date") + pessimistic_date: date | None = Field(None, description="P75 (pessimistic) date") + confidence: float | None = Field( + None, ge=0, le=1, description="Model confidence (0-1)" + ) + days_from_now: int | None = Field( + None, description="Predicted days from today to milestone" + ) + + model_config = {"from_attributes": True} + + +class PeerBenchmark(BaseModel): + """Duration statistics from comparable completed projects.""" + + stage: str + peer_count: int = Field(..., description="Number of comparable projects") + median_days: float | None = None + p25_days: float | None = None + p75_days: float | None = None + + model_config = {"from_attributes": True} + + +class TimelinePredictionResponse(BaseModel): + """Full timeline prediction for a project.""" + + project_id: uuid.UUID + project_name: str + current_stage: PipelineStage + prediction_generated_at: str + model_version: str | None = None + + milestones: list[MilestonePrediction] + peer_benchmarks: list[PeerBenchmark] + + overall_confidence: float | None = Field( + None, ge=0, le=1, description="Aggregate confidence across all milestones" + ) + estimated_total_remaining_days: int | None = Field( + None, description="Estimated days until the project reaches operations" + ) + risk_factors_affecting_timeline: list[str] = Field( + default_factory=list, + description="Key risk factors that may shift the prediction", + ) + + model_config = {"from_attributes": True} + + +class BulkPredictionItem(BaseModel): + """Lightweight prediction for a single project in a bulk response.""" + + project_id: uuid.UUID + project_name: str + current_stage: PipelineStage + predicted_groundbreaking: date | None = None + predicted_co: date | None = None + confidence: float | None = None + + model_config = {"from_attributes": True} + + +class BulkPredictionResponse(BaseModel): + """Multiple project predictions.""" + + generated_at: str + predictions: list[BulkPredictionItem] + total: int + + model_config = {"from_attributes": True} + + +# --------------------------------------------------------------------------- +# Peer benchmark helper +# --------------------------------------------------------------------------- + +_STAGE_DURATION_FIELDS = { + "concept": Project.concept_duration_days, + "pre_development": Project.pre_development_duration_days, + "entitlement": Project.entitlement_duration_days, + "financing": Project.financing_duration_days, + "construction": Project.construction_duration_days, + "lease_up": Project.lease_up_duration_days, +} + + +def _compute_peer_benchmarks( + db: Session, + project: Project, +) -> list[PeerBenchmark]: + """Compute peer duration benchmarks from completed comparable projects. + + Comparable projects share the same state, similar unit count (+/-30 %), + and the same building type (when known). + """ + benchmarks: list[PeerBenchmark] = [] + + unit_low = int(project.total_units * 0.7) if project.total_units else 0 + unit_high = int(project.total_units * 1.3) if project.total_units else 10_000 + + for stage_name, duration_col in _STAGE_DURATION_FIELDS.items(): + stmt = select( + func.count(Project.project_id).label("cnt"), + func.avg(duration_col).label("avg_d"), + func.min(duration_col).label("min_d"), + func.max(duration_col).label("max_d"), + ).where( + duration_col.isnot(None), + Project.total_units >= unit_low, + Project.total_units <= unit_high, + ) + + if project.state: + stmt = stmt.where(Project.state == project.state) + if project.building_type: + stmt = stmt.where(Project.building_type == project.building_type) + + row = db.execute(stmt).first() + if row and row.cnt and row.cnt > 0: + benchmarks.append( + PeerBenchmark( + stage=stage_name, + peer_count=row.cnt, + median_days=round(float(row.avg_d), 1) if row.avg_d else None, + p25_days=round(float(row.min_d), 1) if row.min_d else None, + p75_days=round(float(row.max_d), 1) if row.max_d else None, + ) + ) + else: + benchmarks.append( + PeerBenchmark(stage=stage_name, peer_count=0) + ) + + return benchmarks + + +def _build_fallback_prediction( + project: Project, + benchmarks: list[PeerBenchmark], +) -> list[MilestonePrediction]: + """Build milestone predictions from peer averages when no ML model is available.""" + milestones: list[MilestonePrediction] = [] + today = date.today() + + # Determine which stages remain. + stage_order = [ + PipelineStage.CONCEPT, + PipelineStage.PRE_DEVELOPMENT, + PipelineStage.ENTITLEMENT, + PipelineStage.FINANCING, + PipelineStage.CONSTRUCTION, + PipelineStage.LEASE_UP, + ] + + try: + current_idx = stage_order.index(project.current_stage) + except ValueError: + return milestones + + cursor = today + for idx in range(current_idx, len(stage_order)): + stage = stage_order[idx] + stage_name = stage.value + + # Find the matching benchmark. + bm = next((b for b in benchmarks if b.stage == stage_name), None) + if bm and bm.median_days: + days = int(bm.median_days) + else: + # Default estimates when no peers exist. + defaults = { + "concept": 120, + "pre_development": 180, + "entitlement": 270, + "financing": 180, + "construction": 540, + "lease_up": 120, + } + days = defaults.get(stage_name, 180) + + predicted = cursor + timedelta(days=days) + optimistic = cursor + timedelta(days=int(days * 0.7)) + pessimistic = cursor + timedelta(days=int(days * 1.4)) + + milestones.append( + MilestonePrediction( + milestone=f"{stage_name}_complete", + predicted_date=predicted, + optimistic_date=optimistic, + pessimistic_date=pessimistic, + confidence=0.4 if not bm or bm.peer_count < 3 else 0.65, + days_from_now=(predicted - today).days, + ) + ) + cursor = predicted + + return milestones + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + +@router.get( + "/{project_id}/timeline", + response_model=TimelinePredictionResponse, + summary="Predict project timeline", +) +def predict_project_timeline( + project_id: uuid.UUID, + db: DbSession, +) -> TimelinePredictionResponse: + """Generate a timeline prediction for a single project. + + When the ``src.analytics.predict_project_timeline`` function (backed + by a trained ML model) is available it is used. Otherwise a + peer-benchmark-based heuristic is applied. + + The response includes: + + * Per-milestone predicted, optimistic, and pessimistic dates. + * Peer benchmarks from comparable completed projects. + * Risk factors that could shift the predictions. + """ + project = db.get(Project, project_id) + if project is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Project {project_id} not found", + ) + + # Peer benchmarks are always useful. + benchmarks = _compute_peer_benchmarks(db, project) + + # Try the analytics ML predictor. + try: + from src.analytics import predict_project_timeline as _predict + result = _predict(db, project_id=project_id) + milestones = [MilestonePrediction(**m) for m in result.get("milestones", [])] + overall_confidence = result.get("overall_confidence") + remaining_days = result.get("estimated_total_remaining_days") + risk_factors = result.get("risk_factors_affecting_timeline", []) + model_version = result.get("model_version") + except (ImportError, AttributeError): + milestones = _build_fallback_prediction(project, benchmarks) + overall_confidence = ( + sum(m.confidence for m in milestones if m.confidence) / len(milestones) + if milestones + else None + ) + remaining_days = ( + max((m.days_from_now for m in milestones if m.days_from_now), default=None) + ) + risk_factors = [] + model_version = "peer-benchmark-heuristic" + + # Derive risk factors from project attributes. + if project.jurisdiction_friction_score and project.jurisdiction_friction_score > 60: + risk_factors.append( + f"High jurisdiction friction score ({project.jurisdiction_friction_score})" + ) + if project.neighbor_opposition_level and project.neighbor_opposition_level.value in ( + "high", "severe" + ): + risk_factors.append( + f"Neighbor opposition level: {project.neighbor_opposition_level.value}" + ) + if project.funding_gap and float(project.funding_gap) > 0: + risk_factors.append( + f"Outstanding funding gap: ${float(project.funding_gap):,.0f}" + ) + if project.appeals_filed and project.appeals_filed > 0: + risk_factors.append( + f"{project.appeals_filed} appeal(s) filed" + ) + + return TimelinePredictionResponse( + project_id=project.project_id, + project_name=project.project_name, + current_stage=project.current_stage, + prediction_generated_at=datetime.utcnow().isoformat(), + model_version=model_version, + milestones=milestones, + peer_benchmarks=benchmarks, + overall_confidence=overall_confidence, + estimated_total_remaining_days=remaining_days, + risk_factors_affecting_timeline=risk_factors, + ) + + +@router.get( + "/bulk", + response_model=BulkPredictionResponse, + summary="Bulk timeline predictions", +) +def bulk_predict( + db: DbSession, + jurisdiction: str | None = Query(None, description="Filter by jurisdiction"), + state: str | None = Query(None, max_length=2, description="Filter by state"), + current_stage: PipelineStage | None = Query(None, description="Filter by stage"), + limit: int = Query(50, ge=1, le=200, description="Max projects to predict"), +) -> BulkPredictionResponse: + """Generate lightweight timeline predictions for multiple projects. + + Returns the predicted groundbreaking and certificate-of-occupancy dates + along with a confidence score for each matching project. Useful for + portfolio-level forecasting dashboards. + """ + stmt = select(Project).where( + Project.current_stage.in_([ + PipelineStage.CONCEPT, + PipelineStage.PRE_DEVELOPMENT, + PipelineStage.ENTITLEMENT, + PipelineStage.FINANCING, + PipelineStage.CONSTRUCTION, + PipelineStage.LEASE_UP, + ]) + ) + if jurisdiction: + stmt = stmt.where(Project.jurisdiction == jurisdiction) + if state: + stmt = stmt.where(Project.state == state.upper()) + if current_stage: + stmt = stmt.where(Project.current_stage == current_stage) + + stmt = stmt.order_by(Project.updated_at.desc()).limit(limit) + projects = list(db.scalars(stmt).all()) + + items: list[BulkPredictionItem] = [] + for p in projects: + # Use stored predictions when available. + if p.predicted_groundbreaking or p.predicted_co: + items.append( + BulkPredictionItem( + project_id=p.project_id, + project_name=p.project_name, + current_stage=p.current_stage, + predicted_groundbreaking=p.predicted_groundbreaking, + predicted_co=p.predicted_co, + confidence=p.prediction_confidence, + ) + ) + else: + # Quick heuristic: estimate from peer averages. + benchmarks = _compute_peer_benchmarks(db, p) + milestones = _build_fallback_prediction(p, benchmarks) + groundbreaking = next( + (m.predicted_date for m in milestones if m.milestone == "construction_complete"), + None, + ) + co = next( + (m.predicted_date for m in milestones if m.milestone == "lease_up_complete"), + None, + ) + avg_conf = ( + sum(m.confidence for m in milestones if m.confidence) / len(milestones) + if milestones + else None + ) + items.append( + BulkPredictionItem( + project_id=p.project_id, + project_name=p.project_name, + current_stage=p.current_stage, + predicted_groundbreaking=groundbreaking, + predicted_co=co, + confidence=round(avg_conf, 2) if avg_conf else None, + ) + ) + + return BulkPredictionResponse( + generated_at=datetime.utcnow().isoformat(), + predictions=items, + total=len(items), + ) diff --git a/src/api/endpoints/projects.py b/src/api/endpoints/projects.py new file mode 100644 index 0000000..9c32edb --- /dev/null +++ b/src/api/endpoints/projects.py @@ -0,0 +1,806 @@ +"""Full CRUD endpoints for affordable housing development projects.""" + +from __future__ import annotations + +import logging +import re +import uuid +from datetime import date, datetime +from typing import Any + +from fastapi import APIRouter, Depends, HTTPException, Query, status +from pydantic import BaseModel, Field, field_validator +from sqlalchemy import func, select +from sqlalchemy.orm import Session + +from src.api.dependencies import DbSession, PaginationDep +from src.models.enums import ( + BuildingType, + DataSource, + NeighborOpposition, + OverallHealth, + PipelineStage, + StructureType, +) +from src.models.project import Project + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/projects", tags=["projects"]) + + +# --------------------------------------------------------------------------- +# Pydantic schemas +# --------------------------------------------------------------------------- + +class LocationSchema(BaseModel): + """Location fields shared by create / update / response schemas.""" + + address: str | None = None + city: str | None = None + county: str | None = None + state: str | None = Field(None, max_length=2) + zip: str | None = Field(None, max_length=10) + latitude: float | None = None + longitude: float | None = None + jurisdiction: str | None = None + neighborhood: str | None = None + census_tract: str | None = None + + model_config = {"from_attributes": True} + + +class DevelopmentTeamSchema(BaseModel): + """Development team contacts.""" + + developer_org: str | None = None + developer_contact: str | None = None + architect: str | None = None + general_contractor: str | None = None + property_manager: str | None = None + + model_config = {"from_attributes": True} + + +class UnitMixSchema(BaseModel): + """Unit count breakdown.""" + + total_units: int = Field(0, ge=0) + affordable_units: int = Field(0, ge=0) + market_units: int = Field(0, ge=0) + studio_units: int = Field(0, ge=0) + one_br_units: int = Field(0, ge=0) + two_br_units: int = Field(0, ge=0) + three_br_units: int = Field(0, ge=0) + four_plus_br_units: int = Field(0, ge=0) + + model_config = {"from_attributes": True} + + +class AMITargetingSchema(BaseModel): + """Area Median Income unit targeting.""" + + ami_30_units: int = Field(0, ge=0) + ami_40_units: int = Field(0, ge=0) + ami_50_units: int = Field(0, ge=0) + ami_60_units: int = Field(0, ge=0) + ami_80_units: int = Field(0, ge=0) + market_rate_units: int = Field(0, ge=0) + + model_config = {"from_attributes": True} + + +class SpecialPopulationsSchema(BaseModel): + """Special population unit allocations.""" + + senior_units: int = Field(0, ge=0) + family_units: int = Field(0, ge=0) + psf_units: int = Field(0, ge=0) + veteran_units: int = Field(0, ge=0) + homeless_set_aside: int = Field(0, ge=0) + + model_config = {"from_attributes": True} + + +class CostSchema(BaseModel): + """Project cost information.""" + + total_development_cost: float | None = None + cost_per_unit: float | None = None + cost_per_square_foot: float | None = None + land_acquisition_cost: float | None = None + hard_costs: float | None = None + soft_costs: float | None = None + financing_costs: float | None = None + developer_fee: float | None = None + reserves: float | None = None + + model_config = {"from_attributes": True} + + +class FundingStackSchema(BaseModel): + """Aggregate funding stack information.""" + + funding_stack: dict[str, Any] | None = None + total_funding_committed: float | None = None + funding_gap: float | None = None + debt_amount: float | None = None + equity_amount: float | None = None + subsidy_amount: float | None = None + + model_config = {"from_attributes": True} + + +class TimelineSchema(BaseModel): + """Actual timeline milestones.""" + + concept_start: date | None = None + concept_complete: date | None = None + pre_development_start: date | None = None + pre_development_complete: date | None = None + entitlement_start: date | None = None + entitlement_complete: date | None = None + financing_start: date | None = None + financing_complete: date | None = None + construction_start: date | None = None + construction_complete: date | None = None + lease_up_start: date | None = None + lease_up_complete: date | None = None + + model_config = {"from_attributes": True} + + +# -- Request schemas -------------------------------------------------------- + +class ProjectCreate(BaseModel): + """Schema for creating a new project. + + Only ``project_name`` and ``total_units`` are truly required -- every + other field is optional so that early-stage projects can be entered with + minimal data. + """ + + project_name: str = Field(..., min_length=1, max_length=500) + project_slug: str | None = Field( + None, + max_length=500, + description="URL-friendly slug. Auto-generated from project_name if omitted.", + ) + + # Location + address: str | None = None + city: str | None = None + county: str | None = None + state: str | None = Field(None, max_length=2) + zip: str | None = Field(None, max_length=10) + latitude: float | None = None + longitude: float | None = None + jurisdiction: str | None = None + neighborhood: str | None = None + census_tract: str | None = None + + # Development team + developer_org: str | None = None + developer_contact: str | None = None + architect: str | None = None + general_contractor: str | None = None + property_manager: str | None = None + + # Characteristics + site_acres: float | None = None + building_type: BuildingType | None = None + structure_type: StructureType | None = None + stories: int | None = Field(None, ge=1) + parking_spaces: int | None = Field(None, ge=0) + + # Units + total_units: int = Field(0, ge=0) + affordable_units: int = Field(0, ge=0) + market_units: int = Field(0, ge=0) + studio_units: int = Field(0, ge=0) + one_br_units: int = Field(0, ge=0) + two_br_units: int = Field(0, ge=0) + three_br_units: int = Field(0, ge=0) + four_plus_br_units: int = Field(0, ge=0) + + # AMI + ami_30_units: int = Field(0, ge=0) + ami_40_units: int = Field(0, ge=0) + ami_50_units: int = Field(0, ge=0) + ami_60_units: int = Field(0, ge=0) + ami_80_units: int = Field(0, ge=0) + market_rate_units: int = Field(0, ge=0) + + # Special populations + senior_units: int = Field(0, ge=0) + family_units: int = Field(0, ge=0) + psf_units: int = Field(0, ge=0) + veteran_units: int = Field(0, ge=0) + homeless_set_aside: int = Field(0, ge=0) + + # Pipeline + current_stage: PipelineStage = PipelineStage.CONCEPT + + # Costs + total_development_cost: float | None = None + cost_per_unit: float | None = None + land_acquisition_cost: float | None = None + hard_costs: float | None = None + soft_costs: float | None = None + + # Timeline + concept_start: date | None = None + concept_complete: date | None = None + pre_development_start: date | None = None + pre_development_complete: date | None = None + entitlement_start: date | None = None + entitlement_complete: date | None = None + financing_start: date | None = None + financing_complete: date | None = None + construction_start: date | None = None + construction_complete: date | None = None + + # Metadata + data_source: DataSource | None = None + is_public: bool = False + notes: str | None = None + created_by: str | None = None + + model_config = {"json_schema_extra": { + "examples": [ + { + "project_name": "Sunrise Village Apartments", + "city": "Oakland", + "state": "CA", + "jurisdiction": "City of Oakland", + "total_units": 120, + "affordable_units": 108, + "building_type": "new_construction", + "current_stage": "pre_development", + } + ] + }} + + @field_validator("state") + @classmethod + def validate_state(cls, v: str | None) -> str | None: + if v is not None: + return v.upper() + return v + + +class ProjectUpdate(BaseModel): + """Schema for partial project updates (PATCH semantics). + + All fields are optional. Only provided fields overwrite existing values. + """ + + project_name: str | None = Field(None, min_length=1, max_length=500) + project_slug: str | None = Field(None, max_length=500) + + # Location + address: str | None = None + city: str | None = None + county: str | None = None + state: str | None = Field(None, max_length=2) + zip: str | None = None + latitude: float | None = None + longitude: float | None = None + jurisdiction: str | None = None + neighborhood: str | None = None + census_tract: str | None = None + + # Development team + developer_org: str | None = None + developer_contact: str | None = None + architect: str | None = None + general_contractor: str | None = None + property_manager: str | None = None + + # Characteristics + site_acres: float | None = None + building_type: BuildingType | None = None + structure_type: StructureType | None = None + stories: int | None = Field(None, ge=1) + parking_spaces: int | None = Field(None, ge=0) + + # Units + total_units: int | None = Field(None, ge=0) + affordable_units: int | None = Field(None, ge=0) + market_units: int | None = Field(None, ge=0) + studio_units: int | None = Field(None, ge=0) + one_br_units: int | None = Field(None, ge=0) + two_br_units: int | None = Field(None, ge=0) + three_br_units: int | None = Field(None, ge=0) + four_plus_br_units: int | None = Field(None, ge=0) + + # AMI + ami_30_units: int | None = Field(None, ge=0) + ami_40_units: int | None = Field(None, ge=0) + ami_50_units: int | None = Field(None, ge=0) + ami_60_units: int | None = Field(None, ge=0) + ami_80_units: int | None = Field(None, ge=0) + market_rate_units: int | None = Field(None, ge=0) + + # Special populations + senior_units: int | None = Field(None, ge=0) + family_units: int | None = Field(None, ge=0) + psf_units: int | None = Field(None, ge=0) + veteran_units: int | None = Field(None, ge=0) + homeless_set_aside: int | None = Field(None, ge=0) + + # Pipeline + current_stage: PipelineStage | None = None + overall_health: OverallHealth | None = None + health_score: float | None = None + stage_entry_date: date | None = None + + # Costs + total_development_cost: float | None = None + cost_per_unit: float | None = None + land_acquisition_cost: float | None = None + hard_costs: float | None = None + soft_costs: float | None = None + + # Timeline + concept_start: date | None = None + concept_complete: date | None = None + pre_development_start: date | None = None + pre_development_complete: date | None = None + entitlement_start: date | None = None + entitlement_complete: date | None = None + financing_start: date | None = None + financing_complete: date | None = None + construction_start: date | None = None + construction_complete: date | None = None + lease_up_start: date | None = None + lease_up_complete: date | None = None + + # Funding stack + funding_gap: float | None = None + total_funding_committed: float | None = None + + # Metadata + data_source: DataSource | None = None + is_public: bool | None = None + notes: str | None = None + + model_config = {"json_schema_extra": { + "examples": [ + { + "current_stage": "financing", + "total_development_cost": 42_500_000.00, + "entitlement_complete": "2025-06-15", + } + ] + }} + + @field_validator("state") + @classmethod + def validate_state(cls, v: str | None) -> str | None: + if v is not None: + return v.upper() + return v + + +# -- Response schemas ------------------------------------------------------- + +class ProjectSummaryResponse(BaseModel): + """Lightweight project representation for list endpoints.""" + + project_id: uuid.UUID + project_name: str + project_slug: str + city: str | None = None + state: str | None = None + jurisdiction: str | None = None + total_units: int + affordable_units: int + current_stage: PipelineStage + overall_health: OverallHealth | None = None + health_score: float | None = None + developer_org: str | None = None + total_development_cost: float | None = None + funding_gap: float | None = None + created_at: datetime + updated_at: datetime + + model_config = {"from_attributes": True} + + +class ProjectDetailResponse(BaseModel): + """Full project detail response with all fields.""" + + project_id: uuid.UUID + project_name: str + project_slug: str + + # Location + address: str | None = None + city: str | None = None + county: str | None = None + state: str | None = None + zip: str | None = None + latitude: float | None = None + longitude: float | None = None + jurisdiction: str | None = None + neighborhood: str | None = None + census_tract: str | None = None + + # Development team + developer_org: str | None = None + developer_contact: str | None = None + architect: str | None = None + general_contractor: str | None = None + property_manager: str | None = None + + # Characteristics + site_acres: float | None = None + building_type: BuildingType | None = None + structure_type: StructureType | None = None + stories: int | None = None + parking_spaces: int | None = None + + # Unit mix + total_units: int + affordable_units: int + market_units: int + studio_units: int + one_br_units: int + two_br_units: int + three_br_units: int + four_plus_br_units: int + + # AMI targeting + ami_30_units: int + ami_40_units: int + ami_50_units: int + ami_60_units: int + ami_80_units: int + market_rate_units: int + + # Special populations + senior_units: int + family_units: int + psf_units: int + veteran_units: int + homeless_set_aside: int + + # Pipeline + current_stage: PipelineStage + stage_entry_date: date | None = None + days_in_current_stage: int | None = None + overall_health: OverallHealth | None = None + health_score: float | None = None + last_milestone_date: date | None = None + next_milestone_date: date | None = None + next_milestone_type: str | None = None + + # Timeline - actual + concept_start: date | None = None + concept_complete: date | None = None + concept_duration_days: int | None = None + pre_development_start: date | None = None + pre_development_complete: date | None = None + pre_development_duration_days: int | None = None + entitlement_start: date | None = None + entitlement_complete: date | None = None + entitlement_duration_days: int | None = None + financing_start: date | None = None + financing_complete: date | None = None + financing_duration_days: int | None = None + construction_start: date | None = None + construction_complete: date | None = None + construction_duration_days: int | None = None + lease_up_start: date | None = None + lease_up_complete: date | None = None + lease_up_duration_days: int | None = None + total_elapsed_days: int | None = None + + # Timeline - predicted + predicted_entitlement_complete: date | None = None + predicted_financing_complete: date | None = None + predicted_groundbreaking: date | None = None + predicted_co: date | None = None + prediction_confidence: float | None = None + prediction_last_updated: datetime | None = None + + # Costs + total_development_cost: float | None = None + cost_per_unit: float | None = None + cost_per_square_foot: float | None = None + land_acquisition_cost: float | None = None + hard_costs: float | None = None + soft_costs: float | None = None + financing_costs: float | None = None + developer_fee: float | None = None + reserves: float | None = None + + # Friction + friction_induced_costs: float | None = None + regulatory_delay_costs: float | None = None + jurisdiction_friction_score: int | None = None + primary_friction_points: dict[str, Any] | None = None + + # Funding + funding_stack: dict[str, Any] | None = None + total_funding_committed: float | None = None + funding_gap: float | None = None + debt_amount: float | None = None + equity_amount: float | None = None + subsidy_amount: float | None = None + + # Risk + risk_factors: dict[str, Any] | None = None + risk_score: float | None = None + + # Stakeholder + housing_mind_queries: int + neighbor_opposition_level: NeighborOpposition | None = None + public_meetings_attended: int + variance_hearings: int + + # Data quality + data_source: DataSource | None = None + data_quality_score: float | None = None + data_completeness: float | None = None + + # Metadata + is_public: bool + notes: str | None = None + created_by: str | None = None + created_at: datetime + updated_at: datetime + + model_config = {"from_attributes": True} + + +class ProjectListResponse(BaseModel): + """Paginated list of projects.""" + + items: list[ProjectSummaryResponse] + total: int = Field(..., description="Total number of projects matching the filters") + limit: int + offset: int + + model_config = {"from_attributes": True} + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _generate_slug(name: str) -> str: + """Produce a URL-friendly slug from a project name.""" + slug = name.lower().strip() + slug = re.sub(r"[^\w\s-]", "", slug) + slug = re.sub(r"[\s_]+", "-", slug) + slug = re.sub(r"-+", "-", slug).strip("-") + # Append a short uuid fragment to avoid collisions. + slug = f"{slug}-{uuid.uuid4().hex[:8]}" + return slug + + +def _get_project_or_404(db: Session, project_id: uuid.UUID) -> Project: + """Fetch project by primary key or raise 404.""" + project = db.get(Project, project_id) + if project is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Project {project_id} not found", + ) + return project + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + +@router.get( + "", + response_model=ProjectListResponse, + summary="List projects", +) +def list_projects( + db: DbSession, + pagination: PaginationDep, + city: str | None = Query(None, description="Filter by city name"), + state: str | None = Query(None, max_length=2, description="Filter by 2-letter state code"), + jurisdiction: str | None = Query(None, description="Filter by jurisdiction"), + current_stage: PipelineStage | None = Query(None, description="Filter by pipeline stage"), + overall_health: OverallHealth | None = Query(None, description="Filter by overall health"), + developer_org: str | None = Query(None, description="Filter by developer organization"), + min_units: int | None = Query(None, ge=0, description="Minimum total units"), + is_public: bool | None = Query(None, description="Filter by public visibility"), + search: str | None = Query(None, min_length=1, description="Search project name (ilike)"), +) -> ProjectListResponse: + """Return a paginated list of projects with optional filters. + + Results are ordered by most recently updated first. Use the ``search`` + parameter for case-insensitive partial matching on project name. + """ + stmt = select(Project) + + if city is not None: + stmt = stmt.where(Project.city == city) + if state is not None: + stmt = stmt.where(Project.state == state.upper()) + if jurisdiction is not None: + stmt = stmt.where(Project.jurisdiction == jurisdiction) + if current_stage is not None: + stmt = stmt.where(Project.current_stage == current_stage) + if overall_health is not None: + stmt = stmt.where(Project.overall_health == overall_health) + if developer_org is not None: + stmt = stmt.where(Project.developer_org == developer_org) + if min_units is not None: + stmt = stmt.where(Project.total_units >= min_units) + if is_public is not None: + stmt = stmt.where(Project.is_public == is_public) + if search is not None: + stmt = stmt.where(Project.project_name.ilike(f"%{search}%")) + + # Total count (before pagination). + count_stmt = select(func.count()).select_from(stmt.subquery()) + total = db.scalar(count_stmt) or 0 + + # Paginate. + stmt = stmt.order_by(Project.updated_at.desc()) + stmt = stmt.limit(pagination.limit).offset(pagination.offset) + projects = list(db.scalars(stmt).all()) + + return ProjectListResponse( + items=[ProjectSummaryResponse.model_validate(p) for p in projects], + total=total, + limit=pagination.limit, + offset=pagination.offset, + ) + + +@router.get( + "/{project_id}", + response_model=ProjectDetailResponse, + summary="Get project details", +) +def get_project( + project_id: uuid.UUID, + db: DbSession, +) -> ProjectDetailResponse: + """Return full details for a single project by its UUID.""" + project = _get_project_or_404(db, project_id) + return ProjectDetailResponse.model_validate(project) + + +@router.post( + "", + response_model=ProjectDetailResponse, + status_code=status.HTTP_201_CREATED, + summary="Create a new project", +) +def create_project( + payload: ProjectCreate, + db: DbSession, +) -> ProjectDetailResponse: + """Create a new affordable housing development project. + + If ``project_slug`` is not provided it will be auto-generated from the + project name. + """ + data = payload.model_dump(exclude_unset=True) + + # Auto-generate slug when not provided. + if "project_slug" not in data or data["project_slug"] is None: + data["project_slug"] = _generate_slug(payload.project_name) + + # Guard against duplicate slug. + existing = db.scalars( + select(Project).where(Project.project_slug == data["project_slug"]) + ).first() + if existing is not None: + raise HTTPException( + status_code=status.HTTP_409_CONFLICT, + detail=f"A project with slug '{data['project_slug']}' already exists", + ) + + project = Project(**data) + db.add(project) + + try: + db.commit() + except Exception: + db.rollback() + logger.exception("Failed to create project") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to create project", + ) + + db.refresh(project) + return ProjectDetailResponse.model_validate(project) + + +@router.patch( + "/{project_id}", + response_model=ProjectDetailResponse, + summary="Update an existing project", +) +def update_project( + project_id: uuid.UUID, + payload: ProjectUpdate, + db: DbSession, +) -> ProjectDetailResponse: + """Partially update a project. + + Only fields included in the request body are modified; all other fields + retain their current values (PATCH semantics). + """ + project = _get_project_or_404(db, project_id) + + update_data = payload.model_dump(exclude_unset=True) + if not update_data: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail="No fields provided for update", + ) + + # If the slug is being changed, check for conflicts. + new_slug = update_data.get("project_slug") + if new_slug is not None and new_slug != project.project_slug: + conflict = db.scalars( + select(Project).where( + Project.project_slug == new_slug, + Project.project_id != project_id, + ) + ).first() + if conflict is not None: + raise HTTPException( + status_code=status.HTTP_409_CONFLICT, + detail=f"A project with slug '{new_slug}' already exists", + ) + + for field, value in update_data.items(): + setattr(project, field, value) + + project.updated_at = datetime.utcnow() + + try: + db.commit() + except Exception: + db.rollback() + logger.exception("Failed to update project %s", project_id) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to update project", + ) + + db.refresh(project) + return ProjectDetailResponse.model_validate(project) + + +@router.delete( + "/{project_id}", + status_code=status.HTTP_204_NO_CONTENT, + summary="Delete a project", +) +def delete_project( + project_id: uuid.UUID, + db: DbSession, +) -> None: + """Permanently delete a project and its associated data. + + This cascades to related funding sources and project barriers. + """ + project = _get_project_or_404(db, project_id) + + try: + db.delete(project) + db.commit() + except Exception: + db.rollback() + logger.exception("Failed to delete project %s", project_id) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to delete project", + ) + + logger.info("Deleted project %s (%s)", project_id, project.project_name) diff --git a/src/api/endpoints/reforms.py b/src/api/endpoints/reforms.py new file mode 100644 index 0000000..3c97ce4 --- /dev/null +++ b/src/api/endpoints/reforms.py @@ -0,0 +1,566 @@ +"""Policy reform impact measurement endpoints. + +Allows stakeholders to track zoning changes, parking reforms, density +bonuses, and other regulatory reforms, and to measure their quantitative +impact on the affordable housing pipeline. +""" + +from __future__ import annotations + +import logging +import uuid +from datetime import date, datetime +from typing import Any + +from fastapi import APIRouter, HTTPException, Query, status +from pydantic import BaseModel, Field +from sqlalchemy import func, select +from sqlalchemy.orm import Session + +from src.api.dependencies import DbSession, PaginationDep +from src.models.enums import ConfidenceLevel, PipelineStage, ReformType +from src.models.project import Project +from src.models.reform import PolicyReform + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/reforms", tags=["reforms"]) + + +# --------------------------------------------------------------------------- +# Pydantic schemas +# --------------------------------------------------------------------------- + +class ReformCreate(BaseModel): + """Schema for registering a new policy reform.""" + + jurisdiction: str = Field(..., min_length=1, max_length=300) + reform_name: str = Field(..., min_length=1, max_length=500) + reform_description: str | None = None + reform_type: ReformType + related_friction_topic: str | None = None + + announcement_date: date | None = None + effective_date: date | None = None + implementation_buffer_days: int = Field(30, ge=0) + + source: str | None = None + source_url: str | None = None + ordinance_number: str | None = None + notes: str | None = None + + model_config = {"json_schema_extra": { + "examples": [ + { + "jurisdiction": "Minneapolis, MN", + "reform_name": "2040 Comprehensive Plan - Parking Reform", + "reform_type": "parking_reform", + "effective_date": "2020-01-01", + "related_friction_topic": "parking_minimum", + "source": "City of Minneapolis", + } + ] + }} + + +class ReformUpdate(BaseModel): + """Partial update schema for a policy reform.""" + + reform_name: str | None = Field(None, min_length=1, max_length=500) + reform_description: str | None = None + reform_type: ReformType | None = None + related_friction_topic: str | None = None + announcement_date: date | None = None + effective_date: date | None = None + implementation_buffer_days: int | None = Field(None, ge=0) + source: str | None = None + source_url: str | None = None + ordinance_number: str | None = None + notes: str | None = None + + model_config = {"from_attributes": True} + + +class ReformResponse(BaseModel): + """Full policy reform detail.""" + + reform_id: uuid.UUID + jurisdiction: str + reform_name: str + reform_description: str | None = None + reform_type: ReformType + related_friction_topic: str | None = None + + announcement_date: date | None = None + effective_date: date | None = None + implementation_buffer_days: int + + # Impact metrics + projects_pre_reform: int + projects_post_reform: int + pre_reform_median_days: int | None = None + post_reform_median_days: int | None = None + days_saved_per_project: int | None = None + percent_improvement: float | None = None + + total_cost_savings: float | None = None + units_enabled: int + projects_no_longer_delayed: int + + statistical_significance_p_value: float | None = None + confidence_level: ConfidenceLevel | None = None + + source: str | None = None + source_url: str | None = None + ordinance_number: str | None = None + + impact_last_measured: date | None = None + notes: str | None = None + created_at: datetime + updated_at: datetime + + model_config = {"from_attributes": True} + + +class ReformListResponse(BaseModel): + """Paginated list of policy reforms.""" + + items: list[ReformResponse] + total: int + limit: int + offset: int + + model_config = {"from_attributes": True} + + +class ReformImpactResult(BaseModel): + """Quantified impact analysis for a single reform.""" + + reform_id: uuid.UUID + reform_name: str + jurisdiction: str + reform_type: ReformType + effective_date: date | None = None + + # Before / after comparison + projects_pre_reform: int + projects_post_reform: int + pre_reform_median_days: int | None = None + post_reform_median_days: int | None = None + days_saved_per_project: int | None = None + percent_improvement: float | None = None + + # Cost impact + avg_cost_per_delay_day: float | None = Field( + None, description="Estimated carrying cost per day of delay" + ) + total_cost_savings: float | None = None + units_enabled: int = 0 + + # Statistical quality + statistical_significance_p_value: float | None = None + confidence_level: ConfidenceLevel | None = None + + analysis_generated_at: str + + model_config = {"from_attributes": True} + + +class JurisdictionReformSummary(BaseModel): + """Aggregate reform impact for a jurisdiction.""" + + jurisdiction: str + total_reforms: int + reforms_with_measurable_impact: int + total_days_saved: int + total_cost_savings: float + total_units_enabled: int + most_impactful_reform: str | None = None + avg_percent_improvement: float | None = None + + model_config = {"from_attributes": True} + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + +@router.get( + "", + response_model=ReformListResponse, + summary="List policy reforms", +) +def list_reforms( + db: DbSession, + pagination: PaginationDep, + jurisdiction: str | None = Query(None, description="Filter by jurisdiction"), + reform_type: ReformType | None = Query(None, description="Filter by reform type"), + state: str | None = Query( + None, max_length=2, description="Filter by state (matches jurisdiction's state)" + ), + has_impact: bool | None = Query( + None, description="Only reforms with measured impact (days_saved > 0)" + ), +) -> ReformListResponse: + """Return a paginated list of tracked policy reforms. + + Can be filtered by jurisdiction, reform type, and whether measured + impact data exists. + """ + stmt = select(PolicyReform) + + if jurisdiction: + stmt = stmt.where(PolicyReform.jurisdiction == jurisdiction) + if reform_type: + stmt = stmt.where(PolicyReform.reform_type == reform_type) + if state: + stmt = stmt.where(PolicyReform.jurisdiction.ilike(f"%{state.upper()}%")) + if has_impact is True: + stmt = stmt.where(PolicyReform.days_saved_per_project > 0) + elif has_impact is False: + stmt = stmt.where( + (PolicyReform.days_saved_per_project == None) # noqa: E711 + | (PolicyReform.days_saved_per_project == 0) + ) + + count_stmt = select(func.count()).select_from(stmt.subquery()) + total = db.scalar(count_stmt) or 0 + + stmt = stmt.order_by(PolicyReform.effective_date.desc().nullslast()) + stmt = stmt.limit(pagination.limit).offset(pagination.offset) + reforms = list(db.scalars(stmt).all()) + + return ReformListResponse( + items=[ReformResponse.model_validate(r) for r in reforms], + total=total, + limit=pagination.limit, + offset=pagination.offset, + ) + + +@router.get( + "/{reform_id}", + response_model=ReformResponse, + summary="Get reform details", +) +def get_reform( + reform_id: uuid.UUID, + db: DbSession, +) -> ReformResponse: + """Retrieve full details for a single policy reform.""" + reform = db.get(PolicyReform, reform_id) + if reform is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Policy reform {reform_id} not found", + ) + return ReformResponse.model_validate(reform) + + +@router.post( + "", + response_model=ReformResponse, + status_code=status.HTTP_201_CREATED, + summary="Register a new policy reform", +) +def create_reform( + payload: ReformCreate, + db: DbSession, +) -> ReformResponse: + """Register a new regulatory reform for future impact measurement. + + Once registered, use ``POST /api/v1/reforms/{reform_id}/measure`` to + compute the before/after impact analysis. + """ + reform = PolicyReform(**payload.model_dump(exclude_unset=True)) + db.add(reform) + + try: + db.commit() + except Exception: + db.rollback() + logger.exception("Failed to create policy reform") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to create policy reform", + ) + + db.refresh(reform) + return ReformResponse.model_validate(reform) + + +@router.patch( + "/{reform_id}", + response_model=ReformResponse, + summary="Update a policy reform", +) +def update_reform( + reform_id: uuid.UUID, + payload: ReformUpdate, + db: DbSession, +) -> ReformResponse: + """Partially update a policy reform's metadata.""" + reform = db.get(PolicyReform, reform_id) + if reform is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Policy reform {reform_id} not found", + ) + + update_data = payload.model_dump(exclude_unset=True) + if not update_data: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail="No fields provided for update", + ) + + for field, value in update_data.items(): + setattr(reform, field, value) + reform.updated_at = datetime.utcnow() + + try: + db.commit() + except Exception: + db.rollback() + logger.exception("Failed to update reform %s", reform_id) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to update policy reform", + ) + + db.refresh(reform) + return ReformResponse.model_validate(reform) + + +@router.post( + "/{reform_id}/measure", + response_model=ReformImpactResult, + summary="Measure reform impact", +) +def measure_reform_impact( + reform_id: uuid.UUID, + db: DbSession, + avg_cost_per_delay_day: float = Query( + 3500.0, + ge=0, + description="Estimated carrying/delay cost per day (used for cost-savings calculation)", + ), +) -> ReformImpactResult: + """Compute a before/after impact analysis for a registered reform. + + The analysis compares entitlement durations (or durations for the + relevant stage) of projects that went through the pipeline before the + reform's effective date to those that started after. When the + ``src.analytics.measure_policy_reform_impact`` function is available + it is used for a statistically rigorous comparison; otherwise a + simplified DB-driven approach is used. + + The ``avg_cost_per_delay_day`` parameter lets callers customize the + carrying cost assumption used for total savings calculation. + """ + reform = db.get(PolicyReform, reform_id) + if reform is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Policy reform {reform_id} not found", + ) + + if not reform.effective_date: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail="Cannot measure impact without an effective_date on the reform", + ) + + # Try the analytics module. + try: + from src.analytics import measure_policy_reform_impact as _measure + result = _measure( + db, + reform_id=reform_id, + avg_cost_per_delay_day=avg_cost_per_delay_day, + ) + return ReformImpactResult( + reform_id=reform.reform_id, + reform_name=reform.reform_name, + jurisdiction=reform.jurisdiction, + reform_type=reform.reform_type, + effective_date=reform.effective_date, + avg_cost_per_delay_day=avg_cost_per_delay_day, + analysis_generated_at=datetime.utcnow().isoformat(), + **{k: v for k, v in result.items() if k not in ( + "reform_id", "reform_name", "jurisdiction", "reform_type", + "effective_date", "avg_cost_per_delay_day", "analysis_generated_at", + )}, + ) + except (ImportError, AttributeError): + pass + + # Fallback: direct DB comparison. + buffer = timedelta(days=reform.implementation_buffer_days) + cutoff = reform.effective_date + buffer + + # Pre-reform: projects whose entitlement started before the effective date. + pre_stmt = select( + func.count(Project.project_id).label("cnt"), + func.avg(Project.entitlement_duration_days).label("avg_days"), + ).where( + Project.jurisdiction == reform.jurisdiction, + Project.entitlement_start < reform.effective_date, + Project.entitlement_duration_days.isnot(None), + ) + pre = db.execute(pre_stmt).first() + pre_count = pre.cnt if pre else 0 + pre_avg = float(pre.avg_days) if pre and pre.avg_days else None + + # Post-reform: projects whose entitlement started after the cutoff. + post_stmt = select( + func.count(Project.project_id).label("cnt"), + func.avg(Project.entitlement_duration_days).label("avg_days"), + ).where( + Project.jurisdiction == reform.jurisdiction, + Project.entitlement_start >= cutoff, + Project.entitlement_duration_days.isnot(None), + ) + post = db.execute(post_stmt).first() + post_count = post.cnt if post else 0 + post_avg = float(post.avg_days) if post and post.avg_days else None + + days_saved: int | None = None + pct_improvement: float | None = None + total_savings: float | None = None + units_enabled = 0 + confidence = None + + if pre_avg is not None and post_avg is not None and pre_avg > 0: + days_saved = int(pre_avg - post_avg) + pct_improvement = round((pre_avg - post_avg) / pre_avg * 100, 1) if days_saved > 0 else 0.0 + total_savings = round(max(days_saved, 0) * avg_cost_per_delay_day * post_count, 2) + + # Rough confidence heuristic. + if pre_count >= 10 and post_count >= 10: + confidence = ConfidenceLevel.HIGH + elif pre_count >= 5 and post_count >= 5: + confidence = ConfidenceLevel.MODERATE + else: + confidence = ConfidenceLevel.LOW + + # Persist the results back to the reform record. + reform.projects_pre_reform = pre_count + reform.projects_post_reform = post_count + reform.pre_reform_median_days = int(pre_avg) if pre_avg else None + reform.post_reform_median_days = int(post_avg) if post_avg else None + reform.days_saved_per_project = days_saved + reform.percent_improvement = pct_improvement + reform.total_cost_savings = total_savings + reform.units_enabled = units_enabled + reform.confidence_level = confidence + reform.impact_last_measured = date.today() + reform.updated_at = datetime.utcnow() + + try: + db.commit() + except Exception: + db.rollback() + logger.exception("Failed to persist impact results for reform %s", reform_id) + + return ReformImpactResult( + reform_id=reform.reform_id, + reform_name=reform.reform_name, + jurisdiction=reform.jurisdiction, + reform_type=reform.reform_type, + effective_date=reform.effective_date, + projects_pre_reform=pre_count, + projects_post_reform=post_count, + pre_reform_median_days=int(pre_avg) if pre_avg else None, + post_reform_median_days=int(post_avg) if post_avg else None, + days_saved_per_project=days_saved, + percent_improvement=pct_improvement, + avg_cost_per_delay_day=avg_cost_per_delay_day, + total_cost_savings=total_savings, + units_enabled=units_enabled, + confidence_level=confidence, + analysis_generated_at=datetime.utcnow().isoformat(), + ) + + +@router.get( + "/jurisdictions/summary", + response_model=list[JurisdictionReformSummary], + summary="Reform impact summary by jurisdiction", +) +def jurisdiction_reform_summary( + db: DbSession, + state: str | None = Query(None, max_length=2, description="Filter by state"), + reform_type: ReformType | None = Query(None, description="Filter by reform type"), + limit: int = Query(20, ge=1, le=100, description="Max jurisdictions to return"), +) -> list[JurisdictionReformSummary]: + """Aggregate reform impact grouped by jurisdiction. + + Shows which jurisdictions have been most active in passing reforms + and which reforms have had the largest measurable impact. + """ + stmt = ( + select( + PolicyReform.jurisdiction, + func.count(PolicyReform.reform_id).label("total_reforms"), + func.sum( + func.coalesce(PolicyReform.days_saved_per_project, 0) + * func.coalesce(PolicyReform.projects_post_reform, 0) + ).label("total_days_saved"), + func.coalesce(func.sum(PolicyReform.total_cost_savings), 0).label( + "total_cost_savings" + ), + func.coalesce(func.sum(PolicyReform.units_enabled), 0).label( + "total_units_enabled" + ), + func.avg(PolicyReform.percent_improvement).label("avg_pct"), + ) + .group_by(PolicyReform.jurisdiction) + .order_by(func.count(PolicyReform.reform_id).desc()) + ) + + if state: + stmt = stmt.where(PolicyReform.jurisdiction.ilike(f"%{state.upper()}%")) + if reform_type: + stmt = stmt.where(PolicyReform.reform_type == reform_type) + + stmt = stmt.limit(limit) + rows = db.execute(stmt).all() + + results: list[JurisdictionReformSummary] = [] + for r in rows: + # Find most impactful reform for this jurisdiction. + best_stmt = ( + select(PolicyReform.reform_name) + .where( + PolicyReform.jurisdiction == r.jurisdiction, + PolicyReform.days_saved_per_project.isnot(None), + ) + .order_by(PolicyReform.days_saved_per_project.desc()) + .limit(1) + ) + best_name = db.scalar(best_stmt) + + # Count reforms with measurable positive impact. + measurable_stmt = select(func.count()).where( + PolicyReform.jurisdiction == r.jurisdiction, + PolicyReform.days_saved_per_project > 0, + ) + measurable_count = db.scalar(measurable_stmt) or 0 + + results.append( + JurisdictionReformSummary( + jurisdiction=r.jurisdiction, + total_reforms=r.total_reforms, + reforms_with_measurable_impact=measurable_count, + total_days_saved=int(r.total_days_saved or 0), + total_cost_savings=float(r.total_cost_savings or 0), + total_units_enabled=int(r.total_units_enabled or 0), + most_impactful_reform=best_name, + avg_percent_improvement=( + round(float(r.avg_pct), 1) if r.avg_pct else None + ), + ) + ) + + return results diff --git a/src/api/webhooks.py b/src/api/webhooks.py new file mode 100644 index 0000000..6d036ae --- /dev/null +++ b/src/api/webhooks.py @@ -0,0 +1,390 @@ +"""HousingMind ecosystem webhook handler. + +Receives inbound webhook events from sibling services (HousingLens, +HousingEar, HousingMind orchestrator) and processes them asynchronously. +""" + +from __future__ import annotations + +import hashlib +import hmac +import logging +from datetime import datetime +from enum import Enum +from typing import Any +from uuid import UUID + +from fastapi import APIRouter, Depends, Header, HTTPException, Request, status +from pydantic import BaseModel, Field +from sqlalchemy.orm import Session + +from config.settings import Settings, get_settings +from src.api.dependencies import get_db +from src.models.enums import OverallHealth, PipelineStage +from src.models.project import Project + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/webhooks", tags=["webhooks"]) + + +# --------------------------------------------------------------------------- +# Pydantic schemas +# --------------------------------------------------------------------------- + +class WebhookEventType(str, Enum): + """Known event types dispatched by the HousingMind ecosystem.""" + + FRICTION_SCORE_UPDATED = "friction_score_updated" + HEARING_DETECTED = "hearing_detected" + POLICY_CHANGE_DETECTED = "policy_change_detected" + PROJECT_STAGE_INFERRED = "project_stage_inferred" + FUNDING_ALERT = "funding_alert" + RISK_ASSESSMENT_UPDATED = "risk_assessment_updated" + QUERY_VOLUME_SPIKE = "query_volume_spike" + + +class WebhookPayload(BaseModel): + """Inbound webhook payload from any HousingMind service.""" + + event_type: WebhookEventType = Field( + ..., description="The type of event being reported" + ) + source_service: str = Field( + ..., description="Originating service name (e.g. 'housing_lens')" + ) + timestamp: datetime = Field( + default_factory=datetime.utcnow, + description="UTC timestamp of the event", + ) + project_id: UUID | None = Field( + None, description="Associated project ID, if applicable" + ) + jurisdiction: str | None = Field( + None, description="Jurisdiction the event pertains to" + ) + data: dict[str, Any] = Field( + default_factory=dict, + description="Event-specific payload data", + ) + + model_config = {"json_schema_extra": { + "examples": [ + { + "event_type": "friction_score_updated", + "source_service": "housing_lens", + "project_id": "b1e4a7c0-1234-5678-abcd-ef0123456789", + "jurisdiction": "San Francisco, CA", + "data": { + "new_friction_score": 72, + "previous_friction_score": 65, + "contributing_factors": ["parking_minimum", "design_review"], + }, + } + ] + }} + + +class WebhookResponse(BaseModel): + """Acknowledgement returned to the calling service.""" + + accepted: bool = True + message: str = "Event received and queued for processing" + event_type: str + timestamp: datetime = Field(default_factory=datetime.utcnow) + + +# --------------------------------------------------------------------------- +# Signature verification +# --------------------------------------------------------------------------- + +def _verify_signature( + body: bytes, + signature: str | None, + secret: str, +) -> None: + """Verify HMAC-SHA256 webhook signature. + + Raises ``HTTPException(403)`` when the signature is missing or invalid. + """ + if not secret: + # Secret not configured -- skip verification (dev / test mode). + return + if not signature: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Missing X-Webhook-Signature header", + ) + expected = hmac.new( + secret.encode(), body, hashlib.sha256 + ).hexdigest() + if not hmac.compare_digest(expected, signature): + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Invalid webhook signature", + ) + + +# --------------------------------------------------------------------------- +# Event handlers +# --------------------------------------------------------------------------- + +def _handle_friction_score_updated( + db: Session, payload: WebhookPayload +) -> None: + """Update a project's jurisdiction friction score from HousingLens.""" + if payload.project_id is None: + logger.warning("friction_score_updated event missing project_id") + return + + project = db.get(Project, payload.project_id) + if project is None: + logger.warning( + "friction_score_updated: project %s not found", payload.project_id + ) + return + + new_score = payload.data.get("new_friction_score") + if new_score is not None: + project.jurisdiction_friction_score = int(new_score) + + friction_points = payload.data.get("contributing_factors") + if friction_points is not None: + project.primary_friction_points = {"factors": friction_points} + + project.updated_at = datetime.utcnow() + db.commit() + logger.info( + "Updated friction score for project %s to %s", + payload.project_id, + new_score, + ) + + +def _handle_hearing_detected( + db: Session, payload: WebhookPayload +) -> None: + """Record a newly detected public hearing from HousingEar.""" + if payload.project_id is None: + return + + project = db.get(Project, payload.project_id) + if project is None: + return + + project.public_meetings_attended = (project.public_meetings_attended or 0) + 1 + + hearing_type = payload.data.get("hearing_type", "") + if hearing_type == "variance": + project.variance_hearings = (project.variance_hearings or 0) + 1 + if hearing_type == "design_review": + project.design_review_iterations = ( + project.design_review_iterations or 0 + ) + 1 + + project.updated_at = datetime.utcnow() + db.commit() + logger.info( + "Recorded hearing for project %s (type=%s)", + payload.project_id, + hearing_type, + ) + + +def _handle_project_stage_inferred( + db: Session, payload: WebhookPayload +) -> None: + """Update project stage based on HousingMind ML inference.""" + if payload.project_id is None: + return + + project = db.get(Project, payload.project_id) + if project is None: + return + + inferred_stage = payload.data.get("inferred_stage") + confidence = payload.data.get("confidence") + + if inferred_stage is not None: + try: + new_stage = PipelineStage(inferred_stage) + except ValueError: + logger.warning("Invalid inferred stage: %s", inferred_stage) + return + + project.current_stage = new_stage + if confidence is not None: + project.prediction_confidence = float(confidence) + project.prediction_last_updated = datetime.utcnow() + project.updated_at = datetime.utcnow() + db.commit() + logger.info( + "Inferred stage for project %s -> %s (confidence=%.2f)", + payload.project_id, + new_stage.value, + confidence or 0, + ) + + +def _handle_risk_assessment_updated( + db: Session, payload: WebhookPayload +) -> None: + """Update risk score and overall health from a fresh assessment.""" + if payload.project_id is None: + return + + project = db.get(Project, payload.project_id) + if project is None: + return + + risk_score = payload.data.get("risk_score") + risk_factors = payload.data.get("risk_factors") + overall_health = payload.data.get("overall_health") + + if risk_score is not None: + project.risk_score = float(risk_score) + if risk_factors is not None: + project.risk_factors = risk_factors + if overall_health is not None: + try: + project.overall_health = OverallHealth(overall_health) + except ValueError: + logger.warning("Invalid overall_health value: %s", overall_health) + + health_score = payload.data.get("health_score") + if health_score is not None: + project.health_score = float(health_score) + + project.updated_at = datetime.utcnow() + db.commit() + logger.info( + "Updated risk assessment for project %s (score=%.2f)", + payload.project_id, + risk_score or 0, + ) + + +def _handle_funding_alert( + db: Session, payload: WebhookPayload +) -> None: + """Process a funding-related alert (gap warning, new opportunity, etc.).""" + if payload.project_id is None: + logger.info( + "Funding alert for jurisdiction %s: %s", + payload.jurisdiction, + payload.data.get("alert_message", ""), + ) + return + + project = db.get(Project, payload.project_id) + if project is None: + return + + new_gap = payload.data.get("funding_gap") + if new_gap is not None: + project.funding_gap = float(new_gap) + + project.updated_at = datetime.utcnow() + db.commit() + logger.info("Processed funding alert for project %s", payload.project_id) + + +def _handle_query_volume_spike( + db: Session, payload: WebhookPayload +) -> None: + """Record an unusual spike in HousingMind queries about a project.""" + if payload.project_id is None: + return + + project = db.get(Project, payload.project_id) + if project is None: + return + + additional_queries = payload.data.get("query_count", 0) + project.housing_mind_queries = ( + project.housing_mind_queries or 0 + ) + int(additional_queries) + + categories = payload.data.get("top_categories") + if categories is not None: + project.top_query_categories = {"categories": categories} + + project.updated_at = datetime.utcnow() + db.commit() + logger.info( + "Recorded query spike for project %s (+%d queries)", + payload.project_id, + additional_queries, + ) + + +_EVENT_HANDLERS: dict[WebhookEventType, Any] = { + WebhookEventType.FRICTION_SCORE_UPDATED: _handle_friction_score_updated, + WebhookEventType.HEARING_DETECTED: _handle_hearing_detected, + WebhookEventType.PROJECT_STAGE_INFERRED: _handle_project_stage_inferred, + WebhookEventType.RISK_ASSESSMENT_UPDATED: _handle_risk_assessment_updated, + WebhookEventType.FUNDING_ALERT: _handle_funding_alert, + WebhookEventType.QUERY_VOLUME_SPIKE: _handle_query_volume_spike, +} + + +# --------------------------------------------------------------------------- +# Endpoint +# --------------------------------------------------------------------------- + +@router.post( + "/housingmind", + response_model=WebhookResponse, + status_code=status.HTTP_202_ACCEPTED, + summary="Receive HousingMind ecosystem events", +) +async def receive_housingmind_webhook( + request: Request, + payload: WebhookPayload, + db: Session = Depends(get_db), + settings: Settings = Depends(get_settings), + x_webhook_signature: str | None = Header(None), +) -> WebhookResponse: + """Receive and process webhook events from HousingMind ecosystem services. + + Supported event types: + + * **friction_score_updated** -- HousingLens recalculated a jurisdiction's + friction score relevant to a tracked project. + * **hearing_detected** -- HousingEar detected a public hearing that + pertains to a tracked project. + * **policy_change_detected** -- A policy or zoning reform was detected. + * **project_stage_inferred** -- ML model inferred a project moved to a + new pipeline stage. + * **funding_alert** -- A funding gap warning or new opportunity alert. + * **risk_assessment_updated** -- Fresh risk scoring for a project. + * **query_volume_spike** -- Unusual spike in HousingMind queries about a + project. + + The endpoint validates the ``X-Webhook-Signature`` header (HMAC-SHA256) + when the ``housing_mind_webhook_secret`` setting is configured. + """ + body = await request.body() + _verify_signature(body, x_webhook_signature, settings.housing_mind_webhook_secret) + + handler = _EVENT_HANDLERS.get(payload.event_type) + if handler is not None: + try: + handler(db, payload) + except Exception: + logger.exception( + "Error processing webhook event %s", payload.event_type + ) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Internal error processing webhook event", + ) + else: + logger.info( + "No handler registered for event type %s -- acknowledged but not processed", + payload.event_type, + ) + + return WebhookResponse( + event_type=payload.event_type.value, + ) diff --git a/src/data_collection/__init__.py b/src/data_collection/__init__.py new file mode 100644 index 0000000..c9371ea --- /dev/null +++ b/src/data_collection/__init__.py @@ -0,0 +1,4 @@ +from src.data_collection.developer_portal import DeveloperPortalService +from src.data_collection.validation import DataValidator + +__all__ = ["DeveloperPortalService", "DataValidator"] diff --git a/src/data_collection/developer_portal.py b/src/data_collection/developer_portal.py new file mode 100644 index 0000000..253a9c8 --- /dev/null +++ b/src/data_collection/developer_portal.py @@ -0,0 +1,164 @@ +"""Developer portal service for project data submission and updates.""" + +import logging +import re +import uuid +from datetime import date, datetime + +from sqlalchemy.orm import Session + +from src.models.enums import DataSource, PipelineStage +from src.models.project import Project + +logger = logging.getLogger(__name__) + + +def _slugify(name: str) -> str: + """Convert a project name to a URL-friendly slug.""" + slug = name.lower().strip() + slug = re.sub(r"[^\w\s-]", "", slug) + slug = re.sub(r"[\s_]+", "-", slug) + slug = re.sub(r"-+", "-", slug) + return slug[:500] + + +class DeveloperPortalService: + """Service layer for developer project submissions.""" + + def __init__(self, db: Session): + self.db = db + + def create_project(self, data: dict) -> Project: + """Create a new project from developer portal submission.""" + slug = _slugify(data["project_name"]) + + # Ensure unique slug + existing = self.db.query(Project).filter(Project.project_slug == slug).first() + if existing: + slug = f"{slug}-{uuid.uuid4().hex[:6]}" + + project = Project( + project_name=data["project_name"], + project_slug=slug, + address=data.get("address"), + city=data.get("city"), + county=data.get("county"), + state=data.get("state"), + zip=data.get("zip"), + jurisdiction=data.get("jurisdiction"), + developer_org=data.get("developer_org"), + developer_contact=data.get("developer_contact"), + total_units=data.get("total_units", 0), + affordable_units=data.get("affordable_units", 0), + building_type=data.get("building_type"), + current_stage=data.get("current_stage", PipelineStage.CONCEPT), + stage_entry_date=date.today(), + concept_start=date.today(), + data_source=DataSource.DEVELOPER_PORTAL, + data_quality_score=0.5, + data_completeness=self._calculate_completeness(data), + created_by=data.get("created_by", "developer_portal"), + ) + + self.db.add(project) + self.db.commit() + self.db.refresh(project) + logger.info(f"Created project: {project.project_slug} ({project.project_id})") + return project + + def update_project_stage( + self, + project_id: uuid.UUID, + new_stage: PipelineStage, + completion_date: date | None = None, + ) -> Project: + """Update a project's pipeline stage and record transition.""" + project = self.db.get(Project, project_id) + if project is None: + raise ValueError(f"Project not found: {project_id}") + + old_stage = project.current_stage + now = completion_date or date.today() + + # Record completion of current stage + stage_complete_attr = f"{old_stage.value}_complete" + if hasattr(project, stage_complete_attr): + setattr(project, stage_complete_attr, now) + + # Calculate duration of completed stage + stage_start_attr = f"{old_stage.value}_start" + start_date = getattr(project, stage_start_attr, None) + if start_date: + duration_attr = f"{old_stage.value}_duration_days" + if hasattr(project, duration_attr): + setattr(project, duration_attr, (now - start_date).days) + + # Set new stage + project.current_stage = new_stage + project.stage_entry_date = now + + # Record start of new stage + new_stage_start_attr = f"{new_stage.value}_start" + if hasattr(project, new_stage_start_attr): + setattr(project, new_stage_start_attr, now) + + # Update elapsed days + if project.concept_start: + project.total_elapsed_days = (now - project.concept_start).days + + # Update groundbreaking tracker + if new_stage == PipelineStage.CONSTRUCTION and project.concept_start: + project.concept_to_groundbreaking_days = (now - project.concept_start).days + + project.updated_at = datetime.utcnow() + self.db.commit() + self.db.refresh(project) + + logger.info( + f"Project {project.project_slug} transitioned " + f"{old_stage.value} -> {new_stage.value}" + ) + return project + + def update_project_costs(self, project_id: uuid.UUID, cost_data: dict) -> Project: + """Update cost information for a project.""" + project = self.db.get(Project, project_id) + if project is None: + raise ValueError(f"Project not found: {project_id}") + + cost_fields = [ + "total_development_cost", "land_acquisition_cost", "hard_costs", + "soft_costs", "financing_costs", "developer_fee", "reserves", + "architecture_engineering", "legal_fees", "original_budget", + "current_budget", + ] + + for field in cost_fields: + if field in cost_data: + setattr(project, field, cost_data[field]) + + # Recalculate derived fields + if project.total_development_cost and project.total_units: + project.cost_per_unit = project.total_development_cost / project.total_units + + if project.original_budget and project.current_budget: + project.budget_variance_dollars = project.current_budget - project.original_budget + project.budget_variance_percent = ( + (project.budget_variance_dollars / project.original_budget) * 100 + ) + + project.updated_at = datetime.utcnow() + self.db.commit() + self.db.refresh(project) + return project + + @staticmethod + def _calculate_completeness(data: dict) -> float: + """Calculate a data completeness score (0-1) based on filled fields.""" + important_fields = [ + "project_name", "address", "city", "state", "jurisdiction", + "developer_org", "total_units", "affordable_units", "building_type", + "site_acres", "stories", + ] + filled = sum(1 for f in important_fields if data.get(f) is not None) + return round(filled / len(important_fields), 2) diff --git a/src/data_collection/lihtc_scraper.py b/src/data_collection/lihtc_scraper.py new file mode 100644 index 0000000..7167ae0 --- /dev/null +++ b/src/data_collection/lihtc_scraper.py @@ -0,0 +1,63 @@ +"""Scraper for state LIHTC annual reports and HUD LIHTC database.""" + +import logging + +from src.integrations.public_records import LIHTCProject, LIHTCScraper + +logger = logging.getLogger(__name__) + +# HUD LIHTC database endpoint +HUD_LIHTC_BASE_URL = "https://lihtc.huduser.gov/api" + + +class HUDLIHTCScraper(LIHTCScraper): + """Scraper for the HUD national LIHTC database.""" + + async def get_state_allocations( + self, + state: str, + year: int | None = None, + ) -> list[LIHTCProject]: + """Fetch LIHTC allocation data from the HUD database.""" + params: dict = {"state": state.upper()} + if year: + params["yr_alloc"] = year + + try: + response = await self._client.get( + f"{HUD_LIHTC_BASE_URL}/projects", + params=params, + ) + response.raise_for_status() + data = response.json() + + return [ + LIHTCProject( + project_name=r.get("project", ""), + address=r.get("project_st", ""), + city=r.get("proj_cty", ""), + state=r.get("proj_st", state), + credit_type=r.get("type", ""), + total_units=int(r.get("n_units", 0) or 0), + low_income_units=int(r.get("li_units", 0) or 0), + allocation_year=int(r.get("yr_alloc", 0) or 0), + placed_in_service_date=r.get("yr_pis", ""), + total_credit=float(r.get("allocamt", 0) or 0), + ) + for r in data.get("results", []) + ] + except Exception: + logger.exception(f"Error fetching LIHTC data for {state}") + return [] + + async def search_by_city( + self, + city: str, + state: str, + ) -> list[LIHTCProject]: + """Search LIHTC projects in a specific city.""" + all_projects = await self.get_state_allocations(state) + return [ + p for p in all_projects + if p.city.lower() == city.lower() + ] diff --git a/src/data_collection/permit_scraper.py b/src/data_collection/permit_scraper.py new file mode 100644 index 0000000..3dce359 --- /dev/null +++ b/src/data_collection/permit_scraper.py @@ -0,0 +1,80 @@ +"""City and county permit database scrapers.""" + +import logging +from datetime import date + +from src.integrations.public_records import PermitRecord, PermitScraper + +logger = logging.getLogger(__name__) + + +class GenericPermitScraper(PermitScraper): + """Generic permit scraper for jurisdictions with standard APIs. + + This scraper handles the common pattern of city open-data portals + that expose permit data through Socrata or similar APIs. + """ + + async def search_permits( + self, + address: str | None = None, + date_from: date | None = None, + date_to: date | None = None, + permit_type: str | None = None, + ) -> list[PermitRecord]: + """Search permits using a Socrata-style open data API.""" + params: dict = {} + filters = [] + + if address: + filters.append(f"upper(address) like upper('%{address}%')") + if date_from: + filters.append(f"issue_date >= '{date_from.isoformat()}'") + if date_to: + filters.append(f"issue_date <= '{date_to.isoformat()}'") + if permit_type: + filters.append(f"permit_type = '{permit_type}'") + + if filters: + params["$where"] = " AND ".join(filters) + + params["$limit"] = 100 + params["$order"] = "issue_date DESC" + + try: + response = await self._client.get(self.base_url, params=params) + response.raise_for_status() + records = response.json() + + return [ + PermitRecord( + permit_number=r.get("permit_number", ""), + address=r.get("address", ""), + jurisdiction=self.jurisdiction, + permit_type=r.get("permit_type", ""), + status=r.get("status", ""), + issue_date=r.get("issue_date", ""), + description=r.get("description", ""), + units=int(r.get("units", 0) or 0), + valuation=float(r.get("valuation", 0) or 0), + ) + for r in records + ] + except Exception: + logger.exception(f"Error searching permits for {self.jurisdiction}") + return [] + + +def get_scraper_for_jurisdiction(jurisdiction: str) -> PermitScraper | None: + """Factory function to get the appropriate scraper for a jurisdiction. + + Returns None if no scraper is configured for the given jurisdiction. + """ + # Registry of known jurisdiction scrapers and their API endpoints. + # Extend this mapping as new jurisdictions are onboarded. + registry: dict[str, str] = {} + + base_url = registry.get(jurisdiction.lower()) + if base_url: + return GenericPermitScraper(jurisdiction=jurisdiction, base_url=base_url) + return None diff --git a/src/data_collection/validation.py b/src/data_collection/validation.py new file mode 100644 index 0000000..04ee0f5 --- /dev/null +++ b/src/data_collection/validation.py @@ -0,0 +1,189 @@ +"""Data quality validation for project records.""" + +import logging +from dataclasses import dataclass, field +from datetime import date + +from src.models.project import Project + +logger = logging.getLogger(__name__) + + +@dataclass +class ValidationResult: + """Result of validating a project record.""" + + is_valid: bool + quality_score: float # 0.0 to 1.0 + completeness_score: float # 0.0 to 1.0 + errors: list[str] = field(default_factory=list) + warnings: list[str] = field(default_factory=list) + + +class DataValidator: + """Validates project data quality and completeness.""" + + # Fields that should always be present + REQUIRED_FIELDS = [ + "project_name", + "total_units", + "current_stage", + ] + + # Fields important for analytics + ANALYTICS_FIELDS = [ + "city", + "state", + "jurisdiction", + "affordable_units", + "building_type", + "developer_org", + "concept_start", + ] + + # Fields needed for cost analytics + COST_FIELDS = [ + "total_development_cost", + "hard_costs", + "soft_costs", + "land_acquisition_cost", + ] + + def validate_project(self, project: Project) -> ValidationResult: + """Run all validation checks on a project.""" + errors: list[str] = [] + warnings: list[str] = [] + + # Required field checks + for field_name in self.REQUIRED_FIELDS: + value = getattr(project, field_name, None) + if value is None or value == "": + errors.append(f"Missing required field: {field_name}") + + # Unit counts consistency + if project.total_units is not None and project.total_units < 0: + errors.append("total_units cannot be negative") + + if project.affordable_units is not None and project.total_units is not None: + if project.affordable_units > project.total_units: + errors.append("affordable_units exceeds total_units") + + # Unit mix should add up + unit_mix_sum = sum([ + project.studio_units or 0, + project.one_br_units or 0, + project.two_br_units or 0, + project.three_br_units or 0, + project.four_plus_br_units or 0, + ]) + if unit_mix_sum > 0 and project.total_units and unit_mix_sum != project.total_units: + warnings.append( + f"Unit mix sum ({unit_mix_sum}) does not match " + f"total_units ({project.total_units})" + ) + + # AMI mix should not exceed affordable units + ami_sum = sum([ + project.ami_30_units or 0, + project.ami_40_units or 0, + project.ami_50_units or 0, + project.ami_60_units or 0, + project.ami_80_units or 0, + ]) + if ami_sum > 0 and project.affordable_units and ami_sum > project.affordable_units: + warnings.append( + f"AMI unit sum ({ami_sum}) exceeds " + f"affordable_units ({project.affordable_units})" + ) + + # Timeline consistency + self._validate_timeline(project, errors, warnings) + + # Cost consistency + self._validate_costs(project, warnings) + + # Calculate scores + completeness = self._calculate_completeness(project) + quality = 1.0 if not errors else max(0.0, 1.0 - (len(errors) * 0.2)) + + is_valid = len(errors) == 0 + + return ValidationResult( + is_valid=is_valid, + quality_score=quality, + completeness_score=completeness, + errors=errors, + warnings=warnings, + ) + + def _validate_timeline( + self, + project: Project, + errors: list[str], + warnings: list[str], + ) -> None: + """Validate timeline date consistency.""" + stages = [ + ("concept", project.concept_start, project.concept_complete), + ("pre_development", project.pre_development_start, project.pre_development_complete), + ("entitlement", project.entitlement_start, project.entitlement_complete), + ("financing", project.financing_start, project.financing_complete), + ("construction", project.construction_start, project.construction_complete), + ("lease_up", project.lease_up_start, project.lease_up_complete), + ] + + prev_end: date | None = None + for stage_name, start, end in stages: + if start and end: + if end < start: + errors.append( + f"{stage_name} end date ({end}) is before " + f"start date ({start})" + ) + if start and prev_end and start < prev_end: + warnings.append( + f"{stage_name} start ({start}) overlaps with " + f"previous stage end ({prev_end})" + ) + if end: + prev_end = end + + def _validate_costs(self, project: Project, warnings: list[str]) -> None: + """Validate cost data consistency.""" + if project.total_development_cost and project.total_units: + cpu = project.total_development_cost / project.total_units + if cpu < 50_000: + warnings.append( + f"Cost per unit ({cpu:,.0f}) seems unusually low" + ) + if cpu > 1_000_000: + warnings.append( + f"Cost per unit ({cpu:,.0f}) seems unusually high" + ) + + if project.hard_costs and project.soft_costs and project.total_development_cost: + component_sum = ( + (project.hard_costs or 0) + + (project.soft_costs or 0) + + (project.land_acquisition_cost or 0) + + (project.financing_costs or 0) + + (project.developer_fee or 0) + + (project.reserves or 0) + ) + if component_sum > 0: + diff_pct = abs(component_sum - project.total_development_cost) / project.total_development_cost * 100 + if diff_pct > 10: + warnings.append( + f"Cost components sum ({component_sum:,.0f}) differs from " + f"total ({project.total_development_cost:,.0f}) by {diff_pct:.1f}%" + ) + + def _calculate_completeness(self, project: Project) -> float: + """Calculate what fraction of important fields are populated.""" + all_fields = self.REQUIRED_FIELDS + self.ANALYTICS_FIELDS + self.COST_FIELDS + filled = 0 + for field_name in all_fields: + value = getattr(project, field_name, None) + if value is not None and value != "" and value != 0: + filled += 1 + return round(filled / len(all_fields), 2) diff --git a/src/database/__init__.py b/src/database/__init__.py new file mode 100644 index 0000000..a43d219 --- /dev/null +++ b/src/database/__init__.py @@ -0,0 +1,3 @@ +from src.database.connection import Base, get_db, get_engine, get_session_factory + +__all__ = ["Base", "get_db", "get_engine", "get_session_factory"] diff --git a/src/database/connection.py b/src/database/connection.py new file mode 100644 index 0000000..b3cad2a --- /dev/null +++ b/src/database/connection.py @@ -0,0 +1,42 @@ +"""Database connection and session management.""" + +from collections.abc import Generator + +from sqlalchemy import create_engine +from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker + +from config.settings import get_settings + + +class Base(DeclarativeBase): + """SQLAlchemy declarative base for all models.""" + + pass + + +def get_engine(database_url: str | None = None): + """Create and return a SQLAlchemy engine.""" + settings = get_settings() + url = database_url or settings.database_url + return create_engine( + url, + pool_size=settings.database_pool_size, + max_overflow=settings.database_max_overflow, + pool_pre_ping=True, + ) + + +def get_session_factory(database_url: str | None = None) -> sessionmaker: + """Create and return a session factory.""" + engine = get_engine(database_url) + return sessionmaker(bind=engine, autocommit=False, autoflush=False) + + +def get_db() -> Generator[Session, None, None]: + """Yield a database session for FastAPI dependency injection.""" + SessionLocal = get_session_factory() + db = SessionLocal() + try: + yield db + finally: + db.close() diff --git a/src/database/migrations/.gitkeep b/src/database/migrations/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/database/queries.py b/src/database/queries.py new file mode 100644 index 0000000..80c9f44 --- /dev/null +++ b/src/database/queries.py @@ -0,0 +1,226 @@ +"""Common database query patterns for the HousingHand pipeline.""" + +from datetime import date, timedelta +from uuid import UUID + +from sqlalchemy import func, select +from sqlalchemy.orm import Session + +from src.models.barrier import ProjectBarrier +from src.models.enums import OverallHealth, PipelineStage +from src.models.funding_source import FundingSource +from src.models.project import Project + + +def get_project(db: Session, project_id: UUID) -> Project | None: + """Fetch a single project by ID.""" + return db.get(Project, project_id) + + +def get_project_by_slug(db: Session, slug: str) -> Project | None: + """Fetch a single project by its URL-friendly slug.""" + stmt = select(Project).where(Project.project_slug == slug) + return db.scalars(stmt).first() + + +def query_projects( + db: Session, + *, + jurisdiction: str | None = None, + city: str | None = None, + state: str | None = None, + current_stage: PipelineStage | None = None, + stages: list[PipelineStage] | None = None, + overall_health: OverallHealth | None = None, + affordable_units_min: int | None = None, + developer_org: str | None = None, + date_range_start: date | None = None, + date_range_end: date | None = None, + funding_source_organization: str | None = None, + limit: int = 1000, + offset: int = 0, +) -> list[Project]: + """Flexible project query with multiple filter options.""" + stmt = select(Project) + + if jurisdiction: + stmt = stmt.where(Project.jurisdiction == jurisdiction) + if city: + stmt = stmt.where(Project.city == city) + if state: + stmt = stmt.where(Project.state == state) + if current_stage: + stmt = stmt.where(Project.current_stage == current_stage) + if stages: + stmt = stmt.where(Project.current_stage.in_(stages)) + if overall_health: + stmt = stmt.where(Project.overall_health == overall_health) + if affordable_units_min is not None: + stmt = stmt.where(Project.affordable_units >= affordable_units_min) + if developer_org: + stmt = stmt.where(Project.developer_org == developer_org) + if date_range_start: + stmt = stmt.where(Project.created_at >= date_range_start) + if date_range_end: + stmt = stmt.where(Project.created_at <= date_range_end) + if funding_source_organization: + stmt = stmt.join(FundingSource).where( + FundingSource.provider_organization == funding_source_organization + ) + + stmt = stmt.order_by(Project.updated_at.desc()).limit(limit).offset(offset) + return list(db.scalars(stmt).all()) + + +def query_similar_projects( + db: Session, + *, + jurisdiction: str | None = None, + state: str | None = None, + unit_count_range: tuple[float, float] | None = None, + building_type: str | None = None, + completed_only: bool = False, + limit: int = 50, +) -> list[Project]: + """Find similar projects for peer benchmarking.""" + stmt = select(Project) + + if jurisdiction: + stmt = stmt.where(Project.jurisdiction == jurisdiction) + if state: + stmt = stmt.where(Project.state == state) + if unit_count_range: + low, high = unit_count_range + stmt = stmt.where(Project.total_units >= low, Project.total_units <= high) + if building_type: + stmt = stmt.where(Project.building_type == building_type) + if completed_only: + stmt = stmt.where( + Project.current_stage.in_([PipelineStage.OPERATIONS, PipelineStage.LEASE_UP]) + ) + + stmt = stmt.limit(limit) + return list(db.scalars(stmt).all()) + + +def get_projects_by_entitlement_window( + db: Session, + jurisdiction: str, + entitlement_start_before: date | None = None, + entitlement_start_after: date | None = None, + entitlement_complete_before: date | None = None, +) -> list[Project]: + """Query projects by entitlement date windows (for reform impact analysis).""" + stmt = select(Project).where(Project.jurisdiction == jurisdiction) + + if entitlement_start_before: + stmt = stmt.where(Project.entitlement_start <= entitlement_start_before) + if entitlement_start_after: + stmt = stmt.where(Project.entitlement_start >= entitlement_start_after) + if entitlement_complete_before: + stmt = stmt.where(Project.entitlement_complete <= entitlement_complete_before) + + return list(db.scalars(stmt).all()) + + +def get_abandoned_projects( + db: Session, + jurisdiction: str, + abandoned_before: date | None = None, + stage_when_abandoned: PipelineStage | None = None, +) -> list[Project]: + """Query projects that were abandoned.""" + stmt = select(Project).where( + Project.jurisdiction == jurisdiction, + Project.current_stage == PipelineStage.ABANDONED, + ) + + if abandoned_before: + stmt = stmt.where(Project.updated_at <= abandoned_before) + + return list(db.scalars(stmt).all()) + + +def get_jurisdiction_project_count(db: Session, jurisdiction: str) -> int: + """Count total projects in a jurisdiction.""" + stmt = select(func.count(Project.project_id)).where( + Project.jurisdiction == jurisdiction + ) + return db.scalar(stmt) or 0 + + +def get_portfolio_summary_stats( + db: Session, + jurisdiction: str | None = None, + city: str | None = None, + state: str | None = None, +) -> dict: + """Get aggregate statistics for portfolio views.""" + stmt = select( + func.count(Project.project_id).label("total_projects"), + func.sum(Project.total_units).label("total_units"), + func.sum(Project.affordable_units).label("total_affordable_units"), + func.sum(Project.total_development_cost).label("total_cost"), + func.sum(Project.funding_gap).label("total_funding_gap"), + ) + + if jurisdiction: + stmt = stmt.where(Project.jurisdiction == jurisdiction) + if city: + stmt = stmt.where(Project.city == city) + if state: + stmt = stmt.where(Project.state == state) + + result = db.execute(stmt).first() + if result is None: + return { + "total_projects": 0, + "total_units": 0, + "total_affordable_units": 0, + "total_cost": 0, + "total_funding_gap": 0, + } + + return { + "total_projects": result.total_projects or 0, + "total_units": result.total_units or 0, + "total_affordable_units": result.total_affordable_units or 0, + "total_cost": float(result.total_cost or 0), + "total_funding_gap": float(result.total_funding_gap or 0), + } + + +def get_stage_distribution( + db: Session, + jurisdiction: str | None = None, +) -> dict[str, int]: + """Get project counts by pipeline stage.""" + stmt = select( + Project.current_stage, + func.count(Project.project_id).label("count"), + ).group_by(Project.current_stage) + + if jurisdiction: + stmt = stmt.where(Project.jurisdiction == jurisdiction) + + results = db.execute(stmt).all() + return {row.current_stage.value: row.count for row in results} + + +def get_stalled_projects( + db: Session, + days_threshold: int = 180, + jurisdiction: str | None = None, +) -> list[Project]: + """Find projects stuck in their current stage beyond a threshold.""" + cutoff = date.today() - timedelta(days=days_threshold) + stmt = select(Project).where( + Project.stage_entry_date <= cutoff, + Project.current_stage.notin_([PipelineStage.OPERATIONS, PipelineStage.ABANDONED]), + ) + + if jurisdiction: + stmt = stmt.where(Project.jurisdiction == jurisdiction) + + stmt = stmt.order_by(Project.stage_entry_date.asc()) + return list(db.scalars(stmt).all()) diff --git a/src/integrations/__init__.py b/src/integrations/__init__.py new file mode 100644 index 0000000..b78570c --- /dev/null +++ b/src/integrations/__init__.py @@ -0,0 +1,5 @@ +from src.integrations.housing_ear import HousingEarClient +from src.integrations.housing_lens import HousingLensClient +from src.integrations.housing_mind import HousingMindWebhookHandler + +__all__ = ["HousingLensClient", "HousingEarClient", "HousingMindWebhookHandler"] diff --git a/src/integrations/housing_ear.py b/src/integrations/housing_ear.py new file mode 100644 index 0000000..6e01149 --- /dev/null +++ b/src/integrations/housing_ear.py @@ -0,0 +1,112 @@ +"""Client for the HousingEar policy monitoring API.""" + +from dataclasses import dataclass +from datetime import date + +import httpx + +from config.settings import get_settings + + +@dataclass +class FundingProgram: + """A funding program discovered by HousingEar.""" + + program_name: str + jurisdiction: str + amount_available: float = 0.0 + application_deadline: str = "" + source_url: str = "" + description: str = "" + + +@dataclass +class PolicyChange: + """A policy change detected by HousingEar.""" + + title: str + jurisdiction: str + change_type: str = "" + effective_date: str = "" + source_url: str = "" + description: str = "" + ordinance_number: str = "" + + +class HousingEarClient: + """HTTP client for the HousingEar policy and funding monitoring API.""" + + def __init__(self, base_url: str | None = None, api_key: str | None = None): + settings = get_settings() + self.base_url = (base_url or settings.housing_ear_api_url).rstrip("/") + self.api_key = api_key or settings.housing_ear_api_key + self._client = httpx.AsyncClient( + base_url=self.base_url, + headers={"Authorization": f"Bearer {self.api_key}"}, + timeout=30.0, + ) + + async def check_funding_opportunities( + self, + jurisdiction: str, + date_after: date | None = None, + ) -> list[FundingProgram]: + """Check for new funding programs available in a jurisdiction.""" + try: + params: dict = {"jurisdiction": jurisdiction} + if date_after: + params["date_after"] = date_after.isoformat() + + response = await self._client.get("/funding/opportunities", params=params) + response.raise_for_status() + data = response.json() + + return [ + FundingProgram( + program_name=p["program_name"], + jurisdiction=p.get("jurisdiction", jurisdiction), + amount_available=p.get("amount_available", 0.0), + application_deadline=p.get("application_deadline", ""), + source_url=p.get("source_url", ""), + description=p.get("description", ""), + ) + for p in data.get("programs", []) + ] + except httpx.HTTPError: + return [] + + async def get_recent_policy_changes( + self, + jurisdiction: str, + days_back: int = 90, + ) -> list[PolicyChange]: + """Get recent policy changes in a jurisdiction.""" + try: + response = await self._client.get( + "/policy/changes", + params={ + "jurisdiction": jurisdiction, + "days_back": days_back, + }, + ) + response.raise_for_status() + data = response.json() + + return [ + PolicyChange( + title=c["title"], + jurisdiction=c.get("jurisdiction", jurisdiction), + change_type=c.get("change_type", ""), + effective_date=c.get("effective_date", ""), + source_url=c.get("source_url", ""), + description=c.get("description", ""), + ordinance_number=c.get("ordinance_number", ""), + ) + for c in data.get("changes", []) + ] + except httpx.HTTPError: + return [] + + async def close(self) -> None: + """Close the underlying HTTP client.""" + await self._client.aclose() diff --git a/src/integrations/housing_lens.py b/src/integrations/housing_lens.py new file mode 100644 index 0000000..63833be --- /dev/null +++ b/src/integrations/housing_lens.py @@ -0,0 +1,124 @@ +"""Client for the HousingLens regulatory friction API.""" + +from dataclasses import dataclass, field + +import httpx + +from config.settings import get_settings + + +@dataclass +class FrictionTopic: + """A single friction topic score for a jurisdiction.""" + + name: str + friction_score: int + jurisdiction_rank: int + national_percentile: float = 0.0 + description: str = "" + + +@dataclass +class JurisdictionFrictionData: + """Aggregated friction data for a jurisdiction from HousingLens.""" + + jurisdiction: str + overall_score: int = 0 + topics: list[FrictionTopic] = field(default_factory=list) + last_updated: str = "" + + def get_topic_score(self, topic_name: str) -> int: + """Get friction score for a specific topic, defaulting to 0.""" + for topic in self.topics: + if topic.name == topic_name: + return topic.friction_score + return 0 + + def get_topic(self, topic_name: str) -> FrictionTopic | None: + """Get a full topic object by name.""" + for topic in self.topics: + if topic.name == topic_name: + return topic + return None + + +class HousingLensClient: + """HTTP client for the HousingLens friction score API.""" + + def __init__(self, base_url: str | None = None, api_key: str | None = None): + settings = get_settings() + self.base_url = (base_url or settings.housing_lens_api_url).rstrip("/") + self.api_key = api_key or settings.housing_lens_api_key + self._client = httpx.AsyncClient( + base_url=self.base_url, + headers={"Authorization": f"Bearer {self.api_key}"}, + timeout=30.0, + ) + + async def get_jurisdiction_data(self, jurisdiction: str) -> JurisdictionFrictionData: + """Fetch friction scores for a jurisdiction.""" + try: + response = await self._client.get( + "/jurisdictions/friction", + params={"jurisdiction": jurisdiction}, + ) + response.raise_for_status() + data = response.json() + + topics = [ + FrictionTopic( + name=t["name"], + friction_score=t["friction_score"], + jurisdiction_rank=t.get("jurisdiction_rank", 0), + national_percentile=t.get("national_percentile", 0.0), + description=t.get("description", ""), + ) + for t in data.get("topics", []) + ] + + return JurisdictionFrictionData( + jurisdiction=jurisdiction, + overall_score=data.get("overall_score", 0), + topics=topics, + last_updated=data.get("last_updated", ""), + ) + except httpx.HTTPError: + # Return empty data if HousingLens is unavailable + return JurisdictionFrictionData(jurisdiction=jurisdiction) + + async def get_topic_scores( + self, jurisdiction: str, topics: list[str] + ) -> dict[str, int]: + """Fetch specific topic friction scores for a jurisdiction.""" + data = await self.get_jurisdiction_data(jurisdiction) + return {topic: data.get_topic_score(topic) for topic in topics} + + async def get_related_friction_topics( + self, jurisdiction: str, reform_description: str + ) -> list[FrictionTopic]: + """Find friction topics related to a policy reform description.""" + try: + response = await self._client.get( + "/jurisdictions/related-topics", + params={ + "jurisdiction": jurisdiction, + "query": reform_description, + }, + ) + response.raise_for_status() + data = response.json() + + return [ + FrictionTopic( + name=t["name"], + friction_score=t["friction_score"], + jurisdiction_rank=t.get("jurisdiction_rank", 0), + ) + for t in data.get("topics", []) + ] + except httpx.HTTPError: + return [] + + async def close(self) -> None: + """Close the underlying HTTP client.""" + await self._client.aclose() diff --git a/src/integrations/housing_mind.py b/src/integrations/housing_mind.py new file mode 100644 index 0000000..ac8011e --- /dev/null +++ b/src/integrations/housing_mind.py @@ -0,0 +1,82 @@ +"""Webhook handler for HousingMind query metadata integration.""" + +import hashlib +import hmac +import logging +from dataclasses import dataclass + +from sqlalchemy.orm import Session + +from config.settings import get_settings +from src.models.project import Project + +logger = logging.getLogger(__name__) + + +@dataclass +class QueryEvent: + """A query event from HousingMind about a project or jurisdiction.""" + + project_id: str | None + jurisdiction: str + query_category: str + query_text: str + timestamp: str + user_type: str = "" + + +class HousingMindWebhookHandler: + """Handles incoming webhooks from HousingMind to track query patterns.""" + + def __init__(self, webhook_secret: str | None = None): + settings = get_settings() + self.webhook_secret = webhook_secret or settings.housing_mind_webhook_secret + + def verify_signature(self, payload: bytes, signature: str) -> bool: + """Verify the webhook signature using HMAC-SHA256.""" + if not self.webhook_secret: + logger.warning("No webhook secret configured; skipping verification") + return True + + expected = hmac.new( + self.webhook_secret.encode(), + payload, + hashlib.sha256, + ).hexdigest() + + return hmac.compare_digest(f"sha256={expected}", signature) + + def parse_event(self, payload: dict) -> QueryEvent: + """Parse a webhook payload into a QueryEvent.""" + return QueryEvent( + project_id=payload.get("project_id"), + jurisdiction=payload.get("jurisdiction", ""), + query_category=payload.get("query_category", "general"), + query_text=payload.get("query_text", ""), + timestamp=payload.get("timestamp", ""), + user_type=payload.get("user_type", ""), + ) + + def process_event(self, db: Session, event: QueryEvent) -> None: + """Process a query event by updating project query metrics.""" + if not event.project_id: + return + + project = db.get(Project, event.project_id) + if project is None: + logger.info(f"Query event for unknown project: {event.project_id}") + return + + # Increment query counter + project.housing_mind_queries = (project.housing_mind_queries or 0) + 1 + + # Update top query categories + categories = project.top_query_categories or {} + categories[event.query_category] = categories.get(event.query_category, 0) + 1 + project.top_query_categories = categories + + db.commit() + logger.info( + f"Updated query metrics for project {project.project_slug}: " + f"total={project.housing_mind_queries}" + ) diff --git a/src/integrations/public_records.py b/src/integrations/public_records.py new file mode 100644 index 0000000..44ad2be --- /dev/null +++ b/src/integrations/public_records.py @@ -0,0 +1,97 @@ +"""Public records data scrapers for permit and project data.""" + +import logging +from dataclasses import dataclass +from datetime import date + +import httpx + +logger = logging.getLogger(__name__) + + +@dataclass +class PermitRecord: + """A building permit record from public data.""" + + permit_number: str + address: str + jurisdiction: str + permit_type: str = "" + status: str = "" + issue_date: str = "" + description: str = "" + units: int = 0 + valuation: float = 0.0 + + +@dataclass +class LIHTCProject: + """A LIHTC project record from state HFA reports.""" + + project_name: str + address: str + city: str + state: str + credit_type: str = "" # 4% or 9% + total_units: int = 0 + low_income_units: int = 0 + allocation_year: int = 0 + placed_in_service_date: str = "" + total_credit: float = 0.0 + + +class PermitScraper: + """Base class for scraping city/county permit databases.""" + + def __init__(self, jurisdiction: str, base_url: str): + self.jurisdiction = jurisdiction + self.base_url = base_url + self._client = httpx.AsyncClient(timeout=60.0) + + async def search_permits( + self, + address: str | None = None, + date_from: date | None = None, + date_to: date | None = None, + permit_type: str | None = None, + ) -> list[PermitRecord]: + """Search for permits in the jurisdiction's database. + + This is a base implementation. Subclass for specific jurisdictions. + """ + logger.info( + f"Permit search for {self.jurisdiction}: " + f"address={address}, date_from={date_from}" + ) + # Each jurisdiction has a different API/format. + # Subclasses implement the actual scraping logic. + return [] + + async def close(self) -> None: + await self._client.aclose() + + +class LIHTCScraper: + """Scraper for state LIHTC allocation data from HFA annual reports.""" + + def __init__(self) -> None: + self._client = httpx.AsyncClient(timeout=60.0) + + async def get_state_allocations( + self, + state: str, + year: int | None = None, + ) -> list[LIHTCProject]: + """Fetch LIHTC allocation data for a state. + + In production, this scrapes state HFA websites or uses the + HUD LIHTC database API. + """ + logger.info(f"LIHTC lookup for state={state}, year={year}") + # The HUD LIHTC database provides national data. + # State HFAs publish annual allocation lists. + # This is a placeholder for the actual scraping/API logic. + return [] + + async def close(self) -> None: + await self._client.aclose() diff --git a/src/ml/__init__.py b/src/ml/__init__.py new file mode 100644 index 0000000..dd118df --- /dev/null +++ b/src/ml/__init__.py @@ -0,0 +1,87 @@ +"""HousingHand ML module -- timeline prediction for affordable housing projects. + +Public API +---------- +Model layer: + TimelineModel -- Random Forest wrapper (train / predict / save / load) + StagePrediction -- Single-stage prediction with CI bounds + TimelinePrediction -- Combined 3-stage prediction result + +Feature engineering: + extract_project_features -- Single-project feature dict + extract_features_dataframe -- Batch feature matrix + extract_targets -- Target (y) matrix from projects + prepare_training_data -- One-call (X, y) builder + build_feature_schema -- Canonical column ordering + +Training pipeline: + run_training_pipeline -- Full train/CV/evaluate from ORM objects + train_from_dataframes -- Train from pre-built DataFrames + retrain_production_model -- Final artifact with no held-out split + TrainingConfig -- Hyperparameter configuration + TrainingResult -- Pipeline output container + +Evaluation: + evaluate_predictions -- RMSE / MAE / R2 / MAPE + CI coverage + compute_stage_metrics -- Metrics for a single stage + residual_dataframe -- Tidy residuals for plotting + identify_outlier_predictions -- Flag poorly predicted projects + EvaluationReport -- Full evaluation container + StageMetrics -- Per-stage metric container +""" + +from src.ml.feature_engineering import ( + TARGET_STAGES, + build_feature_schema, + extract_features_dataframe, + extract_project_features, + extract_targets, + prepare_training_data, +) +from src.ml.model_evaluation import ( + EvaluationReport, + StageMetrics, + compute_stage_metrics, + evaluate_predictions, + identify_outlier_predictions, + residual_dataframe, +) +from src.ml.model_training import ( + TrainingConfig, + TrainingResult, + retrain_production_model, + run_training_pipeline, + train_from_dataframes, +) +from src.ml.timeline_model import ( + StagePrediction, + TimelineModel, + TimelinePrediction, +) + +__all__ = [ + # Model + "TimelineModel", + "StagePrediction", + "TimelinePrediction", + # Features + "extract_project_features", + "extract_features_dataframe", + "extract_targets", + "prepare_training_data", + "build_feature_schema", + "TARGET_STAGES", + # Training + "run_training_pipeline", + "train_from_dataframes", + "retrain_production_model", + "TrainingConfig", + "TrainingResult", + # Evaluation + "evaluate_predictions", + "compute_stage_metrics", + "residual_dataframe", + "identify_outlier_predictions", + "EvaluationReport", + "StageMetrics", +] diff --git a/src/ml/feature_engineering.py b/src/ml/feature_engineering.py new file mode 100644 index 0000000..cdf7da0 --- /dev/null +++ b/src/ml/feature_engineering.py @@ -0,0 +1,384 @@ +"""Feature engineering for the HousingHand timeline prediction model. + +Extracts numeric and categorical features from Project objects, +producing a flat feature vector suitable for scikit-learn estimators. +""" + +from __future__ import annotations + +import logging +import math +from dataclasses import dataclass, field +from typing import Any, Sequence + +import numpy as np +import pandas as pd + +from src.models.enums import BuildingType, NeighborOpposition, PipelineStage, StructureType +from src.models.project import Project + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +TARGET_STAGES: list[str] = [ + "entitlement_days", + "financing_days", + "construction_days", +] + +# Ordered mapping for opposition level -> numeric severity score (0-4) +_OPPOSITION_SCORES: dict[str | None, int] = { + None: 0, + NeighborOpposition.NONE: 0, + NeighborOpposition.LOW: 1, + NeighborOpposition.MODERATE: 2, + NeighborOpposition.HIGH: 3, + NeighborOpposition.SEVERE: 4, +} + +# Building-type one-hot categories (sorted for deterministic ordering) +_BUILDING_TYPES: list[str] = sorted(bt.value for bt in BuildingType) + +# Structure-type one-hot categories +_STRUCTURE_TYPES: list[str] = sorted(st.value for st in StructureType) + +# Pipeline stage ordinal encoding (concept=0 ... operations=6) +_STAGE_ORDINAL: dict[PipelineStage, int] = { + PipelineStage.CONCEPT: 0, + PipelineStage.PRE_DEVELOPMENT: 1, + PipelineStage.ENTITLEMENT: 2, + PipelineStage.FINANCING: 3, + PipelineStage.CONSTRUCTION: 4, + PipelineStage.LEASE_UP: 5, + PipelineStage.OPERATIONS: 6, + PipelineStage.STALLED: -1, + PipelineStage.ABANDONED: -2, +} + + +# --------------------------------------------------------------------------- +# Feature descriptor +# --------------------------------------------------------------------------- + +@dataclass +class FeatureSchema: + """Describes every column the feature matrix can contain. + + Provides the canonical column ordering so that train-time and + inference-time matrices are always aligned. + """ + + numeric_columns: list[str] = field(default_factory=list) + categorical_columns: list[str] = field(default_factory=list) + + @property + def all_columns(self) -> list[str]: + return self.numeric_columns + self.categorical_columns + + @property + def n_features(self) -> int: + return len(self.all_columns) + + +def build_feature_schema() -> FeatureSchema: + """Return the canonical feature schema used by the timeline model.""" + numeric = [ + # Scale / size + "total_units", + "affordable_units_pct", + "deep_affordability_pct", + "market_rate_pct", + "stories", + "parking_ratio", + "site_acres", + "density_units_per_acre", + # Unit-mix proportions + "studio_pct", + "one_br_pct", + "two_br_pct", + "three_plus_br_pct", + # Special population share + "senior_pct", + "family_pct", + "psf_pct", + "homeless_pct", + # Regulatory complexity + "jurisdiction_friction_score", + "opposition_score", + "variance_hearings", + "design_review_iterations", + "appeals_filed", + # Cost context + "cost_per_unit_log", + "tdc_log", + # External market signals (to be filled by caller / pipeline) + "construction_cost_index", + "interest_rate", + # Peer benchmarks + "peer_median_entitlement_days", + "peer_median_financing_days", + "peer_median_construction_days", + # Calendar + "start_month_sin", + "start_month_cos", + "start_year", + # Current stage ordinal + "current_stage_ordinal", + ] + + categorical = ( + [f"building_type_{bt}" for bt in _BUILDING_TYPES] + + [f"structure_type_{st}" for st in _STRUCTURE_TYPES] + ) + + return FeatureSchema(numeric_columns=numeric, categorical_columns=categorical) + + +# --------------------------------------------------------------------------- +# Safe value helpers +# --------------------------------------------------------------------------- + +def _safe_ratio(numerator: float | None, denominator: float | None, default: float = 0.0) -> float: + """Compute *numerator / denominator* without blowing up.""" + if numerator is None or denominator is None or denominator == 0: + return default + return float(numerator) / float(denominator) + + +def _safe_log(value: float | None, default: float = 0.0) -> float: + if value is None or value <= 0: + return default + return math.log(float(value)) + + +def _pct_of_total(part: int | None, total: int | None) -> float: + if part is None or total is None or total == 0: + return 0.0 + return float(part) / float(total) + + +# --------------------------------------------------------------------------- +# Single-project feature extraction +# --------------------------------------------------------------------------- + +def extract_project_features( + project: Project, + *, + construction_cost_index: float | None = None, + interest_rate: float | None = None, + peer_median_entitlement_days: float | None = None, + peer_median_financing_days: float | None = None, + peer_median_construction_days: float | None = None, +) -> dict[str, float]: + """Convert one :class:`Project` into a flat feature dictionary. + + External signals (cost index, interest rate, peer medians) are + accepted as keyword arguments because they are not stored directly + on the ``Project`` row -- they come from companion tables or APIs. + + Returns a ``dict`` whose keys match :func:`build_feature_schema`. + """ + total = project.total_units or 0 + affordable = project.affordable_units or 0 + deep_units = (project.ami_30_units or 0) + (project.ami_40_units or 0) + + three_plus_br = (project.three_br_units or 0) + (project.four_plus_br_units or 0) + + parking_ratio = _safe_ratio(project.parking_spaces, total) + density = _safe_ratio(total, project.site_acres) if project.site_acres else 0.0 + + # Calendar features -- cyclical encoding of start month + start_date = ( + project.entitlement_start + or project.pre_development_start + or project.concept_start + ) + if start_date is not None: + month = start_date.month + start_month_sin = math.sin(2 * math.pi * month / 12) + start_month_cos = math.cos(2 * math.pi * month / 12) + start_year = float(start_date.year) + else: + start_month_sin = 0.0 + start_month_cos = 1.0 + start_year = 0.0 + + # One-hot encoding for building_type + bt_value = project.building_type.value if project.building_type else None + bt_features = { + f"building_type_{bt}": float(bt == bt_value) for bt in _BUILDING_TYPES + } + + # One-hot encoding for structure_type + st_value = project.structure_type.value if project.structure_type else None + st_features = { + f"structure_type_{st}": float(st == st_value) for st in _STRUCTURE_TYPES + } + + features: dict[str, float] = { + # Scale / size + "total_units": float(total), + "affordable_units_pct": _pct_of_total(affordable, total), + "deep_affordability_pct": _pct_of_total(deep_units, total), + "market_rate_pct": _pct_of_total(project.market_rate_units, total), + "stories": float(project.stories or 0), + "parking_ratio": parking_ratio, + "site_acres": float(project.site_acres or 0), + "density_units_per_acre": density, + # Unit-mix proportions + "studio_pct": _pct_of_total(project.studio_units, total), + "one_br_pct": _pct_of_total(project.one_br_units, total), + "two_br_pct": _pct_of_total(project.two_br_units, total), + "three_plus_br_pct": _pct_of_total(three_plus_br, total), + # Special populations + "senior_pct": _pct_of_total(project.senior_units, total), + "family_pct": _pct_of_total(project.family_units, total), + "psf_pct": _pct_of_total(project.psf_units, total), + "homeless_pct": _pct_of_total(project.homeless_set_aside, total), + # Regulatory complexity + "jurisdiction_friction_score": float(project.jurisdiction_friction_score or 0), + "opposition_score": float(_OPPOSITION_SCORES.get(project.neighbor_opposition_level, 0)), + "variance_hearings": float(project.variance_hearings or 0), + "design_review_iterations": float(project.design_review_iterations or 0), + "appeals_filed": float(project.appeals_filed or 0), + # Cost context + "cost_per_unit_log": _safe_log(project.cost_per_unit), + "tdc_log": _safe_log(project.total_development_cost), + # External signals (defaults to 0 when unavailable) + "construction_cost_index": float(construction_cost_index or 0), + "interest_rate": float(interest_rate or 0), + # Peer medians + "peer_median_entitlement_days": float(peer_median_entitlement_days or 0), + "peer_median_financing_days": float(peer_median_financing_days or 0), + "peer_median_construction_days": float(peer_median_construction_days or 0), + # Calendar + "start_month_sin": start_month_sin, + "start_month_cos": start_month_cos, + "start_year": start_year, + # Stage + "current_stage_ordinal": float(_STAGE_ORDINAL.get(project.current_stage, -1)), + } + + features.update(bt_features) + features.update(st_features) + + return features + + +# --------------------------------------------------------------------------- +# Batch extraction +# --------------------------------------------------------------------------- + +def extract_features_dataframe( + projects: Sequence[Project], + *, + external_signals: dict[Any, dict[str, float]] | None = None, +) -> pd.DataFrame: + """Build feature matrix for a list of projects. + + Parameters + ---------- + projects: + Iterable of :class:`Project` ORM objects. + external_signals: + Optional mapping of ``project_id -> {"construction_cost_index": ..., ...}`` + providing per-project external signals (cost index, interest rate, + peer medians). Keys that do not match a project are silently ignored. + + Returns + ------- + pd.DataFrame + One row per project, columns ordered by :func:`build_feature_schema`. + """ + schema = build_feature_schema() + rows: list[dict[str, float]] = [] + + for proj in projects: + ext = {} + if external_signals and proj.project_id in external_signals: + ext = external_signals[proj.project_id] + + row = extract_project_features( + proj, + construction_cost_index=ext.get("construction_cost_index"), + interest_rate=ext.get("interest_rate"), + peer_median_entitlement_days=ext.get("peer_median_entitlement_days"), + peer_median_financing_days=ext.get("peer_median_financing_days"), + peer_median_construction_days=ext.get("peer_median_construction_days"), + ) + rows.append(row) + + if not rows: + return pd.DataFrame(columns=schema.all_columns) + + df = pd.DataFrame(rows) + + # Ensure column ordering matches schema; fill any missing cols with 0 + for col in schema.all_columns: + if col not in df.columns: + df[col] = 0.0 + + df = df[schema.all_columns] + return df + + +def extract_targets(projects: Sequence[Project]) -> pd.DataFrame: + """Extract the three target stage durations for supervised training. + + Returns a DataFrame with columns ``entitlement_days``, + ``financing_days``, and ``construction_days``. Rows where *any* + target is ``None`` are **not** dropped -- callers should handle + missing targets (e.g. by filtering or imputing). + """ + records = [] + for proj in projects: + records.append({ + "entitlement_days": proj.entitlement_duration_days, + "financing_days": proj.financing_duration_days, + "construction_days": proj.construction_duration_days, + }) + + return pd.DataFrame(records, columns=TARGET_STAGES) + + +def prepare_training_data( + projects: Sequence[Project], + *, + external_signals: dict[Any, dict[str, float]] | None = None, + drop_incomplete_targets: bool = True, +) -> tuple[pd.DataFrame, pd.DataFrame]: + """One-call helper that produces aligned (X, y) matrices. + + Parameters + ---------- + projects: + Project ORM objects. + external_signals: + See :func:`extract_features_dataframe`. + drop_incomplete_targets: + If *True* (default), rows where **any** target column is NaN + are dropped from both X and y. + + Returns + ------- + (X, y) : tuple of pd.DataFrame + """ + X = extract_features_dataframe(projects, external_signals=external_signals) + y = extract_targets(projects) + + if drop_incomplete_targets: + valid_mask = y.notna().all(axis=1) + n_dropped = (~valid_mask).sum() + if n_dropped > 0: + logger.info( + "Dropped %d / %d projects with incomplete target durations.", + n_dropped, + len(y), + ) + X = X.loc[valid_mask].reset_index(drop=True) + y = y.loc[valid_mask].reset_index(drop=True) + + return X, y diff --git a/src/ml/model_evaluation.py b/src/ml/model_evaluation.py new file mode 100644 index 0000000..0ecd090 --- /dev/null +++ b/src/ml/model_evaluation.py @@ -0,0 +1,384 @@ +"""Evaluation metrics and diagnostic utilities for the timeline model. + +Computes RMSE, MAE, R-squared, and MAPE for each predicted stage as well +as aggregate (all-stages) metrics. Also provides calibration checks for +the confidence-interval coverage and residual analysis helpers. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from typing import Any + +import numpy as np +import pandas as pd +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score + +from src.ml.feature_engineering import TARGET_STAGES + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Data classes +# --------------------------------------------------------------------------- + +@dataclass +class StageMetrics: + """Evaluation metrics for a single predicted stage.""" + + stage: str + n_samples: int + rmse: float + mae: float + r2: float + mape: float # mean absolute percentage error (0-100 scale) + median_absolute_error: float + max_error: float + mean_residual: float # bias indicator; ideally ~0 + + def to_dict(self) -> dict[str, Any]: + return { + "stage": self.stage, + "n_samples": self.n_samples, + "rmse": round(self.rmse, 2), + "mae": round(self.mae, 2), + "r2": round(self.r2, 4), + "mape": round(self.mape, 2), + "median_absolute_error": round(self.median_absolute_error, 2), + "max_error": round(self.max_error, 2), + "mean_residual": round(self.mean_residual, 2), + } + + +@dataclass +class EvaluationReport: + """Full evaluation report spanning all target stages.""" + + per_stage: dict[str, StageMetrics] + aggregate_rmse: float + aggregate_mae: float + aggregate_r2: float + aggregate_mape: float + ci_coverage: dict[str, float] | None = None # fraction of actuals inside CI + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "per_stage": {k: v.to_dict() for k, v in self.per_stage.items()}, + "aggregate": { + "rmse": round(self.aggregate_rmse, 2), + "mae": round(self.aggregate_mae, 2), + "r2": round(self.aggregate_r2, 4), + "mape": round(self.aggregate_mape, 2), + }, + "ci_coverage": ( + {k: round(v, 4) for k, v in self.ci_coverage.items()} + if self.ci_coverage + else None + ), + "metadata": self.metadata, + } + + def summary_table(self) -> pd.DataFrame: + """Return a tidy DataFrame with one row per stage + an aggregate row.""" + rows = [] + for stage in TARGET_STAGES: + if stage in self.per_stage: + rows.append(self.per_stage[stage].to_dict()) + + rows.append({ + "stage": "AGGREGATE", + "n_samples": rows[0]["n_samples"] if rows else 0, + "rmse": round(self.aggregate_rmse, 2), + "mae": round(self.aggregate_mae, 2), + "r2": round(self.aggregate_r2, 4), + "mape": round(self.aggregate_mape, 2), + "median_absolute_error": None, + "max_error": None, + "mean_residual": None, + }) + return pd.DataFrame(rows) + + +# --------------------------------------------------------------------------- +# Core metric computation +# --------------------------------------------------------------------------- + +def _compute_mape(y_true: np.ndarray, y_pred: np.ndarray) -> float: + """Mean absolute percentage error (0-100 scale). + + Samples where ``y_true == 0`` are excluded to avoid division by zero. + Returns 0.0 if no valid samples remain. + """ + mask = y_true != 0 + if not np.any(mask): + return 0.0 + return float(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100) + + +def compute_stage_metrics( + y_true: np.ndarray, + y_pred: np.ndarray, + stage: str, +) -> StageMetrics: + """Compute all metrics for a single target stage. + + Parameters + ---------- + y_true : 1-D array of actual durations. + y_pred : 1-D array of predicted durations. + stage : Human-readable stage name (e.g. ``"entitlement_days"``). + """ + y_true = np.asarray(y_true, dtype=np.float64).ravel() + y_pred = np.asarray(y_pred, dtype=np.float64).ravel() + + if len(y_true) != len(y_pred): + raise ValueError( + f"Length mismatch: y_true has {len(y_true)}, y_pred has {len(y_pred)}" + ) + + n = len(y_true) + residuals = y_true - y_pred + + rmse = float(np.sqrt(mean_squared_error(y_true, y_pred))) + mae = float(mean_absolute_error(y_true, y_pred)) + r2 = float(r2_score(y_true, y_pred)) if n >= 2 else 0.0 + mape = _compute_mape(y_true, y_pred) + med_ae = float(np.median(np.abs(residuals))) + max_err = float(np.max(np.abs(residuals))) if n > 0 else 0.0 + mean_res = float(np.mean(residuals)) + + return StageMetrics( + stage=stage, + n_samples=n, + rmse=rmse, + mae=mae, + r2=r2, + mape=mape, + median_absolute_error=med_ae, + max_error=max_err, + mean_residual=mean_res, + ) + + +# --------------------------------------------------------------------------- +# Full evaluation +# --------------------------------------------------------------------------- + +def evaluate_predictions( + y_true: pd.DataFrame | np.ndarray, + y_pred: pd.DataFrame | np.ndarray, + *, + y_pred_lower: pd.DataFrame | np.ndarray | None = None, + y_pred_upper: pd.DataFrame | np.ndarray | None = None, + confidence_level: float | None = None, +) -> EvaluationReport: + """Evaluate multi-stage predictions and build a full report. + + Parameters + ---------- + y_true: + Actual stage durations, shape ``(n, 3)`` with columns ordered + per ``TARGET_STAGES``. + y_pred: + Predicted stage durations, same shape. + y_pred_lower, y_pred_upper: + Optional lower/upper bounds of confidence intervals. When + both are provided, CI coverage is computed. + confidence_level: + Nominal CI probability (recorded in metadata). + + Returns + ------- + EvaluationReport + """ + y_true_arr = np.asarray(y_true, dtype=np.float64) + y_pred_arr = np.asarray(y_pred, dtype=np.float64) + + if y_true_arr.ndim == 1: + y_true_arr = y_true_arr.reshape(-1, 1) + if y_pred_arr.ndim == 1: + y_pred_arr = y_pred_arr.reshape(-1, 1) + + n_stages = y_true_arr.shape[1] + if n_stages != len(TARGET_STAGES): + raise ValueError( + f"Expected {len(TARGET_STAGES)} target columns, got {n_stages}" + ) + + # Per-stage metrics + per_stage: dict[str, StageMetrics] = {} + for idx, stage in enumerate(TARGET_STAGES): + per_stage[stage] = compute_stage_metrics( + y_true_arr[:, idx], y_pred_arr[:, idx], stage + ) + + # Aggregate metrics (flatten all stages into one vector) + all_true = y_true_arr.ravel() + all_pred = y_pred_arr.ravel() + + agg_rmse = float(np.sqrt(mean_squared_error(all_true, all_pred))) + agg_mae = float(mean_absolute_error(all_true, all_pred)) + agg_r2 = float(r2_score(all_true, all_pred)) if len(all_true) >= 2 else 0.0 + agg_mape = _compute_mape(all_true, all_pred) + + # CI coverage + ci_coverage: dict[str, float] | None = None + if y_pred_lower is not None and y_pred_upper is not None: + lower_arr = np.asarray(y_pred_lower, dtype=np.float64) + upper_arr = np.asarray(y_pred_upper, dtype=np.float64) + if lower_arr.ndim == 1: + lower_arr = lower_arr.reshape(-1, 1) + if upper_arr.ndim == 1: + upper_arr = upper_arr.reshape(-1, 1) + + ci_coverage = {} + for idx, stage in enumerate(TARGET_STAGES): + in_interval = ( + (y_true_arr[:, idx] >= lower_arr[:, idx]) + & (y_true_arr[:, idx] <= upper_arr[:, idx]) + ) + ci_coverage[stage] = float(np.mean(in_interval)) + + # Aggregate coverage + all_in = ( + (y_true_arr >= lower_arr) & (y_true_arr <= upper_arr) + ) + ci_coverage["aggregate"] = float(np.mean(all_in)) + + metadata: dict[str, Any] = { + "n_samples": int(y_true_arr.shape[0]), + "n_stages": n_stages, + } + if confidence_level is not None: + metadata["confidence_level"] = confidence_level + + report = EvaluationReport( + per_stage=per_stage, + aggregate_rmse=agg_rmse, + aggregate_mae=agg_mae, + aggregate_r2=agg_r2, + aggregate_mape=agg_mape, + ci_coverage=ci_coverage, + metadata=metadata, + ) + + _log_report(report) + return report + + +# --------------------------------------------------------------------------- +# Residual analysis helpers +# --------------------------------------------------------------------------- + +def residual_dataframe( + y_true: pd.DataFrame | np.ndarray, + y_pred: pd.DataFrame | np.ndarray, +) -> pd.DataFrame: + """Build a tidy DataFrame of residuals for further analysis / plotting. + + Returns columns: ``stage``, ``actual``, ``predicted``, ``residual``, + ``abs_error``, ``pct_error``. + """ + y_true_arr = np.asarray(y_true, dtype=np.float64) + y_pred_arr = np.asarray(y_pred, dtype=np.float64) + + rows = [] + for idx, stage in enumerate(TARGET_STAGES): + for i in range(y_true_arr.shape[0]): + actual = y_true_arr[i, idx] + pred = y_pred_arr[i, idx] + resid = actual - pred + abs_err = abs(resid) + pct_err = (abs_err / actual * 100) if actual != 0 else 0.0 + rows.append({ + "stage": stage, + "sample_idx": i, + "actual": actual, + "predicted": pred, + "residual": resid, + "abs_error": abs_err, + "pct_error": pct_err, + }) + + return pd.DataFrame(rows) + + +def identify_outlier_predictions( + y_true: pd.DataFrame | np.ndarray, + y_pred: pd.DataFrame | np.ndarray, + *, + threshold_pct: float = 50.0, +) -> pd.DataFrame: + """Return rows where the percentage error exceeds *threshold_pct*. + + Useful for finding projects whose durations are poorly predicted and + may require manual review or additional features. + """ + df = residual_dataframe(y_true, y_pred) + outliers = df[df["pct_error"] > threshold_pct].sort_values( + "pct_error", ascending=False + ) + return outliers.reset_index(drop=True) + + +# --------------------------------------------------------------------------- +# Cross-validation evaluation helper +# --------------------------------------------------------------------------- + +def evaluate_cv_results( + cv_scores: dict[str, list[float]], +) -> dict[str, dict[str, float]]: + """Summarize per-fold cross-validation scores. + + Parameters + ---------- + cv_scores: + Mapping of ``metric_name -> [fold_1_score, fold_2_score, ...]``. + + Returns + ------- + dict mapping each metric to ``{"mean": ..., "std": ..., "min": ..., "max": ...}``. + """ + summary: dict[str, dict[str, float]] = {} + for metric, scores in cv_scores.items(): + arr = np.array(scores, dtype=np.float64) + summary[metric] = { + "mean": float(np.mean(arr)), + "std": float(np.std(arr)), + "min": float(np.min(arr)), + "max": float(np.max(arr)), + "n_folds": len(scores), + } + return summary + + +# --------------------------------------------------------------------------- +# Logging helper +# --------------------------------------------------------------------------- + +def _log_report(report: EvaluationReport) -> None: + """Emit a summary of the evaluation report to the logger.""" + logger.info("=== Evaluation Report ===") + for stage, m in report.per_stage.items(): + logger.info( + " %-25s RMSE=%7.1f MAE=%7.1f R2=%6.3f MAPE=%5.1f%%", + stage, + m.rmse, + m.mae, + m.r2, + m.mape, + ) + logger.info( + " %-25s RMSE=%7.1f MAE=%7.1f R2=%6.3f MAPE=%5.1f%%", + "AGGREGATE", + report.aggregate_rmse, + report.aggregate_mae, + report.aggregate_r2, + report.aggregate_mape, + ) + if report.ci_coverage: + for stage, cov in report.ci_coverage.items(): + logger.info(" CI coverage %-20s %.1f%%", stage, cov * 100) diff --git a/src/ml/model_training.py b/src/ml/model_training.py new file mode 100644 index 0000000..987f45d --- /dev/null +++ b/src/ml/model_training.py @@ -0,0 +1,490 @@ +"""Training pipeline for the HousingHand timeline prediction model. + +Provides a structured pipeline that: + +1. Accepts raw Project ORM objects (or pre-built feature / target matrices). +2. Splits data into train / test sets. +3. Runs k-fold cross-validation on the training set. +4. Trains the final model on the full training set. +5. Evaluates on the held-out test set. +6. Persists the trained model artifact. + +Hyperparameter notes +-------------------- +The default Random Forest configuration was chosen for a balance of +accuracy and training speed on the typical HousingHand dataset size +(hundreds to low-thousands of projects): + +* ``n_estimators=300`` -- enough trees for stable quantile intervals + without excessive memory. +* ``max_depth=18`` -- deep enough to capture non-linear interactions + (e.g. friction * building_type) without severe overfitting on <2 k + samples. +* ``min_samples_split=8, min_samples_leaf=4`` -- regularization guards + against noisy duration outliers. +* ``max_features="sqrt"`` -- decorrelates trees and improves CI coverage. + +For larger datasets (>5 k rows) consider increasing ``n_estimators`` to +500 and relaxing ``min_samples_leaf`` to 2. A Bayesian optimisation +sweep over ``max_depth``, ``min_samples_leaf``, and ``n_estimators`` is +recommended before production deployment. +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Sequence + +import numpy as np +import pandas as pd +from sklearn.model_selection import KFold, train_test_split + +from src.ml.feature_engineering import ( + TARGET_STAGES, + build_feature_schema, + extract_features_dataframe, + extract_targets, + prepare_training_data, +) +from src.ml.model_evaluation import ( + EvaluationReport, + evaluate_cv_results, + evaluate_predictions, +) +from src.ml.timeline_model import DEFAULT_RF_PARAMS, TimelineModel +from src.models.project import Project + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +@dataclass +class TrainingConfig: + """All tunables for a training run, gathered in one place.""" + + # Train / test split + test_size: float = 0.20 + split_random_state: int = 42 + + # Cross-validation + cv_folds: int = 5 + cv_shuffle: bool = True + cv_random_state: int = 42 + + # Random Forest hyperparameters (forwarded to TimelineModel) + rf_params: dict[str, Any] = field(default_factory=lambda: dict(DEFAULT_RF_PARAMS)) + + # Confidence interval width + confidence_level: float = 0.90 + + # Artifact persistence + model_output_path: str | None = None # if set, model is saved here after training + + def describe(self) -> dict[str, Any]: + """Return a JSON-serializable summary of this config.""" + return { + "test_size": self.test_size, + "cv_folds": self.cv_folds, + "confidence_level": self.confidence_level, + "rf_params": self.rf_params, + "model_output_path": self.model_output_path, + } + + +# --------------------------------------------------------------------------- +# Training result container +# --------------------------------------------------------------------------- + +@dataclass +class TrainingResult: + """Everything produced by a single training pipeline run.""" + + model: TimelineModel + config: TrainingConfig + + # Data dimensions + n_total_samples: int = 0 + n_train_samples: int = 0 + n_test_samples: int = 0 + + # Cross-validation results (per-stage, per-fold) + cv_scores: dict[str, dict[str, float]] = field(default_factory=dict) + + # Held-out test evaluation + test_evaluation: EvaluationReport | None = None + + # Timing + training_duration_seconds: float = 0.0 + + # Artifact path (populated if model was saved) + model_path: Path | None = None + + def summary(self) -> dict[str, Any]: + return { + "n_total_samples": self.n_total_samples, + "n_train_samples": self.n_train_samples, + "n_test_samples": self.n_test_samples, + "cv_scores": self.cv_scores, + "test_evaluation": ( + self.test_evaluation.to_dict() if self.test_evaluation else None + ), + "training_duration_seconds": round(self.training_duration_seconds, 2), + "model_path": str(self.model_path) if self.model_path else None, + "config": self.config.describe(), + } + + +# --------------------------------------------------------------------------- +# Cross-validation helper +# --------------------------------------------------------------------------- + +def _run_cross_validation( + X_train: np.ndarray, + y_train: np.ndarray, + config: TrainingConfig, +) -> dict[str, dict[str, float]]: + """Run K-Fold CV and return summarized per-stage scores. + + For each fold, a fresh ``TimelineModel`` is trained and evaluated. + Returns a nested dict: ``stage -> {"rmse_mean", "rmse_std", ...}``. + """ + kf = KFold( + n_splits=config.cv_folds, + shuffle=config.cv_shuffle, + random_state=config.cv_random_state, + ) + + # Accumulators: stage -> metric_name -> [fold_scores] + fold_scores: dict[str, dict[str, list[float]]] = { + stage: {"rmse": [], "mae": [], "r2": []} + for stage in TARGET_STAGES + } + + for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_train)): + logger.info("CV fold %d / %d", fold_idx + 1, config.cv_folds) + + X_fold_train = X_train[train_idx] + y_fold_train = y_train[train_idx] + X_fold_val = X_train[val_idx] + y_fold_val = y_train[val_idx] + + fold_model = TimelineModel( + rf_params=config.rf_params, + confidence_level=config.confidence_level, + ) + fold_model.train(X_fold_train, y_fold_val if y_fold_train is None else y_fold_train) + + # Predict and evaluate + pred_df = fold_model.predict_dataframe(X_fold_val) + + for stage_idx, stage in enumerate(TARGET_STAGES): + prefix = stage.replace("_days", "") + y_actual = y_fold_val[:, stage_idx] + y_predicted = pred_df[f"{prefix}_pred"].values + + residuals = y_actual - y_predicted + rmse = float(np.sqrt(np.mean(residuals ** 2))) + mae = float(np.mean(np.abs(residuals))) + ss_res = float(np.sum(residuals ** 2)) + ss_tot = float(np.sum((y_actual - np.mean(y_actual)) ** 2)) + r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0 + + fold_scores[stage]["rmse"].append(rmse) + fold_scores[stage]["mae"].append(mae) + fold_scores[stage]["r2"].append(r2) + + # Summarize fold scores + cv_summary: dict[str, dict[str, float]] = {} + for stage in TARGET_STAGES: + stage_key = stage.replace("_days", "") + for metric in ("rmse", "mae", "r2"): + scores = fold_scores[stage][metric] + cv_summary[f"{stage_key}_{metric}_mean"] = float(np.mean(scores)) + cv_summary[f"{stage_key}_{metric}_std"] = float(np.std(scores)) + + # Also flatten for the evaluate_cv_results helper + flat_cv: dict[str, list[float]] = {} + for stage in TARGET_STAGES: + stage_key = stage.replace("_days", "") + for metric in ("rmse", "mae", "r2"): + flat_cv[f"{stage_key}_{metric}"] = fold_scores[stage][metric] + + detailed = evaluate_cv_results(flat_cv) + return detailed + + +# --------------------------------------------------------------------------- +# Main training pipeline +# --------------------------------------------------------------------------- + +def run_training_pipeline( + projects: Sequence[Project], + *, + external_signals: dict[Any, dict[str, float]] | None = None, + config: TrainingConfig | None = None, +) -> TrainingResult: + """End-to-end training pipeline from raw Project objects. + + Steps + ----- + 1. Feature extraction + target extraction. + 2. Drop rows with incomplete targets. + 3. Train / test split. + 4. K-fold cross-validation on training set. + 5. Final model training on full training set. + 6. Evaluation on held-out test set. + 7. (Optional) save the model artifact. + + Parameters + ---------- + projects: + Iterable of ``Project`` ORM instances. + external_signals: + Per-project external market data (see ``feature_engineering``). + config: + Training hyperparameters. Uses sensible defaults if omitted. + + Returns + ------- + TrainingResult + """ + if config is None: + config = TrainingConfig() + + t0 = time.perf_counter() + + logger.info("Starting training pipeline with %d projects.", len(projects)) + + # 1. Prepare data + X, y = prepare_training_data( + projects, + external_signals=external_signals, + drop_incomplete_targets=True, + ) + + n_total = len(X) + logger.info("Usable samples after dropping incomplete targets: %d", n_total) + + if n_total < config.cv_folds + 2: + raise ValueError( + f"Not enough usable samples ({n_total}) for {config.cv_folds}-fold CV. " + f"Need at least {config.cv_folds + 2}." + ) + + # 2. Train / test split + X_train, X_test, y_train, y_test = train_test_split( + X, + y, + test_size=config.test_size, + random_state=config.split_random_state, + ) + + n_train = len(X_train) + n_test = len(X_test) + logger.info("Split: %d train, %d test (%.0f%% held out).", n_train, n_test, config.test_size * 100) + + # Convert to numpy for sklearn + X_train_arr = X_train.values.astype(np.float64) + y_train_arr = y_train.values.astype(np.float64) + X_test_arr = X_test.values.astype(np.float64) + y_test_arr = y_test.values.astype(np.float64) + + # 3. Cross-validation + logger.info("Running %d-fold cross-validation ...", config.cv_folds) + cv_scores = _run_cross_validation(X_train_arr, y_train_arr, config) + + # 4. Train final model on full training set + logger.info("Training final model on full training set ...") + model = TimelineModel( + rf_params=config.rf_params, + confidence_level=config.confidence_level, + ) + model.train(X_train, y_train) + + # 5. Evaluate on held-out test set + logger.info("Evaluating on held-out test set ...") + pred_df = model.predict_dataframe(X_test_arr, confidence_level=config.confidence_level) + + # Build y_pred array aligned with TARGET_STAGES + y_pred_arr = np.column_stack([ + pred_df[f"{stage.replace('_days', '')}_pred"].values + for stage in TARGET_STAGES + ]) + + y_lower_arr = np.column_stack([ + pred_df[f"{stage.replace('_days', '')}_lower"].values + for stage in TARGET_STAGES + ]) + + y_upper_arr = np.column_stack([ + pred_df[f"{stage.replace('_days', '')}_upper"].values + for stage in TARGET_STAGES + ]) + + test_eval = evaluate_predictions( + y_test_arr, + y_pred_arr, + y_pred_lower=y_lower_arr, + y_pred_upper=y_upper_arr, + confidence_level=config.confidence_level, + ) + + elapsed = time.perf_counter() - t0 + + # 6. Optionally save model + model_path: Path | None = None + if config.model_output_path: + model_path = model.save(config.model_output_path) + + result = TrainingResult( + model=model, + config=config, + n_total_samples=n_total, + n_train_samples=n_train, + n_test_samples=n_test, + cv_scores=cv_scores, + test_evaluation=test_eval, + training_duration_seconds=elapsed, + model_path=model_path, + ) + + logger.info("Pipeline complete in %.1fs.", elapsed) + return result + + +# --------------------------------------------------------------------------- +# Convenience: train from pre-built matrices +# --------------------------------------------------------------------------- + +def train_from_dataframes( + X: pd.DataFrame, + y: pd.DataFrame, + *, + config: TrainingConfig | None = None, +) -> TrainingResult: + """Train from pre-built feature / target DataFrames (skip ORM extraction). + + Useful when features have already been computed or when working with + CSV exports rather than a live database. + """ + if config is None: + config = TrainingConfig() + + t0 = time.perf_counter() + + n_total = len(X) + if n_total < config.cv_folds + 2: + raise ValueError( + f"Not enough samples ({n_total}) for {config.cv_folds}-fold CV." + ) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, + test_size=config.test_size, + random_state=config.split_random_state, + ) + + n_train = len(X_train) + n_test = len(X_test) + + X_train_arr = X_train.values.astype(np.float64) + y_train_arr = y_train.values.astype(np.float64) + X_test_arr = X_test.values.astype(np.float64) + y_test_arr = y_test.values.astype(np.float64) + + # Cross-validation + cv_scores = _run_cross_validation(X_train_arr, y_train_arr, config) + + # Final model + model = TimelineModel( + rf_params=config.rf_params, + confidence_level=config.confidence_level, + ) + model.train(X_train, y_train) + + # Test evaluation + pred_df = model.predict_dataframe(X_test_arr, confidence_level=config.confidence_level) + + y_pred_arr = np.column_stack([ + pred_df[f"{stage.replace('_days', '')}_pred"].values + for stage in TARGET_STAGES + ]) + y_lower_arr = np.column_stack([ + pred_df[f"{stage.replace('_days', '')}_lower"].values + for stage in TARGET_STAGES + ]) + y_upper_arr = np.column_stack([ + pred_df[f"{stage.replace('_days', '')}_upper"].values + for stage in TARGET_STAGES + ]) + + test_eval = evaluate_predictions( + y_test_arr, + y_pred_arr, + y_pred_lower=y_lower_arr, + y_pred_upper=y_upper_arr, + confidence_level=config.confidence_level, + ) + + elapsed = time.perf_counter() - t0 + + model_path: Path | None = None + if config.model_output_path: + model_path = model.save(config.model_output_path) + + return TrainingResult( + model=model, + config=config, + n_total_samples=n_total, + n_train_samples=n_train, + n_test_samples=n_test, + cv_scores=cv_scores, + test_evaluation=test_eval, + training_duration_seconds=elapsed, + model_path=model_path, + ) + + +# --------------------------------------------------------------------------- +# Retrain helper (full dataset, no held-out test -- for final deployment) +# --------------------------------------------------------------------------- + +def retrain_production_model( + projects: Sequence[Project], + *, + external_signals: dict[Any, dict[str, float]] | None = None, + config: TrainingConfig | None = None, + output_path: str = "models/timeline_model.joblib", +) -> TimelineModel: + """Train on **all** available data and save for production inference. + + No test split or CV is performed -- this is intended for producing + the final artifact after hyper-parameters have been validated via + :func:`run_training_pipeline`. + """ + if config is None: + config = TrainingConfig() + + X, y = prepare_training_data( + projects, + external_signals=external_signals, + drop_incomplete_targets=True, + ) + + logger.info("Retraining production model on %d samples.", len(X)) + + model = TimelineModel( + rf_params=config.rf_params, + confidence_level=config.confidence_level, + ) + model.train(X, y) + model.save(output_path) + + logger.info("Production model saved to %s.", output_path) + return model diff --git a/src/ml/timeline_model.py b/src/ml/timeline_model.py new file mode 100644 index 0000000..7b377d9 --- /dev/null +++ b/src/ml/timeline_model.py @@ -0,0 +1,445 @@ +"""Random Forest model for affordable-housing stage-duration prediction. + +Wraps three :class:`~sklearn.ensemble.RandomForestRegressor` estimators +(one per target stage) and exposes a unified ``train / predict / save / load`` +interface. Confidence intervals are derived from individual-tree predictions +(quantile estimation). +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any + +import joblib +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestRegressor + +from src.ml.feature_engineering import TARGET_STAGES, build_feature_schema + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Data classes for structured output +# --------------------------------------------------------------------------- + +@dataclass +class StagePrediction: + """Point prediction + confidence interval for a single stage.""" + + stage: str + predicted_days: float + lower_bound: float + upper_bound: float + confidence_level: float # e.g. 0.90 for a 90 % interval + + +@dataclass +class TimelinePrediction: + """Combined prediction for all three stages of a single project.""" + + entitlement: StagePrediction + financing: StagePrediction + construction: StagePrediction + total_predicted_days: float + predicted_at: datetime = field(default_factory=datetime.utcnow) + + def to_dict(self) -> dict[str, Any]: + """Serialize to a plain dictionary (JSON-friendly).""" + return { + "entitlement": { + "predicted_days": round(self.entitlement.predicted_days, 1), + "lower_bound": round(self.entitlement.lower_bound, 1), + "upper_bound": round(self.entitlement.upper_bound, 1), + "confidence_level": self.entitlement.confidence_level, + }, + "financing": { + "predicted_days": round(self.financing.predicted_days, 1), + "lower_bound": round(self.financing.lower_bound, 1), + "upper_bound": round(self.financing.upper_bound, 1), + "confidence_level": self.financing.confidence_level, + }, + "construction": { + "predicted_days": round(self.construction.predicted_days, 1), + "lower_bound": round(self.construction.lower_bound, 1), + "upper_bound": round(self.construction.upper_bound, 1), + "confidence_level": self.construction.confidence_level, + }, + "total_predicted_days": round(self.total_predicted_days, 1), + "predicted_at": self.predicted_at.isoformat(), + } + + +# --------------------------------------------------------------------------- +# Default hyper-parameters +# --------------------------------------------------------------------------- + +DEFAULT_RF_PARAMS: dict[str, Any] = { + "n_estimators": 300, + "max_depth": 18, + "min_samples_split": 8, + "min_samples_leaf": 4, + "max_features": "sqrt", + "random_state": 42, + "n_jobs": -1, +} + + +# --------------------------------------------------------------------------- +# Model wrapper +# --------------------------------------------------------------------------- + +class TimelineModel: + """Multi-output Random Forest model for stage-duration prediction. + + Internally maintains one ``RandomForestRegressor`` per target stage so + that hyper-parameters can (optionally) be tuned per-stage, and so + that confidence intervals can be extracted from each forest + independently. + + Parameters + ---------- + rf_params: + Keyword arguments forwarded to each ``RandomForestRegressor``. + Defaults to :data:`DEFAULT_RF_PARAMS`. + confidence_level: + Width of the prediction interval expressed as a probability + (e.g. ``0.90`` for a 90 % CI). Quantiles are computed from + the individual-tree predictions. + """ + + def __init__( + self, + rf_params: dict[str, Any] | None = None, + confidence_level: float = 0.90, + ) -> None: + self.rf_params = rf_params or dict(DEFAULT_RF_PARAMS) + self.confidence_level = confidence_level + + # One estimator per target stage + self._models: dict[str, RandomForestRegressor] = {} + self._is_fitted: bool = False + self._feature_names: list[str] = [] + self._train_timestamp: datetime | None = None + + # Per-stage training metadata (populated after fit) + self._train_stats: dict[str, dict[str, float]] = {} + + # ------------------------------------------------------------------ + # Training + # ------------------------------------------------------------------ + + def train( + self, + X: pd.DataFrame | np.ndarray, + y: pd.DataFrame | np.ndarray, + ) -> "TimelineModel": + """Fit one Random Forest per target stage. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + y : array-like of shape (n_samples, 3) + Columns must be ordered as ``TARGET_STAGES`` + (entitlement_days, financing_days, construction_days). + + Returns + ------- + self + """ + if isinstance(X, pd.DataFrame): + self._feature_names = list(X.columns) + X_arr = X.values.astype(np.float64) + else: + X_arr = np.asarray(X, dtype=np.float64) + schema = build_feature_schema() + self._feature_names = schema.all_columns + + if isinstance(y, pd.DataFrame): + y_arr = y.values.astype(np.float64) + else: + y_arr = np.asarray(y, dtype=np.float64) + + if y_arr.shape[1] != len(TARGET_STAGES): + raise ValueError( + f"y must have {len(TARGET_STAGES)} columns matching TARGET_STAGES, " + f"got {y_arr.shape[1]}" + ) + + # Replace any remaining NaNs in X with 0 (defensive) + X_arr = np.nan_to_num(X_arr, nan=0.0) + + for idx, stage in enumerate(TARGET_STAGES): + logger.info("Training RandomForest for %s ...", stage) + rf = RandomForestRegressor(**self.rf_params) + rf.fit(X_arr, y_arr[:, idx]) + self._models[stage] = rf + + # Capture basic training stats + self._train_stats[stage] = { + "n_samples": int(X_arr.shape[0]), + "y_mean": float(np.mean(y_arr[:, idx])), + "y_std": float(np.std(y_arr[:, idx])), + "y_min": float(np.min(y_arr[:, idx])), + "y_max": float(np.max(y_arr[:, idx])), + } + + self._is_fitted = True + self._train_timestamp = datetime.utcnow() + logger.info( + "Training complete. %d samples, %d features, %d trees/stage.", + X_arr.shape[0], + X_arr.shape[1], + self.rf_params.get("n_estimators", "?"), + ) + return self + + # ------------------------------------------------------------------ + # Prediction + # ------------------------------------------------------------------ + + def predict( + self, + X: pd.DataFrame | np.ndarray, + *, + confidence_level: float | None = None, + ) -> list[TimelinePrediction]: + """Generate timeline predictions with confidence intervals. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + confidence_level: + Override the instance-level confidence_level for this call. + + Returns + ------- + list[TimelinePrediction] + One prediction object per input row. + """ + self._check_fitted() + + if isinstance(X, pd.DataFrame): + X_arr = X.values.astype(np.float64) + else: + X_arr = np.asarray(X, dtype=np.float64) + + X_arr = np.nan_to_num(X_arr, nan=0.0) + + cl = confidence_level if confidence_level is not None else self.confidence_level + lower_q = (1.0 - cl) / 2.0 + upper_q = 1.0 - lower_q + + # Collect per-tree predictions for quantile estimation + stage_predictions: dict[str, dict[str, np.ndarray]] = {} + for stage in TARGET_STAGES: + rf = self._models[stage] + # Each tree predicts independently + tree_preds = np.array( + [tree.predict(X_arr) for tree in rf.estimators_] + ) # shape: (n_trees, n_samples) + + stage_predictions[stage] = { + "mean": np.mean(tree_preds, axis=0), + "lower": np.quantile(tree_preds, lower_q, axis=0), + "upper": np.quantile(tree_preds, upper_q, axis=0), + } + + results: list[TimelinePrediction] = [] + n_samples = X_arr.shape[0] + + for i in range(n_samples): + stage_preds: dict[str, StagePrediction] = {} + for stage in TARGET_STAGES: + sp = StagePrediction( + stage=stage, + predicted_days=float(max(stage_predictions[stage]["mean"][i], 0)), + lower_bound=float(max(stage_predictions[stage]["lower"][i], 0)), + upper_bound=float(max(stage_predictions[stage]["upper"][i], 0)), + confidence_level=cl, + ) + stage_preds[stage] = sp + + total = sum(sp.predicted_days for sp in stage_preds.values()) + + results.append( + TimelinePrediction( + entitlement=stage_preds["entitlement_days"], + financing=stage_preds["financing_days"], + construction=stage_preds["construction_days"], + total_predicted_days=total, + ) + ) + + return results + + def predict_dataframe( + self, + X: pd.DataFrame | np.ndarray, + *, + confidence_level: float | None = None, + ) -> pd.DataFrame: + """Return predictions as a flat DataFrame (convenient for analysis). + + Columns: ``_pred``, ``_lower``, ``_upper`` + for each target stage, plus ``total_pred``. + """ + preds = self.predict(X, confidence_level=confidence_level) + rows = [] + for p in preds: + row: dict[str, float] = {} + for stage_pred in (p.entitlement, p.financing, p.construction): + prefix = stage_pred.stage.replace("_days", "") + row[f"{prefix}_pred"] = stage_pred.predicted_days + row[f"{prefix}_lower"] = stage_pred.lower_bound + row[f"{prefix}_upper"] = stage_pred.upper_bound + row["total_pred"] = p.total_predicted_days + rows.append(row) + return pd.DataFrame(rows) + + # ------------------------------------------------------------------ + # Feature importance + # ------------------------------------------------------------------ + + def feature_importances(self) -> pd.DataFrame: + """Return a DataFrame of feature importances across all stages. + + Columns: ``feature``, plus one column per stage with the MDI + importance from that stage's forest, plus ``mean_importance``. + """ + self._check_fitted() + data: dict[str, list[float]] = {"feature": self._feature_names} + all_importances: list[np.ndarray] = [] + + for stage in TARGET_STAGES: + imp = self._models[stage].feature_importances_ + col_name = stage.replace("_days", "") + "_importance" + data[col_name] = imp.tolist() + all_importances.append(imp) + + data["mean_importance"] = np.mean(all_importances, axis=0).tolist() + df = pd.DataFrame(data).sort_values("mean_importance", ascending=False) + return df.reset_index(drop=True) + + # ------------------------------------------------------------------ + # Serialization + # ------------------------------------------------------------------ + + def save(self, path: str | Path) -> Path: + """Persist the model to disk using joblib. + + The saved artifact includes the fitted estimators, feature + schema, training stats, and hyper-parameters so that inference + is fully self-contained. + + Parameters + ---------- + path: + File path for the output ``.joblib`` file. + + Returns + ------- + pathlib.Path + Resolved path of the saved file. + """ + self._check_fitted() + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + + artifact = { + "models": self._models, + "rf_params": self.rf_params, + "confidence_level": self.confidence_level, + "feature_names": self._feature_names, + "target_stages": TARGET_STAGES, + "train_stats": self._train_stats, + "train_timestamp": self._train_timestamp, + "version": "1.0.0", + } + + joblib.dump(artifact, path) + logger.info("Model saved to %s", path) + return path.resolve() + + @classmethod + def load(cls, path: str | Path) -> "TimelineModel": + """Reconstruct a :class:`TimelineModel` from a joblib artifact. + + Parameters + ---------- + path: + Path to the ``.joblib`` file produced by :meth:`save`. + + Returns + ------- + TimelineModel + """ + path = Path(path) + artifact: dict[str, Any] = joblib.load(path) + + instance = cls( + rf_params=artifact["rf_params"], + confidence_level=artifact.get("confidence_level", 0.90), + ) + instance._models = artifact["models"] + instance._feature_names = artifact["feature_names"] + instance._train_stats = artifact.get("train_stats", {}) + instance._train_timestamp = artifact.get("train_timestamp") + instance._is_fitted = True + + logger.info( + "Model loaded from %s (trained %s, version %s).", + path, + instance._train_timestamp, + artifact.get("version", "unknown"), + ) + return instance + + # ------------------------------------------------------------------ + # Metadata / introspection + # ------------------------------------------------------------------ + + @property + def is_fitted(self) -> bool: + return self._is_fitted + + @property + def train_stats(self) -> dict[str, dict[str, float]]: + return dict(self._train_stats) + + @property + def train_timestamp(self) -> datetime | None: + return self._train_timestamp + + def summary(self) -> dict[str, Any]: + """Return a JSON-serializable summary of model metadata.""" + return { + "is_fitted": self._is_fitted, + "n_features": len(self._feature_names), + "target_stages": TARGET_STAGES, + "rf_params": self.rf_params, + "confidence_level": self.confidence_level, + "train_timestamp": ( + self._train_timestamp.isoformat() if self._train_timestamp else None + ), + "train_stats": self._train_stats, + } + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _check_fitted(self) -> None: + if not self._is_fitted: + raise RuntimeError( + "TimelineModel has not been trained yet. Call .train() first." + ) + + def __repr__(self) -> str: + status = "fitted" if self._is_fitted else "unfitted" + n_trees = self.rf_params.get("n_estimators", "?") + return f"" diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..7deab6b --- /dev/null +++ b/src/models/__init__.py @@ -0,0 +1,41 @@ +from src.models.barrier import ProjectBarrier +from src.models.enums import ( + AMIMixCategory, + BuildingType, + DataSource, + FundingSourceStatus, + FundingSourceType, + NeighborOpposition, + OverallHealth, + PipelineStage, + PortfolioType, + ReformType, + StakeholderType, + StructureType, +) +from src.models.funding_source import FundingSource +from src.models.peer_group import PeerGroup +from src.models.portfolio import PortfolioDashboard +from src.models.project import Project +from src.models.reform import PolicyReform + +__all__ = [ + "Project", + "FundingSource", + "ProjectBarrier", + "PeerGroup", + "PortfolioDashboard", + "PolicyReform", + "PipelineStage", + "BuildingType", + "StructureType", + "OverallHealth", + "DataSource", + "FundingSourceType", + "FundingSourceStatus", + "NeighborOpposition", + "AMIMixCategory", + "PortfolioType", + "ReformType", + "StakeholderType", +] diff --git a/src/models/barrier.py b/src/models/barrier.py new file mode 100644 index 0000000..d319609 --- /dev/null +++ b/src/models/barrier.py @@ -0,0 +1,60 @@ +"""SQLAlchemy model for project barriers and friction points.""" + +import uuid +from datetime import date, datetime + +from sqlalchemy import Boolean, Date, DateTime, Enum, Integer, Numeric, String, Text +from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from src.database.connection import Base +from src.models.enums import BarrierStage + + +class ProjectBarrier(Base): + """Links projects to specific regulatory friction points.""" + + __tablename__ = "project_barriers" + + barrier_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) + project_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + __import__("sqlalchemy").ForeignKey("projects.project_id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + + barrier_type: Mapped[str] = mapped_column(String(300), nullable=False, index=True) + barrier_description: Mapped[str | None] = mapped_column(Text) + jurisdiction: Mapped[str | None] = mapped_column(String(300)) + + # From HousingLens + friction_score: Mapped[int | None] = mapped_column(Integer) + jurisdiction_rank: Mapped[int | None] = mapped_column(Integer) + + # Project-specific impact + stage_encountered: Mapped[BarrierStage | None] = mapped_column(Enum(BarrierStage)) + date_encountered: Mapped[date | None] = mapped_column(Date) + date_resolved: Mapped[date | None] = mapped_column(Date) + days_delayed: Mapped[int] = mapped_column(Integer, default=0) + cost_impact: Mapped[float] = mapped_column(Numeric(12, 2), default=0) + + resolution_strategy: Mapped[str | None] = mapped_column(Text) + variance_required: Mapped[bool] = mapped_column(Boolean, default=False) + variance_granted: Mapped[bool | None] = mapped_column(Boolean) + appeal_filed: Mapped[bool] = mapped_column(Boolean, default=False) + + lessons_learned: Mapped[str | None] = mapped_column(Text) + notes: Mapped[str | None] = mapped_column(Text) + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) + updated_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, onupdate=datetime.utcnow + ) + + # Relationship + project = relationship("Project", back_populates="barriers") + + def __repr__(self) -> str: + return f"" diff --git a/src/models/enums.py b/src/models/enums.py new file mode 100644 index 0000000..b858aed --- /dev/null +++ b/src/models/enums.py @@ -0,0 +1,128 @@ +"""Enumerations for the HousingHand data model.""" + +import enum + + +class PipelineStage(str, enum.Enum): + CONCEPT = "concept" + PRE_DEVELOPMENT = "pre_development" + ENTITLEMENT = "entitlement" + FINANCING = "financing" + CONSTRUCTION = "construction" + LEASE_UP = "lease_up" + OPERATIONS = "operations" + STALLED = "stalled" + ABANDONED = "abandoned" + + +class BuildingType(str, enum.Enum): + NEW_CONSTRUCTION = "new_construction" + ADAPTIVE_REUSE = "adaptive_reuse" + SUBSTANTIAL_REHAB = "substantial_rehab" + ACQUISITION_REHAB = "acquisition_rehab" + + +class StructureType(str, enum.Enum): + WOOD_FRAME = "wood_frame" + CONCRETE = "concrete" + STEEL = "steel" + MIXED = "mixed" + + +class ParkingType(str, enum.Enum): + SURFACE = "surface" + STRUCTURED = "structured" + UNDERGROUND = "underground" + NONE = "none" + + +class OverallHealth(str, enum.Enum): + ON_TRACK = "on_track" + AT_RISK = "at_risk" + DELAYED = "delayed" + STALLED = "stalled" + + +class DataSource(str, enum.Enum): + DEVELOPER_PORTAL = "developer_portal" + PUBLIC_RECORDS = "public_records" + FUNDER_REPORT = "funder_report" + HOUSING_MIND_INFERENCE = "housing_mind_inference" + MANUAL_ENTRY = "manual_entry" + + +class FundingSourceType(str, enum.Enum): + LIHTC_4PCT = "LIHTC_4pct" + LIHTC_9PCT = "LIHTC_9pct" + HOME = "HOME" + CDBG = "CDBG" + HTF = "HTF" + STATE_TAX_CREDIT = "state_tax_credit" + LOCAL_TRUST_FUND = "local_trust_fund" + CONSTRUCTION_LOAN = "construction_loan" + PERMANENT_LOAN = "permanent_loan" + EQUITY = "equity" + GRANT = "grant" + OTHER = "other" + + +class FundingSourceStatus(str, enum.Enum): + ANTICIPATED = "anticipated" + APPLIED = "applied" + AWARDED = "awarded" + COMMITTED = "committed" + CLOSED = "closed" + REJECTED = "rejected" + + +class NeighborOpposition(str, enum.Enum): + NONE = "none" + LOW = "low" + MODERATE = "moderate" + HIGH = "high" + SEVERE = "severe" + + +class AMIMixCategory(str, enum.Enum): + DEEP_AFFORDABILITY = "deep_affordability" + MIXED_INCOME = "mixed_income" + WORKFORCE = "workforce" + SENIOR = "senior" + + +class PortfolioType(str, enum.Enum): + PHA_SERVICE_AREA = "pha_service_area" + FUNDER_PORTFOLIO = "funder_portfolio" + CITY_JURISDICTION = "city_jurisdiction" + STATE_REGION = "state_region" + CUSTOM = "custom" + + +class ReformType(str, enum.Enum): + ZONING_CHANGE = "zoning_change" + PARKING_REFORM = "parking_reform" + DENSITY_BONUS = "density_bonus" + STREAMLINING = "streamlining" + FEE_REDUCTION = "fee_reduction" + OTHER = "other" + + +class StakeholderType(str, enum.Enum): + PHA = "pha" + FUNDER = "funder" + CITY = "city" + STATE = "state" + RESEARCHER = "researcher" + + +class ConfidenceLevel(str, enum.Enum): + HIGH = "high" + MODERATE = "moderate" + LOW = "low" + + +class BarrierStage(str, enum.Enum): + PRE_DEVELOPMENT = "pre_development" + ENTITLEMENT = "entitlement" + FINANCING = "financing" + CONSTRUCTION = "construction" diff --git a/src/models/funding_source.py b/src/models/funding_source.py new file mode 100644 index 0000000..b249009 --- /dev/null +++ b/src/models/funding_source.py @@ -0,0 +1,68 @@ +"""SQLAlchemy model for project funding sources.""" + +import uuid +from datetime import date, datetime + +from sqlalchemy import Date, DateTime, Enum, Float, Integer, Numeric, String, Text +from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from src.database.connection import Base +from src.models.enums import FundingSourceStatus, FundingSourceType + + +class FundingSource(Base): + """Funding source linked to a project (many-to-one).""" + + __tablename__ = "funding_sources" + + funding_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) + project_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + __import__("sqlalchemy").ForeignKey("projects.project_id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + + source_type: Mapped[FundingSourceType] = mapped_column( + Enum(FundingSourceType), nullable=False + ) + source_name: Mapped[str] = mapped_column(String(500), nullable=False) + provider_organization: Mapped[str | None] = mapped_column(String(500)) + amount: Mapped[float] = mapped_column(Numeric(14, 2), nullable=False, default=0) + status: Mapped[FundingSourceStatus] = mapped_column( + Enum(FundingSourceStatus), nullable=False, default=FundingSourceStatus.ANTICIPATED + ) + + application_date: Mapped[date | None] = mapped_column(Date) + award_date: Mapped[date | None] = mapped_column(Date) + closing_date: Mapped[date | None] = mapped_column(Date) + expected_closing_date: Mapped[date | None] = mapped_column(Date) + + # Debt terms + interest_rate: Mapped[float | None] = mapped_column(Float) + term_years: Mapped[int | None] = mapped_column(Integer) + amortization_years: Mapped[int | None] = mapped_column(Integer) + + # Tax credit details + credit_amount_annual: Mapped[float | None] = mapped_column(Numeric(14, 2)) + equity_raised: Mapped[float | None] = mapped_column(Numeric(14, 2)) + pricing_percent: Mapped[float | None] = mapped_column(Float) + + # Compliance + compliance_period_years: Mapped[int | None] = mapped_column(Integer) + affordability_period_years: Mapped[int | None] = mapped_column(Integer) + + notes: Mapped[str | None] = mapped_column(Text) + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) + updated_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, onupdate=datetime.utcnow + ) + + # Relationship + project = relationship("Project", back_populates="funding_sources") + + def __repr__(self) -> str: + return f"" diff --git a/src/models/peer_group.py b/src/models/peer_group.py new file mode 100644 index 0000000..69baa8f --- /dev/null +++ b/src/models/peer_group.py @@ -0,0 +1,52 @@ +"""SQLAlchemy model for peer benchmarking groups.""" + +import uuid +from datetime import datetime + +from sqlalchemy import DateTime, Enum, Integer, Numeric, String, Text +from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.orm import Mapped, mapped_column + +from src.database.connection import Base +from src.models.enums import AMIMixCategory, BuildingType + + +class PeerGroup(Base): + """Defines comparable project cohorts for benchmarking.""" + + __tablename__ = "peer_groups" + + peer_group_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) + group_name: Mapped[str] = mapped_column(String(300), nullable=False) + description: Mapped[str | None] = mapped_column(Text) + + # Similarity criteria + jurisdiction: Mapped[str | None] = mapped_column(String(300), index=True) + unit_count_min: Mapped[int | None] = mapped_column(Integer) + unit_count_max: Mapped[int | None] = mapped_column(Integer) + ami_mix_category: Mapped[AMIMixCategory | None] = mapped_column(Enum(AMIMixCategory)) + building_type: Mapped[BuildingType | None] = mapped_column(Enum(BuildingType)) + + # Benchmark statistics (calculated) + project_count: Mapped[int] = mapped_column(Integer, default=0) + median_concept_duration: Mapped[int | None] = mapped_column(Integer) + median_pre_dev_duration: Mapped[int | None] = mapped_column(Integer) + median_entitlement_duration: Mapped[int | None] = mapped_column(Integer) + median_financing_duration: Mapped[int | None] = mapped_column(Integer) + median_construction_duration: Mapped[int | None] = mapped_column(Integer) + median_total_duration: Mapped[int | None] = mapped_column(Integer) + + median_cost_per_unit: Mapped[float | None] = mapped_column(Numeric(12, 2)) + p25_cost_per_unit: Mapped[float | None] = mapped_column(Numeric(12, 2)) + p75_cost_per_unit: Mapped[float | None] = mapped_column(Numeric(12, 2)) + + last_calculated: Mapped[datetime | None] = mapped_column(DateTime) + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) + updated_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, onupdate=datetime.utcnow + ) + + def __repr__(self) -> str: + return f"" diff --git a/src/models/portfolio.py b/src/models/portfolio.py new file mode 100644 index 0000000..3894a89 --- /dev/null +++ b/src/models/portfolio.py @@ -0,0 +1,54 @@ +"""SQLAlchemy model for portfolio dashboard configurations.""" + +import uuid +from datetime import date, datetime + +from sqlalchemy import Boolean, Date, DateTime, Enum, Integer, Numeric, String +from sqlalchemy.dialects.postgresql import JSON, UUID +from sqlalchemy.orm import Mapped, mapped_column + +from src.database.connection import Base +from src.models.enums import PortfolioType + + +class PortfolioDashboard(Base): + """Saved portfolio configurations for stakeholders.""" + + __tablename__ = "portfolio_dashboards" + + portfolio_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) + portfolio_name: Mapped[str] = mapped_column(String(300), nullable=False) + owner_user_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True)) + organization: Mapped[str | None] = mapped_column(String(500)) + + portfolio_type: Mapped[PortfolioType] = mapped_column( + Enum(PortfolioType), nullable=False + ) + + # Filters + geography_filter: Mapped[dict | None] = mapped_column(JSON) + funding_filter: Mapped[dict | None] = mapped_column(JSON) + stage_filter: Mapped[dict | None] = mapped_column(JSON) + ami_filter: Mapped[dict | None] = mapped_column(JSON) + date_range_start: Mapped[date | None] = mapped_column(Date) + date_range_end: Mapped[date | None] = mapped_column(Date) + + # Calculated metrics (cached) + total_projects: Mapped[int] = mapped_column(Integer, default=0) + total_units: Mapped[int] = mapped_column(Integer, default=0) + units_by_stage: Mapped[dict | None] = mapped_column(JSON) + funding_gap_aggregate: Mapped[float | None] = mapped_column(Numeric(14, 2)) + at_risk_count: Mapped[int] = mapped_column(Integer, default=0) + velocity_metrics: Mapped[dict | None] = mapped_column(JSON) + + last_calculated: Mapped[datetime | None] = mapped_column(DateTime) + is_public: Mapped[bool] = mapped_column(Boolean, default=False) + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) + updated_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, onupdate=datetime.utcnow + ) + + def __repr__(self) -> str: + return f"" diff --git a/src/models/project.py b/src/models/project.py new file mode 100644 index 0000000..c5ba551 --- /dev/null +++ b/src/models/project.py @@ -0,0 +1,237 @@ +"""SQLAlchemy model for affordable housing development projects.""" + +import uuid +from datetime import date, datetime + +from sqlalchemy import ( + Boolean, + Date, + DateTime, + Enum, + Float, + Integer, + Numeric, + String, + Text, +) +from sqlalchemy.dialects.postgresql import JSON, UUID +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from src.database.connection import Base +from src.models.enums import ( + BuildingType, + DataSource, + NeighborOpposition, + OverallHealth, + ParkingType, + PipelineStage, + StructureType, +) + + +class Project(Base): + """Comprehensive affordable housing project tracking model.""" + + __tablename__ = "projects" + + # Identity + project_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) + project_name: Mapped[str] = mapped_column(String(500), nullable=False) + project_slug: Mapped[str] = mapped_column(String(500), unique=True, nullable=False, index=True) + + # Location + address: Mapped[str | None] = mapped_column(String(500)) + city: Mapped[str | None] = mapped_column(String(200), index=True) + county: Mapped[str | None] = mapped_column(String(200)) + state: Mapped[str | None] = mapped_column(String(2), index=True) + zip: Mapped[str | None] = mapped_column(String(10)) + latitude: Mapped[float | None] = mapped_column(Float) + longitude: Mapped[float | None] = mapped_column(Float) + jurisdiction: Mapped[str | None] = mapped_column(String(300), index=True) + neighborhood: Mapped[str | None] = mapped_column(String(200)) + census_tract: Mapped[str | None] = mapped_column(String(20)) + + # Development Team + developer_org: Mapped[str | None] = mapped_column(String(500)) + developer_contact: Mapped[str | None] = mapped_column(String(300)) + architect: Mapped[str | None] = mapped_column(String(500)) + general_contractor: Mapped[str | None] = mapped_column(String(500)) + property_manager: Mapped[str | None] = mapped_column(String(500)) + + # Project Characteristics + site_acres: Mapped[float | None] = mapped_column(Float) + building_type: Mapped[BuildingType | None] = mapped_column(Enum(BuildingType)) + structure_type: Mapped[StructureType | None] = mapped_column(Enum(StructureType)) + stories: Mapped[int | None] = mapped_column(Integer) + parking_spaces: Mapped[int | None] = mapped_column(Integer) + parking_type: Mapped[ParkingType | None] = mapped_column(Enum(ParkingType)) + + # Unit Mix + total_units: Mapped[int] = mapped_column(Integer, nullable=False, default=0) + affordable_units: Mapped[int] = mapped_column(Integer, default=0) + market_units: Mapped[int] = mapped_column(Integer, default=0) + studio_units: Mapped[int] = mapped_column(Integer, default=0) + one_br_units: Mapped[int] = mapped_column(Integer, default=0) + two_br_units: Mapped[int] = mapped_column(Integer, default=0) + three_br_units: Mapped[int] = mapped_column(Integer, default=0) + four_plus_br_units: Mapped[int] = mapped_column(Integer, default=0) + + # AMI Targeting + ami_30_units: Mapped[int] = mapped_column(Integer, default=0) + ami_40_units: Mapped[int] = mapped_column(Integer, default=0) + ami_50_units: Mapped[int] = mapped_column(Integer, default=0) + ami_60_units: Mapped[int] = mapped_column(Integer, default=0) + ami_80_units: Mapped[int] = mapped_column(Integer, default=0) + market_rate_units: Mapped[int] = mapped_column(Integer, default=0) + + # Special Populations + senior_units: Mapped[int] = mapped_column(Integer, default=0) + family_units: Mapped[int] = mapped_column(Integer, default=0) + psf_units: Mapped[int] = mapped_column(Integer, default=0) + veteran_units: Mapped[int] = mapped_column(Integer, default=0) + homeless_set_aside: Mapped[int] = mapped_column(Integer, default=0) + + # Pipeline Status + current_stage: Mapped[PipelineStage] = mapped_column( + Enum(PipelineStage), nullable=False, default=PipelineStage.CONCEPT, index=True + ) + stage_entry_date: Mapped[date | None] = mapped_column(Date) + days_in_current_stage: Mapped[int | None] = mapped_column(Integer) + overall_health: Mapped[OverallHealth | None] = mapped_column(Enum(OverallHealth)) + health_score: Mapped[float | None] = mapped_column(Float) + last_milestone_date: Mapped[date | None] = mapped_column(Date) + next_milestone_date: Mapped[date | None] = mapped_column(Date) + next_milestone_type: Mapped[str | None] = mapped_column(String(200)) + + # Timeline - Actual + concept_start: Mapped[date | None] = mapped_column(Date) + concept_complete: Mapped[date | None] = mapped_column(Date) + concept_duration_days: Mapped[int | None] = mapped_column(Integer) + + pre_development_start: Mapped[date | None] = mapped_column(Date) + pre_development_complete: Mapped[date | None] = mapped_column(Date) + pre_development_duration_days: Mapped[int | None] = mapped_column(Integer) + + entitlement_start: Mapped[date | None] = mapped_column(Date) + entitlement_complete: Mapped[date | None] = mapped_column(Date) + entitlement_duration_days: Mapped[int | None] = mapped_column(Integer) + + financing_start: Mapped[date | None] = mapped_column(Date) + financing_complete: Mapped[date | None] = mapped_column(Date) + financing_duration_days: Mapped[int | None] = mapped_column(Integer) + + construction_start: Mapped[date | None] = mapped_column(Date) + construction_complete: Mapped[date | None] = mapped_column(Date) + construction_duration_days: Mapped[int | None] = mapped_column(Integer) + + lease_up_start: Mapped[date | None] = mapped_column(Date) + lease_up_complete: Mapped[date | None] = mapped_column(Date) + lease_up_duration_days: Mapped[int | None] = mapped_column(Integer) + + total_elapsed_days: Mapped[int | None] = mapped_column(Integer) + concept_to_groundbreaking_days: Mapped[int | None] = mapped_column(Integer) + concept_to_co_days: Mapped[int | None] = mapped_column(Integer) + + # Timeline - Predicted + predicted_entitlement_complete: Mapped[date | None] = mapped_column(Date) + predicted_financing_complete: Mapped[date | None] = mapped_column(Date) + predicted_groundbreaking: Mapped[date | None] = mapped_column(Date) + predicted_co: Mapped[date | None] = mapped_column(Date) + prediction_confidence: Mapped[float | None] = mapped_column(Float) + prediction_last_updated: Mapped[datetime | None] = mapped_column(DateTime) + + # Costs + total_development_cost: Mapped[float | None] = mapped_column(Numeric(14, 2)) + cost_per_unit: Mapped[float | None] = mapped_column(Numeric(12, 2)) + cost_per_square_foot: Mapped[float | None] = mapped_column(Numeric(10, 2)) + + land_acquisition_cost: Mapped[float | None] = mapped_column(Numeric(14, 2)) + hard_costs: Mapped[float | None] = mapped_column(Numeric(14, 2)) + soft_costs: Mapped[float | None] = mapped_column(Numeric(14, 2)) + financing_costs: Mapped[float | None] = mapped_column(Numeric(14, 2)) + developer_fee: Mapped[float | None] = mapped_column(Numeric(14, 2)) + reserves: Mapped[float | None] = mapped_column(Numeric(14, 2)) + + # Cost Breakdown Detail + architecture_engineering: Mapped[float | None] = mapped_column(Numeric(12, 2)) + legal_fees: Mapped[float | None] = mapped_column(Numeric(12, 2)) + environmental_review: Mapped[float | None] = mapped_column(Numeric(12, 2)) + market_study: Mapped[float | None] = mapped_column(Numeric(12, 2)) + appraisal: Mapped[float | None] = mapped_column(Numeric(12, 2)) + title_insurance: Mapped[float | None] = mapped_column(Numeric(12, 2)) + construction_loan_interest: Mapped[float | None] = mapped_column(Numeric(12, 2)) + permanent_loan_fees: Mapped[float | None] = mapped_column(Numeric(12, 2)) + + # Friction-Induced Costs + friction_induced_costs: Mapped[float | None] = mapped_column(Numeric(14, 2)) + regulatory_delay_costs: Mapped[float | None] = mapped_column(Numeric(14, 2)) + redesign_costs: Mapped[float | None] = mapped_column(Numeric(12, 2)) + carrying_costs_from_delays: Mapped[float | None] = mapped_column(Numeric(14, 2)) + + # Cost Variance + original_budget: Mapped[float | None] = mapped_column(Numeric(14, 2)) + current_budget: Mapped[float | None] = mapped_column(Numeric(14, 2)) + budget_variance_dollars: Mapped[float | None] = mapped_column(Numeric(14, 2)) + budget_variance_percent: Mapped[float | None] = mapped_column(Float) + budget_variance_reasons: Mapped[dict | None] = mapped_column(JSON) + + # Funding Stack + funding_stack: Mapped[dict | None] = mapped_column(JSON) + total_funding_committed: Mapped[float | None] = mapped_column(Numeric(14, 2)) + funding_gap: Mapped[float | None] = mapped_column(Numeric(14, 2)) + debt_amount: Mapped[float | None] = mapped_column(Numeric(14, 2)) + equity_amount: Mapped[float | None] = mapped_column(Numeric(14, 2)) + subsidy_amount: Mapped[float | None] = mapped_column(Numeric(14, 2)) + + # Regulatory Friction Analysis + jurisdiction_friction_score: Mapped[int | None] = mapped_column(Integer) + predicted_delay_from_friction: Mapped[int | None] = mapped_column(Integer) + actual_delay_sofar: Mapped[int | None] = mapped_column(Integer) + primary_friction_points: Mapped[dict | None] = mapped_column(JSON) + + # Stakeholder Interactions + housing_mind_queries: Mapped[int] = mapped_column(Integer, default=0) + top_query_categories: Mapped[dict | None] = mapped_column(JSON) + public_meetings_attended: Mapped[int] = mapped_column(Integer, default=0) + variance_hearings: Mapped[int] = mapped_column(Integer, default=0) + design_review_iterations: Mapped[int] = mapped_column(Integer, default=0) + neighbor_opposition_level: Mapped[NeighborOpposition | None] = mapped_column( + Enum(NeighborOpposition) + ) + appeals_filed: Mapped[int] = mapped_column(Integer, default=0) + + # Risk Factors + risk_factors: Mapped[dict | None] = mapped_column(JSON) + risk_score: Mapped[float | None] = mapped_column(Float) + pending_approvals: Mapped[dict | None] = mapped_column(JSON) + active_oppositions: Mapped[dict | None] = mapped_column(JSON) + + # Data Quality & Provenance + data_source: Mapped[DataSource | None] = mapped_column(Enum(DataSource)) + data_quality_score: Mapped[float | None] = mapped_column(Float) + last_verified: Mapped[date | None] = mapped_column(Date) + verified_by: Mapped[str | None] = mapped_column(String(200)) + data_completeness: Mapped[float | None] = mapped_column(Float) + + # Metadata + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) + updated_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, onupdate=datetime.utcnow + ) + created_by: Mapped[str | None] = mapped_column(String(200)) + is_public: Mapped[bool] = mapped_column(Boolean, default=False) + notes: Mapped[str | None] = mapped_column(Text) + + # Relationships + funding_sources = relationship("FundingSource", back_populates="project", cascade="all, delete-orphan") + barriers = relationship("ProjectBarrier", back_populates="project", cascade="all, delete-orphan") + + def get_stage_duration(self, stage: str) -> int | None: + """Return duration in days for a given pipeline stage.""" + attr = f"{stage}_duration_days" + return getattr(self, attr, None) + + def __repr__(self) -> str: + return f"" diff --git a/src/models/reform.py b/src/models/reform.py new file mode 100644 index 0000000..c39cb7e --- /dev/null +++ b/src/models/reform.py @@ -0,0 +1,61 @@ +"""SQLAlchemy model for policy reform tracking.""" + +import uuid +from datetime import date, datetime + +from sqlalchemy import Date, DateTime, Enum, Float, Integer, Numeric, String, Text +from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.orm import Mapped, mapped_column + +from src.database.connection import Base +from src.models.enums import ConfidenceLevel, ReformType + + +class PolicyReform(Base): + """Tracks regulatory changes and measures their impact on development pipelines.""" + + __tablename__ = "policy_reforms" + + reform_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) + jurisdiction: Mapped[str] = mapped_column(String(300), nullable=False, index=True) + reform_name: Mapped[str] = mapped_column(String(500), nullable=False) + reform_description: Mapped[str | None] = mapped_column(Text) + + reform_type: Mapped[ReformType] = mapped_column(Enum(ReformType), nullable=False) + related_friction_topic: Mapped[str | None] = mapped_column(String(300)) + + announcement_date: Mapped[date | None] = mapped_column(Date) + effective_date: Mapped[date | None] = mapped_column(Date) + implementation_buffer_days: Mapped[int] = mapped_column(Integer, default=30) + + # Impact measurement + projects_pre_reform: Mapped[int] = mapped_column(Integer, default=0) + projects_post_reform: Mapped[int] = mapped_column(Integer, default=0) + pre_reform_median_days: Mapped[int | None] = mapped_column(Integer) + post_reform_median_days: Mapped[int | None] = mapped_column(Integer) + days_saved_per_project: Mapped[int | None] = mapped_column(Integer) + percent_improvement: Mapped[float | None] = mapped_column(Float) + + total_cost_savings: Mapped[float | None] = mapped_column(Numeric(14, 2)) + units_enabled: Mapped[int] = mapped_column(Integer, default=0) + projects_no_longer_delayed: Mapped[int] = mapped_column(Integer, default=0) + + statistical_significance_p_value: Mapped[float | None] = mapped_column(Float) + confidence_level: Mapped[ConfidenceLevel | None] = mapped_column(Enum(ConfidenceLevel)) + + # Source tracking + source: Mapped[str | None] = mapped_column(String(300)) + source_url: Mapped[str | None] = mapped_column(String(1000)) + ordinance_number: Mapped[str | None] = mapped_column(String(100)) + + impact_last_measured: Mapped[date | None] = mapped_column(Date) + notes: Mapped[str | None] = mapped_column(Text) + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) + updated_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, onupdate=datetime.utcnow + ) + + def __repr__(self) -> str: + return f"" diff --git a/src/tasks/__init__.py b/src/tasks/__init__.py new file mode 100644 index 0000000..bb78487 --- /dev/null +++ b/src/tasks/__init__.py @@ -0,0 +1,3 @@ +from src.tasks.celery_app import celery_app + +__all__ = ["celery_app"] diff --git a/src/tasks/alert_generation.py b/src/tasks/alert_generation.py new file mode 100644 index 0000000..8623ccd --- /dev/null +++ b/src/tasks/alert_generation.py @@ -0,0 +1,121 @@ +"""Celery tasks for generating stakeholder alerts.""" + +import logging +from datetime import date, datetime, timedelta + +from src.database.connection import get_session_factory +from src.models.enums import OverallHealth, PipelineStage +from src.models.project import Project +from src.tasks.celery_app import celery_app + +logger = logging.getLogger(__name__) + + +@celery_app.task(name="src.tasks.alert_generation.generate_daily_alerts") +def generate_daily_alerts() -> dict: + """Generate alerts for projects that need attention.""" + SessionLocal = get_session_factory() + db = SessionLocal() + + try: + alerts = [] + + # Alert 1: Projects newly at risk + at_risk = ( + db.query(Project) + .filter( + Project.overall_health.in_([OverallHealth.AT_RISK, OverallHealth.STALLED]), + Project.current_stage.notin_( + [PipelineStage.OPERATIONS, PipelineStage.ABANDONED] + ), + ) + .all() + ) + + for project in at_risk: + alerts.append({ + "type": "health_warning", + "severity": "high" if project.overall_health == OverallHealth.STALLED else "medium", + "project_id": str(project.project_id), + "project_name": project.project_name, + "message": ( + f"Project '{project.project_name}' is {project.overall_health.value} " + f"in {project.current_stage.value} stage " + f"({project.days_in_current_stage or 0} days)" + ), + }) + + # Alert 2: Upcoming milestones (next 14 days) + upcoming_deadline = date.today() + timedelta(days=14) + upcoming = ( + db.query(Project) + .filter( + Project.next_milestone_date.isnot(None), + Project.next_milestone_date <= upcoming_deadline, + Project.next_milestone_date >= date.today(), + ) + .all() + ) + + for project in upcoming: + days_until = (project.next_milestone_date - date.today()).days + alerts.append({ + "type": "milestone_upcoming", + "severity": "low" if days_until > 7 else "medium", + "project_id": str(project.project_id), + "project_name": project.project_name, + "message": ( + f"Project '{project.project_name}' has milestone " + f"'{project.next_milestone_type}' in {days_until} days" + ), + }) + + # Alert 3: Large funding gaps + funding_gap_projects = ( + db.query(Project) + .filter( + Project.funding_gap > 0, + Project.current_stage.in_( + [PipelineStage.PRE_DEVELOPMENT, PipelineStage.ENTITLEMENT, PipelineStage.FINANCING] + ), + ) + .all() + ) + + for project in funding_gap_projects: + if project.funding_gap and project.total_development_cost: + gap_pct = (float(project.funding_gap) / float(project.total_development_cost)) * 100 + if gap_pct > 20: + alerts.append({ + "type": "funding_gap", + "severity": "high", + "project_id": str(project.project_id), + "project_name": project.project_name, + "message": ( + f"Project '{project.project_name}' has a " + f"{gap_pct:.0f}% funding gap " + f"(${float(project.funding_gap):,.0f})" + ), + }) + + result = { + "total_alerts": len(alerts), + "by_severity": { + "high": len([a for a in alerts if a["severity"] == "high"]), + "medium": len([a for a in alerts if a["severity"] == "medium"]), + "low": len([a for a in alerts if a["severity"] == "low"]), + }, + "alerts": alerts, + "timestamp": datetime.utcnow().isoformat(), + } + + logger.info( + f"Generated {len(alerts)} alerts: " + f"{result['by_severity']['high']} high, " + f"{result['by_severity']['medium']} medium, " + f"{result['by_severity']['low']} low" + ) + return result + + finally: + db.close() diff --git a/src/tasks/calculate_benchmarks.py b/src/tasks/calculate_benchmarks.py new file mode 100644 index 0000000..9956ec5 --- /dev/null +++ b/src/tasks/calculate_benchmarks.py @@ -0,0 +1,120 @@ +"""Celery tasks for recalculating peer group benchmarks.""" + +import logging +from datetime import datetime + +import numpy as np + +from src.database.connection import get_session_factory +from src.models.peer_group import PeerGroup +from src.models.project import Project +from src.tasks.celery_app import celery_app + +logger = logging.getLogger(__name__) + + +@celery_app.task(name="src.tasks.calculate_benchmarks.recalculate_all_benchmarks") +def recalculate_all_benchmarks() -> dict: + """Recalculate statistics for all peer groups.""" + SessionLocal = get_session_factory() + db = SessionLocal() + + try: + peer_groups = db.query(PeerGroup).all() + + updated = 0 + for pg in peer_groups: + try: + _recalculate_peer_group(db, pg) + updated += 1 + except Exception: + logger.exception(f"Error recalculating peer group: {pg.group_name}") + + db.commit() + result = { + "total_groups": len(peer_groups), + "updated": updated, + "timestamp": datetime.utcnow().isoformat(), + } + logger.info(f"Benchmark recalculation complete: {result}") + return result + + finally: + db.close() + + +@celery_app.task(name="src.tasks.calculate_benchmarks.recalculate_peer_group") +def recalculate_peer_group(peer_group_id: str) -> dict: + """Recalculate statistics for a single peer group.""" + SessionLocal = get_session_factory() + db = SessionLocal() + + try: + pg = db.get(PeerGroup, peer_group_id) + if pg is None: + return {"error": f"Peer group not found: {peer_group_id}"} + + _recalculate_peer_group(db, pg) + db.commit() + + return { + "peer_group_id": peer_group_id, + "project_count": pg.project_count, + "median_total_duration": pg.median_total_duration, + "timestamp": datetime.utcnow().isoformat(), + } + finally: + db.close() + + +def _recalculate_peer_group(db, pg: PeerGroup) -> None: + """Recalculate benchmark stats for a peer group.""" + query = db.query(Project) + + if pg.jurisdiction: + query = query.filter(Project.jurisdiction == pg.jurisdiction) + if pg.unit_count_min is not None: + query = query.filter(Project.total_units >= pg.unit_count_min) + if pg.unit_count_max is not None: + query = query.filter(Project.total_units <= pg.unit_count_max) + if pg.building_type: + query = query.filter(Project.building_type == pg.building_type) + + projects = query.all() + pg.project_count = len(projects) + + if not projects: + pg.last_calculated = datetime.utcnow() + return + + def _safe_median(values): + filtered = [v for v in values if v is not None and v > 0] + return int(np.median(filtered)) if filtered else None + + pg.median_concept_duration = _safe_median( + [p.concept_duration_days for p in projects] + ) + pg.median_pre_dev_duration = _safe_median( + [p.pre_development_duration_days for p in projects] + ) + pg.median_entitlement_duration = _safe_median( + [p.entitlement_duration_days for p in projects] + ) + pg.median_financing_duration = _safe_median( + [p.financing_duration_days for p in projects] + ) + pg.median_construction_duration = _safe_median( + [p.construction_duration_days for p in projects] + ) + + total_durations = [p.concept_to_co_days for p in projects if p.concept_to_co_days] + if total_durations: + pg.median_total_duration = int(np.median(total_durations)) + + costs = [float(p.cost_per_unit) for p in projects if p.cost_per_unit] + if costs: + pg.median_cost_per_unit = float(np.median(costs)) + pg.p25_cost_per_unit = float(np.percentile(costs, 25)) + pg.p75_cost_per_unit = float(np.percentile(costs, 75)) + + pg.last_calculated = datetime.utcnow() diff --git a/src/tasks/celery_app.py b/src/tasks/celery_app.py new file mode 100644 index 0000000..dbe1864 --- /dev/null +++ b/src/tasks/celery_app.py @@ -0,0 +1,48 @@ +"""Celery application configuration for async task processing.""" + +from celery import Celery +from celery.schedules import crontab + +from config.settings import get_settings + +settings = get_settings() + +celery_app = Celery( + "housinghand", + broker=settings.celery_broker_url, + backend=settings.celery_result_backend, +) + +celery_app.conf.update( + task_serializer="json", + accept_content=["json"], + result_serializer="json", + timezone="UTC", + enable_utc=True, + task_track_started=True, + task_acks_late=True, + worker_prefetch_multiplier=1, +) + +# Auto-discover tasks in the tasks package +celery_app.autodiscover_tasks(["src.tasks"]) + +# Periodic task schedule +celery_app.conf.beat_schedule = { + "update-predictions-daily": { + "task": "src.tasks.update_predictions.update_all_predictions", + "schedule": crontab(hour=2, minute=0), # 2:00 AM UTC + }, + "calculate-benchmarks-weekly": { + "task": "src.tasks.calculate_benchmarks.recalculate_all_benchmarks", + "schedule": crontab(hour=3, minute=0, day_of_week="sunday"), + }, + "health-checks-hourly": { + "task": "src.tasks.health_checks.run_health_checks", + "schedule": crontab(minute=0), # Every hour + }, + "generate-alerts-daily": { + "task": "src.tasks.alert_generation.generate_daily_alerts", + "schedule": crontab(hour=8, minute=0), # 8:00 AM UTC + }, +} diff --git a/src/tasks/health_checks.py b/src/tasks/health_checks.py new file mode 100644 index 0000000..9468454 --- /dev/null +++ b/src/tasks/health_checks.py @@ -0,0 +1,120 @@ +"""Celery tasks for periodic project health assessments.""" + +import logging +from datetime import date, datetime + +from src.database.connection import get_session_factory +from src.models.enums import OverallHealth, PipelineStage +from src.models.project import Project +from src.tasks.celery_app import celery_app + +logger = logging.getLogger(__name__) + +# Stages that need active health monitoring +ACTIVE_STAGES = [ + PipelineStage.CONCEPT, + PipelineStage.PRE_DEVELOPMENT, + PipelineStage.ENTITLEMENT, + PipelineStage.FINANCING, + PipelineStage.CONSTRUCTION, + PipelineStage.LEASE_UP, +] + + +@celery_app.task(name="src.tasks.health_checks.run_health_checks") +def run_health_checks() -> dict: + """Run health assessments on all active projects.""" + SessionLocal = get_session_factory() + db = SessionLocal() + + try: + projects = ( + db.query(Project) + .filter(Project.current_stage.in_(ACTIVE_STAGES)) + .all() + ) + + checked = 0 + newly_at_risk = 0 + newly_stalled = 0 + + for project in projects: + try: + old_health = project.overall_health + _update_health(project) + + if ( + old_health == OverallHealth.ON_TRACK + and project.overall_health in [OverallHealth.AT_RISK, OverallHealth.DELAYED] + ): + newly_at_risk += 1 + + if project.overall_health == OverallHealth.STALLED and old_health != OverallHealth.STALLED: + newly_stalled += 1 + + checked += 1 + except Exception: + logger.exception(f"Error checking health for {project.project_slug}") + + db.commit() + + result = { + "total_active_projects": len(projects), + "checked": checked, + "newly_at_risk": newly_at_risk, + "newly_stalled": newly_stalled, + "timestamp": datetime.utcnow().isoformat(), + } + logger.info(f"Health checks complete: {result}") + return result + + finally: + db.close() + + +def _update_health(project: Project) -> None: + """Update the health status and days-in-stage for a project.""" + # Update days in current stage + if project.stage_entry_date: + project.days_in_current_stage = (date.today() - project.stage_entry_date).days + + # Simple health heuristic based on days in stage vs benchmarks + from config.settings import get_settings + from pathlib import Path + import yaml + + settings = get_settings() + benchmarks_path = settings.project_root / "config" / "national_benchmarks.yaml" + + try: + with open(benchmarks_path) as f: + benchmarks = yaml.safe_load(f) + except FileNotFoundError: + benchmarks = None + + if benchmarks and project.days_in_current_stage is not None: + stage_key = project.current_stage.value + stage_benchmarks = ( + benchmarks.get("national_benchmarks", {}) + .get("stage_durations", {}) + .get(stage_key, {}) + ) + p90 = stage_benchmarks.get("p90") + + if p90: + ratio = project.days_in_current_stage / p90 + if ratio < 0.5: + project.overall_health = OverallHealth.ON_TRACK + project.health_score = min(100, 100 - (ratio * 40)) + elif ratio < 0.8: + project.overall_health = OverallHealth.ON_TRACK + project.health_score = max(60, 100 - (ratio * 50)) + elif ratio < 1.0: + project.overall_health = OverallHealth.AT_RISK + project.health_score = max(40, 80 - (ratio * 40)) + elif ratio < 1.5: + project.overall_health = OverallHealth.DELAYED + project.health_score = max(20, 60 - (ratio * 30)) + else: + project.overall_health = OverallHealth.STALLED + project.health_score = max(0, 30 - (ratio * 10)) diff --git a/src/tasks/update_predictions.py b/src/tasks/update_predictions.py new file mode 100644 index 0000000..84b04c4 --- /dev/null +++ b/src/tasks/update_predictions.py @@ -0,0 +1,128 @@ +"""Celery tasks for refreshing timeline predictions.""" + +import logging +from datetime import datetime + +from src.database.connection import get_session_factory +from src.models.enums import PipelineStage +from src.models.project import Project +from src.tasks.celery_app import celery_app + +logger = logging.getLogger(__name__) + +# Stages where predictions are relevant +PREDICTABLE_STAGES = [ + PipelineStage.CONCEPT, + PipelineStage.PRE_DEVELOPMENT, + PipelineStage.ENTITLEMENT, + PipelineStage.FINANCING, + PipelineStage.CONSTRUCTION, +] + + +@celery_app.task(name="src.tasks.update_predictions.update_all_predictions") +def update_all_predictions() -> dict: + """Refresh timeline predictions for all active projects.""" + SessionLocal = get_session_factory() + db = SessionLocal() + + try: + projects = ( + db.query(Project) + .filter(Project.current_stage.in_(PREDICTABLE_STAGES)) + .all() + ) + + updated = 0 + errors = 0 + + for project in projects: + try: + _update_single_prediction(db, project) + updated += 1 + except Exception: + logger.exception( + f"Error updating prediction for {project.project_slug}" + ) + errors += 1 + + db.commit() + result = { + "total_projects": len(projects), + "updated": updated, + "errors": errors, + "timestamp": datetime.utcnow().isoformat(), + } + logger.info(f"Prediction update complete: {result}") + return result + + finally: + db.close() + + +@celery_app.task(name="src.tasks.update_predictions.update_project_prediction") +def update_project_prediction(project_id: str) -> dict: + """Refresh timeline prediction for a single project.""" + SessionLocal = get_session_factory() + db = SessionLocal() + + try: + project = db.get(Project, project_id) + if project is None: + return {"error": f"Project not found: {project_id}"} + + _update_single_prediction(db, project) + db.commit() + + return { + "project_id": project_id, + "predicted_co": project.predicted_co.isoformat() if project.predicted_co else None, + "confidence": project.prediction_confidence, + "timestamp": datetime.utcnow().isoformat(), + } + finally: + db.close() + + +def _update_single_prediction(db, project: Project) -> None: + """Update prediction fields on a single project (internal helper).""" + from src.analytics.timeline_prediction import predict_project_timeline + + result = predict_project_timeline(db, project) + + if "error" not in result: + timeline = result.get("predicted_timeline", {}) + + # Update prediction dates on project based on current stage + # Convert months to approximate dates from now + from datetime import date, timedelta + + today = date.today() + + entitlement_months = timeline.get("entitlement_months", 0) + financing_months = timeline.get("financing_months", 0) + construction_months = timeline.get("construction_months", 0) + + if project.current_stage in [PipelineStage.CONCEPT, PipelineStage.PRE_DEVELOPMENT]: + project.predicted_entitlement_complete = today + timedelta( + days=int(entitlement_months * 30 + 180) + ) + if project.entitlement_complete: + project.predicted_financing_complete = project.entitlement_complete + timedelta( + days=int(financing_months * 30) + ) + elif project.predicted_entitlement_complete: + project.predicted_financing_complete = project.predicted_entitlement_complete + timedelta( + days=int(financing_months * 30) + ) + + total_months = timeline.get("total_concept_to_co_months", 0) + if project.concept_start: + project.predicted_co = project.concept_start + timedelta( + days=int(total_months * 30) + ) + + project.prediction_confidence = result.get("confidence_intervals", {}).get( + "confidence_level", 0.5 + ) + project.prediction_last_updated = datetime.utcnow() diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..e8afbfd --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1,32 @@ +from src.utils.date_helpers import ( + calculate_duration_days, + date_to_quarter, + months_between, + parse_timeframe, +) +from src.utils.formatting import format_currency, format_duration, format_percent +from src.utils.geography import find_comparable_jurisdictions, normalize_jurisdiction +from src.utils.statistical_helpers import ( + calculate_percentile, + mean, + median, + safe_divide, + weighted_average, +) + +__all__ = [ + "calculate_duration_days", + "date_to_quarter", + "months_between", + "parse_timeframe", + "format_currency", + "format_duration", + "format_percent", + "find_comparable_jurisdictions", + "normalize_jurisdiction", + "calculate_percentile", + "mean", + "median", + "safe_divide", + "weighted_average", +] diff --git a/src/utils/date_helpers.py b/src/utils/date_helpers.py new file mode 100644 index 0000000..af53215 --- /dev/null +++ b/src/utils/date_helpers.py @@ -0,0 +1,78 @@ +"""Date manipulation utilities for pipeline timeline analysis.""" + +from datetime import date, datetime, timedelta + + +def calculate_duration_days(start: date | None, end: date | None) -> int | None: + """Calculate duration in days between two dates.""" + if start is None or end is None: + return None + return (end - start).days + + +def months_between(start: date, end: date) -> float: + """Calculate approximate months between two dates.""" + days = (end - start).days + return round(days / 30.44, 1) + + +def date_to_quarter(d: date) -> str: + """Convert a date to a quarter string like '2025-Q3'.""" + quarter = (d.month - 1) // 3 + 1 + return f"{d.year}-Q{quarter}" + + +def parse_timeframe(timeframe: str) -> tuple[date, date]: + """Parse a timeframe string into a date range. + + Supported formats: + - 'last_N_months' (e.g., 'last_24_months') + - 'last_N_years' (e.g., 'last_2_years') + - 'YYYY-MM-DD:YYYY-MM-DD' (explicit range) + - 'YYYY' (full year) + """ + today = date.today() + + if timeframe.startswith("last_"): + parts = timeframe.split("_") + n = int(parts[1]) + unit = parts[2] + if unit == "months": + start = today - timedelta(days=n * 30) + elif unit == "years": + start = today - timedelta(days=n * 365) + else: + raise ValueError(f"Unknown timeframe unit: {unit}") + return (start, today) + + if ":" in timeframe: + start_str, end_str = timeframe.split(":") + return ( + datetime.strptime(start_str, "%Y-%m-%d").date(), + datetime.strptime(end_str, "%Y-%m-%d").date(), + ) + + if len(timeframe) == 4 and timeframe.isdigit(): + year = int(timeframe) + return (date(year, 1, 1), date(year, 12, 31)) + + raise ValueError(f"Cannot parse timeframe: {timeframe}") + + +def days_to_months(days: int | float) -> float: + """Convert days to months (approximate).""" + return round(days / 30.44, 1) + + +def quarters_ahead(n: int) -> list[str]: + """Return the next N quarter strings from today.""" + today = date.today() + result = [] + current = today + for _ in range(n): + result.append(date_to_quarter(current)) + month = current.month + 3 + year = current.year + (month - 1) // 12 + month = (month - 1) % 12 + 1 + current = date(year, month, 1) + return result diff --git a/src/utils/formatting.py b/src/utils/formatting.py new file mode 100644 index 0000000..365282f --- /dev/null +++ b/src/utils/formatting.py @@ -0,0 +1,49 @@ +"""Output formatting utilities.""" + + +def format_currency(amount: float | int | None, include_cents: bool = False) -> str: + """Format a number as USD currency.""" + if amount is None: + return "N/A" + if include_cents: + return f"${amount:,.2f}" + return f"${amount:,.0f}" + + +def format_duration(days: int | float | None) -> str: + """Format a duration in days as a human-readable string.""" + if days is None: + return "N/A" + days = int(days) + if days < 30: + return f"{days} days" + months = days / 30.44 + if months < 12: + return f"{months:.1f} months" + years = months / 12 + remaining_months = months % 12 + if remaining_months < 0.5: + return f"{int(years)} years" + return f"{int(years)} years, {int(remaining_months)} months" + + +def format_percent(value: float | None, decimals: int = 1) -> str: + """Format a float as a percentage string.""" + if value is None: + return "N/A" + return f"{value:.{decimals}f}%" + + +def format_change(value: float | None, decimals: int = 1) -> str: + """Format a change value with a + or - prefix.""" + if value is None: + return "N/A" + sign = "+" if value > 0 else "" + return f"{sign}{value:.{decimals}f}%" + + +def truncate(text: str, max_length: int = 100) -> str: + """Truncate text to a maximum length with ellipsis.""" + if len(text) <= max_length: + return text + return text[: max_length - 3] + "..." diff --git a/src/utils/geography.py b/src/utils/geography.py new file mode 100644 index 0000000..c523b63 --- /dev/null +++ b/src/utils/geography.py @@ -0,0 +1,76 @@ +"""Geography and jurisdiction utilities.""" + +import re + +# Mapping of state abbreviations to regions for peer comparison +STATE_REGIONS: dict[str, str] = { + "CT": "northeast", "ME": "northeast", "MA": "northeast", "NH": "northeast", + "RI": "northeast", "VT": "northeast", "NJ": "northeast", "NY": "northeast", + "PA": "northeast", + "IL": "midwest", "IN": "midwest", "MI": "midwest", "OH": "midwest", + "WI": "midwest", "IA": "midwest", "KS": "midwest", "MN": "midwest", + "MO": "midwest", "NE": "midwest", "ND": "midwest", "SD": "midwest", + "DE": "south", "FL": "south", "GA": "south", "MD": "south", + "NC": "south", "SC": "south", "VA": "south", "DC": "south", + "WV": "south", "AL": "south", "KY": "south", "MS": "south", + "TN": "south", "AR": "south", "LA": "south", "OK": "south", "TX": "south", + "AZ": "west", "CO": "west", "ID": "west", "MT": "west", + "NV": "west", "NM": "west", "UT": "west", "WY": "west", + "AK": "west", "CA": "west", "HI": "west", "OR": "west", "WA": "west", +} + +# Population tiers for jurisdiction comparison +POPULATION_TIERS = { + "small": (0, 50_000), + "medium": (50_000, 250_000), + "large": (250_000, 1_000_000), + "major": (1_000_000, float("inf")), +} + + +def normalize_jurisdiction(name: str) -> str: + """Normalize a jurisdiction name for consistent matching. + + Strips whitespace, lowercases, removes common suffixes like 'City of'. + """ + name = name.strip().lower() + # Remove common prefixes + for prefix in ["city of ", "town of ", "village of ", "county of "]: + if name.startswith(prefix): + name = name[len(prefix):] + # Remove trailing state abbreviation patterns like ", CA" + name = re.sub(r",\s*[a-z]{2}$", "", name) + return name.strip() + + +def get_region(state: str) -> str | None: + """Return the census region for a state abbreviation.""" + return STATE_REGIONS.get(state.upper()) + + +def find_comparable_jurisdictions( + jurisdiction: str, + state: str | None = None, + max_results: int = 5, +) -> list[str]: + """Find comparable jurisdictions for benchmarking. + + In production this would query the database for jurisdictions with similar: + - Population size + - Regional location + - Housing market characteristics + + This is a placeholder that returns an empty list until real data is available. + """ + # In production, this queries the database for jurisdictions with similar + # population, region, and housing market characteristics. + # For now, return empty - the caller handles the case of no peers. + return [] + + +def is_same_metro(jurisdiction_a: str, jurisdiction_b: str) -> bool: + """Check if two jurisdictions are in the same metropolitan area. + + Placeholder for MSA/CBSA lookup integration. + """ + return normalize_jurisdiction(jurisdiction_a) == normalize_jurisdiction(jurisdiction_b) diff --git a/src/utils/statistical_helpers.py b/src/utils/statistical_helpers.py new file mode 100644 index 0000000..4d6a065 --- /dev/null +++ b/src/utils/statistical_helpers.py @@ -0,0 +1,82 @@ +"""Statistical helper functions for pipeline analytics.""" + +from typing import Sequence + +import numpy as np + + +def mean(values: Sequence[float | int]) -> float: + """Calculate arithmetic mean, returning 0 for empty sequences.""" + if not values: + return 0.0 + return float(np.mean(values)) + + +def median(values: Sequence[float | int]) -> float: + """Calculate median, returning 0 for empty sequences.""" + if not values: + return 0.0 + return float(np.median(values)) + + +def calculate_percentile(values: Sequence[float | int], pct: float) -> float: + """Calculate a given percentile of a sequence.""" + if not values: + return 0.0 + return float(np.percentile(values, pct)) + + +def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float: + """Divide safely, returning default when denominator is zero.""" + if denominator == 0: + return default + return numerator / denominator + + +def weighted_average(values: Sequence[float], weights: Sequence[float]) -> float: + """Compute a weighted average.""" + if not values or not weights or len(values) != len(weights): + return 0.0 + total_weight = sum(weights) + if total_weight == 0: + return 0.0 + return sum(v * w for v, w in zip(values, weights)) / total_weight + + +def variance(values: Sequence[float | int]) -> float: + """Calculate population variance.""" + if len(values) < 2: + return 0.0 + return float(np.var(values)) + + +def std_dev(values: Sequence[float | int]) -> float: + """Calculate population standard deviation.""" + if len(values) < 2: + return 0.0 + return float(np.std(values)) + + +def coefficient_of_variation(values: Sequence[float | int]) -> float: + """Calculate coefficient of variation (std / mean).""" + m = mean(values) + if m == 0: + return 0.0 + return std_dev(values) / m + + +def rank_in_group(value: float, values: Sequence[float | int], ascending: bool = True) -> int: + """Return 1-based rank of a value within a group.""" + sorted_vals = sorted(values, reverse=not ascending) + for i, v in enumerate(sorted_vals): + if v >= value: + return i + 1 + return len(sorted_vals) + + +def percentile_rank(value: float, values: Sequence[float | int]) -> float: + """Return the percentile rank (0-100) of a value within a distribution.""" + if not values: + return 0.0 + count_below = sum(1 for v in values if v < value) + return (count_below / len(values)) * 100 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..80472d2 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,365 @@ +"""Shared pytest fixtures for the HousingHand test suite. + +Provides an in-memory SQLite database, session management, and factory +functions for creating sample projects, funding sources, and barriers. +""" + +import uuid +from collections.abc import Generator +from datetime import date, datetime, timedelta + +import pytest +from sqlalchemy import create_engine, event +from sqlalchemy.orm import Session, sessionmaker +from sqlalchemy.pool import StaticPool + +from src.database.connection import Base + +# Force all models to be registered on Base.metadata before create_all. +from src.models.project import Project # noqa: F401 +from src.models.funding_source import FundingSource # noqa: F401 +from src.models.barrier import ProjectBarrier # noqa: F401 +from src.models.peer_group import PeerGroup # noqa: F401 +from src.models.portfolio import PortfolioDashboard # noqa: F401 +from src.models.reform import PolicyReform # noqa: F401 +from src.models.enums import ( + BarrierStage, + BuildingType, + DataSource, + FundingSourceStatus, + FundingSourceType, + NeighborOpposition, + OverallHealth, + PipelineStage, + PortfolioType, + ReformType, + StructureType, +) + + +# --------------------------------------------------------------------------- +# Engine & session fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(scope="session") +def engine(): + """Create an in-memory SQLite engine shared across the entire test session.""" + eng = create_engine( + "sqlite:///:memory:", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + + @event.listens_for(eng, "connect") + def _set_sqlite_pragma(dbapi_connection, connection_record): + cursor = dbapi_connection.cursor() + cursor.execute("PRAGMA foreign_keys=ON") + cursor.close() + + Base.metadata.create_all(eng) + yield eng + Base.metadata.drop_all(eng) + + +@pytest.fixture() +def db(engine) -> Generator[Session, None, None]: + """Yield a database session that is rolled back after each test. + + Every test gets a clean transactional boundary so that data created + inside one test never leaks into another. + """ + connection = engine.connect() + transaction = connection.begin() + session = Session(bind=connection) + + yield session + + session.close() + transaction.rollback() + connection.close() + + +# --------------------------------------------------------------------------- +# Factory helpers +# --------------------------------------------------------------------------- + +def _make_project_id() -> uuid.UUID: + return uuid.uuid4() + + +@pytest.fixture() +def sample_project(db: Session) -> Project: + """Insert and return a single realistic sample project. + + This represents a mid-pipeline LIHTC new-construction project in + Oakland, CA that is currently in the entitlement stage. + """ + project = Project( + project_id=_make_project_id(), + project_name="Sunrise Village Apartments", + project_slug=f"sunrise-village-apartments-{uuid.uuid4().hex[:8]}", + address="1234 Main Street", + city="Oakland", + county="Alameda", + state="CA", + zip="94607", + latitude=37.8044, + longitude=-122.2712, + jurisdiction="City of Oakland", + neighborhood="West Oakland", + census_tract="4001.00", + developer_org="Community Housing Partners", + developer_contact="Jane Doe", + architect="Studio Architecture", + general_contractor="BuildRight Construction", + property_manager="GreenField Management", + site_acres=2.5, + building_type=BuildingType.NEW_CONSTRUCTION, + structure_type=StructureType.WOOD_FRAME, + stories=4, + parking_spaces=60, + total_units=120, + affordable_units=108, + market_units=12, + studio_units=10, + one_br_units=40, + two_br_units=50, + three_br_units=15, + four_plus_br_units=5, + ami_30_units=24, + ami_40_units=12, + ami_50_units=36, + ami_60_units=36, + ami_80_units=0, + market_rate_units=12, + senior_units=0, + family_units=120, + psf_units=0, + veteran_units=0, + homeless_set_aside=10, + current_stage=PipelineStage.ENTITLEMENT, + stage_entry_date=date.today() - timedelta(days=90), + days_in_current_stage=90, + overall_health=OverallHealth.ON_TRACK, + health_score=82.0, + last_milestone_date=date.today() - timedelta(days=30), + next_milestone_date=date.today() + timedelta(days=60), + next_milestone_type="Design Review Hearing", + concept_start=date(2023, 1, 15), + concept_complete=date(2023, 4, 1), + concept_duration_days=76, + pre_development_start=date(2023, 4, 1), + pre_development_complete=date(2023, 10, 15), + pre_development_duration_days=197, + entitlement_start=date(2023, 10, 15), + total_development_cost=42_000_000.00, + cost_per_unit=350_000.00, + cost_per_square_foot=450.00, + land_acquisition_cost=6_000_000.00, + hard_costs=28_000_000.00, + soft_costs=5_000_000.00, + original_budget=40_000_000.00, + current_budget=42_000_000.00, + budget_variance_dollars=2_000_000.00, + budget_variance_percent=5.0, + total_funding_committed=35_000_000.00, + funding_gap=7_000_000.00, + jurisdiction_friction_score=45, + neighbor_opposition_level=NeighborOpposition.LOW, + appeals_filed=0, + risk_score=25.0, + data_source=DataSource.DEVELOPER_PORTAL, + data_quality_score=0.85, + data_completeness=0.90, + is_public=True, + ) + db.add(project) + db.flush() + return project + + +@pytest.fixture() +def sample_funding_source(db: Session, sample_project: Project) -> FundingSource: + """Insert and return a LIHTC 4% funding source attached to sample_project.""" + fs = FundingSource( + funding_id=uuid.uuid4(), + project_id=sample_project.project_id, + source_type=FundingSourceType.LIHTC_4PCT, + source_name="California LIHTC 4%", + provider_organization="California Tax Credit Allocation Committee", + amount=15_000_000.00, + status=FundingSourceStatus.COMMITTED, + application_date=date(2023, 6, 1), + award_date=date(2023, 9, 15), + compliance_period_years=15, + affordability_period_years=55, + ) + db.add(fs) + db.flush() + return fs + + +@pytest.fixture() +def sample_barrier(db: Session, sample_project: Project) -> ProjectBarrier: + """Insert and return a zoning barrier attached to sample_project.""" + barrier = ProjectBarrier( + barrier_id=uuid.uuid4(), + project_id=sample_project.project_id, + barrier_type="Minimum Parking Requirements", + barrier_description="City requires 1.5 spaces/unit; project designed for 0.5 spaces/unit.", + jurisdiction="City of Oakland", + friction_score=65, + jurisdiction_rank=12, + stage_encountered=BarrierStage.ENTITLEMENT, + date_encountered=date(2024, 1, 15), + days_delayed=45, + cost_impact=250_000.00, + resolution_strategy="Applied for parking variance citing transit proximity.", + variance_required=True, + variance_granted=None, + appeal_filed=False, + ) + db.add(barrier) + db.flush() + return barrier + + +# --------------------------------------------------------------------------- +# Multi-project factory +# --------------------------------------------------------------------------- + +@pytest.fixture() +def project_factory(db: Session): + """Return a callable that creates projects with customizable overrides. + + Usage in tests:: + + p = project_factory(project_name="Test", current_stage=PipelineStage.FINANCING) + """ + + def _create(**overrides) -> Project: + defaults = dict( + project_id=_make_project_id(), + project_name=f"Test Project {uuid.uuid4().hex[:6]}", + project_slug=f"test-project-{uuid.uuid4().hex[:8]}", + city="Oakland", + state="CA", + jurisdiction="City of Oakland", + total_units=100, + affordable_units=90, + market_units=10, + current_stage=PipelineStage.CONCEPT, + building_type=BuildingType.NEW_CONSTRUCTION, + structure_type=StructureType.WOOD_FRAME, + stories=4, + is_public=True, + ) + defaults.update(overrides) + project = Project(**defaults) + db.add(project) + db.flush() + return project + + return _create + + +@pytest.fixture() +def funding_source_factory(db: Session): + """Return a callable that creates funding sources with customizable overrides.""" + + def _create(project_id: uuid.UUID, **overrides) -> FundingSource: + defaults = dict( + funding_id=uuid.uuid4(), + project_id=project_id, + source_type=FundingSourceType.LIHTC_4PCT, + source_name=f"Test Funding {uuid.uuid4().hex[:6]}", + amount=5_000_000.00, + status=FundingSourceStatus.ANTICIPATED, + ) + defaults.update(overrides) + fs = FundingSource(**defaults) + db.add(fs) + db.flush() + return fs + + return _create + + +@pytest.fixture() +def barrier_factory(db: Session): + """Return a callable that creates barriers with customizable overrides.""" + + def _create(project_id: uuid.UUID, **overrides) -> ProjectBarrier: + defaults = dict( + barrier_id=uuid.uuid4(), + project_id=project_id, + barrier_type="Generic Regulatory Barrier", + jurisdiction="City of Oakland", + friction_score=50, + days_delayed=30, + cost_impact=100_000.00, + stage_encountered=BarrierStage.ENTITLEMENT, + ) + defaults.update(overrides) + barrier = ProjectBarrier(**defaults) + db.add(barrier) + db.flush() + return barrier + + return _create + + +# --------------------------------------------------------------------------- +# Peer benchmark mock fixture +# --------------------------------------------------------------------------- + +@pytest.fixture() +def mock_peer_benchmark(): + """Return a PeerBenchmarkResult dict with realistic national benchmark data. + + This avoids hitting the YAML config or running peer queries during tests. + """ + from src.analytics.peer_benchmarking import PeerBenchmarkResult, StageBenchmark + + return PeerBenchmarkResult( + peer_group_id=None, + peer_group_name="test_national_benchmarks", + project_count=0, + stage_benchmarks={ + "concept": StageBenchmark( + stage="concept", median_days=60.0, mean_days=65.0, + p25_days=30.0, p75_days=90.0, p90_days=120.0, + std_days=25.0, sample_size=50, + ), + "pre_development": StageBenchmark( + stage="pre_development", median_days=180.0, mean_days=190.0, + p25_days=120.0, p75_days=270.0, p90_days=365.0, + std_days=60.0, sample_size=50, + ), + "entitlement": StageBenchmark( + stage="entitlement", median_days=240.0, mean_days=260.0, + p25_days=150.0, p75_days=365.0, p90_days=540.0, + std_days=80.0, sample_size=50, + ), + "financing": StageBenchmark( + stage="financing", median_days=180.0, mean_days=195.0, + p25_days=120.0, p75_days=270.0, p90_days=365.0, + std_days=55.0, sample_size=50, + ), + "construction": StageBenchmark( + stage="construction", median_days=540.0, mean_days=560.0, + p25_days=365.0, p75_days=720.0, p90_days=900.0, + std_days=120.0, sample_size=50, + ), + "lease_up": StageBenchmark( + stage="lease_up", median_days=120.0, mean_days=130.0, + p25_days=60.0, p75_days=180.0, p90_days=270.0, + std_days=40.0, sample_size=50, + ), + }, + median_total_duration_days=1320.0, + median_cost_per_unit=350_000.0, + p25_cost_per_unit=250_000.0, + p75_cost_per_unit=475_000.0, + computed_at=datetime.utcnow().isoformat(), + ) diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py new file mode 100644 index 0000000..abdc98e --- /dev/null +++ b/tests/fixtures/sample_data.py @@ -0,0 +1,358 @@ +"""Sample data dictionaries for HousingHand tests. + +These are raw data dicts (not ORM objects) that can be used to construct +test payloads, compare against API responses, and seed test databases +via the factory fixtures in ``conftest.py``. +""" + +from datetime import date, timedelta + + +# --------------------------------------------------------------------------- +# Sample project data dicts +# --------------------------------------------------------------------------- + +SAMPLE_PROJECT_MINIMAL = { + "project_name": "Minimal Test Project", + "total_units": 50, +} + +SAMPLE_PROJECT_CREATE_PAYLOAD = { + "project_name": "Sunrise Village Apartments", + "city": "Oakland", + "state": "CA", + "jurisdiction": "City of Oakland", + "total_units": 120, + "affordable_units": 108, + "market_units": 12, + "building_type": "new_construction", + "structure_type": "wood_frame", + "stories": 4, + "current_stage": "entitlement", + "developer_org": "Community Housing Partners", + "total_development_cost": 42_000_000.00, + "is_public": True, +} + +SAMPLE_PROJECT_FULL = { + "project_name": "Harbor View Family Housing", + "city": "San Francisco", + "county": "San Francisco", + "state": "CA", + "zip": "94107", + "jurisdiction": "City and County of San Francisco", + "neighborhood": "Mission Bay", + "latitude": 37.7749, + "longitude": -122.4194, + "developer_org": "Tenderloin Neighborhood Development Corp", + "architect": "Leddy Maytum Stacy Architects", + "general_contractor": "Cahill Contractors", + "property_manager": "John Stewart Company", + "site_acres": 1.8, + "building_type": "new_construction", + "structure_type": "concrete", + "stories": 8, + "parking_spaces": 40, + "total_units": 200, + "affordable_units": 200, + "market_units": 0, + "studio_units": 20, + "one_br_units": 60, + "two_br_units": 80, + "three_br_units": 30, + "four_plus_br_units": 10, + "ami_30_units": 40, + "ami_40_units": 20, + "ami_50_units": 60, + "ami_60_units": 60, + "ami_80_units": 20, + "senior_units": 0, + "family_units": 200, + "homeless_set_aside": 20, + "current_stage": "financing", + "total_development_cost": 95_000_000.00, + "cost_per_unit": 475_000.00, + "land_acquisition_cost": 15_000_000.00, + "hard_costs": 60_000_000.00, + "soft_costs": 12_000_000.00, + "is_public": True, + "data_source": "developer_portal", +} + +SAMPLE_PROJECT_UPDATE_PAYLOAD = { + "current_stage": "financing", + "total_development_cost": 43_500_000.00, + "entitlement_complete": str(date.today()), +} + + +# --------------------------------------------------------------------------- +# Multiple project scenarios for analytics tests +# --------------------------------------------------------------------------- + +def make_on_track_project_data() -> dict: + """Project that is healthy: on schedule, under budget, team complete.""" + return { + "project_name": "Greenfield Heights On-Track", + "city": "Oakland", + "state": "CA", + "jurisdiction": "City of Oakland", + "total_units": 80, + "affordable_units": 72, + "market_units": 8, + "building_type": "new_construction", + "structure_type": "wood_frame", + "stories": 3, + "current_stage": "entitlement", + "days_in_current_stage": 60, + "stage_entry_date": str(date.today() - timedelta(days=60)), + "developer_org": "Oakland Community Builders", + "architect": "Modern Arc Studio", + "general_contractor": "Bay Builders Inc", + "property_manager": "Westside Management", + "total_development_cost": 28_000_000.00, + "original_budget": 28_000_000.00, + "current_budget": 27_500_000.00, + "budget_variance_percent": -1.8, + "risk_score": 15.0, + "jurisdiction_friction_score": 30, + "health_score": 85.0, + "overall_health": "on_track", + "data_completeness": 0.95, + } + + +def make_at_risk_project_data() -> dict: + """Project with emerging risks: behind schedule, moderate opposition.""" + return { + "project_name": "Eastside Commons At-Risk", + "city": "Oakland", + "state": "CA", + "jurisdiction": "City of Oakland", + "total_units": 60, + "affordable_units": 54, + "market_units": 6, + "building_type": "substantial_rehab", + "structure_type": "mixed", + "stories": 3, + "current_stage": "entitlement", + "days_in_current_stage": 350, + "stage_entry_date": str(date.today() - timedelta(days=350)), + "developer_org": "East Bay Housing Corp", + "total_development_cost": 22_000_000.00, + "original_budget": 20_000_000.00, + "current_budget": 22_000_000.00, + "budget_variance_percent": 10.0, + "risk_score": 55.0, + "jurisdiction_friction_score": 60, + "neighbor_opposition_level": "moderate", + "health_score": 55.0, + "overall_health": "at_risk", + } + + +def make_delayed_project_data() -> dict: + """Significantly delayed project: over budget, legal challenges.""" + return { + "project_name": "West End Towers Delayed", + "city": "Berkeley", + "state": "CA", + "jurisdiction": "City of Berkeley", + "total_units": 150, + "affordable_units": 135, + "market_units": 15, + "building_type": "new_construction", + "structure_type": "concrete", + "stories": 6, + "current_stage": "entitlement", + "days_in_current_stage": 500, + "stage_entry_date": str(date.today() - timedelta(days=500)), + "developer_org": "Regional Housing Alliance", + "total_development_cost": 65_000_000.00, + "original_budget": 50_000_000.00, + "current_budget": 65_000_000.00, + "budget_variance_percent": 30.0, + "risk_score": 75.0, + "jurisdiction_friction_score": 80, + "neighbor_opposition_level": "high", + "appeals_filed": 2, + "health_score": 35.0, + "overall_health": "delayed", + } + + +def make_stalled_project_data() -> dict: + """Completely stalled project: severe problems across all dimensions.""" + return { + "project_name": "Riverside Place Stalled", + "city": "San Jose", + "state": "CA", + "jurisdiction": "City of San Jose", + "total_units": 90, + "affordable_units": 81, + "market_units": 9, + "building_type": "adaptive_reuse", + "structure_type": "steel", + "stories": 5, + "current_stage": "stalled", + "days_in_current_stage": 400, + "stage_entry_date": str(date.today() - timedelta(days=400)), + "total_development_cost": 40_000_000.00, + "original_budget": 30_000_000.00, + "current_budget": 40_000_000.00, + "budget_variance_percent": 33.3, + "risk_score": 92.0, + "jurisdiction_friction_score": 90, + "neighbor_opposition_level": "severe", + "appeals_filed": 3, + "health_score": 12.0, + "overall_health": "stalled", + } + + +# --------------------------------------------------------------------------- +# Sample funding source data +# --------------------------------------------------------------------------- + +SAMPLE_FUNDING_SOURCES = [ + { + "source_type": "LIHTC_4pct", + "source_name": "California LIHTC 4%", + "provider_organization": "CTCAC", + "amount": 15_000_000.00, + "status": "committed", + }, + { + "source_type": "HOME", + "source_name": "HOME Investment Partnership", + "provider_organization": "HUD", + "amount": 3_000_000.00, + "status": "awarded", + }, + { + "source_type": "construction_loan", + "source_name": "Wells Fargo Construction Loan", + "provider_organization": "Wells Fargo Bank", + "amount": 20_000_000.00, + "status": "anticipated", + }, + { + "source_type": "local_trust_fund", + "source_name": "Oakland Housing Trust Fund", + "provider_organization": "City of Oakland", + "amount": 2_500_000.00, + "status": "applied", + }, +] + + +# --------------------------------------------------------------------------- +# Sample barrier data +# --------------------------------------------------------------------------- + +SAMPLE_BARRIERS = [ + { + "barrier_type": "Minimum Parking Requirements", + "barrier_description": "City requires 1.5 spaces/unit; project designed for 0.5.", + "jurisdiction": "City of Oakland", + "friction_score": 65, + "stage_encountered": "entitlement", + "days_delayed": 45, + "cost_impact": 250_000.00, + "variance_required": True, + }, + { + "barrier_type": "Height Limit Restriction", + "barrier_description": "Zoning caps height at 35 feet; project needs 55 feet.", + "jurisdiction": "City of Oakland", + "friction_score": 70, + "stage_encountered": "entitlement", + "days_delayed": 90, + "cost_impact": 500_000.00, + "variance_required": True, + }, + { + "barrier_type": "Environmental Review (CEQA)", + "barrier_description": "Full EIR required due to neighbor objection.", + "jurisdiction": "City of Berkeley", + "friction_score": 80, + "stage_encountered": "pre_development", + "days_delayed": 180, + "cost_impact": 750_000.00, + "variance_required": False, + }, + { + "barrier_type": "Design Review Iterations", + "barrier_description": "Planning commission required 4 rounds of design changes.", + "jurisdiction": "City of San Francisco", + "friction_score": 55, + "stage_encountered": "entitlement", + "days_delayed": 120, + "cost_impact": 350_000.00, + "variance_required": False, + }, +] + + +# --------------------------------------------------------------------------- +# Sample reform data +# --------------------------------------------------------------------------- + +SAMPLE_REFORM = { + "jurisdiction": "City of Oakland", + "reform_name": "Parking Minimum Elimination", + "reform_description": "Eliminated minimum parking requirements for projects within 0.5mi of transit.", + "reform_type": "parking_reform", + "effective_date": str(date(2023, 7, 1)), + "projects_pre_reform": 15, + "projects_post_reform": 8, + "pre_reform_median_days": 300, + "post_reform_median_days": 210, + "days_saved_per_project": 90, + "percent_improvement": 30.0, + "total_cost_savings": 3_600_000.00, + "units_enabled": 200, +} + + +# --------------------------------------------------------------------------- +# HousingLens mock API responses +# --------------------------------------------------------------------------- + +HOUSING_LENS_JURISDICTION_RESPONSE = { + "overall_score": 62, + "topics": [ + { + "name": "parking_minimums", + "friction_score": 75, + "jurisdiction_rank": 8, + "national_percentile": 82.0, + "description": "Minimum parking requirements above transit-oriented thresholds.", + }, + { + "name": "height_limits", + "friction_score": 60, + "jurisdiction_rank": 15, + "national_percentile": 68.0, + "description": "Restrictive height limits in high-opportunity zones.", + }, + { + "name": "density_caps", + "friction_score": 45, + "jurisdiction_rank": 22, + "national_percentile": 55.0, + "description": "Per-acre unit caps below state density bonus allowances.", + }, + ], + "last_updated": "2024-06-15T00:00:00", +} + +HOUSING_LENS_RELATED_TOPICS_RESPONSE = { + "topics": [ + { + "name": "parking_minimums", + "friction_score": 75, + "jurisdiction_rank": 8, + }, + ], +} diff --git a/tests/test_analytics/__init__.py b/tests/test_analytics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_analytics/test_bottleneck_detection.py b/tests/test_analytics/test_bottleneck_detection.py new file mode 100644 index 0000000..1b5fa59 --- /dev/null +++ b/tests/test_analytics/test_bottleneck_detection.py @@ -0,0 +1,275 @@ +"""Tests for bottleneck detection and analysis. + +Validates that the bottleneck analysis endpoints and helper functions +correctly identify stage accumulation, barrier impacts, and +jurisdiction friction rankings. +""" + +from datetime import date, timedelta + +import pytest +from sqlalchemy.orm import Session + +from src.api.endpoints.analytics import ( + _compute_barrier_summaries, + _compute_jurisdiction_friction, + _compute_stage_bottlenecks, +) +from src.models.enums import ( + BarrierStage, + BuildingType, + NeighborOpposition, + OverallHealth, + PipelineStage, +) +from src.models.project import Project + + +# --------------------------------------------------------------------------- +# Stage bottleneck computation +# --------------------------------------------------------------------------- + + +class TestStageBottlenecks: + """Test the per-stage bottleneck aggregation.""" + + def test_empty_database_returns_zero_totals(self, db): + """With no projects, stage bottlenecks should all be zero.""" + results, total_active = _compute_stage_bottlenecks( + db, stall_threshold_days=180, jurisdiction=None, state=None, + ) + assert total_active == 0 + for stage_bn in results: + assert stage_bn.project_count == 0 + + def test_single_project_counted_in_correct_stage(self, db, project_factory): + """A single entitlement-stage project should appear in entitlement stats.""" + project_factory( + current_stage=PipelineStage.ENTITLEMENT, + days_in_current_stage=100, + stage_entry_date=date.today() - timedelta(days=100), + ) + + results, total_active = _compute_stage_bottlenecks( + db, stall_threshold_days=180, jurisdiction=None, state=None, + ) + assert total_active == 1 + entitlement_bn = next( + r for r in results if r.stage == PipelineStage.ENTITLEMENT + ) + assert entitlement_bn.project_count == 1 + assert entitlement_bn.avg_days is not None + + def test_stall_threshold_detection(self, db, project_factory): + """Projects exceeding stall threshold should be counted as stalled.""" + project_factory( + current_stage=PipelineStage.FINANCING, + days_in_current_stage=250, + stage_entry_date=date.today() - timedelta(days=250), + ) + project_factory( + current_stage=PipelineStage.FINANCING, + days_in_current_stage=50, + stage_entry_date=date.today() - timedelta(days=50), + ) + + results, total_active = _compute_stage_bottlenecks( + db, stall_threshold_days=180, jurisdiction=None, state=None, + ) + financing_bn = next( + r for r in results if r.stage == PipelineStage.FINANCING + ) + assert financing_bn.project_count == 2 + assert financing_bn.stalled_count == 1 + + def test_jurisdiction_filter_applies(self, db, project_factory): + """Jurisdiction filter should limit results.""" + project_factory( + current_stage=PipelineStage.CONSTRUCTION, + jurisdiction="City of Oakland", + ) + project_factory( + current_stage=PipelineStage.CONSTRUCTION, + jurisdiction="City of Berkeley", + ) + + results, total_active = _compute_stage_bottlenecks( + db, + stall_threshold_days=180, + jurisdiction="City of Oakland", + state=None, + ) + assert total_active == 1 + + def test_percentage_of_pipeline_computed(self, db, project_factory): + """pct_of_pipeline should sum to approximately 100 across stages.""" + for _ in range(3): + project_factory(current_stage=PipelineStage.ENTITLEMENT) + for _ in range(2): + project_factory(current_stage=PipelineStage.FINANCING) + + results, total_active = _compute_stage_bottlenecks( + db, stall_threshold_days=180, jurisdiction=None, state=None, + ) + assert total_active == 5 + total_pct = sum(r.pct_of_pipeline for r in results) + assert abs(total_pct - 100.0) < 1.0 # Rounding tolerance + + +# --------------------------------------------------------------------------- +# Barrier summary computation +# --------------------------------------------------------------------------- + + +class TestBarrierSummaries: + """Test barrier aggregation across projects.""" + + def test_empty_barriers_returns_empty_list(self, db): + """No barriers in DB should yield an empty list.""" + results = _compute_barrier_summaries( + db, jurisdiction=None, state=None, top_n=10, + ) + assert results == [] + + def test_barrier_aggregation(self, db, project_factory, barrier_factory): + """Barriers should be aggregated by type with correct totals.""" + p1 = project_factory() + p2 = project_factory() + + barrier_factory( + p1.project_id, + barrier_type="Minimum Parking Requirements", + days_delayed=45, + cost_impact=250_000.00, + jurisdiction="City of Oakland", + ) + barrier_factory( + p2.project_id, + barrier_type="Minimum Parking Requirements", + days_delayed=60, + cost_impact=300_000.00, + jurisdiction="City of Berkeley", + ) + barrier_factory( + p1.project_id, + barrier_type="Height Limit Restriction", + days_delayed=90, + cost_impact=500_000.00, + jurisdiction="City of Oakland", + ) + + results = _compute_barrier_summaries( + db, jurisdiction=None, state=None, top_n=10, + ) + assert len(results) == 2 + + # Parking barrier should be present with 2 occurrences + parking = next( + (r for r in results if r.barrier_type == "Minimum Parking Requirements"), + None, + ) + assert parking is not None + assert parking.occurrence_count == 2 + assert parking.total_days_delayed == 105 + assert parking.affected_jurisdictions == 2 + + def test_barrier_jurisdiction_filter(self, db, project_factory, barrier_factory): + """Filtering by jurisdiction should limit barrier results.""" + p1 = project_factory() + barrier_factory( + p1.project_id, + barrier_type="Setback Requirements", + days_delayed=30, + jurisdiction="City of Oakland", + ) + barrier_factory( + p1.project_id, + barrier_type="Setback Requirements", + days_delayed=40, + jurisdiction="City of Berkeley", + ) + + results = _compute_barrier_summaries( + db, jurisdiction="City of Oakland", state=None, top_n=10, + ) + assert len(results) == 1 + assert results[0].total_days_delayed == 30 + + def test_top_n_limits_results(self, db, project_factory, barrier_factory): + """The top_n parameter should cap the number of results returned.""" + p = project_factory() + for i in range(5): + barrier_factory( + p.project_id, + barrier_type=f"Barrier Type {i}", + days_delayed=(i + 1) * 10, + ) + + results = _compute_barrier_summaries( + db, jurisdiction=None, state=None, top_n=3, + ) + assert len(results) == 3 + + +# --------------------------------------------------------------------------- +# Jurisdiction friction ranking +# --------------------------------------------------------------------------- + + +class TestJurisdictionFriction: + """Test jurisdiction-level friction analysis.""" + + def test_empty_database_returns_empty(self, db): + """No projects should yield an empty jurisdiction list.""" + results = _compute_jurisdiction_friction(db, state=None, top_n=10) + assert results == [] + + def test_jurisdiction_aggregation(self, db, project_factory): + """Projects should be grouped and ranked by jurisdiction friction.""" + project_factory( + jurisdiction="City of Oakland", + jurisdiction_friction_score=70, + overall_health=OverallHealth.AT_RISK, + current_stage=PipelineStage.ENTITLEMENT, + entitlement_duration_days=300, + ) + project_factory( + jurisdiction="City of Oakland", + jurisdiction_friction_score=60, + overall_health=OverallHealth.ON_TRACK, + current_stage=PipelineStage.FINANCING, + entitlement_duration_days=200, + ) + project_factory( + jurisdiction="City of Berkeley", + jurisdiction_friction_score=40, + overall_health=OverallHealth.ON_TRACK, + current_stage=PipelineStage.CONSTRUCTION, + entitlement_duration_days=150, + ) + + results = _compute_jurisdiction_friction(db, state=None, top_n=10) + assert len(results) == 2 + + # Oakland should rank higher (higher avg friction) + assert results[0].jurisdiction == "City of Oakland" + assert results[0].project_count == 2 + assert results[0].avg_friction_score == 65.0 + assert results[0].at_risk_count == 1 + + def test_state_filter_applies(self, db, project_factory): + """State filter should limit jurisdiction friction results.""" + project_factory( + jurisdiction="City of Oakland", + state="CA", + jurisdiction_friction_score=60, + ) + project_factory( + jurisdiction="City of Portland", + state="OR", + jurisdiction_friction_score=50, + ) + + results = _compute_jurisdiction_friction(db, state="CA", top_n=10) + assert len(results) == 1 + assert results[0].jurisdiction == "City of Oakland" diff --git a/tests/test_analytics/test_health_assessment.py b/tests/test_analytics/test_health_assessment.py new file mode 100644 index 0000000..a86b59d --- /dev/null +++ b/tests/test_analytics/test_health_assessment.py @@ -0,0 +1,421 @@ +"""Tests for the pipeline health assessment module. + +Validates composite health scoring across five dimensions (timeline, +budget, funding, risk, team) and ensures correct mapping to +OverallHealth categories (on_track, at_risk, delayed, stalled). +""" + +from datetime import date, timedelta +from unittest.mock import patch + +import pytest + +from src.analytics.health_assessment import ( + HEALTH_WEIGHTS, + _generate_recommendations, + _score_budget, + _score_risk, + _score_team, + _score_timeline, + _score_to_health, + assess_project_health, +) +from src.models.enums import ( + BuildingType, + FundingSourceStatus, + FundingSourceType, + NeighborOpposition, + OverallHealth, + PipelineStage, +) +from src.models.funding_source import FundingSource +from src.models.project import Project + + +# --------------------------------------------------------------------------- +# _score_to_health mapping tests +# --------------------------------------------------------------------------- + + +class TestScoreToHealthMapping: + """Verify that composite scores map to the correct OverallHealth enum.""" + + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_high_score_maps_to_on_track(self, mock_benchmarks): + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + assert _score_to_health(95.0) == OverallHealth.ON_TRACK + assert _score_to_health(80.0) == OverallHealth.ON_TRACK + + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_moderate_score_maps_to_at_risk(self, mock_benchmarks): + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + assert _score_to_health(79.9) == OverallHealth.AT_RISK + assert _score_to_health(60.0) == OverallHealth.AT_RISK + + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_low_score_maps_to_delayed(self, mock_benchmarks): + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + assert _score_to_health(59.9) == OverallHealth.DELAYED + assert _score_to_health(40.0) == OverallHealth.DELAYED + + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_very_low_score_maps_to_stalled(self, mock_benchmarks): + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + assert _score_to_health(39.9) == OverallHealth.STALLED + assert _score_to_health(0.0) == OverallHealth.STALLED + + +# --------------------------------------------------------------------------- +# Timeline dimension scoring +# --------------------------------------------------------------------------- + + +class TestTimelineScoring: + """Test the timeline dimension scoring function.""" + + def test_on_pace_project_scores_100(self, sample_project, mock_peer_benchmark): + """A project at or below peer median should score 100.""" + sample_project.days_in_current_stage = 90 + # entitlement median is 240 days, so 90 days = ratio 0.375 -> score 100 + result = _score_timeline(sample_project, mock_peer_benchmark) + assert result["raw_score"] == 100.0 + assert result["dimension"] == "timeline" + + def test_double_median_scores_zero(self, sample_project, mock_peer_benchmark): + """A project at 2x the peer median should score 0.""" + sample_project.days_in_current_stage = 480 # 2x 240 + result = _score_timeline(sample_project, mock_peer_benchmark) + assert result["raw_score"] == 0.0 + + def test_1_5x_median_scores_50(self, sample_project, mock_peer_benchmark): + """A project at 1.5x peer median should score approximately 50.""" + sample_project.days_in_current_stage = 360 # 1.5x 240 + result = _score_timeline(sample_project, mock_peer_benchmark) + assert 45.0 <= result["raw_score"] <= 55.0 + + def test_no_days_data_gets_default_score(self, sample_project, mock_peer_benchmark): + """A project with no days_in_current_stage gets a neutral default.""" + sample_project.days_in_current_stage = None + result = _score_timeline(sample_project, mock_peer_benchmark) + assert result["raw_score"] == 50.0 + + def test_stalled_stage_gets_default_score(self, sample_project, mock_peer_benchmark): + """Terminal/stalled stages get a default 50.""" + sample_project.current_stage = PipelineStage.STALLED + result = _score_timeline(sample_project, mock_peer_benchmark) + assert result["raw_score"] == 50.0 + + def test_weight_is_applied_correctly(self, sample_project, mock_peer_benchmark): + """Weighted score should equal raw_score * timeline weight.""" + sample_project.days_in_current_stage = 90 + result = _score_timeline(sample_project, mock_peer_benchmark) + expected_weighted = round(result["raw_score"] * HEALTH_WEIGHTS["timeline"], 2) + assert result["weighted_score"] == expected_weighted + + +# --------------------------------------------------------------------------- +# Budget dimension scoring +# --------------------------------------------------------------------------- + + +class TestBudgetScoring: + """Test the budget dimension scoring function.""" + + def test_under_budget_scores_100(self, sample_project): + """A project on or under budget should score 100.""" + sample_project.budget_variance_percent = -5.0 + result = _score_budget(sample_project) + assert result["raw_score"] == 100.0 + + def test_zero_variance_scores_100(self, sample_project): + """Exactly on budget should score 100.""" + sample_project.budget_variance_percent = 0.0 + result = _score_budget(sample_project) + assert result["raw_score"] == 100.0 + + def test_severely_over_budget_scores_zero(self, sample_project): + """30%+ over budget should score 0.""" + sample_project.budget_variance_percent = 30.0 + result = _score_budget(sample_project) + assert result["raw_score"] == 0.0 + + def test_moderate_overrun_scores_proportionally(self, sample_project): + """15% over budget should score approximately 50.""" + sample_project.budget_variance_percent = 15.0 + result = _score_budget(sample_project) + assert 45.0 <= result["raw_score"] <= 55.0 + + def test_no_budget_data_gets_default(self, sample_project): + """Missing budget data should yield default score of 70.""" + sample_project.budget_variance_percent = None + sample_project.original_budget = None + sample_project.current_budget = None + result = _score_budget(sample_project) + assert result["raw_score"] == 70.0 + + def test_computed_from_original_and_current_budget(self, sample_project): + """When variance_percent is None, it should be computed from budgets.""" + sample_project.budget_variance_percent = None + sample_project.original_budget = 10_000_000.00 + sample_project.current_budget = 10_000_000.00 + result = _score_budget(sample_project) + assert result["raw_score"] == 100.0 + + +# --------------------------------------------------------------------------- +# Risk dimension scoring +# --------------------------------------------------------------------------- + + +class TestRiskScoring: + """Test the risk dimension scoring function.""" + + def test_low_risk_scores_high(self, sample_project): + """A project with low risk metrics should score well.""" + sample_project.risk_score = 10.0 + sample_project.jurisdiction_friction_score = 20 + sample_project.neighbor_opposition_level = NeighborOpposition.NONE + sample_project.appeals_filed = 0 + result = _score_risk(sample_project) + assert result["raw_score"] >= 80.0 + + def test_high_risk_scores_low(self, sample_project): + """A project with severe risk indicators should score poorly.""" + sample_project.risk_score = 90.0 + sample_project.jurisdiction_friction_score = 85 + sample_project.neighbor_opposition_level = NeighborOpposition.SEVERE + sample_project.appeals_filed = 3 + result = _score_risk(sample_project) + assert result["raw_score"] <= 30.0 + + def test_no_risk_data_gets_default(self, sample_project): + """Missing risk data should give a moderate default score.""" + sample_project.risk_score = None + sample_project.jurisdiction_friction_score = None + sample_project.neighbor_opposition_level = None + sample_project.appeals_filed = 0 + result = _score_risk(sample_project) + assert result["raw_score"] == 75.0 + + def test_opposition_levels_are_graded(self, sample_project): + """Higher opposition should produce lower scores.""" + sample_project.risk_score = None + sample_project.jurisdiction_friction_score = None + sample_project.appeals_filed = 0 + + sample_project.neighbor_opposition_level = NeighborOpposition.LOW + low_result = _score_risk(sample_project) + + sample_project.neighbor_opposition_level = NeighborOpposition.HIGH + high_result = _score_risk(sample_project) + + assert low_result["raw_score"] > high_result["raw_score"] + + +# --------------------------------------------------------------------------- +# Team dimension scoring +# --------------------------------------------------------------------------- + + +class TestTeamScoring: + """Test the team completeness dimension scoring.""" + + def test_full_team_scores_high(self, sample_project): + """All four team roles filled should yield a high score.""" + sample_project.developer_org = "Dev Corp" + sample_project.architect = "Arc Studio" + sample_project.general_contractor = "GC Inc" + sample_project.property_manager = "PM LLC" + sample_project.data_completeness = None + result = _score_team(sample_project) + assert result["raw_score"] == 100.0 + + def test_no_team_scores_zero(self, sample_project): + """No team roles filled should yield 0.""" + sample_project.developer_org = None + sample_project.architect = None + sample_project.general_contractor = None + sample_project.property_manager = None + sample_project.data_completeness = None + result = _score_team(sample_project) + assert result["raw_score"] == 0.0 + + def test_partial_team_scores_proportionally(self, sample_project): + """Two of four team roles filled should yield ~50.""" + sample_project.developer_org = "Dev Corp" + sample_project.architect = "Arc Studio" + sample_project.general_contractor = None + sample_project.property_manager = None + sample_project.data_completeness = None + result = _score_team(sample_project) + assert result["raw_score"] == 50.0 + + def test_data_completeness_blends_into_score(self, sample_project): + """When data_completeness is present, score should blend 70/30.""" + sample_project.developer_org = "Dev Corp" + sample_project.architect = "Arc Studio" + sample_project.general_contractor = "GC Inc" + sample_project.property_manager = "PM LLC" + sample_project.data_completeness = 0.5 + result = _score_team(sample_project) + # 100 * 0.7 + (0.5 * 100) * 0.3 = 70 + 15 = 85 + assert result["raw_score"] == 85.0 + + +# --------------------------------------------------------------------------- +# Full health assessment integration +# --------------------------------------------------------------------------- + + +class TestAssessProjectHealth: + """Integration tests for the full health assessment pipeline.""" + + @patch("src.analytics.health_assessment.compute_peer_benchmarks") + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_on_track_project_assessment( + self, mock_benchmarks, mock_compute, db, sample_project, mock_peer_benchmark + ): + """A healthy project should be assessed as on_track.""" + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + mock_compute.return_value = mock_peer_benchmark + + result = assess_project_health(db, sample_project.project_id, mock_peer_benchmark) + + assert result["project_id"] == str(sample_project.project_id) + assert result["project_name"] == "Sunrise Village Apartments" + assert result["current_stage"] == "entitlement" + assert 0.0 <= result["composite_score"] <= 100.0 + assert result["overall_health"] in [h.value for h in OverallHealth] + assert "dimensions" in result + assert set(result["dimensions"].keys()) == {"timeline", "budget", "funding", "risk", "team"} + + @patch("src.analytics.health_assessment.compute_peer_benchmarks") + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_terminal_project_returns_fixed_score( + self, mock_benchmarks, mock_compute, db, project_factory, mock_peer_benchmark + ): + """A project in operations stage should return 100/on_track.""" + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + mock_compute.return_value = mock_peer_benchmark + proj = project_factory(current_stage=PipelineStage.OPERATIONS) + + result = assess_project_health(db, proj.project_id, mock_peer_benchmark) + + assert result["composite_score"] == 100.0 + assert result["overall_health"] == OverallHealth.ON_TRACK.value + + @patch("src.analytics.health_assessment.compute_peer_benchmarks") + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_abandoned_project_returns_zero_stalled( + self, mock_benchmarks, mock_compute, db, project_factory, mock_peer_benchmark + ): + """An abandoned project should return 0/stalled.""" + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + mock_compute.return_value = mock_peer_benchmark + proj = project_factory(current_stage=PipelineStage.ABANDONED) + + result = assess_project_health(db, proj.project_id, mock_peer_benchmark) + + assert result["composite_score"] == 0.0 + assert result["overall_health"] == OverallHealth.STALLED.value + + @patch("src.analytics.health_assessment.compute_peer_benchmarks") + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_nonexistent_project_raises( + self, mock_benchmarks, mock_compute, db, mock_peer_benchmark + ): + """Assessing a nonexistent project should raise ValueError.""" + import uuid + + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + mock_compute.return_value = mock_peer_benchmark + + with pytest.raises(ValueError, match="not found"): + assess_project_health(db, uuid.uuid4(), mock_peer_benchmark) + + @patch("src.analytics.health_assessment.compute_peer_benchmarks") + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_score_trend_detection( + self, mock_benchmarks, mock_compute, db, sample_project, mock_peer_benchmark + ): + """Score trend should be detected relative to stored health_score.""" + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + mock_compute.return_value = mock_peer_benchmark + + result = assess_project_health(db, sample_project.project_id, mock_peer_benchmark) + + assert result["score_trend"] in ("improving", "stable", "declining") + assert result["previous_score"] == 82.0 + + +# --------------------------------------------------------------------------- +# Recommendations generation +# --------------------------------------------------------------------------- + + +class TestRecommendations: + """Test recommendation generation for underperforming dimensions.""" + + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_low_timeline_produces_recommendation(self, mock_benchmarks, sample_project): + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + dimensions = { + "timeline": {"dimension": "timeline", "raw_score": 40.0, "weight": 0.3, "weighted_score": 12.0, "detail": ""}, + "budget": {"dimension": "budget", "raw_score": 90.0, "weight": 0.25, "weighted_score": 22.5, "detail": ""}, + "funding": {"dimension": "funding", "raw_score": 90.0, "weight": 0.2, "weighted_score": 18.0, "detail": ""}, + "risk": {"dimension": "risk", "raw_score": 90.0, "weight": 0.15, "weighted_score": 13.5, "detail": ""}, + "team": {"dimension": "team", "raw_score": 90.0, "weight": 0.1, "weighted_score": 9.0, "detail": ""}, + } + recs = _generate_recommendations(sample_project, dimensions, OverallHealth.AT_RISK) + assert len(recs) > 0 + assert any("timeline" in r.lower() or "schedule" in r.lower() for r in recs) + + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_all_high_scores_produces_no_recommendations(self, mock_benchmarks, sample_project): + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + dimensions = { + "timeline": {"dimension": "timeline", "raw_score": 95.0, "weight": 0.3, "weighted_score": 28.5, "detail": ""}, + "budget": {"dimension": "budget", "raw_score": 95.0, "weight": 0.25, "weighted_score": 23.75, "detail": ""}, + "funding": {"dimension": "funding", "raw_score": 95.0, "weight": 0.2, "weighted_score": 19.0, "detail": ""}, + "risk": {"dimension": "risk", "raw_score": 95.0, "weight": 0.15, "weighted_score": 14.25, "detail": ""}, + "team": {"dimension": "team", "raw_score": 95.0, "weight": 0.1, "weighted_score": 9.5, "detail": ""}, + } + recs = _generate_recommendations(sample_project, dimensions, OverallHealth.ON_TRACK) + assert len(recs) == 0 + + @patch("src.analytics.health_assessment._load_national_benchmarks") + def test_recommendations_capped_at_five(self, mock_benchmarks, sample_project): + mock_benchmarks.return_value = { + "health_thresholds": {"on_track": 80, "at_risk": 60, "delayed": 40} + } + dimensions = { + "timeline": {"dimension": "timeline", "raw_score": 20.0, "weight": 0.3, "weighted_score": 6.0, "detail": ""}, + "budget": {"dimension": "budget", "raw_score": 20.0, "weight": 0.25, "weighted_score": 5.0, "detail": ""}, + "funding": {"dimension": "funding", "raw_score": 20.0, "weight": 0.2, "weighted_score": 4.0, "detail": ""}, + "risk": {"dimension": "risk", "raw_score": 20.0, "weight": 0.15, "weighted_score": 3.0, "detail": ""}, + "team": {"dimension": "team", "raw_score": 20.0, "weight": 0.1, "weighted_score": 2.0, "detail": ""}, + } + recs = _generate_recommendations(sample_project, dimensions, OverallHealth.STALLED) + assert len(recs) <= 5 diff --git a/tests/test_analytics/test_portfolio_intelligence.py b/tests/test_analytics/test_portfolio_intelligence.py new file mode 100644 index 0000000..9799fba --- /dev/null +++ b/tests/test_analytics/test_portfolio_intelligence.py @@ -0,0 +1,150 @@ +"""Tests for portfolio intelligence dashboard generation.""" + +from datetime import date, timedelta + +import pytest + +from src.models.enums import ( + BuildingType, + FundingSourceStatus, + FundingSourceType, + OverallHealth, + PipelineStage, +) +from src.models.funding_source import FundingSource +from src.models.project import Project + + +@pytest.fixture +def portfolio_projects(db): + """Create a portfolio of projects in various stages.""" + projects_data = [ + { + "project_name": "Downtown Apts", + "project_slug": "downtown-apts", + "jurisdiction": "Portland, OR", + "city": "Portland", + "state": "OR", + "total_units": 80, + "affordable_units": 72, + "current_stage": PipelineStage.CONSTRUCTION, + "overall_health": OverallHealth.ON_TRACK, + "health_score": 85.0, + "total_development_cost": 30_000_000, + "construction_start": date.today() - timedelta(days=90), + "concept_start": date.today() - timedelta(days=600), + "predicted_co": date.today() + timedelta(days=365), + }, + { + "project_name": "Hillside Senior", + "project_slug": "hillside-senior", + "jurisdiction": "Portland, OR", + "city": "Portland", + "state": "OR", + "total_units": 60, + "affordable_units": 60, + "current_stage": PipelineStage.ENTITLEMENT, + "overall_health": OverallHealth.AT_RISK, + "health_score": 55.0, + "total_development_cost": 24_000_000, + "funding_gap": 3_000_000, + "concept_start": date.today() - timedelta(days=300), + }, + { + "project_name": "River View Family", + "project_slug": "river-view-family", + "jurisdiction": "Portland, OR", + "city": "Portland", + "state": "OR", + "total_units": 45, + "affordable_units": 45, + "current_stage": PipelineStage.FINANCING, + "overall_health": OverallHealth.ON_TRACK, + "health_score": 78.0, + "total_development_cost": 18_000_000, + "concept_start": date.today() - timedelta(days=450), + "predicted_co": date.today() + timedelta(days=540), + }, + ] + + projects = [] + for data in projects_data: + p = Project(**data) + db.add(p) + projects.append(p) + + db.flush() + + # Add funding source + db.add(FundingSource( + project_id=projects[0].project_id, + source_type=FundingSourceType.LIHTC_9PCT, + source_name="9% LIHTC", + amount=10_000_000, + status=FundingSourceStatus.CLOSED, + )) + + db.commit() + return projects + + +class TestPortfolioIntelligence: + """Tests for generate_portfolio_intelligence (or generate_portfolio_dashboard).""" + + def test_portfolio_returns_summary(self, db, portfolio_projects): + """Portfolio should return summary with total projects and units.""" + from src.analytics.portfolio_intelligence import ( + generate_portfolio_dashboard, + ) + + result = generate_portfolio_dashboard( + db=db, + geography_filter={"jurisdiction": "Portland, OR"}, + stakeholder_type="city", + ) + assert isinstance(result, dict) + summary = result.get("portfolio_summary", {}) + assert summary.get("total_projects", 0) >= 3 + assert summary.get("total_units", 0) >= 185 + + def test_portfolio_pipeline_snapshot(self, db, portfolio_projects): + """Should include pipeline snapshot with stage breakdown.""" + from src.analytics.portfolio_intelligence import ( + generate_portfolio_dashboard, + ) + + result = generate_portfolio_dashboard( + db=db, + geography_filter={"jurisdiction": "Portland, OR"}, + stakeholder_type="city", + ) + snapshot = result.get("pipeline_snapshot", {}) + assert "units_by_stage" in snapshot or "projects_by_stage" in snapshot + + def test_portfolio_health_distribution(self, db, portfolio_projects): + """Should include health distribution counts.""" + from src.analytics.portfolio_intelligence import ( + generate_portfolio_dashboard, + ) + + result = generate_portfolio_dashboard( + db=db, + geography_filter={"jurisdiction": "Portland, OR"}, + stakeholder_type="city", + ) + health = result.get("health_distribution", {}) + assert isinstance(health, dict) + + def test_empty_portfolio(self, db): + """Should handle empty portfolio gracefully.""" + from src.analytics.portfolio_intelligence import ( + generate_portfolio_dashboard, + ) + + result = generate_portfolio_dashboard( + db=db, + geography_filter={"jurisdiction": "Nonexistent, XX"}, + stakeholder_type="city", + ) + summary = result.get("portfolio_summary", {}) + assert summary.get("total_projects", 0) == 0 diff --git a/tests/test_analytics/test_reform_impact.py b/tests/test_analytics/test_reform_impact.py new file mode 100644 index 0000000..aec3683 --- /dev/null +++ b/tests/test_analytics/test_reform_impact.py @@ -0,0 +1,110 @@ +"""Tests for policy reform impact measurement.""" + +from datetime import date, timedelta + +import pytest + +from src.models.enums import BuildingType, PipelineStage +from src.models.project import Project + + +@pytest.fixture +def pre_reform_projects(db): + """Create projects that completed entitlement before a reform.""" + reform_date = date.today() - timedelta(days=365) + projects = [] + for i in range(6): + p = Project( + project_name=f"Pre Reform {i}", + project_slug=f"pre-reform-{i}", + jurisdiction="TestCity, CA", + city="TestCity", + state="CA", + total_units=50, + affordable_units=50, + current_stage=PipelineStage.CONSTRUCTION, + entitlement_start=reform_date - timedelta(days=400 + i * 10), + entitlement_complete=reform_date - timedelta(days=100 + i * 10), + entitlement_duration_days=300 + i * 10, + ) + db.add(p) + projects.append(p) + db.commit() + return projects + + +@pytest.fixture +def post_reform_projects(db): + """Create projects that started entitlement after a reform.""" + reform_date = date.today() - timedelta(days=365) + projects = [] + for i in range(4): + p = Project( + project_name=f"Post Reform {i}", + project_slug=f"post-reform-{i}", + jurisdiction="TestCity, CA", + city="TestCity", + state="CA", + total_units=50, + affordable_units=50, + current_stage=PipelineStage.FINANCING, + entitlement_start=reform_date + timedelta(days=60 + i * 10), + entitlement_complete=reform_date + timedelta(days=260 + i * 10), + entitlement_duration_days=200 + i * 10, + ) + db.add(p) + projects.append(p) + db.commit() + return projects + + +class TestReformImpact: + """Tests for measure_reform_impact.""" + + def test_reform_impact_returns_result( + self, db, pre_reform_projects, post_reform_projects + ): + """Should return reform impact analysis.""" + from src.analytics.reform_impact import measure_reform_impact + + reform_date = date.today() - timedelta(days=365) + result = measure_reform_impact( + db=db, + jurisdiction="TestCity, CA", + reform_date=reform_date, + reform_description="Streamlined design review", + ) + assert isinstance(result, dict) + + def test_insufficient_data_returns_error(self, db): + """Should indicate insufficient data when too few projects.""" + from src.analytics.reform_impact import measure_reform_impact + + result = measure_reform_impact( + db=db, + jurisdiction="EmptyCity, XX", + reform_date=date.today() - timedelta(days=180), + reform_description="Some reform", + ) + assert isinstance(result, dict) + # Should handle gracefully either with error key or low confidence + + def test_reform_with_improvement( + self, db, pre_reform_projects, post_reform_projects + ): + """Post-reform projects with shorter durations should show improvement.""" + from src.analytics.reform_impact import measure_reform_impact + + reform_date = date.today() - timedelta(days=365) + result = measure_reform_impact( + db=db, + jurisdiction="TestCity, CA", + reform_date=reform_date, + reform_description="Streamlined design review", + ) + # The post-reform projects have ~200 day durations vs ~300 pre-reform + # So we expect to see improvement + if "analysis" in result: + analysis = result["analysis"] + if "days_saved_per_project" in analysis: + assert analysis["days_saved_per_project"] >= 0 diff --git a/tests/test_analytics/test_timeline_prediction.py b/tests/test_analytics/test_timeline_prediction.py new file mode 100644 index 0000000..94db798 --- /dev/null +++ b/tests/test_analytics/test_timeline_prediction.py @@ -0,0 +1,298 @@ +"""Tests for the analytics timeline prediction module. + +Validates single-project predictions, friction-to-multiplier conversion, +risk adjustments, remaining-stage calculation, and terminal project +handling. +""" + +from datetime import date, timedelta +from unittest.mock import patch + +import pytest + +from src.analytics.timeline_prediction import ( + _friction_to_multiplier, + _get_remaining_stages, + _risk_to_multiplier, + predict_project_timeline, +) +from src.models.enums import ( + BuildingType, + NeighborOpposition, + OverallHealth, + PipelineStage, +) +from src.models.project import Project + + +# --------------------------------------------------------------------------- +# Friction-to-multiplier conversion +# --------------------------------------------------------------------------- + + +class TestFrictionToMultiplier: + """Test the friction score to duration multiplier mapping.""" + + def test_median_friction_is_1x(self): + """A friction score of 50 (median) should produce 1.0x multiplier.""" + assert _friction_to_multiplier(50) == 1.0 + + def test_high_friction_increases_multiplier(self): + """A friction score of 100 should produce approximately 1.8x.""" + result = _friction_to_multiplier(100) + assert result == pytest.approx(1.8, abs=0.05) + + def test_low_friction_decreases_multiplier(self): + """A friction score of 1 should produce approximately 0.7x.""" + result = _friction_to_multiplier(1) + assert result == pytest.approx(0.706, abs=0.05) + + def test_none_friction_returns_1x(self): + """None friction score should default to 1.0x.""" + assert _friction_to_multiplier(None) == 1.0 + + def test_multiplier_is_monotonic(self): + """Higher friction should always produce a higher multiplier.""" + prev = _friction_to_multiplier(1) + for score in range(10, 101, 10): + current = _friction_to_multiplier(score) + assert current >= prev, ( + f"Multiplier decreased from score {score - 10} to {score}" + ) + prev = current + + +# --------------------------------------------------------------------------- +# Risk-to-multiplier conversion +# --------------------------------------------------------------------------- + + +class TestRiskToMultiplier: + """Test the project risk factor to duration multiplier mapping.""" + + def test_low_risk_project(self, sample_project): + """A low-risk project should have a multiplier near 1.0.""" + sample_project.risk_score = 15.0 + sample_project.appeals_filed = 0 + sample_project.neighbor_opposition_level = NeighborOpposition.NONE + sample_project.design_review_iterations = 1 + result = _risk_to_multiplier(sample_project) + assert 0.8 <= result <= 1.1 + + def test_high_risk_project(self, sample_project): + """A high-risk project should have an elevated multiplier.""" + sample_project.risk_score = 90.0 + sample_project.appeals_filed = 3 + sample_project.neighbor_opposition_level = NeighborOpposition.SEVERE + sample_project.design_review_iterations = 6 + result = _risk_to_multiplier(sample_project) + assert result > 1.2 + + def test_no_risk_data_returns_1x(self, sample_project): + """A project with no risk data should default to 1.0.""" + sample_project.risk_score = None + sample_project.appeals_filed = 0 + sample_project.neighbor_opposition_level = None + sample_project.design_review_iterations = 0 + assert _risk_to_multiplier(sample_project) == 1.0 + + def test_multiplier_is_capped(self, sample_project): + """Multiplier should not exceed 1.5 regardless of inputs.""" + sample_project.risk_score = 100.0 + sample_project.appeals_filed = 10 + sample_project.neighbor_opposition_level = NeighborOpposition.SEVERE + sample_project.design_review_iterations = 20 + result = _risk_to_multiplier(sample_project) + assert result <= 1.5 + + +# --------------------------------------------------------------------------- +# Remaining stages calculation +# --------------------------------------------------------------------------- + + +class TestGetRemainingStages: + """Test the remaining stages computation.""" + + def test_concept_returns_all_stages(self): + """A concept-stage project should have all stages remaining.""" + remaining = _get_remaining_stages("concept") + assert remaining == [ + "concept", "pre_development", "entitlement", + "financing", "construction", "lease_up", + ] + + def test_construction_returns_construction_and_beyond(self): + """A construction-stage project should return construction + lease_up.""" + remaining = _get_remaining_stages("construction") + assert remaining == ["construction", "lease_up"] + + def test_lease_up_returns_only_lease_up(self): + """A lease_up-stage project should return just lease_up.""" + remaining = _get_remaining_stages("lease_up") + assert remaining == ["lease_up"] + + def test_unknown_stage_returns_full_pipeline(self): + """An unknown stage should return the full pipeline.""" + remaining = _get_remaining_stages("something_weird") + assert len(remaining) == 6 + + +# --------------------------------------------------------------------------- +# Single project prediction (integration) +# --------------------------------------------------------------------------- + + +class TestPredictProjectTimeline: + """Integration tests for predict_project_timeline.""" + + @patch("src.analytics.timeline_prediction._load_national_benchmarks") + @patch("src.analytics.timeline_prediction.compute_peer_benchmarks") + def test_entitlement_project_prediction( + self, mock_compute, mock_benchmarks, db, sample_project, mock_peer_benchmark + ): + """A mid-pipeline project should get stage-by-stage predictions.""" + mock_compute.return_value = mock_peer_benchmark + mock_benchmarks.return_value = { + "stage_durations": { + "entitlement": {"median": 240}, + "financing": {"median": 180}, + "construction": {"median": 540}, + "lease_up": {"median": 120}, + }, + "holding_costs": {"daily_per_unit_during_entitlement": 20}, + } + + result = predict_project_timeline( + db, sample_project.project_id, mock_peer_benchmark + ) + + assert result["project_id"] == str(sample_project.project_id) + assert result["current_stage"] == "entitlement" + assert result["predicted_remaining_days"] > 0 + assert 0.0 < result["confidence"] <= 1.0 + assert len(result["stage_predictions"]) >= 1 + assert result["method"] in ("peer_adjusted", "national_adjusted") + + @patch("src.analytics.timeline_prediction._load_national_benchmarks") + @patch("src.analytics.timeline_prediction.compute_peer_benchmarks") + def test_terminal_project_returns_zero_remaining( + self, mock_compute, mock_benchmarks, db, project_factory, mock_peer_benchmark + ): + """A project in operations should predict 0 remaining days.""" + mock_compute.return_value = mock_peer_benchmark + mock_benchmarks.return_value = {"stage_durations": {}, "holding_costs": {}} + proj = project_factory(current_stage=PipelineStage.OPERATIONS) + + result = predict_project_timeline( + db, proj.project_id, mock_peer_benchmark + ) + + assert result["predicted_remaining_days"] == 0.0 + assert result["method"] == "terminal" + assert result["confidence"] == 1.0 + + @patch("src.analytics.timeline_prediction._load_national_benchmarks") + @patch("src.analytics.timeline_prediction.compute_peer_benchmarks") + def test_nonexistent_project_raises( + self, mock_compute, mock_benchmarks, db, mock_peer_benchmark + ): + """Predicting for a nonexistent project should raise ValueError.""" + import uuid + mock_compute.return_value = mock_peer_benchmark + mock_benchmarks.return_value = {"stage_durations": {}, "holding_costs": {}} + + with pytest.raises(ValueError, match="not found"): + predict_project_timeline(db, uuid.uuid4(), mock_peer_benchmark) + + @patch("src.analytics.timeline_prediction._load_national_benchmarks") + @patch("src.analytics.timeline_prediction.compute_peer_benchmarks") + def test_prediction_includes_confidence_interval( + self, mock_compute, mock_benchmarks, db, sample_project, mock_peer_benchmark + ): + """Prediction should include a confidence interval tuple.""" + mock_compute.return_value = mock_peer_benchmark + mock_benchmarks.return_value = { + "stage_durations": {"entitlement": {"median": 240}}, + "holding_costs": {}, + } + + result = predict_project_timeline( + db, sample_project.project_id, mock_peer_benchmark + ) + + ci = result["confidence_interval_days"] + assert isinstance(ci, tuple) + assert len(ci) == 2 + low, high = ci + assert low <= high + + @patch("src.analytics.timeline_prediction._load_national_benchmarks") + @patch("src.analytics.timeline_prediction.compute_peer_benchmarks") + def test_high_friction_extends_prediction( + self, mock_compute, mock_benchmarks, db, project_factory, mock_peer_benchmark + ): + """Higher friction scores should produce longer predictions.""" + mock_compute.return_value = mock_peer_benchmark + mock_benchmarks.return_value = { + "stage_durations": {"entitlement": {"median": 240}}, + "holding_costs": {}, + } + + low_friction = project_factory( + current_stage=PipelineStage.ENTITLEMENT, + jurisdiction_friction_score=20, + days_in_current_stage=0, + ) + high_friction = project_factory( + current_stage=PipelineStage.ENTITLEMENT, + jurisdiction_friction_score=90, + days_in_current_stage=0, + ) + + result_low = predict_project_timeline( + db, low_friction.project_id, mock_peer_benchmark + ) + result_high = predict_project_timeline( + db, high_friction.project_id, mock_peer_benchmark + ) + + assert ( + result_high["predicted_remaining_days"] + > result_low["predicted_remaining_days"] + ) + + @patch("src.analytics.timeline_prediction._load_national_benchmarks") + @patch("src.analytics.timeline_prediction.compute_peer_benchmarks") + def test_days_in_stage_subtracted_from_current_stage( + self, mock_compute, mock_benchmarks, db, project_factory, mock_peer_benchmark + ): + """Days already spent in current stage should be subtracted.""" + mock_compute.return_value = mock_peer_benchmark + mock_benchmarks.return_value = { + "stage_durations": {"entitlement": {"median": 240}}, + "holding_costs": {}, + } + + fresh = project_factory( + current_stage=PipelineStage.ENTITLEMENT, + days_in_current_stage=0, + jurisdiction_friction_score=50, + ) + midway = project_factory( + current_stage=PipelineStage.ENTITLEMENT, + days_in_current_stage=120, + jurisdiction_friction_score=50, + ) + + result_fresh = predict_project_timeline( + db, fresh.project_id, mock_peer_benchmark + ) + result_midway = predict_project_timeline( + db, midway.project_id, mock_peer_benchmark + ) + + assert ( + result_midway["predicted_remaining_days"] + < result_fresh["predicted_remaining_days"] + ) diff --git a/tests/test_api/__init__.py b/tests/test_api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_api/test_projects.py b/tests/test_api/test_projects.py new file mode 100644 index 0000000..6060e82 --- /dev/null +++ b/tests/test_api/test_projects.py @@ -0,0 +1,104 @@ +"""Tests for project CRUD API endpoints.""" + +import pytest +from fastapi.testclient import TestClient + +from src.api.app import app +from src.database.connection import get_db +from src.models.enums import PipelineStage +from src.models.project import Project + + +@pytest.fixture +def client(db): + """Create a FastAPI TestClient with overridden DB dependency.""" + + def override_get_db(): + try: + yield db + finally: + pass + + app.dependency_overrides[get_db] = override_get_db + with TestClient(app) as c: + yield c + app.dependency_overrides.clear() + + +@pytest.fixture +def existing_project(db): + """Create a project in the database.""" + p = Project( + project_name="Test API Project", + project_slug="test-api-project", + city="Portland", + state="OR", + jurisdiction="Portland, OR", + total_units=50, + affordable_units=50, + current_stage=PipelineStage.CONCEPT, + ) + db.add(p) + db.commit() + db.refresh(p) + return p + + +class TestListProjects: + """Tests for GET /api/v1/projects.""" + + def test_list_projects_empty(self, client): + """Should return empty list when no projects exist.""" + response = client.get("/api/v1/projects") + assert response.status_code == 200 + data = response.json() + assert isinstance(data, (list, dict)) + + def test_list_projects_with_data(self, client, existing_project): + """Should return projects when they exist.""" + response = client.get("/api/v1/projects") + assert response.status_code == 200 + + def test_list_projects_filter_by_state(self, client, existing_project): + """Should filter projects by state.""" + response = client.get("/api/v1/projects", params={"state": "OR"}) + assert response.status_code == 200 + + +class TestGetProject: + """Tests for GET /api/v1/projects/{id}.""" + + def test_get_existing_project(self, client, existing_project): + """Should return project details by ID.""" + response = client.get(f"/api/v1/projects/{existing_project.project_id}") + assert response.status_code == 200 + + def test_get_nonexistent_project(self, client): + """Should return 404 for nonexistent project.""" + import uuid + + fake_id = uuid.uuid4() + response = client.get(f"/api/v1/projects/{fake_id}") + assert response.status_code == 404 + + +class TestCreateProject: + """Tests for POST /api/v1/projects.""" + + def test_create_project(self, client): + """Should create a new project.""" + payload = { + "project_name": "New Test Project", + "city": "Sacramento", + "state": "CA", + "total_units": 100, + "affordable_units": 80, + } + response = client.post("/api/v1/projects", json=payload) + assert response.status_code in [200, 201] + + def test_create_project_missing_name(self, client): + """Should reject project without name.""" + payload = {"total_units": 50} + response = client.post("/api/v1/projects", json=payload) + assert response.status_code == 422 diff --git a/tests/test_integrations/__init__.py b/tests/test_integrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_integrations/test_housing_lens.py b/tests/test_integrations/test_housing_lens.py new file mode 100644 index 0000000..b4a7dd4 --- /dev/null +++ b/tests/test_integrations/test_housing_lens.py @@ -0,0 +1,81 @@ +"""Tests for HousingLens integration client.""" + +import pytest + +from src.integrations.housing_lens import ( + FrictionTopic, + HousingLensClient, + JurisdictionFrictionData, +) + + +class TestJurisdictionFrictionData: + """Tests for the JurisdictionFrictionData dataclass.""" + + def test_get_topic_score_existing(self): + """Should return score for an existing topic.""" + data = JurisdictionFrictionData( + jurisdiction="TestCity, CA", + overall_score=650, + topics=[ + FrictionTopic( + name="parking_requirements", + friction_score=720, + jurisdiction_rank=15, + ), + FrictionTopic( + name="design_review", + friction_score=500, + jurisdiction_rank=42, + ), + ], + ) + assert data.get_topic_score("parking_requirements") == 720 + assert data.get_topic_score("design_review") == 500 + + def test_get_topic_score_missing(self): + """Should return 0 for a non-existent topic.""" + data = JurisdictionFrictionData( + jurisdiction="TestCity, CA", + overall_score=650, + topics=[], + ) + assert data.get_topic_score("nonexistent") == 0 + + def test_get_topic_existing(self): + """Should return FrictionTopic object for an existing topic.""" + topic = FrictionTopic( + name="zoning_variances", + friction_score=800, + jurisdiction_rank=5, + ) + data = JurisdictionFrictionData( + jurisdiction="TestCity, CA", + overall_score=700, + topics=[topic], + ) + result = data.get_topic("zoning_variances") + assert result is not None + assert result.friction_score == 800 + + def test_get_topic_missing(self): + """Should return None for a non-existent topic.""" + data = JurisdictionFrictionData( + jurisdiction="TestCity, CA", + overall_score=700, + topics=[], + ) + assert data.get_topic("nonexistent") is None + + +class TestHousingLensClient: + """Tests for HousingLensClient initialization.""" + + def test_client_creation(self): + """Should create client with custom URL and key.""" + client = HousingLensClient( + base_url="http://test:8001/api/v1", + api_key="test-key", + ) + assert client.base_url == "http://test:8001/api/v1" + assert client.api_key == "test-key" diff --git a/tests/test_ml/__init__.py b/tests/test_ml/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_ml/test_feature_engineering.py b/tests/test_ml/test_feature_engineering.py new file mode 100644 index 0000000..3d91759 --- /dev/null +++ b/tests/test_ml/test_feature_engineering.py @@ -0,0 +1,84 @@ +"""Tests for ML feature engineering.""" + +from datetime import date, timedelta + +import pytest + +from src.models.enums import BuildingType, PipelineStage +from src.models.project import Project + + +@pytest.fixture +def feature_project(db): + """Create a project for feature extraction tests.""" + p = Project( + project_name="Feature Test Project", + project_slug="feature-test-project", + city="Denver", + state="CO", + jurisdiction="Denver, CO", + total_units=100, + affordable_units=80, + market_units=20, + ami_30_units=20, + ami_40_units=10, + ami_50_units=30, + ami_60_units=20, + stories=5, + parking_spaces=75, + site_acres=2.0, + building_type=BuildingType.NEW_CONSTRUCTION, + current_stage=PipelineStage.ENTITLEMENT, + concept_start=date.today() - timedelta(days=300), + ) + db.add(p) + db.commit() + return p + + +class TestFeatureEngineering: + """Tests for feature extraction from projects.""" + + def test_extract_features_returns_dict(self, db, feature_project): + """Feature extraction should return a dict of features.""" + from src.ml.feature_engineering import extract_features + + features = extract_features(feature_project) + assert isinstance(features, dict) + assert "total_units" in features + + def test_feature_values_are_numeric(self, db, feature_project): + """All feature values should be numeric.""" + from src.ml.feature_engineering import extract_features + + features = extract_features(feature_project) + for key, value in features.items(): + assert isinstance(value, (int, float)), ( + f"Feature {key} has non-numeric value: {value}" + ) + + def test_affordable_pct_calculation(self, db, feature_project): + """Should correctly calculate affordable unit percentage.""" + from src.ml.feature_engineering import extract_features + + features = extract_features(feature_project) + if "affordable_units_pct" in features: + assert 0.0 <= features["affordable_units_pct"] <= 1.0 + assert abs(features["affordable_units_pct"] - 0.8) < 0.01 + + def test_deep_affordability_pct(self, db, feature_project): + """Should calculate deep affordability percentage.""" + from src.ml.feature_engineering import extract_features + + features = extract_features(feature_project) + if "deep_affordability_pct" in features: + # 20 + 10 = 30 out of 100 + assert abs(features["deep_affordability_pct"] - 0.30) < 0.01 + + def test_parking_ratio(self, db, feature_project): + """Should calculate parking ratio.""" + from src.ml.feature_engineering import extract_features + + features = extract_features(feature_project) + if "parking_ratio" in features: + assert abs(features["parking_ratio"] - 0.75) < 0.01 From 72c887d0a4b35b8f46f00465d2c639bf129ade81 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 7 Feb 2026 19:37:07 +0000 Subject: [PATCH 2/5] Enhance reform impact tests with helper function unit tests and integration coverage https://claude.ai/code/session_01K1NfMrvDoARx7PAB1cvHbZ --- tests/test_analytics/test_reform_impact.py | 378 ++++++++++++++++----- 1 file changed, 289 insertions(+), 89 deletions(-) diff --git a/tests/test_analytics/test_reform_impact.py b/tests/test_analytics/test_reform_impact.py index aec3683..7e7d0b9 100644 --- a/tests/test_analytics/test_reform_impact.py +++ b/tests/test_analytics/test_reform_impact.py @@ -1,110 +1,310 @@ -"""Tests for policy reform impact measurement.""" +"""Tests for the policy reform impact measurement module. +Validates pre/post comparison, statistical testing, cost savings estimates, +confidence level determination, and helper functions. +""" + +import uuid from datetime import date, timedelta +from unittest.mock import patch +import numpy as np import pytest -from src.models.enums import BuildingType, PipelineStage +from src.analytics.reform_impact import ( + _bootstrap_ci_diff, + _compute_slope, + _determine_confidence, + _extract_durations, + _stall_rate, + measure_reform_impact, +) +from src.models.enums import ( + BarrierStage, + BuildingType, + ConfidenceLevel, + PipelineStage, + ReformType, +) from src.models.project import Project +from src.models.reform import PolicyReform -@pytest.fixture -def pre_reform_projects(db): - """Create projects that completed entitlement before a reform.""" - reform_date = date.today() - timedelta(days=365) - projects = [] - for i in range(6): - p = Project( - project_name=f"Pre Reform {i}", - project_slug=f"pre-reform-{i}", - jurisdiction="TestCity, CA", - city="TestCity", - state="CA", - total_units=50, - affordable_units=50, - current_stage=PipelineStage.CONSTRUCTION, - entitlement_start=reform_date - timedelta(days=400 + i * 10), - entitlement_complete=reform_date - timedelta(days=100 + i * 10), - entitlement_duration_days=300 + i * 10, - ) - db.add(p) - projects.append(p) - db.commit() - return projects - - -@pytest.fixture -def post_reform_projects(db): - """Create projects that started entitlement after a reform.""" - reform_date = date.today() - timedelta(days=365) - projects = [] - for i in range(4): - p = Project( - project_name=f"Post Reform {i}", - project_slug=f"post-reform-{i}", - jurisdiction="TestCity, CA", - city="TestCity", - state="CA", - total_units=50, - affordable_units=50, - current_stage=PipelineStage.FINANCING, - entitlement_start=reform_date + timedelta(days=60 + i * 10), - entitlement_complete=reform_date + timedelta(days=260 + i * 10), - entitlement_duration_days=200 + i * 10, - ) - db.add(p) - projects.append(p) - db.commit() - return projects +# --------------------------------------------------------------------------- +# Helper function unit tests +# --------------------------------------------------------------------------- -class TestReformImpact: - """Tests for measure_reform_impact.""" +class TestExtractDurations: + """Test the _extract_durations helper.""" - def test_reform_impact_returns_result( - self, db, pre_reform_projects, post_reform_projects - ): - """Should return reform impact analysis.""" - from src.analytics.reform_impact import measure_reform_impact + def test_extracts_entitlement_durations(self, project_factory): + """Should extract non-null entitlement durations from projects.""" + p1 = project_factory(entitlement_duration_days=250) + p2 = project_factory(entitlement_duration_days=300) + p3 = project_factory(entitlement_duration_days=None) + + result = _extract_durations([p1, p2, p3], "entitlement") + assert result == [250.0, 300.0] + + def test_empty_list_returns_empty(self): + """No projects should yield an empty list.""" + assert _extract_durations([], "entitlement") == [] + + def test_all_none_returns_empty(self, project_factory): + """All-null durations should yield an empty list.""" + p = project_factory(entitlement_duration_days=None) + assert _extract_durations([p], "entitlement") == [] + + +class TestComputeSlope: + """Test the simple linear slope computation.""" + + def test_increasing_values(self): + """Slope of increasing series should be positive.""" + values = [100.0, 120.0, 140.0, 160.0] + slope = _compute_slope(values) + assert slope > 0 + + def test_decreasing_values(self): + """Slope of decreasing series should be negative.""" + values = [300.0, 250.0, 200.0, 150.0] + slope = _compute_slope(values) + assert slope < 0 + + def test_constant_values(self): + """Slope of constant series should be approximately zero.""" + values = [200.0, 200.0, 200.0, 200.0] + slope = _compute_slope(values) + assert abs(slope) < 0.01 + + def test_single_value_returns_zero(self): + """Single value should return slope 0.""" + assert _compute_slope([100.0]) == 0.0 + + def test_empty_returns_zero(self): + """Empty list should return slope 0.""" + assert _compute_slope([]) == 0.0 + + +class TestDetermineConfidence: + """Test confidence level determination.""" + + def test_high_confidence(self): + """Large sample, significant, medium+ effect -> HIGH.""" + result = _determine_confidence(20, 15, True, "large") + assert result == ConfidenceLevel.HIGH + + def test_moderate_confidence(self): + """Moderate sample, significant, small effect -> MODERATE.""" + result = _determine_confidence(8, 6, True, "small") + assert result == ConfidenceLevel.MODERATE + + def test_low_confidence(self): + """Small sample, not significant -> LOW.""" + result = _determine_confidence(2, 3, False, "negligible") + assert result == ConfidenceLevel.LOW + + def test_insufficient_data_is_low(self): + """Very small samples should always be LOW confidence.""" + result = _determine_confidence(1, 1, False, "insufficient_data") + assert result == ConfidenceLevel.LOW + + +class TestBootstrapCIDiff: + """Test the bootstrap confidence interval for median difference.""" + + def test_positive_difference(self): + """Pre > post should give a positive CI.""" + pre = [300.0, 320.0, 280.0, 310.0, 290.0, 340.0, 350.0, 305.0] + post = [200.0, 210.0, 190.0, 220.0, 195.0, 215.0, 205.0, 198.0] + low, high = _bootstrap_ci_diff(pre, post) + assert low > 0 # Pre is clearly larger + + def test_overlapping_distributions(self): + """Overlapping distributions should have CI spanning zero.""" + pre = [200.0, 210.0, 190.0, 205.0, 195.0] + post = [198.0, 208.0, 192.0, 203.0, 197.0] + low, high = _bootstrap_ci_diff(pre, post) + assert low <= high + + def test_returns_tuple_of_two(self): + """Should return a (lower, upper) tuple.""" + pre = [100.0, 120.0, 110.0] + post = [90.0, 100.0, 95.0] + result = _bootstrap_ci_diff(pre, post) + assert isinstance(result, tuple) + assert len(result) == 2 + + +class TestStallRate: + """Test the stall rate computation.""" + + def test_no_stalled_projects(self, project_factory): + """No stalled/abandoned projects should give rate 0.""" + projects = [ + project_factory(current_stage=PipelineStage.ENTITLEMENT), + project_factory(current_stage=PipelineStage.ENTITLEMENT), + ] + rate = _stall_rate(projects, "entitlement") + assert rate == 0.0 + def test_some_stalled(self, project_factory): + """Mix of active and stalled should give proportional rate.""" + projects = [ + project_factory(current_stage=PipelineStage.ENTITLEMENT), + project_factory(current_stage=PipelineStage.STALLED), + project_factory(current_stage=PipelineStage.ABANDONED), + ] + rate = _stall_rate(projects, "entitlement") + assert 0.0 < rate <= 1.0 + + +# --------------------------------------------------------------------------- +# Full reform impact measurement (integration) +# --------------------------------------------------------------------------- + + +class TestMeasureReformImpact: + """Integration tests for measure_reform_impact.""" + + @pytest.fixture() + def reform_with_data(self, db, project_factory): + """Create a reform and pre/post projects in the DB.""" reform_date = date.today() - timedelta(days=365) - result = measure_reform_impact( - db=db, - jurisdiction="TestCity, CA", - reform_date=reform_date, - reform_description="Streamlined design review", + jurisdiction = "City of Oakland" + + reform = PolicyReform( + reform_id=uuid.uuid4(), + jurisdiction=jurisdiction, + reform_name="Parking Minimum Elimination", + reform_type=ReformType.PARKING_REFORM, + effective_date=reform_date, + implementation_buffer_days=60, ) - assert isinstance(result, dict) + db.add(reform) + + # Pre-reform projects: completed entitlement before the reform + for i in range(6): + project_factory( + jurisdiction=jurisdiction, + current_stage=PipelineStage.CONSTRUCTION, + entitlement_start=reform_date - timedelta(days=600 + i * 20), + entitlement_complete=reform_date - timedelta(days=300 + i * 10), + entitlement_duration_days=300 + i * 10, + ) - def test_insufficient_data_returns_error(self, db): - """Should indicate insufficient data when too few projects.""" - from src.analytics.reform_impact import measure_reform_impact + # Post-reform projects: started entitlement after buffer + for i in range(5): + project_factory( + jurisdiction=jurisdiction, + current_stage=PipelineStage.FINANCING, + entitlement_start=reform_date + timedelta(days=90 + i * 30), + entitlement_complete=reform_date + timedelta(days=290 + i * 20), + entitlement_duration_days=200 + i * 5, + ) + + db.flush() + return reform + + @patch("src.analytics.reform_impact._load_national_benchmarks") + def test_reform_impact_basic_structure( + self, mock_benchmarks, db, reform_with_data + ): + """Should return a well-formed ReformImpactResult.""" + mock_benchmarks.return_value = { + "holding_costs": {"daily_per_unit_during_entitlement": 20}, + } + + result = measure_reform_impact(db, reform_with_data.reform_id) + + assert result["reform_name"] == "Parking Minimum Elimination" + assert result["jurisdiction"] == "City of Oakland" + assert result["pre_reform_n"] >= 1 + assert result["post_reform_n"] >= 1 + assert "test_used" in result + assert "p_value" in result + assert "confidence_level" in result + assert result["measured_at"] is not None + + @patch("src.analytics.reform_impact._load_national_benchmarks") + def test_pre_reform_longer_than_post( + self, mock_benchmarks, db, reform_with_data + ): + """Pre-reform durations (~300-350) should be longer than post (~200-220).""" + mock_benchmarks.return_value = { + "holding_costs": {"daily_per_unit_during_entitlement": 20}, + } - result = measure_reform_impact( - db=db, - jurisdiction="EmptyCity, XX", - reform_date=date.today() - timedelta(days=180), - reform_description="Some reform", + result = measure_reform_impact(db, reform_with_data.reform_id) + + assert result["pre_reform_median_days"] > result["post_reform_median_days"] + assert result["days_saved_per_project"] > 0 + assert result["percent_improvement"] > 0 + + @patch("src.analytics.reform_impact._load_national_benchmarks") + def test_cost_savings_positive( + self, mock_benchmarks, db, reform_with_data + ): + """Days saved should translate to positive cost savings.""" + mock_benchmarks.return_value = { + "holding_costs": {"daily_per_unit_during_entitlement": 20}, + } + + result = measure_reform_impact(db, reform_with_data.reform_id) + + if result["days_saved_per_project"] > 0: + assert result["total_cost_savings"] > 0 + + @patch("src.analytics.reform_impact._load_national_benchmarks") + def test_nonexistent_reform_raises(self, mock_benchmarks, db): + """Measuring a nonexistent reform should raise ValueError.""" + mock_benchmarks.return_value = {"holding_costs": {}} + + with pytest.raises(ValueError, match="not found"): + measure_reform_impact(db, uuid.uuid4()) + + @patch("src.analytics.reform_impact._load_national_benchmarks") + def test_reform_without_effective_date_raises( + self, mock_benchmarks, db + ): + """A reform with no effective_date should raise ValueError.""" + mock_benchmarks.return_value = {"holding_costs": {}} + + reform = PolicyReform( + reform_id=uuid.uuid4(), + jurisdiction="City of Oakland", + reform_name="Draft Reform", + reform_type=ReformType.PARKING_REFORM, + effective_date=None, ) - assert isinstance(result, dict) - # Should handle gracefully either with error key or low confidence + db.add(reform) + db.flush() - def test_reform_with_improvement( - self, db, pre_reform_projects, post_reform_projects + with pytest.raises(ValueError, match="no effective_date"): + measure_reform_impact(db, reform.reform_id) + + @patch("src.analytics.reform_impact._load_national_benchmarks") + def test_insufficient_data_produces_caveats( + self, mock_benchmarks, db ): - """Post-reform projects with shorter durations should show improvement.""" - from src.analytics.reform_impact import measure_reform_impact + """A reform with <2 projects should produce caveats.""" + mock_benchmarks.return_value = { + "holding_costs": {"daily_per_unit_during_entitlement": 10}, + } - reform_date = date.today() - timedelta(days=365) - result = measure_reform_impact( - db=db, - jurisdiction="TestCity, CA", - reform_date=reform_date, - reform_description="Streamlined design review", + reform = PolicyReform( + reform_id=uuid.uuid4(), + jurisdiction="City of Nowhere", + reform_name="Empty Reform", + reform_type=ReformType.DENSITY_UPZONE, + effective_date=date.today() - timedelta(days=180), + implementation_buffer_days=30, ) - # The post-reform projects have ~200 day durations vs ~300 pre-reform - # So we expect to see improvement - if "analysis" in result: - analysis = result["analysis"] - if "days_saved_per_project" in analysis: - assert analysis["days_saved_per_project"] >= 0 + db.add(reform) + db.flush() + + result = measure_reform_impact(db, reform.reform_id) + assert result["test_used"] == "insufficient_data" + assert len(result["caveats"]) > 0 + assert result["confidence_level"] == ConfidenceLevel.LOW.value From 38f6c92adb68947126ea2d319da3f06ac2f4cea9 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 7 Feb 2026 19:37:29 +0000 Subject: [PATCH 3/5] Enhance portfolio intelligence tests with comprehensive dashboard validation https://claude.ai/code/session_01K1NfMrvDoARx7PAB1cvHbZ --- .../test_portfolio_intelligence.py | 481 +++++++++++++----- 1 file changed, 362 insertions(+), 119 deletions(-) diff --git a/tests/test_analytics/test_portfolio_intelligence.py b/tests/test_analytics/test_portfolio_intelligence.py index 9799fba..a4e4e19 100644 --- a/tests/test_analytics/test_portfolio_intelligence.py +++ b/tests/test_analytics/test_portfolio_intelligence.py @@ -1,150 +1,393 @@ -"""Tests for portfolio intelligence dashboard generation.""" +"""Tests for the portfolio intelligence dashboard module. -from datetime import date, timedelta +Validates dashboard generation, stage distribution computation, health +distribution, geographic breakdown, velocity metrics, at-risk +identification, and empty-portfolio handling. +""" + +import uuid +from datetime import date, datetime, timedelta +from unittest.mock import patch import pytest +from src.analytics.portfolio_intelligence import ( + _compute_geographic_breakdown, + _compute_health_distribution, + _compute_stage_distribution, + _empty_dashboard, + _identify_at_risk_projects, + _projects_expected_co, + _projects_expected_groundbreaking, + generate_portfolio_dashboard, +) from src.models.enums import ( BuildingType, FundingSourceStatus, FundingSourceType, OverallHealth, PipelineStage, + StakeholderType, ) from src.models.funding_source import FundingSource from src.models.project import Project -@pytest.fixture -def portfolio_projects(db): - """Create a portfolio of projects in various stages.""" - projects_data = [ - { - "project_name": "Downtown Apts", - "project_slug": "downtown-apts", - "jurisdiction": "Portland, OR", - "city": "Portland", - "state": "OR", - "total_units": 80, - "affordable_units": 72, - "current_stage": PipelineStage.CONSTRUCTION, - "overall_health": OverallHealth.ON_TRACK, - "health_score": 85.0, - "total_development_cost": 30_000_000, - "construction_start": date.today() - timedelta(days=90), - "concept_start": date.today() - timedelta(days=600), - "predicted_co": date.today() + timedelta(days=365), - }, - { - "project_name": "Hillside Senior", - "project_slug": "hillside-senior", - "jurisdiction": "Portland, OR", - "city": "Portland", - "state": "OR", - "total_units": 60, - "affordable_units": 60, - "current_stage": PipelineStage.ENTITLEMENT, - "overall_health": OverallHealth.AT_RISK, - "health_score": 55.0, - "total_development_cost": 24_000_000, - "funding_gap": 3_000_000, - "concept_start": date.today() - timedelta(days=300), - }, - { - "project_name": "River View Family", - "project_slug": "river-view-family", - "jurisdiction": "Portland, OR", - "city": "Portland", - "state": "OR", - "total_units": 45, - "affordable_units": 45, - "current_stage": PipelineStage.FINANCING, - "overall_health": OverallHealth.ON_TRACK, - "health_score": 78.0, - "total_development_cost": 18_000_000, - "concept_start": date.today() - timedelta(days=450), - "predicted_co": date.today() + timedelta(days=540), - }, - ] - - projects = [] - for data in projects_data: - p = Project(**data) - db.add(p) - projects.append(p) - - db.flush() - - # Add funding source - db.add(FundingSource( - project_id=projects[0].project_id, - source_type=FundingSourceType.LIHTC_9PCT, - source_name="9% LIHTC", - amount=10_000_000, - status=FundingSourceStatus.CLOSED, - )) - - db.commit() - return projects - - -class TestPortfolioIntelligence: - """Tests for generate_portfolio_intelligence (or generate_portfolio_dashboard).""" - - def test_portfolio_returns_summary(self, db, portfolio_projects): - """Portfolio should return summary with total projects and units.""" - from src.analytics.portfolio_intelligence import ( - generate_portfolio_dashboard, - ) +# --------------------------------------------------------------------------- +# Stage distribution computation +# --------------------------------------------------------------------------- - result = generate_portfolio_dashboard( - db=db, - geography_filter={"jurisdiction": "Portland, OR"}, - stakeholder_type="city", + +class TestComputeStageDistribution: + """Test the per-stage project/unit aggregation.""" + + def test_single_stage(self, project_factory): + """All projects in one stage should produce a single entry.""" + projects = [ + project_factory( + current_stage=PipelineStage.ENTITLEMENT, + total_units=50, + affordable_units=45, + days_in_current_stage=100, + ), + project_factory( + current_stage=PipelineStage.ENTITLEMENT, + total_units=80, + affordable_units=72, + days_in_current_stage=200, + ), + ] + + result = _compute_stage_distribution(projects) + assert len(result) == 1 + assert result[0]["stage"] == "entitlement" + assert result[0]["project_count"] == 2 + assert result[0]["total_units"] == 130 + assert result[0]["affordable_units"] == 117 + assert result[0]["median_days_in_stage"] == 150.0 + + def test_multiple_stages(self, project_factory): + """Projects across stages should produce multiple entries.""" + project_factory(current_stage=PipelineStage.CONCEPT, total_units=30) + project_factory(current_stage=PipelineStage.FINANCING, total_units=60) + + projects = [ + project_factory(current_stage=PipelineStage.CONCEPT, total_units=30), + project_factory(current_stage=PipelineStage.FINANCING, total_units=60), + ] + + result = _compute_stage_distribution(projects) + stages = {sd["stage"] for sd in result} + assert "concept" in stages + assert "financing" in stages + + def test_empty_projects_list(self): + """Empty project list should give empty result.""" + assert _compute_stage_distribution([]) == [] + + +# --------------------------------------------------------------------------- +# Health distribution computation +# --------------------------------------------------------------------------- + + +class TestComputeHealthDistribution: + """Test health status grouping.""" + + def test_mixed_health_statuses(self, project_factory): + """Projects with different health should produce multiple entries.""" + projects = [ + project_factory(overall_health=OverallHealth.ON_TRACK, total_units=50), + project_factory(overall_health=OverallHealth.ON_TRACK, total_units=60), + project_factory(overall_health=OverallHealth.AT_RISK, total_units=40), + project_factory(overall_health=OverallHealth.DELAYED, total_units=30), + ] + + result = _compute_health_distribution(projects) + health_map = {hd["health_status"]: hd for hd in result} + + assert health_map["on_track"]["project_count"] == 2 + assert health_map["on_track"]["total_units"] == 110 + assert health_map["at_risk"]["project_count"] == 1 + assert health_map["delayed"]["project_count"] == 1 + + def test_percentages_sum_to_100(self, project_factory): + """Percentages across all health statuses should sum to ~100.""" + projects = [ + project_factory(overall_health=OverallHealth.ON_TRACK), + project_factory(overall_health=OverallHealth.AT_RISK), + project_factory(overall_health=OverallHealth.DELAYED), + project_factory(overall_health=OverallHealth.STALLED), + ] + + result = _compute_health_distribution(projects) + total_pct = sum(hd["percentage"] for hd in result) + assert abs(total_pct - 100.0) < 1.0 + + def test_empty_projects(self): + """Empty list should produce empty result.""" + assert _compute_health_distribution([]) == [] + + +# --------------------------------------------------------------------------- +# Geographic breakdown +# --------------------------------------------------------------------------- + + +class TestComputeGeographicBreakdown: + """Test jurisdiction-level geographic aggregation.""" + + def test_groups_by_jurisdiction(self, project_factory): + """Should group projects by jurisdiction.""" + projects = [ + project_factory(jurisdiction="City of Oakland", total_units=50, health_score=80.0), + project_factory(jurisdiction="City of Oakland", total_units=70, health_score=60.0), + project_factory(jurisdiction="City of Berkeley", total_units=40, health_score=90.0), + ] + + result = _compute_geographic_breakdown(projects) + geo_map = {g["area"]: g for g in result} + + assert "City of Oakland" in geo_map + assert geo_map["City of Oakland"]["project_count"] == 2 + assert geo_map["City of Oakland"]["total_units"] == 120 + assert geo_map["City of Oakland"]["average_health_score"] == 70.0 + + def test_at_risk_counted(self, project_factory): + """At-risk/delayed/stalled projects should increment at_risk_count.""" + projects = [ + project_factory( + jurisdiction="City of Oakland", + overall_health=OverallHealth.AT_RISK, + ), + project_factory( + jurisdiction="City of Oakland", + overall_health=OverallHealth.DELAYED, + ), + project_factory( + jurisdiction="City of Oakland", + overall_health=OverallHealth.ON_TRACK, + ), + ] + + result = _compute_geographic_breakdown(projects) + oakland = next(g for g in result if g["area"] == "City of Oakland") + assert oakland["at_risk_count"] == 2 + + +# --------------------------------------------------------------------------- +# At-risk project identification +# --------------------------------------------------------------------------- + + +class TestIdentifyAtRiskProjects: + """Test at-risk project identification.""" + + def test_only_at_risk_included(self, project_factory): + """Only projects with at_risk/delayed/stalled health should appear.""" + projects = [ + project_factory( + overall_health=OverallHealth.ON_TRACK, + health_score=90.0, + ), + project_factory( + overall_health=OverallHealth.AT_RISK, + health_score=55.0, + funding_gap=3_000_000.0, + ), + project_factory( + overall_health=OverallHealth.STALLED, + health_score=10.0, + ), + ] + + result = _identify_at_risk_projects(projects) + assert len(result) == 2 + # Sorted by health_score ascending (worst first) + assert result[0]["health_score"] == 10.0 + assert result[1]["health_score"] == 55.0 + + def test_on_track_excluded(self, project_factory): + """On-track projects should not appear in at-risk list.""" + projects = [ + project_factory(overall_health=OverallHealth.ON_TRACK), + ] + result = _identify_at_risk_projects(projects) + assert len(result) == 0 + + +# --------------------------------------------------------------------------- +# Expected CO / groundbreaking projections +# --------------------------------------------------------------------------- + + +class TestProjectionCounts: + """Test CO and groundbreaking projection counting.""" + + def test_projects_expected_co_within_window(self, project_factory): + """Projects with predicted_co within 12m should be counted.""" + projects = [ + project_factory( + predicted_co=date.today() + timedelta(days=180), + total_units=50, + ), + project_factory( + predicted_co=date.today() + timedelta(days=400), + total_units=60, + ), + project_factory(predicted_co=None, total_units=70), + ] + + count, units = _projects_expected_co(projects, months=12) + assert count == 1 + assert units == 50 + + def test_groundbreaking_within_window(self, project_factory): + """Projects with predicted_groundbreaking within 6m should be counted.""" + projects = [ + project_factory( + predicted_groundbreaking=date.today() + timedelta(days=90), + ), + project_factory( + predicted_groundbreaking=date.today() + timedelta(days=300), + ), + ] + + count = _projects_expected_groundbreaking(projects, months=6) + assert count == 1 + + +# --------------------------------------------------------------------------- +# Empty dashboard +# --------------------------------------------------------------------------- + + +class TestEmptyDashboard: + """Test the empty dashboard factory.""" + + def test_empty_dashboard_structure(self): + """Empty dashboard should have all required keys with zero values.""" + result = _empty_dashboard(None, "Test", StakeholderType.CITY) + + assert result["total_projects"] == 0 + assert result["total_units"] == 0 + assert result["stage_distribution"] == [] + assert result["health_distribution"] == [] + assert result["funding_breakdown"] == [] + assert result["at_risk_projects"] == [] + assert result["stalled_projects"] == [] + assert result["portfolio_name"] == "Test" + assert result["stakeholder_type"] == "city" + + def test_empty_dashboard_has_velocity(self): + """Empty dashboard should still include velocity metrics.""" + result = _empty_dashboard(None, "Test", StakeholderType.FUNDER) + assert result["velocity"]["throughput_units_per_month"] == 0.0 + + +# --------------------------------------------------------------------------- +# Full dashboard generation (integration) +# --------------------------------------------------------------------------- + + +class TestGeneratePortfolioDashboard: + """Integration tests for the portfolio dashboard generator.""" + + @pytest.fixture() + def portfolio_projects(self, db, project_factory, funding_source_factory): + """Create a portfolio of projects in various stages.""" + p1 = project_factory( + project_name="Downtown Apts", + jurisdiction="Portland, OR", + city="Portland", + state="OR", + total_units=80, + affordable_units=72, + current_stage=PipelineStage.CONSTRUCTION, + overall_health=OverallHealth.ON_TRACK, + health_score=85.0, + total_development_cost=30_000_000, ) - assert isinstance(result, dict) - summary = result.get("portfolio_summary", {}) - assert summary.get("total_projects", 0) >= 3 - assert summary.get("total_units", 0) >= 185 - - def test_portfolio_pipeline_snapshot(self, db, portfolio_projects): - """Should include pipeline snapshot with stage breakdown.""" - from src.analytics.portfolio_intelligence import ( - generate_portfolio_dashboard, + p2 = project_factory( + project_name="Hillside Senior", + jurisdiction="Portland, OR", + city="Portland", + state="OR", + total_units=60, + affordable_units=60, + current_stage=PipelineStage.ENTITLEMENT, + overall_health=OverallHealth.AT_RISK, + health_score=55.0, + funding_gap=3_000_000, + ) + p3 = project_factory( + project_name="River View Family", + jurisdiction="Portland, OR", + city="Portland", + state="OR", + total_units=45, + affordable_units=45, + current_stage=PipelineStage.FINANCING, + overall_health=OverallHealth.ON_TRACK, + health_score=78.0, ) + funding_source_factory( + p1.project_id, + source_type=FundingSourceType.LIHTC_9PCT, + source_name="9% LIHTC", + amount=10_000_000, + status=FundingSourceStatus.CLOSED, + ) + return [p1, p2, p3] + + def test_dashboard_summary(self, db, portfolio_projects): + """Dashboard should include correct project and unit totals.""" result = generate_portfolio_dashboard( - db=db, - geography_filter={"jurisdiction": "Portland, OR"}, - stakeholder_type="city", + db, + jurisdiction="Portland, OR", + stakeholder_type=StakeholderType.CITY, ) - snapshot = result.get("pipeline_snapshot", {}) - assert "units_by_stage" in snapshot or "projects_by_stage" in snapshot + assert result["total_projects"] >= 3 + assert result["total_units"] >= 185 - def test_portfolio_health_distribution(self, db, portfolio_projects): - """Should include health distribution counts.""" - from src.analytics.portfolio_intelligence import ( - generate_portfolio_dashboard, + def test_dashboard_stage_distribution(self, db, portfolio_projects): + """Dashboard should include stage distribution.""" + result = generate_portfolio_dashboard( + db, + jurisdiction="Portland, OR", + stakeholder_type=StakeholderType.CITY, ) + assert len(result["stage_distribution"]) >= 1 + stages = {sd["stage"] for sd in result["stage_distribution"]} + assert "construction" in stages or "entitlement" in stages + def test_dashboard_health_distribution(self, db, portfolio_projects): + """Dashboard should include health distribution.""" result = generate_portfolio_dashboard( - db=db, - geography_filter={"jurisdiction": "Portland, OR"}, - stakeholder_type="city", + db, + jurisdiction="Portland, OR", + stakeholder_type=StakeholderType.CITY, ) - health = result.get("health_distribution", {}) - assert isinstance(health, dict) + assert len(result["health_distribution"]) >= 1 - def test_empty_portfolio(self, db): - """Should handle empty portfolio gracefully.""" - from src.analytics.portfolio_intelligence import ( - generate_portfolio_dashboard, + def test_dashboard_at_risk_projects(self, db, portfolio_projects): + """Dashboard should identify at-risk projects.""" + result = generate_portfolio_dashboard( + db, + jurisdiction="Portland, OR", + stakeholder_type=StakeholderType.CITY, ) + assert len(result["at_risk_projects"]) >= 1 + at_risk_names = [ar["project_name"] for ar in result["at_risk_projects"]] + assert "Hillside Senior" in at_risk_names + def test_empty_jurisdiction_returns_empty_dashboard(self, db): + """Nonexistent jurisdiction should return empty dashboard.""" result = generate_portfolio_dashboard( - db=db, - geography_filter={"jurisdiction": "Nonexistent, XX"}, - stakeholder_type="city", + db, + jurisdiction="Nonexistent, XX", + stakeholder_type=StakeholderType.CITY, ) - summary = result.get("portfolio_summary", {}) - assert summary.get("total_projects", 0) == 0 + assert result["total_projects"] == 0 + assert result["stage_distribution"] == [] From 67478a6147a87b1d0869ad4b317b9940862b78f1 Mon Sep 17 00:00:00 2001 From: zachurban Date: Sat, 7 Feb 2026 13:01:42 -0700 Subject: [PATCH 4/5] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f3bad3..13bd412 100644 --- a/README.md +++ b/README.md @@ -74,4 +74,5 @@ docs/ # Documentation ## License -MIT +©️ 2026 Zachary Urban +All Rights Reserved From e81705c57c62cda4cfb18f592dce8645074f8c59 Mon Sep 17 00:00:00 2001 From: zachurban Date: Sat, 7 Feb 2026 13:03:16 -0700 Subject: [PATCH 5/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 13bd412..8297591 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # HousingHand -Development Pipeline Intelligence Platform for the HousingMind ecosystem. +Affordable Housing Development Pipeline Intelligence Platform for the HousingMind ecosystem. HousingHand tracks every affordable housing project from concept to certificate of occupancy, quantifying where development pipelines break down and connecting regulatory friction to real production outcomes. It creates the first comprehensive affordable housing development pipeline database, enabling stakeholders to predict timelines, identify bottlenecks, measure policy reform impact, and optimize portfolio performance.