diff --git a/.github/scripts/validate_readmes/test-readme-check.sh b/.github/scripts/validate_readmes/test-readme-check.sh index d138e0a04..aca5540e3 100755 --- a/.github/scripts/validate_readmes/test-readme-check.sh +++ b/.github/scripts/validate_readmes/test-readme-check.sh @@ -50,17 +50,38 @@ for target_dir in "${TARGET_DIRS[@]}"; do # Determine if it's a component or pipeline if [[ "$target_dir" == components/* ]]; then TYPE_FLAG="--component" + ASSET_FILE="component.py" elif [[ "$target_dir" == pipelines/* ]]; then TYPE_FLAG="--pipeline" + ASSET_FILE="pipeline.py" else print_error "Invalid directory: $target_dir. Must be in components/ or pipelines/" exit 2 fi - echo "Checking $target_dir..." - # Run in check mode (no --fix flag). Exit code 1 means diffs detected. - if ! uv run python -m scripts.generate_readme $TYPE_FLAG "$target_dir"; then - HAS_ERRORS=1 + # Check if this is a direct component/pipeline or a subcategory + if [[ -f "$target_dir/$ASSET_FILE" ]]; then + # Direct component/pipeline + echo "Checking $target_dir..." + if ! uv run python -m scripts.generate_readme $TYPE_FLAG "$target_dir"; then + HAS_ERRORS=1 + fi + else + # This might be a subcategory - find components inside + found_assets=0 + for subdir in "$target_dir"/*/; do + if [[ -f "$subdir$ASSET_FILE" ]]; then + found_assets=1 + echo "Checking $subdir..." + if ! uv run python -m scripts.generate_readme $TYPE_FLAG "${subdir%/}"; then + HAS_ERRORS=1 + fi + fi + done + if [[ $found_assets -eq 0 ]]; then + print_error "'$target_dir' does not contain a $ASSET_FILE file and has no subdirectories with one" + exit 2 + fi fi done diff --git a/AGENTS.md b/AGENTS.md index f6230ee4f..b0f3965d0 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -34,10 +34,10 @@ Agents typically interact with this repository in three modes. Use the mode to d - **Reuse-first**: search `components//` and `pipelines//` for similar functionality; prefer extending/composing instead of duplicating. - **Create scaffolding**: use the Make targets in `Makefile`: - - `make component CATEGORY= NAME= [NO_TESTS]` - - `make pipeline CATEGORY= NAME= [NO_TESTS]` - - `make tests TYPE=component|pipeline CATEGORY= NAME=` - - `make readme TYPE=component|pipeline CATEGORY= NAME=` + - `make component CATEGORY= NAME= [SUBCATEGORY=] [NO_TESTS=true] [CREATE_SHARED=true]` + - `make pipeline CATEGORY= NAME= [SUBCATEGORY=] [NO_TESTS=true] [CREATE_SHARED=true]` + - `make tests TYPE=component|pipeline CATEGORY= NAME= [SUBCATEGORY=]` + - `make readme TYPE=component|pipeline CATEGORY= NAME= [SUBCATEGORY=]` - **Validate like CI**: follow [`CONTRIBUTING.md` (Testing and Quality)](docs/CONTRIBUTING.md#testing-and-quality) and reference the workflows under `.github/workflows/` (example: [`.github/workflows/python-lint.yml`](.github/workflows/python-lint.yml)). - **New assets require approval**: for initial contributions (introducing a new component/pipeline to the catalog), @@ -66,7 +66,9 @@ Good places to look: #### Establish the target location and naming - Components live under `components///`. +- Components can optionally use subcategories: `components////`. - Pipelines live under `pipelines///`. +- Pipelines can optionally use subcategories: `pipelines////`. - Use `snake_case` directory names (per `CONTRIBUTING.md`). ### Required files @@ -95,7 +97,7 @@ Process (expected for agents): Use this prompt pattern: "Search `components/` for similar functionality and reuse if possible. If a new component is needed, create it under -`components///` using `make component CATEGORY= NAME= [NO_TESTS]`, then implement +`components///` using `make component CATEGORY= NAME= [NO_TESTS=true]`, then implement `component.py` following repository lint rules (including import guard). Create `metadata.yaml` that conforms to the metadata schema defined in [`CONTRIBUTING.md`](docs/CONTRIBUTING.md#metadatayaml-schema) (required field order, fresh `lastVerified`). Generate/validate `README.md` using `make readme TYPE=component CATEGORY= NAME=`. Add unit tests using `.python_func()` and a @@ -103,17 +105,35 @@ LocalRunner test using `setup_and_teardown_subprocess_runner` (you can generate `make tests TYPE=component CATEGORY= NAME=`). Reference an existing component like `components/data_processing/yoda_data_processor/` for patterns." +#### Add a component in a subcategory + +Use this prompt pattern when creating related components that should share ownership or utilities: + +"Create a component in a subcategory using `make component CATEGORY= SUBCATEGORY= NAME=`. This +automatically creates the subcategory structure with OWNERS and README.md if it doesn't exist. For shared utilities, +add `CREATE_SHARED=true` to create a `shared/` package. Update the subcategory OWNERS and README.md with appropriate +maintainers and documentation. Follow the same component implementation patterns as above." + #### Add a new pipeline (reuse-first, compliant) Use this prompt pattern: "Search `pipelines/` for similar functionality and reuse if possible. If a new pipeline is needed, create it under -`pipelines///` using `make pipeline CATEGORY= NAME= [NO_TESTS]`, then implement +`pipelines///` using `make pipeline CATEGORY= NAME= [NO_TESTS=true]`, then implement `pipeline.py` following repository lint rules (including import guard). Create `metadata.yaml` that conforms to the metadata schema defined in [`CONTRIBUTING.md`](docs/CONTRIBUTING.md#metadatayaml-schema) (required field order, fresh `lastVerified`). Generate/validate `README.md` using `make readme TYPE=pipeline CATEGORY= NAME=`. Add tests (you can generate tests via `make tests TYPE=pipeline CATEGORY= NAME=`)." +#### Add a pipeline in a subcategory + +Use this prompt pattern when creating related pipelines that should share ownership or utilities: + +"Create a pipeline in a subcategory using `make pipeline CATEGORY= SUBCATEGORY= NAME=`. This +automatically creates the subcategory structure with OWNERS and README.md if it doesn't exist. For shared utilities, +add `CREATE_SHARED=true` to create a `shared/` package. Update the subcategory OWNERS and README.md with appropriate +maintainers and documentation. Follow the same pipeline implementation patterns as above." + #### Update an existing component safely "Find the existing component directory. Make the minimal change needed. Update docstrings and regenerate the README diff --git a/Makefile b/Makefile index 923d85ecd..8e9fc9ff7 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ RUFF ?= $(UVRUN) ruff YAMLLINT ?= $(UVRUN) yamllint PYTEST ?= $(UVRUN) pytest -.PHONY: format fix lint lint-format lint-python lint-markdown lint-yaml lint-imports test test-coverage component pipeline tests readme +.PHONY: format fix lint lint-format lint-python lint-markdown lint-yaml lint-imports test test-coverage component pipeline tests readme sync-packages format: $(RUFF) format components pipelines scripts @@ -38,37 +38,68 @@ test-coverage: cd .github/scripts && $(PYTEST) */tests/ --cov=. --cov-report=term-missing -v $(ARGS) component: - @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make component CATEGORY=data_processing NAME=my_component [NO_TESTS]"; exit 1; fi - @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make component CATEGORY=data_processing NAME=my_component [NO_TESTS]"; exit 1; fi - @if [ -n "$(NO_TESTS)" ]; then \ - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=component --category=$(CATEGORY) --name=$(NAME) --no-tests; \ + @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make component CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x] [NO_TESTS=true] [CREATE_SHARED=true]"; exit 1; fi + @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make component CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x] [NO_TESTS=true] [CREATE_SHARED=true]"; exit 1; fi + @SUBCATEGORY_ARG=""; \ + if [ -n "$(SUBCATEGORY)" ]; then SUBCATEGORY_ARG="--subcategory=$(SUBCATEGORY)"; fi; \ + NO_TESTS_ARG=""; \ + if [ "$(NO_TESTS)" = "true" ]; then NO_TESTS_ARG="--no-tests"; fi; \ + CREATE_SHARED_ARG=""; \ + if [ "$(CREATE_SHARED)" = "true" ]; then CREATE_SHARED_ARG="--create-shared"; fi; \ + $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=component --category=$(CATEGORY) --name=$(NAME) $$SUBCATEGORY_ARG $$NO_TESTS_ARG $$CREATE_SHARED_ARG; \ + echo ""; \ + echo "Generating READMEs..."; \ + if [ -n "$(SUBCATEGORY)" ]; then \ + $(UVRUN) -m scripts.generate_readme --component components/$(CATEGORY)/$(SUBCATEGORY)/$(NAME) --fix; \ else \ - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=component --category=$(CATEGORY) --name=$(NAME); \ + $(UVRUN) -m scripts.generate_readme --component components/$(CATEGORY)/$(NAME) --fix; \ fi + @$(MAKE) --no-print-directory sync-packages pipeline: - @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [NO_TESTS]"; exit 1; fi - @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [NO_TESTS]"; exit 1; fi - @if [ -n "$(NO_TESTS)" ]; then \ - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=pipeline --category=$(CATEGORY) --name=$(NAME) --no-tests; \ + @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [SUBCATEGORY=x] [NO_TESTS=true] [CREATE_SHARED=true]"; exit 1; fi + @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [SUBCATEGORY=x] [NO_TESTS=true] [CREATE_SHARED=true]"; exit 1; fi + @SUBCATEGORY_ARG=""; \ + if [ -n "$(SUBCATEGORY)" ]; then SUBCATEGORY_ARG="--subcategory=$(SUBCATEGORY)"; fi; \ + NO_TESTS_ARG=""; \ + if [ "$(NO_TESTS)" = "true" ]; then NO_TESTS_ARG="--no-tests"; fi; \ + CREATE_SHARED_ARG=""; \ + if [ "$(CREATE_SHARED)" = "true" ]; then CREATE_SHARED_ARG="--create-shared"; fi; \ + $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=pipeline --category=$(CATEGORY) --name=$(NAME) $$SUBCATEGORY_ARG $$NO_TESTS_ARG $$CREATE_SHARED_ARG; \ + echo ""; \ + echo "Generating READMEs..."; \ + if [ -n "$(SUBCATEGORY)" ]; then \ + $(UVRUN) -m scripts.generate_readme --pipeline pipelines/$(CATEGORY)/$(SUBCATEGORY)/$(NAME) --fix; \ else \ - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=pipeline --category=$(CATEGORY) --name=$(NAME); \ + $(UVRUN) -m scripts.generate_readme --pipeline pipelines/$(CATEGORY)/$(NAME) --fix; \ fi + @$(MAKE) --no-print-directory sync-packages tests: - @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi - @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi - @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=$(TYPE) --category=$(CATEGORY) --name=$(NAME) --tests-only + @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ "$(TYPE)" = "component" ] || [ "$(TYPE)" = "pipeline" ]; then \ + SUBCATEGORY_ARG=""; \ + if [ -n "$(SUBCATEGORY)" ]; then SUBCATEGORY_ARG="--subcategory=$(SUBCATEGORY)"; fi; \ + $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=$(TYPE) --category=$(CATEGORY) --name=$(NAME) $$SUBCATEGORY_ARG --tests-only; \ + else \ + echo "Error: TYPE must be either 'component' or 'pipeline'"; exit 1; \ + fi readme: - @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi - @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi - @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi - @if [ "$(TYPE)" = "component" ]; then \ - $(UVRUN) -m scripts.generate_readme --component $(TYPE)s/$(CATEGORY)/$(NAME) --fix; \ - elif [ "$(TYPE)" = "pipeline" ]; then \ - $(UVRUN) -m scripts.generate_readme --pipeline $(TYPE)s/$(CATEGORY)/$(NAME) --fix; \ + @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ "$(TYPE)" = "component" ] || [ "$(TYPE)" = "pipeline" ]; then \ + if [ -n "$(SUBCATEGORY)" ]; then \ + $(UVRUN) -m scripts.generate_readme --$(TYPE) $(TYPE)s/$(CATEGORY)/$(SUBCATEGORY)/$(NAME) --fix; \ + else \ + $(UVRUN) -m scripts.generate_readme --$(TYPE) $(TYPE)s/$(CATEGORY)/$(NAME) --fix; \ + fi; \ else \ echo "Error: TYPE must be either 'component' or 'pipeline'"; exit 1; \ fi + +sync-packages: + @$(UVRUN) scripts/sync_packages.py diff --git a/components/training/README.md b/components/training/README.md new file mode 100644 index 000000000..c301e306c --- /dev/null +++ b/components/training/README.md @@ -0,0 +1,7 @@ +# Training Components + +This directory contains components in the **Training** category: + +## Subcategories + +- [Sklearn Models](./sklearn_models/README.md) diff --git a/components/training/sklearn_models/OWNERS b/components/training/sklearn_models/OWNERS new file mode 100644 index 000000000..3423f6e47 --- /dev/null +++ b/components/training/sklearn_models/OWNERS @@ -0,0 +1,7 @@ +approvers: + # TODO: Add your GitHub username here (must be a Kubeflow community member) + # - your-github-username +reviewers: + # TODO: Add reviewers' GitHub usernames here + # - reviewer1 + # - reviewer2 \ No newline at end of file diff --git a/components/training/sklearn_models/README.md b/components/training/sklearn_models/README.md new file mode 100644 index 000000000..43209e35d --- /dev/null +++ b/components/training/sklearn_models/README.md @@ -0,0 +1,5 @@ +# Sklearn Models + +This subcategory contains components in the **Sklearn Models** group: + +- [Logistic Regression](./logistic_regression/README.md): Logistic Regression component. diff --git a/components/training/sklearn_models/__init__.py b/components/training/sklearn_models/__init__.py new file mode 100644 index 000000000..3dbd4594d --- /dev/null +++ b/components/training/sklearn_models/__init__.py @@ -0,0 +1 @@ +"""Assets in the sklearn_models subcategory.""" diff --git a/components/training/sklearn_models/logistic_regression/OWNERS b/components/training/sklearn_models/logistic_regression/OWNERS new file mode 100644 index 000000000..3423f6e47 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/OWNERS @@ -0,0 +1,7 @@ +approvers: + # TODO: Add your GitHub username here (must be a Kubeflow community member) + # - your-github-username +reviewers: + # TODO: Add reviewers' GitHub usernames here + # - reviewer1 + # - reviewer2 \ No newline at end of file diff --git a/components/training/sklearn_models/logistic_regression/README.md b/components/training/sklearn_models/logistic_regression/README.md new file mode 100644 index 000000000..dc321ee25 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/README.md @@ -0,0 +1,39 @@ +# Logistic Regression โœจ + +> โš ๏ธ **Stability: alpha** โ€” This asset is not yet stable and may change. + +## Overview ๐Ÿงพ + +Logistic Regression component. + +TODO: Add a detailed description of what this component does. + +Args: input_param: Description of the component parameter. # Add descriptions for other parameters + +Returns: Description of what the component returns. + +## Inputs ๐Ÿ“ฅ + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `input_param` | `str` | `None` | | + +## Outputs ๐Ÿ“ค + +| Name | Type | Description | +|------|------|-------------| +| Output | `str` | | + +## Metadata ๐Ÿ—‚๏ธ + +- **Name**: logistic_regression +- **Stability**: alpha +- **Dependencies**: + - Kubeflow: + - Name: Pipelines, Version: >=2.15.2 +- **Tags**: + - training +- **Last Verified**: 2026-02-11 20:18:36+00:00 +- **Owners**: + - Approvers: None + - Reviewers: None diff --git a/components/training/sklearn_models/logistic_regression/__init__.py b/components/training/sklearn_models/logistic_regression/__init__.py new file mode 100644 index 000000000..9474e0323 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/__init__.py @@ -0,0 +1,3 @@ +from .component import logistic_regression + +__all__ = ["logistic_regression"] diff --git a/components/training/sklearn_models/logistic_regression/component.py b/components/training/sklearn_models/logistic_regression/component.py new file mode 100644 index 000000000..8a0b38ee9 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/component.py @@ -0,0 +1,34 @@ +from kfp import dsl + + +@dsl.component( + base_image="python:3.11", + # packages_to_install=["numpy", "pandas"], # Add your dependencies here +) +def logistic_regression( + # Add your component parameters here + input_param: str, + # Add your output artifacts here + # output_artifact: dsl.Output[dsl.Artifact] +) -> str: # Specify your return type + """Logistic Regression component. + + TODO: Add a detailed description of what this component does. + + Args: + input_param: Description of the component parameter. + # Add descriptions for other parameters + + Returns: + Description of what the component returns. + """ + # TODO: Implement your component logic here + + +if __name__ == "__main__": + from kfp.compiler import Compiler + + Compiler().compile( + logistic_regression, + package_path=__file__.replace(".py", "_component.yaml"), + ) diff --git a/components/training/sklearn_models/logistic_regression/metadata.yaml b/components/training/sklearn_models/logistic_regression/metadata.yaml new file mode 100644 index 000000000..23ed8dd88 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/metadata.yaml @@ -0,0 +1,17 @@ +--- +name: logistic_regression +stability: alpha # New component without proven track record +dependencies: + kubeflow: + - name: Pipelines + version: '>=2.15.2' + # external_services: # Add if component uses external services + # - name: Example Service + # version: ">=1.0.0" +tags: + - training + # Add more relevant tags here +lastVerified: 2026-02-11T20:18:36Z +# links: # Add relevant links +# documentation: https://your-docs-url.com +# issue_tracker: https://github.com/kubeflow/pipelines-components/issues diff --git a/components/training/sklearn_models/logistic_regression/tests/__init__.py b/components/training/sklearn_models/logistic_regression/tests/__init__.py new file mode 100644 index 000000000..92cbc4c96 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/tests/__init__.py @@ -0,0 +1 @@ +# Test package for component tests diff --git a/components/training/sklearn_models/logistic_regression/tests/test_component_local.py b/components/training/sklearn_models/logistic_regression/tests/test_component_local.py new file mode 100644 index 000000000..ec36beff0 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/tests/test_component_local.py @@ -0,0 +1,17 @@ +"""Local runner tests for the logistic_regression component.""" + +from ..component import logistic_regression + + +class TestLogisticRegressionLocalRunner: + """Test component with LocalRunner (subprocess execution).""" + + def test_local_execution(self, setup_and_teardown_subprocess_runner): # noqa: F811 + """Test component execution with LocalRunner.""" + # TODO: Implement local runner tests for your component + + # Example test structure: + result = logistic_regression(input_param="test_value") + + # Add assertions about expected outputs if needed + assert result is not None diff --git a/components/training/sklearn_models/logistic_regression/tests/test_component_unit.py b/components/training/sklearn_models/logistic_regression/tests/test_component_unit.py new file mode 100644 index 000000000..2acffb144 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/tests/test_component_unit.py @@ -0,0 +1,27 @@ +"""Tests for the logistic_regression component.""" + +from ..component import logistic_regression + + +class TestLogisticRegressionUnitTests: + """Unit tests for component logic.""" + + def test_component_function_exists(self): + """Test that the component function is properly imported.""" + assert callable(logistic_regression) + assert hasattr(logistic_regression, "python_func") + + def test_component_with_default_parameters(self): + """Test component with valid input parameters.""" + # TODO: Implement unit tests for your component + + # Example test structure: + result = logistic_regression.python_func(input_param="test_value") + assert isinstance(result, str) + assert "test_value" in result + + # TODO: Add more comprehensive unit tests + # @mock.patch("external_library.some_function") + # def test_component_with_mocked_dependencies(self, mock_function): + # """Test component behavior with mocked external calls.""" + # pass diff --git a/components/training/sklearn_models/shared/__init__.py b/components/training/sklearn_models/shared/__init__.py new file mode 100644 index 000000000..48e5b99a9 --- /dev/null +++ b/components/training/sklearn_models/shared/__init__.py @@ -0,0 +1 @@ +"""Shared utilities for the sklearn_models subcategory.""" diff --git a/components/training/sklearn_models/shared/sklearn_models_utils.py b/components/training/sklearn_models/shared/sklearn_models_utils.py new file mode 100644 index 000000000..3e8a8f588 --- /dev/null +++ b/components/training/sklearn_models/shared/sklearn_models_utils.py @@ -0,0 +1,4 @@ +"""Shared utility functions for the sklearn_models subcategory.""" + + +# TODO: Add shared utility functions, classes, or constants here. diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index f6345be3e..62c05e28a 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -128,6 +128,53 @@ Components must be organized by category under `components//`. Pipelines must be organized by category under `pipelines//`. +### Subcategories + +For better organization of related components or pipelines, you can create **subcategories** within a category. +Subcategories provide: + +- **Logical grouping** of related assets (e.g., all sklearn-based trainers, related ML workflows) +- **Dedicated ownership** via subcategory-level OWNERS file +- **Shared utilities** via an optional `shared/` package + +**Component subcategory structure:** + +```text +components/// +โ”œโ”€โ”€ __init__.py # Subcategory package +โ”œโ”€โ”€ OWNERS # Subcategory maintainers +โ”œโ”€โ”€ README.md # Subcategory documentation +โ”œโ”€โ”€ shared/ # Optional shared utilities package +โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ””โ”€โ”€ training_utils.py # Common code for components in this subcategory +โ””โ”€โ”€ / # Individual component + โ”œโ”€โ”€ __init__.py + โ”œโ”€โ”€ component.py + โ”œโ”€โ”€ metadata.yaml + โ”œโ”€โ”€ OWNERS + โ”œโ”€โ”€ README.md + โ””โ”€โ”€ tests/ +``` + +**Pipeline subcategory structure:** + +```text +pipelines/// +โ”œโ”€โ”€ __init__.py # Subcategory package +โ”œโ”€โ”€ OWNERS # Subcategory maintainers +โ”œโ”€โ”€ README.md # Subcategory documentation +โ”œโ”€โ”€ shared/ # Optional shared utilities package +โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ””โ”€โ”€ workflow_utils.py # Common code for pipelines in this subcategory +โ””โ”€โ”€ / # Individual pipeline + โ”œโ”€โ”€ __init__.py + โ”œโ”€โ”€ pipeline.py + โ”œโ”€โ”€ metadata.yaml + โ”œโ”€โ”€ OWNERS + โ”œโ”€โ”€ README.md + โ””โ”€โ”€ tests/ +``` + ## Naming Conventions - **Components and pipelines** use `snake_case` (e.g., `data_preprocessing`, `model_trainer`) @@ -275,14 +322,20 @@ For rapid development, this repository provides convenient make commands that au The following make targets simplify the development workflow: -| Command | Description | -|--------------------------------------------------------|---------------------------------------------------| -| `make component CATEGORY= NAME= [NO_TESTS]` | Create a new component skeleton | -| `make pipeline CATEGORY= NAME= [NO_TESTS]` | Create a new pipeline skeleton | -| `make tests TYPE= CATEGORY= NAME=` | Add tests to existing component/pipeline | -| `make readme TYPE= CATEGORY= NAME=` | Generate/update README from code | -| `make format` | Auto-fix code formatting and linting issues | -| `make lint` | Check code quality (formatting, linting, imports) | +| Command | Description | +|---------|-------------| +| `make component CATEGORY= NAME=` | Create a new component skeleton | +| `make pipeline CATEGORY= NAME=` | Create a new pipeline skeleton | +| `make tests TYPE= CATEGORY= NAME=` | Add tests to existing component/pipeline | +| `make readme TYPE= CATEGORY= NAME=` | Generate/update README from code | +| `make format` | Auto-fix code formatting and linting issues | +| `make lint` | Check code quality (formatting, linting, imports) | + +**Optional flags** (append to component/pipeline commands): + +- `SUBCATEGORY=` - Create asset in a subcategory +- `NO_TESTS=true` - Skip test file generation +- `CREATE_SHARED=true` - Create shared utilities package (requires SUBCATEGORY) @@ -301,8 +354,8 @@ make pipeline CATEGORY=training NAME=my_training_pipeline **Create without tests (for rapid prototyping):** ```bash -make component CATEGORY=data_processing NAME=my_prototype NO_TESTS -make pipeline CATEGORY=training NAME=my_prototype NO_TESTS +make component CATEGORY=data_processing NAME=my_prototype NO_TESTS=true +make pipeline CATEGORY=training NAME=my_prototype NO_TESTS=true ``` This generates the complete directory structure: @@ -320,6 +373,70 @@ components/data_processing/my_data_processor/ โ””โ”€โ”€ test_component_local.py # Integration test template ``` +**Create a component within a subcategory:** + +```bash +# Create component in a subcategory (subcategory files created automatically) +make component CATEGORY=training SUBCATEGORY=sklearn_trainer NAME=logistic_regression + +# Create component in subcategory with shared utilities package +make component CATEGORY=training SUBCATEGORY=sklearn_trainer NAME=random_forest CREATE_SHARED=true +``` + +This generates a nested structure: + +```text +components/training/sklearn_trainer/ +โ”œโ”€โ”€ __init__.py # Subcategory package +โ”œโ”€โ”€ OWNERS # Subcategory maintainers +โ”œโ”€โ”€ README.md # Subcategory documentation +โ”œโ”€โ”€ shared/ # (if CREATE_SHARED=true) Shared utilities +โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ””โ”€โ”€ sklearn_trainer_utils.py # Placeholder utility file +โ””โ”€โ”€ logistic_regression/ # Your component + โ”œโ”€โ”€ __init__.py + โ”œโ”€โ”€ component.py + โ”œโ”€โ”€ metadata.yaml + โ”œโ”€โ”€ OWNERS + โ”œโ”€โ”€ README.md + โ””โ”€โ”€ tests/ + โ”œโ”€โ”€ __init__.py + โ”œโ”€โ”€ test_component_local.py + โ””โ”€โ”€ test_component_unit.py +``` + +**Create a pipeline within a subcategory:** + +```bash +# Create pipeline in a subcategory (subcategory files created automatically) +make pipeline CATEGORY=training SUBCATEGORY=ml_workflows NAME=batch_training + +# Create pipeline in subcategory with shared utilities package +make pipeline CATEGORY=training SUBCATEGORY=ml_workflows NAME=inference CREATE_SHARED=true +``` + +This generates a nested structure: + +```text +pipelines/training/ml_workflows/ +โ”œโ”€โ”€ __init__.py # Subcategory package +โ”œโ”€โ”€ OWNERS # Subcategory maintainers +โ”œโ”€โ”€ README.md # Subcategory documentation +โ”œโ”€โ”€ shared/ # (if CREATE_SHARED=true) Shared utilities +โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ””โ”€โ”€ ml_workflows_utils.py # Placeholder utility file +โ””โ”€โ”€ batch_training/ # Your pipeline + โ”œโ”€โ”€ __init__.py + โ”œโ”€โ”€ pipeline.py + โ”œโ”€โ”€ metadata.yaml + โ”œโ”€โ”€ OWNERS + โ”œโ”€โ”€ README.md + โ””โ”€โ”€ tests/ + โ”œโ”€โ”€ __init__.py + โ”œโ”€โ”€ test_pipeline_local.py + โ””โ”€โ”€ test_pipeline_unit.py +``` +
๐Ÿ”ง Alternative: Manual Creation @@ -477,6 +594,35 @@ git push origin component/csv-cleaner This workflow typically takes just a few minutes to set up the complete component structure with documentation and tests. +#### Example Workflow with Subcategory + +When creating related components that share ownership or utilities: + +```bash +# 1. Create feature branch +git checkout -b component/sklearn-logistic-regression upstream/main + +# 2. Create component in subcategory (first component also creates subcategory structure) +make component CATEGORY=training SUBCATEGORY=sklearn_trainer NAME=logistic_regression + +# 3. Edit the component and subcategory files: +# - components/training/sklearn_trainer/logistic_regression/component.py (your logic) +# - components/training/sklearn_trainer/OWNERS (subcategory maintainers) +# - components/training/sklearn_trainer/README.md (subcategory docs) + +# 4. Generate documentation +make readme TYPE=component CATEGORY=training SUBCATEGORY=sklearn_trainer NAME=logistic_regression + +# 5. Format, lint, test, and submit (same as above) +make format +make lint +pytest components/training/sklearn_trainer/logistic_regression/tests/ -v +pre-commit run +git add . +git commit -m "feat(training): add logistic_regression component in sklearn_trainer subcategory" +git push origin component/sklearn-logistic-regression +``` + ## Testing and Quality ### Running Tests Locally @@ -753,21 +899,6 @@ pytest tests/ --cov=. --cov-report=html - **Dependencies**: Mock external services in unit tests; use real dependencies in local runner tests - **Cleanup**: Use provided fixtures to ensure proper test environment cleanup -### Package Validation - -The validation script ensures the `packages` list in `pyproject.toml` stays in sync with the actual -Python package structure. It discovers all packages in `components/` and `pipelines/` and compares -them with the declared packages in `pyproject.toml`. - -Run the validation locally: - -```bash -uv run python -m scripts.validate_package_entries.validate_package_entries -``` - -If validation fails, update the `packages` list in `pyproject.toml` under `[tool.setuptools]` to -include any missing packages. The script will report exactly which packages are missing or extra. - ### Building Custom Container Images If your component uses a custom image, test the container build: diff --git a/pipelines/training/README.md b/pipelines/training/README.md new file mode 100644 index 000000000..d7f473063 --- /dev/null +++ b/pipelines/training/README.md @@ -0,0 +1,7 @@ +# Training Pipelines + +This directory contains pipelines in the **Training** category: + +## Subcategories + +- [Ml Workflows](./ml_workflows/README.md) diff --git a/pipelines/training/ml_workflows/OWNERS b/pipelines/training/ml_workflows/OWNERS new file mode 100644 index 000000000..3423f6e47 --- /dev/null +++ b/pipelines/training/ml_workflows/OWNERS @@ -0,0 +1,7 @@ +approvers: + # TODO: Add your GitHub username here (must be a Kubeflow community member) + # - your-github-username +reviewers: + # TODO: Add reviewers' GitHub usernames here + # - reviewer1 + # - reviewer2 \ No newline at end of file diff --git a/pipelines/training/ml_workflows/README.md b/pipelines/training/ml_workflows/README.md new file mode 100644 index 000000000..82615ec09 --- /dev/null +++ b/pipelines/training/ml_workflows/README.md @@ -0,0 +1,5 @@ +# Ml Workflows + +This subcategory contains pipelines in the **Ml Workflows** group: + +- [Batch Training](./batch_training/README.md): TODO: Add pipeline description. diff --git a/pipelines/training/ml_workflows/__init__.py b/pipelines/training/ml_workflows/__init__.py new file mode 100644 index 000000000..562b1f6b0 --- /dev/null +++ b/pipelines/training/ml_workflows/__init__.py @@ -0,0 +1 @@ +"""Assets in the ml_workflows subcategory.""" diff --git a/pipelines/training/ml_workflows/batch_training/OWNERS b/pipelines/training/ml_workflows/batch_training/OWNERS new file mode 100644 index 000000000..3423f6e47 --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/OWNERS @@ -0,0 +1,7 @@ +approvers: + # TODO: Add your GitHub username here (must be a Kubeflow community member) + # - your-github-username +reviewers: + # TODO: Add reviewers' GitHub usernames here + # - reviewer1 + # - reviewer2 \ No newline at end of file diff --git a/pipelines/training/ml_workflows/batch_training/README.md b/pipelines/training/ml_workflows/batch_training/README.md new file mode 100644 index 000000000..62c3dd5ec --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/README.md @@ -0,0 +1,30 @@ +# Batch Training โœจ + +> โš ๏ธ **Stability: alpha** โ€” This asset is not yet stable and may change. + +## Overview ๐Ÿงพ + +TODO: Add pipeline description. + +TODO: Add a detailed description of what this pipeline does. + +## Inputs ๐Ÿ“ฅ + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `input_param` | `str` | `default_value` | Description of the pipeline parameter. | + +## Metadata ๐Ÿ—‚๏ธ + +- **Name**: batch_training +- **Stability**: alpha +- **Dependencies**: + - Kubeflow: + - Name: Pipelines, Version: >=2.15.2 +- **Tags**: + - training + - pipeline +- **Last Verified**: 2026-02-11 20:18:46+00:00 +- **Owners**: + - Approvers: None + - Reviewers: None diff --git a/pipelines/training/ml_workflows/batch_training/__init__.py b/pipelines/training/ml_workflows/batch_training/__init__.py new file mode 100644 index 000000000..6a2953237 --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/__init__.py @@ -0,0 +1,3 @@ +from .pipeline import batch_training + +__all__ = ["batch_training"] diff --git a/pipelines/training/ml_workflows/batch_training/metadata.yaml b/pipelines/training/ml_workflows/batch_training/metadata.yaml new file mode 100644 index 000000000..3394eddf8 --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/metadata.yaml @@ -0,0 +1,18 @@ +--- +name: batch_training +stability: alpha # New pipeline without proven track record +dependencies: + kubeflow: + - name: Pipelines + version: '>=2.15.2' + # external_services: # Add if pipeline uses external services + # - name: Example Service + # version: ">=1.0.0" +tags: + - training + - pipeline + # Add more relevant tags here +lastVerified: 2026-02-11T20:18:46Z +# links: # Add relevant links +# documentation: https://your-docs-url.com +# issue_tracker: https://github.com/kubeflow/pipelines-components/issues diff --git a/pipelines/training/ml_workflows/batch_training/pipeline.py b/pipelines/training/ml_workflows/batch_training/pipeline.py new file mode 100644 index 000000000..b43e7814c --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/pipeline.py @@ -0,0 +1,31 @@ +from kfp import dsl + + +@dsl.pipeline( + name="batch-training", + description="TODO: Add a brief description of this pipeline", +) +def batch_training( + # Add your pipeline parameters here + input_param: str = "default_value", +): + """TODO: Add pipeline description. + + TODO: Add a detailed description of what this pipeline does. + + Args: + input_param: Description of the pipeline parameter. + + Returns: + Pipeline outputs or None if no outputs are needed. + """ + # TODO: Implement your pipeline logic here + + +if __name__ == "__main__": + from kfp.compiler import Compiler + + Compiler().compile( + batch_training, + package_path=__file__.replace(".py", "_pipeline.yaml"), + ) diff --git a/pipelines/training/ml_workflows/batch_training/tests/__init__.py b/pipelines/training/ml_workflows/batch_training/tests/__init__.py new file mode 100644 index 000000000..d9eb4f378 --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/tests/__init__.py @@ -0,0 +1 @@ +# Test package for pipeline tests diff --git a/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_local.py b/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_local.py new file mode 100644 index 000000000..43558a7e8 --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_local.py @@ -0,0 +1,17 @@ +"""Local runner tests for the batch_training pipeline.""" + +from ..pipeline import batch_training + + +class TestBatchTrainingLocalRunner: + """Test pipeline with LocalRunner (subprocess execution).""" + + def test_local_execution(self, setup_and_teardown_subprocess_runner): # noqa: F811 + """Test pipeline execution with LocalRunner.""" + # TODO: Implement local runner tests for your pipeline + + # Example test structure: + result = batch_training(input_param="test_value") + + # Add assertions about expected outputs if needed + assert result is not None diff --git a/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_unit.py b/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_unit.py new file mode 100644 index 000000000..650ef559c --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_unit.py @@ -0,0 +1,27 @@ +"""Tests for the batch_training pipeline.""" + +from ..pipeline import batch_training + + +class TestBatchTrainingUnitTests: + """Unit tests for pipeline logic.""" + + def test_pipeline_function_exists(self): + """Test that the pipeline function is properly imported.""" + assert callable(batch_training) + assert hasattr(batch_training, "python_func") + + def test_pipeline_with_default_parameters(self): + """Test pipeline with valid input parameters.""" + # TODO: Implement unit tests for your pipeline + + # Example test structure: + result = batch_training.python_func(input_param="test_value") + assert isinstance(result, str) + assert "test_value" in result + + # TODO: Add more comprehensive unit tests + # @mock.patch("external_library.some_function") + # def test_pipeline_with_mocked_dependencies(self, mock_function): + # """Test pipeline behavior with mocked external calls.""" + # pass diff --git a/pipelines/training/ml_workflows/shared/__init__.py b/pipelines/training/ml_workflows/shared/__init__.py new file mode 100644 index 000000000..1c2bc45d9 --- /dev/null +++ b/pipelines/training/ml_workflows/shared/__init__.py @@ -0,0 +1 @@ +"""Shared utilities for the ml_workflows subcategory.""" diff --git a/pipelines/training/ml_workflows/shared/ml_workflows_utils.py b/pipelines/training/ml_workflows/shared/ml_workflows_utils.py new file mode 100644 index 000000000..ad3af8a7c --- /dev/null +++ b/pipelines/training/ml_workflows/shared/ml_workflows_utils.py @@ -0,0 +1,4 @@ +"""Shared utility functions for the ml_workflows subcategory.""" + + +# TODO: Add shared utility functions, classes, or constants here. diff --git a/pyproject.toml b/pyproject.toml index 5e6179932..e52fa9139 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,16 +50,22 @@ Issues = "https://github.com/kubeflow/pipelines-components/issues" packages = [ "kfp_components", "kfp_components.components", - "kfp_components.components.training", - "kfp_components.components.evaluation", "kfp_components.components.data_processing", "kfp_components.components.data_processing.yoda_data_processor", "kfp_components.components.deployment", + "kfp_components.components.evaluation", + "kfp_components.components.training", + "kfp_components.components.training.sklearn_models", + "kfp_components.components.training.sklearn_models.logistic_regression", + "kfp_components.components.training.sklearn_models.shared", "kfp_components.pipelines", - "kfp_components.pipelines.training", - "kfp_components.pipelines.evaluation", "kfp_components.pipelines.data_processing", "kfp_components.pipelines.deployment", + "kfp_components.pipelines.evaluation", + "kfp_components.pipelines.training", + "kfp_components.pipelines.training.ml_workflows", + "kfp_components.pipelines.training.ml_workflows.batch_training", + "kfp_components.pipelines.training.ml_workflows.shared", ] [tool.setuptools.package-dir] diff --git a/scripts/generate_readme/category_index_generator.py b/scripts/generate_readme/category_index_generator.py index 0b202d43e..ed20a339d 100644 --- a/scripts/generate_readme/category_index_generator.py +++ b/scripts/generate_readme/category_index_generator.py @@ -1,34 +1,41 @@ -"""Category index generator for KFP components and pipelines.""" +"""Category and subcategory index generators for KFP components and pipelines.""" import logging from pathlib import Path -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import yaml from jinja2 import Environment, FileSystemLoader -from scripts.generate_readme.constants import CATEGORY_README_TEMPLATE, MAX_LINE_LENGTH +from scripts.generate_readme.constants import ( + CATEGORY_README_TEMPLATE, + MAX_LINE_LENGTH, + SUBCATEGORY_README_TEMPLATE, +) from scripts.generate_readme.metadata_parser import MetadataParser from scripts.generate_readme.utils import format_title logger = logging.getLogger(__name__) -class CategoryIndexGenerator: - """Generates category-level README.md that indexes all components/pipelines in a category.""" +class _BaseIndexGenerator: + """Base class for index generators with shared Jinja2 setup and item extraction.""" - def __init__(self, category_dir: Path, is_component: bool = True): - """Initialize the category index generator. + def __init__(self, directory: Path, template_name: str, is_component: bool = True): + """Initialize the base index generator. Args: - category_dir: Path to the category directory (e.g., components/dev/). + directory: Path to the directory to index. + template_name: Name of the Jinja2 template to use. is_component: True if indexing components, False if indexing pipelines. """ - self.category_dir = category_dir - if category_dir.exists() is False: - raise ValueError(f"Required category directory not found: {category_dir}") + if not directory.exists(): + raise ValueError(f"Required directory not found: {directory}") + + self.directory = directory self.is_component = is_component - self.category_name = category_dir.name + self.type_name = "Components" if is_component else "Pipelines" + self._target_file = "component.py" if is_component else "pipeline.py" # Set up Jinja2 environment template_dir = Path(__file__).parent / "templates" @@ -37,38 +44,20 @@ def __init__(self, category_dir: Path, is_component: bool = True): trim_blocks=True, lstrip_blocks=True, ) - self.template = self.env.get_template(CATEGORY_README_TEMPLATE) - - def _find_items_in_category(self) -> List[Path]: - """Find all component/pipeline directories within the category. - - Returns: - List of paths to component/pipeline directories. - """ - items = [] - - # Look for subdirectories containing component.py or pipeline.py - target_file = "component.py" if self.is_component else "pipeline.py" - - for subdir in self.category_dir.iterdir(): - if subdir.is_dir() and not subdir.name.startswith("__"): - target_path = subdir / target_file - metadata_path = subdir / "metadata.yaml" - if target_path.exists() and metadata_path.exists(): - items.append(subdir) - - return items + self.template = self.env.get_template(template_name) def _get_display_name(self, item_dir: Path) -> str: - """Get the display name for an item, retrieved from the `name` field in metadata.yaml. + """Get the display name for an item from the `name` field in metadata.yaml. Args: item_dir: Path to the component/pipeline directory. Returns: The display name to use. + + Raises: + ValueError: If the `name` field is not found in metadata.yaml. """ - # Try to load metadata.yaml metadata_file = item_dir / "metadata.yaml" try: with open(metadata_file, "r", encoding="utf-8") as f: @@ -90,34 +79,26 @@ def _extract_item_info(self, item_dir: Path) -> Optional[Dict[str, str]]: Dictionary with 'name', 'overview', and 'link' keys, or None if extraction fails. """ try: - # Determine source file and parser - if self.is_component: - source_file = item_dir / "component.py" - parser = MetadataParser(source_file, "component") - else: - source_file = item_dir / "pipeline.py" - parser = MetadataParser(source_file, "pipeline") - - # Find the function + source_file = item_dir / self._target_file + parser_type = "component" if self.is_component else "pipeline" + parser = MetadataParser(source_file, parser_type) + function_name = parser.find_function() if not function_name: logger.warning(f"No function found in {source_file}") return None - # Extract metadata function_metadata = parser.extract_metadata(function_name) if not function_metadata: logger.warning(f"Could not extract function metadata from {source_file}") return None + name = self._get_display_name(item_dir) - # Format name to match individual README titles formatted_name = format_title(name) - # Get overview from docstring - overview = function_metadata.get("overview") + overview = function_metadata.get("overview", "") overview = overview.split("\n")[0].strip() - # Create relative link to the item's README link = f"./{item_dir.name}/README.md" return { @@ -130,30 +111,153 @@ def _extract_item_info(self, item_dir: Path) -> Optional[Dict[str, str]]: logger.warning(f"Error extracting info from {item_dir}: {e}") return None - def generate(self) -> str: - """Generate the category index README content. + def _find_asset_dirs(self) -> List[Path]: + """Find component/pipeline directories that contain a target file and metadata.yaml. + + Skips directories starting with '__' and directories named 'shared'. Returns: - Complete README.md content for the category index. + List of paths to component/pipeline directories. """ - # Find all items in the category - item_dirs = self._find_items_in_category() + items = [] + for subdir in self.directory.iterdir(): + if subdir.is_dir() and not subdir.name.startswith("__") and subdir.name != "shared": + if (subdir / self._target_file).exists() and (subdir / "metadata.yaml").exists(): + items.append(subdir) + return items + + def _collect_items(self, item_dirs: List[Path]) -> List[Dict[str, str]]: + """Extract and sort item info from a list of directories. - # Extract info for each item + Args: + item_dirs: List of paths to component/pipeline directories. + + Returns: + Sorted list of item info dictionaries. + """ items = [] for item_dir in item_dirs: item_info = self._extract_item_info(item_dir) if item_info: items.append(item_info) - - # Sort items by display name items.sort(key=lambda x: x["name"]) + return items + + +class CategoryIndexGenerator(_BaseIndexGenerator): + """Generates category-level README.md that indexes all components/pipelines in a category.""" + + def __init__(self, category_dir: Path, is_component: bool = True): + """Initialize the category index generator. + + Args: + category_dir: Path to the category directory (e.g., components/dev/). + is_component: True if indexing components, False if indexing pipelines. + """ + super().__init__(category_dir, CATEGORY_README_TEMPLATE, is_component) + self.category_dir = category_dir + self.category_name = category_dir.name + + def _is_subcategory(self, subdir: Path) -> bool: + """Check if a directory is a subcategory (contains child dirs with component.py/pipeline.py). + + Args: + subdir: Path to check. + + Returns: + True if the directory is a subcategory. + """ + for child in subdir.iterdir(): + if child.is_dir() and not child.name.startswith("__") and child.name != "shared": + if (child / self._target_file).exists(): + return True + return False + + def _find_items_in_category(self) -> Tuple[List[Path], List[Path]]: + """Find all component/pipeline directories and subcategories within the category. + + Returns: + Tuple of (direct_items, subcategories) where each is a list of paths. + """ + direct_items = [] + subcategories = [] + + for subdir in self.category_dir.iterdir(): + if subdir.is_dir() and not subdir.name.startswith("__"): + if (subdir / self._target_file).exists() and (subdir / "metadata.yaml").exists(): + direct_items.append(subdir) + elif self._is_subcategory(subdir): + subcategories.append(subdir) + + return direct_items, subcategories + + @staticmethod + def _extract_subcategory_info(subcat_dir: Path) -> Dict[str, str]: + """Extract display info for a subcategory directory. + + Args: + subcat_dir: Path to the subcategory directory. + + Returns: + Dictionary with 'name' and 'link' keys. + """ + return { + "name": format_title(subcat_dir.name), + "link": f"./{subcat_dir.name}/README.md", + } + + def generate(self) -> str: + """Generate the category index README content. + + Returns: + Complete README.md content for the category index. + """ + item_dirs, subcategory_dirs = self._find_items_in_category() + + items = self._collect_items(item_dirs) + + subcategories = [self._extract_subcategory_info(d) for d in subcategory_dirs] + subcategories.sort(key=lambda x: x["name"]) - # Prepare template context context = { "category_name": format_title(self.category_name), "is_component": self.is_component, - "type_name": "Components" if self.is_component else "Pipelines", + "type_name": self.type_name, + "items": items, + "subcategories": subcategories, + } + + return self.template.render(**context) + + +class SubcategoryIndexGenerator(_BaseIndexGenerator): + """Generates subcategory-level README.md that indexes all components/pipelines in a subcategory.""" + + def __init__(self, subcategory_dir: Path, is_component: bool = True): + """Initialize the subcategory index generator. + + Args: + subcategory_dir: Path to the subcategory directory + (e.g., components/training/sklearn_trainer/). + is_component: True if indexing components, False if indexing pipelines. + """ + super().__init__(subcategory_dir, SUBCATEGORY_README_TEMPLATE, is_component) + self.subcategory_dir = subcategory_dir + self.subcategory_name = subcategory_dir.name + + def generate(self) -> str: + """Generate the subcategory index README content. + + Returns: + Complete README.md content for the subcategory index. + """ + item_dirs = self._find_asset_dirs() + items = self._collect_items(item_dirs) + + context = { + "subcategory_name": format_title(self.subcategory_name), + "is_component": self.is_component, + "type_name": self.type_name, "items": items, } diff --git a/scripts/generate_readme/constants.py b/scripts/generate_readme/constants.py index dce99a1e4..209902167 100644 --- a/scripts/generate_readme/constants.py +++ b/scripts/generate_readme/constants.py @@ -5,6 +5,7 @@ # README Templates CATEGORY_README_TEMPLATE = "CATEGORY_README.md.j2" +SUBCATEGORY_README_TEMPLATE = "SUBCATEGORY_README.md.j2" README_TEMPLATE = "README.md.j2" # Exit codes diff --git a/scripts/generate_readme/templates/CATEGORY_README.md.j2 b/scripts/generate_readme/templates/CATEGORY_README.md.j2 index 02e889ded..b6844e1bf 100644 --- a/scripts/generate_readme/templates/CATEGORY_README.md.j2 +++ b/scripts/generate_readme/templates/CATEGORY_README.md.j2 @@ -5,3 +5,10 @@ This directory contains {{ type_name | lower }} in the **{{ category_name }}** c {% for item in items %} - [{{ item.name }}]({{ item.link }}): {{ item.overview }} {% endfor %} +{% if subcategories %} +## Subcategories + +{% for sub in subcategories %} +- [{{ sub.name }}]({{ sub.link }}) +{% endfor %} +{% endif %} diff --git a/scripts/generate_readme/templates/SUBCATEGORY_README.md.j2 b/scripts/generate_readme/templates/SUBCATEGORY_README.md.j2 new file mode 100644 index 000000000..aaed6b8ee --- /dev/null +++ b/scripts/generate_readme/templates/SUBCATEGORY_README.md.j2 @@ -0,0 +1,7 @@ +# {{ subcategory_name }} + +This subcategory contains {{ type_name | lower }} in the **{{ subcategory_name }}** group: + +{% for item in items %} +- [{{ item.name }}]({{ item.link }}): {{ item.overview }} +{% endfor %} diff --git a/scripts/generate_readme/writer.py b/scripts/generate_readme/writer.py index b54bc5a85..a935716d5 100644 --- a/scripts/generate_readme/writer.py +++ b/scripts/generate_readme/writer.py @@ -5,7 +5,10 @@ from pathlib import Path from typing import Optional -from scripts.generate_readme.category_index_generator import CategoryIndexGenerator +from scripts.generate_readme.category_index_generator import ( + CategoryIndexGenerator, + SubcategoryIndexGenerator, +) from scripts.generate_readme.constants import CUSTOM_CONTENT_MARKER, EXIT_ERROR from scripts.generate_readme.content_generator import ReadmeContentGenerator from scripts.generate_readme.metadata_parser import MetadataParser @@ -48,8 +51,21 @@ def __init__( self.source_file = pipeline_dir / "pipeline.py" self.function_type = "pipeline" - self.category_dir = self.source_dir.parent + self.subcategory_dir = None + parent = self.source_dir.parent + try: + if parent.parent.parent.name in {"components", "pipelines"}: + # 3-level: components//// + self.subcategory_dir = parent + self.category_dir = parent.parent + else: + # 2-level: components/// + self.category_dir = parent + except (AttributeError, ValueError): + self.category_dir = parent + self.category_index_file = self.category_dir / "README.md" + self.subcategory_index_file = self.subcategory_dir / "README.md" if self.subcategory_dir else None self.parser = MetadataParser(self.source_file, self.function_type) self.metadata_file = self.source_dir / "metadata.yaml" @@ -111,6 +127,22 @@ def _has_diff(self, expected: str, actual: Optional[str]) -> bool: return True return expected != actual + def _check_index_file(self, index_file: Path, expected_content: str) -> bool: + """Check if an index README matches expected content. + + Args: + index_file: Path to the index README file. + expected_content: The expected content. + + Returns: + True if there's a diff, False if content matches. + """ + actual_content = self._read_file_content(index_file) + has_diff = self._has_diff(expected_content, actual_content) + if has_diff: + logger.warning(f"Out of sync: {index_file}") + return has_diff + def _check_category_index(self, category_content: str) -> bool: """Check if category index matches expected content. @@ -120,33 +152,60 @@ def _check_category_index(self, category_content: str) -> bool: Returns: True if there's a diff, False if content matches. """ - actual_content = self._read_file_content(self.category_index_file) - has_diff = self._has_diff(category_content, actual_content) - if has_diff: - logger.warning(f"Out of sync: {self.category_index_file}") - return has_diff + return self._check_index_file(self.category_index_file, category_content) - def _write_category_index(self, category_content: str) -> None: - """Write the category-level README index. + def _check_subcategory_index(self, subcategory_content: str) -> bool: + """Check if subcategory index matches expected content. Args: - category_content: The generated category index content to write. + subcategory_content: The expected subcategory index content. + + Returns: + True if there's a diff, False if content matches. """ - if self.category_index_file.exists(): - logger.info(f"Category index exists at {self.category_index_file}, regenerating entries.") - else: - logger.info(f"Category index does not exist yet at {self.category_index_file}, creating new file") + if self.subcategory_index_file is None: + return False + return self._check_index_file(self.subcategory_index_file, subcategory_content) - try: - with open(self.category_index_file, "w", encoding="utf-8") as f: - f.write(category_content) + def _write_index_file(self, index_file: Path, content: str, label: str) -> None: + """Write an index README file. - logger.info(f"Category index generated at {self.category_index_file}") + Args: + index_file: Path to the index README file. + content: The generated content to write. + label: Human-readable label for log messages (e.g., "Category index"). + """ + if index_file.exists(): + logger.info(f"{label} exists at {index_file}, regenerating entries.") + else: + logger.info(f"{label} does not exist yet at {index_file}, creating new file") + try: + with open(index_file, "w", encoding="utf-8") as f: + f.write(content) + logger.info(f"{label} generated at {index_file}") except Exception as e: - logger.error(f"Could not write category index: {e}") + logger.error(f"Could not write {label.lower()}: {e}") sys.exit(EXIT_ERROR) + def _write_category_index(self, category_content: str) -> None: + """Write the category-level README index. + + Args: + category_content: The generated category index content to write. + """ + self._write_index_file(self.category_index_file, category_content, "Category index") + + def _write_subcategory_index(self, subcategory_content: str) -> None: + """Write the subcategory-level README index. + + Args: + subcategory_content: The generated subcategory index content to write. + """ + if self.subcategory_index_file is None: + return + self._write_index_file(self.subcategory_index_file, subcategory_content, "Subcategory index") + def _check_readme_file(self, readme_content: str) -> bool: """Check if README matches expected content. @@ -195,6 +254,11 @@ def _write_readme_file(self, readme_content: str) -> None: def generate(self, fix: bool = False) -> bool: """Generate the README documentation. + Generates up to 3 README files: + 1. The component/pipeline README (always) + 2. The subcategory index README (if in a subcategory) + 3. The category index README (always) + Args: fix: If True, write/update README files. If False, only check for diffs without writing files. @@ -227,19 +291,30 @@ def generate(self, fix: bool = False) -> bool: readme_content_generator = ReadmeContentGenerator(metadata, self.source_dir) readme_content = readme_content_generator.generate_readme() - # Generate category index content - index_generator = CategoryIndexGenerator(self.category_dir, self.is_component) - index_content = index_generator.generate() - - # Check for diffs (in both modes) + # Check component/pipeline README for diffs readme_has_diff = self._check_readme_file(readme_content) - category_has_diff = self._check_category_index(index_content) - has_diff = readme_has_diff or category_has_diff + has_diff = readme_has_diff + + # Generate subcategory index if we're in a subcategory + subcategory_content = None + if self.subcategory_dir: + subcategory_generator = SubcategoryIndexGenerator(self.subcategory_dir, self.is_component) + subcategory_content = subcategory_generator.generate() + subcategory_has_diff = self._check_subcategory_index(subcategory_content) + has_diff = has_diff or subcategory_has_diff + + # Generate category index content + category_generator = CategoryIndexGenerator(self.category_dir, self.is_component) + category_content = category_generator.generate() + category_has_diff = self._check_category_index(category_content) + has_diff = has_diff or category_has_diff if has_diff and fix: # Fix mode: write files self._write_readme_file(readme_content) - self._write_category_index(index_content) + if subcategory_content is not None: + self._write_subcategory_index(subcategory_content) + self._write_category_index(category_content) # Log metadata statistics logger.debug(f"README content length: {len(readme_content)} characters") diff --git a/scripts/generate_skeleton/generate_skeleton.py b/scripts/generate_skeleton/generate_skeleton.py index 79d8f07a3..686715b6e 100755 --- a/scripts/generate_skeleton/generate_skeleton.py +++ b/scripts/generate_skeleton/generate_skeleton.py @@ -7,8 +7,12 @@ Usage: python scripts/generate_skeleton/generate_skeleton.py --type=component \\ --category=data_processing --name=my_processor + python scripts/generate_skeleton/generate_skeleton.py --type=component \\ + --category=training --subcategory=sklearn_trainer --name=logistic_regression python scripts/generate_skeleton/generate_skeleton.py --type=pipeline \\ --category=ml_workflows --name=my_training_pipeline + python scripts/generate_skeleton/generate_skeleton.py --type=pipeline \\ + --category=training --subcategory=ml_workflows --name=batch_training """ import argparse @@ -95,6 +99,34 @@ def validate_category(category: str) -> None: raise ValueError("Category can only contain letters, numbers, and underscores") +def validate_subcategory(subcategory: str) -> None: + """Validate subcategory name for security. + + Args: + subcategory: The subcategory to validate + + Raises: + ValueError: If subcategory is invalid + """ + if not subcategory: + raise ValueError("Subcategory cannot be empty") + + # Check for directory traversal attempts + if "/" in subcategory or "\\" in subcategory: + raise ValueError("Subcategory cannot contain path separators (/, \\)") + + if "." in subcategory: + raise ValueError("Subcategory cannot contain dots (.)") + + # Enforce snake_case (no uppercase letters, allow underscores) + if subcategory != subcategory.lower(): + raise ValueError("Subcategory must be lowercase (snake_case)") + + # Allow letters, numbers, and underscores + if not subcategory.replace("_", "").isalnum() or not subcategory[0].isalpha(): + raise ValueError("Subcategory can only contain letters, numbers, and underscores") + + def get_existing_categories(skeleton_type: str) -> list[str]: """Get list of existing categories for the given skeleton type. @@ -111,6 +143,112 @@ def get_existing_categories(skeleton_type: str) -> list[str]: return [item.name for item in base_dir.iterdir() if item.is_dir() and not item.name.startswith((".", "_"))] +def build_skeleton_path(skeleton_type: str, category: str, name: str, subcategory: str | None = None) -> Path: + """Build the path for a skeleton directory. + + Args: + skeleton_type: Type of skeleton ('component' or 'pipeline') + category: Category name + name: Skeleton name + subcategory: Optional subcategory name + + Returns: + Path to the skeleton directory + """ + if subcategory: + return Path(f"{skeleton_type}s/{category}/{subcategory}/{name}") + return Path(f"{skeleton_type}s/{category}/{name}") + + +def generate_subcategory_files(subcategory: str) -> dict[str, str]: + """Generate files for a new subcategory (OWNERS and README.md). + + Args: + subcategory: Subcategory name + + Returns: + Dict of filename to content mappings + """ + env = _get_template_env() + + files = {} + + # Generate OWNERS for subcategory + template = env.get_template("OWNERS.j2") + files["OWNERS"] = template.render({"name": subcategory}) + + # Generate a simple README for subcategory + readme_content = f"""# {subcategory.replace("_", " ").title()} + +This subcategory contains related assets. + +## Overview + +TODO: Add description of what this subcategory contains. + +## Assets + +TODO: List components/pipelines in this subcategory. + +## Shared Utilities + +If this subcategory has a `shared/` directory, document the shared utilities here. +""" + files["README.md"] = readme_content + + return files + + +def ensure_subcategory_exists(skeleton_type: str, category: str, subcategory: str, create_shared: bool = False) -> Path: + """Ensure subcategory directory exists with required files. + + Creates the subcategory directory and its OWNERS/README files if they don't exist. + + Args: + skeleton_type: Type of skeleton ('component' or 'pipeline') + category: Category name + subcategory: Subcategory name + create_shared: Whether to create a shared/ package directory + + Returns: + Path to the subcategory directory + """ + subcategory_dir = Path(f"{skeleton_type}s/{category}/{subcategory}") + + subcategory_dir.mkdir(parents=True, exist_ok=True) + + # Create any missing subcategory-level files + subcategory_files = generate_subcategory_files(subcategory) + for filename, content in subcategory_files.items(): + file_path = subcategory_dir / filename + if not file_path.exists(): + file_path.write_text(content) + + # Create __init__.py for the subcategory package + init_path = subcategory_dir / "__init__.py" + if not init_path.exists(): + init_path.write_text(f'"""Assets in the {subcategory} subcategory."""\n') + + # Optionally create shared/ package + if create_shared: + shared_dir = subcategory_dir / "shared" + shared_dir.mkdir(exist_ok=True) + shared_init = shared_dir / "__init__.py" + if not shared_init.exists(): + shared_init.write_text(f'"""Shared utilities for the {subcategory} subcategory."""\n') + # Create a placeholder utility file + utils_file = shared_dir / f"{subcategory}_utils.py" + if not utils_file.exists(): + utils_file.write_text( + f'"""Shared utility functions for the {subcategory} subcategory."""\n' + "\n" + "\n" + "# TODO: Add shared utility functions, classes, or constants here.\n" + ) + + return subcategory_dir + + def generate_core_files(skeleton_type: str, category: str, name: str) -> dict[str, str]: """Generate core files for skeleton based on type. @@ -153,6 +291,19 @@ def generate_core_files(skeleton_type: str, category: str, name: str) -> dict[st template = env.get_template("OWNERS.j2") files["OWNERS"] = template.render(context) + # Generate placeholder README.md + title = name.replace("_", " ").title() + files["README.md"] = f"""# {title} + +## Overview + +TODO: Add description of this {skeleton_type}. + +## Usage + +TODO: Add usage examples. +""" + return files @@ -192,20 +343,33 @@ def generate_test_files(skeleton_type: str, name: str) -> dict[str, str]: return files -def create_skeleton(skeleton_type: str, category: str, name: str, create_tests: bool = True): +def create_skeleton( + skeleton_type: str, + category: str, + name: str, + subcategory: str | None = None, + create_tests: bool = True, + create_shared: bool = False, +): """Create skeleton files for a component or pipeline. Args: skeleton_type: Type of skeleton ('component' or 'pipeline') category: Category name (e.g., 'data_processing', 'training') name: Skeleton name (e.g., 'my_processor') + subcategory: Optional subcategory name (e.g., 'sklearn_trainer') create_tests: Whether to create test files (default: True) + create_shared: Whether to create shared/ package in subcategory (default: False) Returns: Path to created directory """ + # Ensure subcategory exists with required files if specified + if subcategory: + ensure_subcategory_exists(skeleton_type, category, subcategory, create_shared) + # Create directory structure - skeleton_dir = Path(f"{skeleton_type}s/{category}/{name}") + skeleton_dir = build_skeleton_path(skeleton_type, category, name, subcategory) skeleton_dir.mkdir(parents=True, exist_ok=True) tests_dir = skeleton_dir / "tests" @@ -226,13 +390,14 @@ def create_skeleton(skeleton_type: str, category: str, name: str, create_tests: return skeleton_dir -def create_tests_only(skeleton_type: str, category: str, name: str): +def create_tests_only(skeleton_type: str, category: str, name: str, subcategory: str | None = None): """Create test files for an existing skeleton. Args: skeleton_type: Type of skeleton ('component' or 'pipeline') category: Category name (e.g., 'data_processing', 'training') name: Skeleton name (e.g., 'my_processor') + subcategory: Optional subcategory name (e.g., 'sklearn_trainer') Returns: Path to created tests directory @@ -240,27 +405,42 @@ def create_tests_only(skeleton_type: str, category: str, name: str): Raises: ValueError: If the skeleton directory or required files don't exist """ - skeleton_dir = Path(f"{skeleton_type}s/{category}/{name}") + skeleton_dir = build_skeleton_path(skeleton_type, category, name, subcategory) main_file = skeleton_dir / f"{skeleton_type}.py" + # Build the command hint with subcategory if provided + subcategory_arg = f" --subcategory={subcategory}" if subcategory else "" + make_subcategory_arg = f" SUBCATEGORY={subcategory}" if subcategory else "" + # Check if skeleton directory exists if not skeleton_dir.exists(): + location = f"subcategory '{subcategory}' of category '{category}'" if subcategory else f"category '{category}'" + script_cmd = ( + f"python scripts/generate_skeleton/generate_skeleton.py " + f"--type={skeleton_type} --category={category}{subcategory_arg} --name={name}" + ) + make_cmd = f"make {skeleton_type} CATEGORY={category}{make_subcategory_arg} NAME={name}" raise ValueError( f""" -Error: {skeleton_type.title()} '{name}' does not exist in category '{category}'. +Error: {skeleton_type.title()} '{name}' does not exist in {location}. Expected directory: {skeleton_dir} To create this {skeleton_type} first, run: - python scripts/generate_skeleton/generate_skeleton.py --type={skeleton_type} --category={category} --name={name} + {script_cmd} Or use the Makefile: - make {skeleton_type} CATEGORY={category} NAME={name} + {make_cmd} """.strip() ) # Check if the main skeleton file exists if not main_file.exists(): + script_cmd = ( + f"python scripts/generate_skeleton/generate_skeleton.py " + f"--type={skeleton_type} --category={category}{subcategory_arg} --name={name}" + ) + make_cmd = f"make {skeleton_type} CATEGORY={category}{make_subcategory_arg} NAME={name}" raise ValueError( f""" Error: {skeleton_type.title()} '{name}' directory exists but missing main file. @@ -268,10 +448,10 @@ def create_tests_only(skeleton_type: str, category: str, name: str): Expected file: {main_file} The {skeleton_type} directory exists but appears incomplete. Please recreate it: - python scripts/generate_skeleton/generate_skeleton.py --type={skeleton_type} --category={category} --name={name} + {script_cmd} Or use the Makefile: - make {skeleton_type} CATEGORY={category} NAME={name} + {make_cmd} """.strip() ) @@ -301,6 +481,11 @@ def main(): %(prog)s --type=component --category=data_processing --name=my_processor %(prog)s --type=pipeline --category=ml_workflows --name=training_pipeline %(prog)s --type=component --category=training --name=bert_trainer + +With subcategory: + %(prog)s --type=component --category=training --subcategory=sklearn_trainer --name=logistic_regression + %(prog)s --type=component --category=training --subcategory=sklearn_trainer --name=random_forest --create-shared + %(prog)s --type=pipeline --category=training --subcategory=ml_workflows --name=batch_training """, ) @@ -312,6 +497,13 @@ def main(): help="Category for the component/pipeline (e.g., 'data_processing', 'training', 'ml_workflows')", ) + parser.add_argument( + "--subcategory", + required=False, + default=None, + help="Optional subcategory within the category (e.g., 'sklearn_trainer')", + ) + parser.add_argument( "--name", required=True, help="Name of the component/pipeline (use snake_case, e.g., 'my_processor')" ) @@ -322,16 +514,29 @@ def main(): "--tests-only", action="store_true", help="Create only test files for an existing component/pipeline" ) + parser.add_argument( + "--create-shared", + action="store_true", + help="Create a shared/ package in the subcategory for common utilities (only with --subcategory)", + ) + args = parser.parse_args() # Validate input parameters using comprehensive validation try: validate_name(args.name) validate_category(args.category) + if args.subcategory: + validate_subcategory(args.subcategory) except ValueError as e: print(f"Error: {e}") sys.exit(1) + # Validate --create-shared requires --subcategory + if args.create_shared and not args.subcategory: + print("Error: --create-shared requires --subcategory to be specified") + sys.exit(1) + # Validate that category exists (for new skeletons) or provide helpful guidance if not args.tests_only: existing_categories = get_existing_categories(args.type) @@ -348,10 +553,13 @@ def main(): print("Error: --no-tests and --tests-only cannot be used together") sys.exit(1) + # Build command hints for output messages + make_subcategory_arg = f" SUBCATEGORY={args.subcategory}" if args.subcategory else "" + try: if args.tests_only: # Create tests for existing skeleton - created_dir = create_tests_only(args.type, args.category, args.name) + created_dir = create_tests_only(args.type, args.category, args.name, args.subcategory) print(f"โœ… Test files created successfully at: {created_dir}") print(f""" Next steps: @@ -360,14 +568,16 @@ def main(): """) else: # Check if directory already exists for new skeleton - target_dir = Path(f"{args.type}s/{args.category}/{args.name}") + target_dir = build_skeleton_path(args.type, args.category, args.name, args.subcategory) if target_dir.exists(): print(f"Error: Directory '{target_dir}' already exists.") sys.exit(1) # Create new skeleton create_tests = not args.no_tests - created_dir = create_skeleton(args.type, args.category, args.name, create_tests) + created_dir = create_skeleton( + args.type, args.category, args.name, args.subcategory, create_tests, args.create_shared + ) print(f"โœ… {args.type.title()} skeleton created successfully at: {created_dir}") next_steps = f""" @@ -376,18 +586,33 @@ def main(): 2. Implement the logic in {created_dir}/{args.type}.py 3. Update {created_dir}/metadata.yaml with correct dependencies and tags""" + if args.subcategory: + subcategory_dir = Path(f"{args.type}s/{args.category}/{args.subcategory}") + next_steps += f""" +4. Update {subcategory_dir}/OWNERS with subcategory owners +5. Update {subcategory_dir}/README.md with subcategory documentation""" + step_offset = 6 + else: + step_offset = 4 + if create_tests: - readme_cmd = f"make readme TYPE={args.type} CATEGORY={args.category} NAME={args.name}" + readme_cmd = ( + f"make readme TYPE={args.type} CATEGORY={args.category}{make_subcategory_arg} NAME={args.name}" + ) next_steps += f""" -4. Write comprehensive tests in {created_dir}/tests/ -5. Update {created_dir}/README.md with actual documentation or run: {readme_cmd} -6. Run tests: pytest {created_dir}/tests/ -v""" +{step_offset}. Write comprehensive tests in {created_dir}/tests/ +{step_offset + 1}. Update {created_dir}/README.md with actual documentation or run: {readme_cmd} +{step_offset + 2}. Run tests: pytest {created_dir}/tests/ -v""" else: - readme_cmd = f"make readme TYPE={args.type} CATEGORY={args.category} NAME={args.name}" - tests_cmd = f"make tests TYPE={args.type} CATEGORY={args.category} NAME={args.name}" + readme_cmd = ( + f"make readme TYPE={args.type} CATEGORY={args.category}{make_subcategory_arg} NAME={args.name}" + ) + tests_cmd = ( + f"make tests TYPE={args.type} CATEGORY={args.category}{make_subcategory_arg} NAME={args.name}" + ) next_steps += f""" -4. Update {created_dir}/README.md with actual documentation or run: {readme_cmd} -5. Add tests later with: {tests_cmd}""" +{step_offset}. Update {created_dir}/README.md with actual documentation or run: {readme_cmd} +{step_offset + 1}. Add tests later with: {tests_cmd}""" print(next_steps) diff --git a/scripts/generate_skeleton/tests/test_generate_skeleton.py b/scripts/generate_skeleton/tests/test_generate_skeleton.py index 02366f191..16a51307d 100644 --- a/scripts/generate_skeleton/tests/test_generate_skeleton.py +++ b/scripts/generate_skeleton/tests/test_generate_skeleton.py @@ -7,13 +7,17 @@ import pytest from ..generate_skeleton import ( + build_skeleton_path, create_skeleton, create_tests_only, + ensure_subcategory_exists, generate_core_files, + generate_subcategory_files, generate_test_files, get_existing_categories, validate_category, validate_name, + validate_subcategory, ) @@ -25,7 +29,7 @@ def test_generate_component_files(self): files = generate_core_files("component", "data_processing", "my_processor") # Check all expected files are generated - expected_files = ["__init__.py", "component.py", "metadata.yaml", "OWNERS"] + expected_files = ["__init__.py", "component.py", "metadata.yaml", "OWNERS", "README.md"] assert set(files.keys()) == set(expected_files) # Check content contains expected elements @@ -39,7 +43,7 @@ def test_generate_pipeline_files(self): files = generate_core_files("pipeline", "training", "my_pipeline") # Check all expected files are generated - expected_files = ["__init__.py", "pipeline.py", "metadata.yaml", "OWNERS"] + expected_files = ["__init__.py", "pipeline.py", "metadata.yaml", "OWNERS", "README.md"] assert set(files.keys()) == set(expected_files) # Check content contains expected elements @@ -314,3 +318,327 @@ def test_category_with_underscores(self): # Check that category underscores are converted to hyphens in tags assert "- data-processing" in files["metadata.yaml"] + + +class TestSubcategoryValidation: + """Test subcategory validation functions.""" + + def test_validate_subcategory_valid_cases(self): + """Test validate_subcategory with valid subcategory names.""" + valid_subcategories = ["sklearn_trainer", "pytorch_models", "utils", "model_v2"] + for subcategory in valid_subcategories: + validate_subcategory(subcategory) # Should not raise + + @pytest.mark.parametrize( + "invalid_subcategory", + [ + "", # Empty + "../malicious", # Path traversal + "path/traversal", # Forward slash + "windows\\path", # Backslash + "subcategory.with.dots", # Dots + "SklearnTrainer", # Uppercase + "CamelCase", # Mixed case + "subcategory!", # Invalid character + "subcategory-with-hyphens", # Hyphens + "subcategory with spaces", # Spaces + ], + ) + def test_validate_subcategory_invalid_cases(self, invalid_subcategory): + """Test validate_subcategory raises ValueError for invalid names.""" + with pytest.raises(ValueError): + validate_subcategory(invalid_subcategory) + + +class TestBuildSkeletonPath: + """Test the build_skeleton_path helper function.""" + + def test_path_without_subcategory(self): + """Test building path without subcategory.""" + path = build_skeleton_path("component", "training", "my_trainer") + assert path == Path("components/training/my_trainer") + + def test_path_with_subcategory(self): + """Test building path with subcategory.""" + path = build_skeleton_path("component", "training", "logistic_regression", "sklearn_trainer") + assert path == Path("components/training/sklearn_trainer/logistic_regression") + + def test_pipeline_path_without_subcategory(self): + """Test building pipeline path without subcategory.""" + path = build_skeleton_path("pipeline", "ml_workflows", "training_pipeline") + assert path == Path("pipelines/ml_workflows/training_pipeline") + + def test_pipeline_path_with_subcategory(self): + """Test building pipeline path with subcategory.""" + path = build_skeleton_path("pipeline", "training", "batch_training", "ml_workflows") + assert path == Path("pipelines/training/ml_workflows/batch_training") + + +class TestGenerateSubcategoryFiles: + """Test subcategory file generation.""" + + def test_generates_required_files(self): + """Test that OWNERS and README.md are generated for subcategory.""" + files = generate_subcategory_files("sklearn_trainer") + + assert "OWNERS" in files + assert "README.md" in files + + def test_readme_contains_subcategory_name(self): + """Test that README contains the subcategory name.""" + files = generate_subcategory_files("sklearn_trainer") + + assert "Sklearn Trainer" in files["README.md"] + + +class TestEnsureSubcategoryExists: + """Test the ensure_subcategory_exists function.""" + + def test_creates_new_subcategory(self): + """Test creating a new subcategory directory with required files.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + # Create category directory first + Path("components/training").mkdir(parents=True) + + # Ensure subcategory exists + subcategory_dir = ensure_subcategory_exists("component", "training", "sklearn_trainer") + + # Check directory was created + assert subcategory_dir.exists() + assert (subcategory_dir / "OWNERS").exists() + assert (subcategory_dir / "README.md").exists() + assert (subcategory_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + + def test_creates_shared_package_when_requested(self): + """Test creating shared package in subcategory.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + Path("components/training").mkdir(parents=True) + + subcategory_dir = ensure_subcategory_exists( + "component", "training", "sklearn_trainer", create_shared=True + ) + + # Check shared directory was created + shared_dir = subcategory_dir / "shared" + assert shared_dir.exists() + assert (shared_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + + def test_does_not_overwrite_existing_files(self): + """Test that existing subcategory files are not overwritten.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + # Create subcategory with custom OWNERS + subcategory_dir = Path("components/training/sklearn_trainer") + subcategory_dir.mkdir(parents=True) + custom_owners = "approvers:\n - custom_owner\n" + (subcategory_dir / "OWNERS").write_text(custom_owners) + + # Call ensure_subcategory_exists + ensure_subcategory_exists("component", "training", "sklearn_trainer") + + # Verify OWNERS was not overwritten + assert (subcategory_dir / "OWNERS").read_text() == custom_owners + + finally: + os.chdir(original_cwd) + + def test_creates_pipeline_subcategory(self): + """Test creating a new pipeline subcategory directory with required files.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + Path("pipelines/training").mkdir(parents=True) + + subcategory_dir = ensure_subcategory_exists("pipeline", "training", "ml_workflows") + + assert subcategory_dir.exists() + assert subcategory_dir == Path("pipelines/training/ml_workflows") + assert (subcategory_dir / "OWNERS").exists() + assert (subcategory_dir / "README.md").exists() + assert (subcategory_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + + +class TestCreateSkeletonWithSubcategory: + """Test create_skeleton with subcategory support.""" + + def test_create_component_with_subcategory(self): + """Test creating a component within a subcategory.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + # Create category directory + Path("components/training").mkdir(parents=True) + + # Create skeleton with subcategory + result_dir = create_skeleton( + "component", "training", "logistic_regression", subcategory="sklearn_trainer", create_tests=True + ) + + # Check component directory structure + assert result_dir.exists() + assert result_dir == Path("components/training/sklearn_trainer/logistic_regression") + assert (result_dir / "component.py").exists() + assert (result_dir / "metadata.yaml").exists() + assert (result_dir / "OWNERS").exists() + assert (result_dir / "tests").exists() + + # Check subcategory files were created + subcategory_dir = Path("components/training/sklearn_trainer") + assert (subcategory_dir / "OWNERS").exists() + assert (subcategory_dir / "README.md").exists() + assert (subcategory_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + + def test_create_component_with_subcategory_and_shared(self): + """Test creating a component with subcategory and shared package.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + Path("components/training").mkdir(parents=True) + + create_skeleton( + "component", + "training", + "logistic_regression", + subcategory="sklearn_trainer", + create_tests=False, + create_shared=True, + ) + + # Check shared package was created + shared_dir = Path("components/training/sklearn_trainer/shared") + assert shared_dir.exists() + assert (shared_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + + def test_create_pipeline_with_subcategory(self): + """Test creating a pipeline within a subcategory.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + Path("pipelines/training").mkdir(parents=True) + + result_dir = create_skeleton( + "pipeline", "training", "batch_training", subcategory="ml_workflows", create_tests=True + ) + + # Check pipeline directory structure + assert result_dir.exists() + assert result_dir == Path("pipelines/training/ml_workflows/batch_training") + assert (result_dir / "pipeline.py").exists() + assert (result_dir / "metadata.yaml").exists() + assert (result_dir / "OWNERS").exists() + assert (result_dir / "tests").exists() + + # Check subcategory files were created + subcategory_dir = Path("pipelines/training/ml_workflows") + assert (subcategory_dir / "OWNERS").exists() + assert (subcategory_dir / "README.md").exists() + assert (subcategory_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + + +class TestCreateTestsOnlyWithSubcategory: + """Test create_tests_only with subcategory support.""" + + def test_create_tests_for_subcategory_component(self): + """Test creating tests for a component in a subcategory.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + # First create skeleton without tests + Path("components/training").mkdir(parents=True) + create_skeleton( + "component", "training", "logistic_regression", subcategory="sklearn_trainer", create_tests=False + ) + + # Now create tests + tests_dir = create_tests_only( + "component", "training", "logistic_regression", subcategory="sklearn_trainer" + ) + + # Check tests were created + assert tests_dir.exists() + assert (tests_dir / "__init__.py").exists() + assert (tests_dir / "test_component_unit.py").exists() + assert (tests_dir / "test_component_local.py").exists() + + finally: + os.chdir(original_cwd) + + def test_create_tests_only_missing_subcategory_component(self): + """Test error when trying to create tests for non-existent subcategory component.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + with pytest.raises(ValueError) as exc_info: + create_tests_only("component", "training", "nonexistent", subcategory="sklearn_trainer") + + assert "does not exist" in str(exc_info.value) + assert "sklearn_trainer" in str(exc_info.value) + + finally: + os.chdir(original_cwd) + + def test_create_tests_for_subcategory_pipeline(self): + """Test creating tests for a pipeline in a subcategory.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + # First create pipeline skeleton without tests + Path("pipelines/training").mkdir(parents=True) + create_skeleton( + "pipeline", "training", "batch_training", subcategory="ml_workflows", create_tests=False + ) + + # Now create tests + tests_dir = create_tests_only("pipeline", "training", "batch_training", subcategory="ml_workflows") + + # Check tests were created + assert tests_dir.exists() + assert (tests_dir / "__init__.py").exists() + assert (tests_dir / "test_pipeline_unit.py").exists() + assert (tests_dir / "test_pipeline_local.py").exists() + + finally: + os.chdir(original_cwd) diff --git a/scripts/lib/discovery.py b/scripts/lib/discovery.py index 05d4f5315..bc270d220 100644 --- a/scripts/lib/discovery.py +++ b/scripts/lib/discovery.py @@ -58,7 +58,8 @@ def discover_assets(base_dir: Path, asset_type: str) -> list[dict[str, Any]]: asset_type: Either 'component' or 'pipeline' Returns: - List of dicts with 'path', 'category', 'name', and 'module_path' keys + List of dicts with 'path', 'category', 'subcategory', 'name', and 'module_path' keys. + 'subcategory' is None for direct category assets. """ assets = [] filename = f"{asset_type}.py" @@ -70,20 +71,39 @@ def discover_assets(base_dir: Path, asset_type: str) -> list[dict[str, Any]]: if not category_dir.is_dir() or category_dir.name.startswith(("_", ".")): continue - for asset_dir in category_dir.iterdir(): - if not asset_dir.is_dir() or asset_dir.name.startswith(("_", ".")): + for item_dir in category_dir.iterdir(): + if not item_dir.is_dir() or item_dir.name.startswith(("_", ".")): continue - asset_file = asset_dir / filename + # Check if this is a direct asset + asset_file = item_dir / filename if asset_file.exists(): assets.append( { "path": asset_file, "category": category_dir.name, - "name": asset_dir.name, + "subcategory": None, + "name": item_dir.name, "module_path": str(asset_file), } ) + else: + # This might be a subcategory + for subitem_dir in item_dir.iterdir(): + if not subitem_dir.is_dir() or subitem_dir.name.startswith(("_", ".")): + continue + + sub_asset_file = subitem_dir / filename + if sub_asset_file.exists(): + assets.append( + { + "path": sub_asset_file, + "category": category_dir.name, + "subcategory": item_dir.name, + "name": subitem_dir.name, + "module_path": str(sub_asset_file), + } + ) return assets @@ -96,7 +116,8 @@ def find_assets_with_metadata(asset_type: str, base_path: Path | None = None) -> base_path: Optional base path, defaults to current directory Returns: - List of asset paths like 'components/training/my_component' + List of asset paths like 'components/training/my_component' or + 'components/training/sklearn_trainer/logistic_regression' """ assets = [] if base_path is None: @@ -110,12 +131,21 @@ def find_assets_with_metadata(asset_type: str, base_path: Path | None = None) -> if not category.is_dir() or category.name.startswith((".", "_")): continue - for asset in category.iterdir(): - if not asset.is_dir() or asset.name.startswith((".", "_")): + for item in category.iterdir(): + if not item.is_dir() or item.name.startswith((".", "_")): continue - if (asset / "metadata.yaml").exists(): - assets.append(f"{asset_type}/{category.name}/{asset.name}") + # Check if this is a direct asset + if (item / "metadata.yaml").exists(): + assets.append(f"{asset_type}/{category.name}/{item.name}") + else: + # This might be a subcategory + for subitem in item.iterdir(): + if not subitem.is_dir() or subitem.name.startswith((".", "_")): + continue + + if (subitem / "metadata.yaml").exists(): + assets.append(f"{asset_type}/{category.name}/{item.name}/{subitem.name}") return assets @@ -217,15 +247,47 @@ def resolve_pipeline_path(repo_root: Path, raw: str) -> Path: def _build_asset_dict_from_repo_path( repo_root: Path, asset_root: str, asset_file: Path, expected_filename: str ) -> dict[str, Any]: + """Build asset metadata dictionary from a file path. + + Args: + repo_root: Repository root directory. + asset_root: Either 'components' or 'pipelines'. + asset_file: Path to the asset file (component.py or pipeline.py). + expected_filename: Expected filename (component.py or pipeline.py). + + Returns: + Dictionary containing path, category, subcategory, name, and module_path. + 'subcategory' is None for direct category assets. + + Raises: + ValueError: If the path structure is invalid. + """ root = (repo_root / asset_root).resolve() resolved = asset_file.resolve() if resolved.name != expected_filename: raise ValueError(f"Expected {expected_filename} under {asset_root}: {asset_file}") rel = resolved.relative_to(root) - if len(rel.parts) < 3: - raise ValueError(f"Path must be {asset_root}///{expected_filename}: {asset_file}") - category, name = rel.parts[0], rel.parts[1] - return {"path": asset_file, "category": category, "name": name, "module_path": str(asset_file)} + + if len(rel.parts) == 3: + # Direct category asset: category/name/filename + category, name = rel.parts[0], rel.parts[1] + subcategory = None + elif len(rel.parts) == 4: + # Subcategory asset: category/subcategory/name/filename + category, subcategory, name = rel.parts[0], rel.parts[1], rel.parts[2] + else: + raise ValueError( + f"Path must be {asset_root}///{expected_filename} or " + f"{asset_root}////{expected_filename}: {asset_file}" + ) + + return { + "path": asset_file, + "category": category, + "subcategory": subcategory, + "name": name, + "module_path": str(asset_file), + } def build_component_asset(repo_root: Path, component_file: Path) -> dict[str, Any]: @@ -236,7 +298,8 @@ def build_component_asset(repo_root: Path, component_file: Path) -> dict[str, An component_file: Path to the component.py file. Returns: - Dictionary containing path, category, name, and module_path. + Dictionary containing path, category, subcategory, name, and module_path. + 'subcategory' is None for direct category components. """ return _build_asset_dict_from_repo_path(repo_root, "components", component_file, _COMPONENT_FILENAME) @@ -249,6 +312,7 @@ def build_pipeline_asset(repo_root: Path, pipeline_file: Path) -> dict[str, Any] pipeline_file: Path to the pipeline.py file. Returns: - Dictionary containing path, category, name, and module_path. + Dictionary containing path, category, subcategory, name, and module_path. + 'subcategory' is None for direct category pipelines. """ return _build_asset_dict_from_repo_path(repo_root, "pipelines", pipeline_file, _PIPELINE_FILENAME) diff --git a/scripts/sync_packages.py b/scripts/sync_packages.py new file mode 100644 index 000000000..318f0840a --- /dev/null +++ b/scripts/sync_packages.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +"""Sync the packages list in pyproject.toml with discovered packages. + +Discovers packages under components/ and pipelines/, maps them to the +kfp_components.* namespace, and updates the static packages list in +pyproject.toml. + +Usage: + uv run scripts/sync_packages.py +""" + +import re +from pathlib import Path + +from setuptools import find_packages + +REPO_ROOT = Path(__file__).resolve().parent.parent + + +def discover_packages() -> list[str]: + """Discover packages and map to kfp_components namespace.""" + physical = find_packages( + where=str(REPO_ROOT), + include=["components", "components.*", "pipelines", "pipelines.*"], + exclude=["*.tests", "*.tests.*"], + ) + return sorted(["kfp_components"] + [f"kfp_components.{p}" for p in physical]) + + +def sync_packages() -> None: + """Update the packages list in pyproject.toml.""" + pyproject = REPO_ROOT / "pyproject.toml" + content = pyproject.read_text() + packages = discover_packages() + + lines = ",\n".join([f' "{p}"' for p in packages]) + new_block = f"packages = [\n{lines},\n]" + + updated = re.sub( + r"packages\s*=\s*\[.*?\]", + new_block, + content, + count=1, + flags=re.DOTALL, + ) + + if updated == content: + print("pyproject.toml packages already in sync.") + return + + pyproject.write_text(updated) + print(f"Synced {len(packages)} packages in pyproject.toml") + + +if __name__ == "__main__": + sync_packages() diff --git a/scripts/validate_metadata/validate_metadata.py b/scripts/validate_metadata/validate_metadata.py index b27132f55..9b7801784 100644 --- a/scripts/validate_metadata/validate_metadata.py +++ b/scripts/validate_metadata/validate_metadata.py @@ -62,20 +62,20 @@ def parse_args() -> argparse.Namespace: "--dir", type=validate_dir, required=True, - help="Path to the component or pipeline directory (must contain OWNERS and metadata.yaml files)", + help="Path to a component/pipeline directory or a subcategory containing multiple components/pipelines", ) return parser.parse_args() def validate_dir(path: str) -> Path: - """Validate that the input path is a valid directory and contains required files. + """Validate that the input path is a valid directory. Args: - path: String representation of the path to the component or pipeline directory. + path: String representation of the path to the component, pipeline, or subcategory directory. Returns: - Path: Validated Path object to the component or pipeline directory. + Path: Validated Path object to the directory. Raises: argparse.ArgumentTypeError: If validation fails. @@ -87,15 +87,38 @@ def validate_dir(path: str) -> Path: if not path.is_dir(): raise argparse.ArgumentTypeError(f"'{path}' is not a directory") - file_path = path / OWNERS - if not file_path.exists(): - raise argparse.ArgumentTypeError(f"{path} does not contain an {OWNERS} file") + return path - metadata_file = path / METADATA - if not metadata_file.exists(): - raise argparse.ArgumentTypeError(f"'{path}' does not contain a {METADATA} file") - return path +def find_dirs_to_validate(input_dir: Path) -> list[Path]: + """Find all directories that need validation (handles both components and subcategories). + + Args: + input_dir: Path to a component/pipeline directory or a subcategory directory. + + Returns: + List of Path objects to directories containing metadata.yaml files. + + Raises: + argparse.ArgumentTypeError: If no valid directories are found. + """ + # Check if this directory has metadata.yaml + if (input_dir / METADATA).exists(): + return [input_dir] + + # This might be a subcategory - find subdirectories with metadata.yaml + dirs_to_validate = [] + for subdir in input_dir.iterdir(): + if subdir.is_dir() and (subdir / METADATA).exists(): + dirs_to_validate.append(subdir) + + if not dirs_to_validate: + raise argparse.ArgumentTypeError( + f"'{input_dir}' does not contain a {METADATA} file and has no subdirectories with one. " + f"If this is a subcategory, ensure it contains component directories." + ) + + return dirs_to_validate def validate_owners_file(filepath: Path): @@ -353,25 +376,41 @@ def main(): args = parse_args() input_dir = args.dir - # Validate OWNERS + # Find all directories to validate (handles subcategories) try: - owners_file_path = input_dir / OWNERS - validate_owners_file(owners_file_path) - except ValidationError as e: - logging.error("Validation Error: %s", e) + dirs_to_validate = find_dirs_to_validate(input_dir) + except argparse.ArgumentTypeError as e: + logging.error("Error: %s", e) sys.exit(1) - # Validate metadata.yaml - try: - metadata_file_path = input_dir / METADATA - validate_metadata_yaml(metadata_file_path) - except ValidationError as e: - logging.error("Validation Error: %s", e) - sys.exit(1) + has_errors = False + for dir_path in dirs_to_validate: + print(f"Validating {dir_path}...") + + # Validate OWNERS + try: + owners_file_path = dir_path / OWNERS + validate_owners_file(owners_file_path) + except ValidationError as e: + logging.error("Validation Error: %s", e) + has_errors = True + continue - # Validation successful. - logging.info(f"Validation successful for {input_dir}.") - print(f"Validation successful for {input_dir}.") + # Validate metadata.yaml + try: + metadata_file_path = dir_path / METADATA + validate_metadata_yaml(metadata_file_path) + except ValidationError as e: + logging.error("Validation Error: %s", e) + has_errors = True + continue + + # Validation successful for this directory. + logging.info(f"Validation successful for {dir_path}.") + print(f"Validation successful for {dir_path}.") + + if has_errors: + sys.exit(1) if __name__ == "__main__":