From 056ead45d5b6568e331f20ae5c4fb9112c0a6187 Mon Sep 17 00:00:00 2001 From: Vani Haripriya Mudadla Date: Wed, 4 Feb 2026 17:00:49 -0600 Subject: [PATCH 1/3] feat: Add Component Subcategories Support Signed-off-by: Vani Haripriya Mudadla --- .../validate_readmes/test-readme-check.sh | 29 +- AGENTS.md | 24 +- Makefile | 48 ++-- docs/CONTRIBUTING.md | 108 +++++++- .../generate_skeleton/generate_skeleton.py | 249 ++++++++++++++++-- .../tests/test_generate_skeleton.py | 248 +++++++++++++++++ scripts/lib/discovery.py | 96 +++++-- .../validate_metadata/validate_metadata.py | 89 +++++-- 8 files changed, 792 insertions(+), 99 deletions(-) diff --git a/.github/scripts/validate_readmes/test-readme-check.sh b/.github/scripts/validate_readmes/test-readme-check.sh index d138e0a04..aca5540e3 100755 --- a/.github/scripts/validate_readmes/test-readme-check.sh +++ b/.github/scripts/validate_readmes/test-readme-check.sh @@ -50,17 +50,38 @@ for target_dir in "${TARGET_DIRS[@]}"; do # Determine if it's a component or pipeline if [[ "$target_dir" == components/* ]]; then TYPE_FLAG="--component" + ASSET_FILE="component.py" elif [[ "$target_dir" == pipelines/* ]]; then TYPE_FLAG="--pipeline" + ASSET_FILE="pipeline.py" else print_error "Invalid directory: $target_dir. Must be in components/ or pipelines/" exit 2 fi - echo "Checking $target_dir..." - # Run in check mode (no --fix flag). Exit code 1 means diffs detected. - if ! uv run python -m scripts.generate_readme $TYPE_FLAG "$target_dir"; then - HAS_ERRORS=1 + # Check if this is a direct component/pipeline or a subcategory + if [[ -f "$target_dir/$ASSET_FILE" ]]; then + # Direct component/pipeline + echo "Checking $target_dir..." + if ! uv run python -m scripts.generate_readme $TYPE_FLAG "$target_dir"; then + HAS_ERRORS=1 + fi + else + # This might be a subcategory - find components inside + found_assets=0 + for subdir in "$target_dir"/*/; do + if [[ -f "$subdir$ASSET_FILE" ]]; then + found_assets=1 + echo "Checking $subdir..." + if ! uv run python -m scripts.generate_readme $TYPE_FLAG "${subdir%/}"; then + HAS_ERRORS=1 + fi + fi + done + if [[ $found_assets -eq 0 ]]; then + print_error "'$target_dir' does not contain a $ASSET_FILE file and has no subdirectories with one" + exit 2 + fi fi done diff --git a/AGENTS.md b/AGENTS.md index f6230ee4f..bb3fae2dc 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -34,10 +34,10 @@ Agents typically interact with this repository in three modes. Use the mode to d - **Reuse-first**: search `components//` and `pipelines//` for similar functionality; prefer extending/composing instead of duplicating. - **Create scaffolding**: use the Make targets in `Makefile`: - - `make component CATEGORY= NAME= [NO_TESTS]` - - `make pipeline CATEGORY= NAME= [NO_TESTS]` - - `make tests TYPE=component|pipeline CATEGORY= NAME=` - - `make readme TYPE=component|pipeline CATEGORY= NAME=` + - `make component CATEGORY= NAME= [SUBCATEGORY=] [NO_TESTS=true] [CREATE_SHARED=true]` + - `make pipeline CATEGORY= NAME= [NO_TESTS=true]` + - `make tests TYPE=component|pipeline CATEGORY= NAME= [SUBCATEGORY=]` + - `make readme TYPE=component|pipeline CATEGORY= NAME= [SUBCATEGORY=]` - **Validate like CI**: follow [`CONTRIBUTING.md` (Testing and Quality)](docs/CONTRIBUTING.md#testing-and-quality) and reference the workflows under `.github/workflows/` (example: [`.github/workflows/python-lint.yml`](.github/workflows/python-lint.yml)). - **New assets require approval**: for initial contributions (introducing a new component/pipeline to the catalog), @@ -66,7 +66,8 @@ Good places to look: #### Establish the target location and naming - Components live under `components///`. -- Pipelines live under `pipelines///`. +- Components can optionally use subcategories: `components////`. +- Pipelines live under `pipelines///` (subcategories not supported for pipelines). - Use `snake_case` directory names (per `CONTRIBUTING.md`). ### Required files @@ -95,7 +96,7 @@ Process (expected for agents): Use this prompt pattern: "Search `components/` for similar functionality and reuse if possible. If a new component is needed, create it under -`components///` using `make component CATEGORY= NAME= [NO_TESTS]`, then implement +`components///` using `make component CATEGORY= NAME= [NO_TESTS=true]`, then implement `component.py` following repository lint rules (including import guard). Create `metadata.yaml` that conforms to the metadata schema defined in [`CONTRIBUTING.md`](docs/CONTRIBUTING.md#metadatayaml-schema) (required field order, fresh `lastVerified`). Generate/validate `README.md` using `make readme TYPE=component CATEGORY= NAME=`. Add unit tests using `.python_func()` and a @@ -103,12 +104,21 @@ LocalRunner test using `setup_and_teardown_subprocess_runner` (you can generate `make tests TYPE=component CATEGORY= NAME=`). Reference an existing component like `components/data_processing/yoda_data_processor/` for patterns." +#### Add a component in a subcategory + +Use this prompt pattern when creating related components that should share ownership or utilities: + +"Create a component in a subcategory using `make component CATEGORY= SUBCATEGORY= NAME=`. This +automatically creates the subcategory structure with OWNERS and README.md if it doesn't exist. For shared utilities, +add `CREATE_SHARED=true` to create a `shared/` package. Update the subcategory OWNERS and README.md with appropriate +maintainers and documentation. Follow the same component implementation patterns as above." + #### Add a new pipeline (reuse-first, compliant) Use this prompt pattern: "Search `pipelines/` for similar functionality and reuse if possible. If a new pipeline is needed, create it under -`pipelines///` using `make pipeline CATEGORY= NAME= [NO_TESTS]`, then implement +`pipelines///` using `make pipeline CATEGORY= NAME= [NO_TESTS=true]`, then implement `pipeline.py` following repository lint rules (including import guard). Create `metadata.yaml` that conforms to the metadata schema defined in [`CONTRIBUTING.md`](docs/CONTRIBUTING.md#metadatayaml-schema) (required field order, fresh `lastVerified`). Generate/validate `README.md` using `make readme TYPE=pipeline CATEGORY= NAME=`. Add tests diff --git a/Makefile b/Makefile index 923d85ecd..e7e75afe2 100644 --- a/Makefile +++ b/Makefile @@ -38,17 +38,19 @@ test-coverage: cd .github/scripts && $(PYTEST) */tests/ --cov=. --cov-report=term-missing -v $(ARGS) component: - @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make component CATEGORY=data_processing NAME=my_component [NO_TESTS]"; exit 1; fi - @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make component CATEGORY=data_processing NAME=my_component [NO_TESTS]"; exit 1; fi - @if [ -n "$(NO_TESTS)" ]; then \ - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=component --category=$(CATEGORY) --name=$(NAME) --no-tests; \ - else \ - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=component --category=$(CATEGORY) --name=$(NAME); \ - fi + @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make component CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x] [NO_TESTS=true] [CREATE_SHARED=true]"; exit 1; fi + @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make component CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x] [NO_TESTS=true] [CREATE_SHARED=true]"; exit 1; fi + @SUBCATEGORY_ARG=""; \ + if [ -n "$(SUBCATEGORY)" ]; then SUBCATEGORY_ARG="--subcategory=$(SUBCATEGORY)"; fi; \ + NO_TESTS_ARG=""; \ + if [ -n "$(NO_TESTS)" ]; then NO_TESTS_ARG="--no-tests"; fi; \ + CREATE_SHARED_ARG=""; \ + if [ -n "$(CREATE_SHARED)" ]; then CREATE_SHARED_ARG="--create-shared"; fi; \ + $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=component --category=$(CATEGORY) --name=$(NAME) $$SUBCATEGORY_ARG $$NO_TESTS_ARG $$CREATE_SHARED_ARG pipeline: - @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [NO_TESTS]"; exit 1; fi - @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [NO_TESTS]"; exit 1; fi + @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [NO_TESTS=true]"; exit 1; fi + @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [NO_TESTS=true]"; exit 1; fi @if [ -n "$(NO_TESTS)" ]; then \ $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=pipeline --category=$(CATEGORY) --name=$(NAME) --no-tests; \ else \ @@ -56,17 +58,29 @@ pipeline: fi tests: - @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi - @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi - @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=$(TYPE) --category=$(CATEGORY) --name=$(NAME) --tests-only + @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi + @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi + @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi + @if [ "$(TYPE)" = "component" ]; then \ + SUBCATEGORY_ARG=""; \ + if [ -n "$(SUBCATEGORY)" ]; then SUBCATEGORY_ARG="--subcategory=$(SUBCATEGORY)"; fi; \ + $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=$(TYPE) --category=$(CATEGORY) --name=$(NAME) $$SUBCATEGORY_ARG --tests-only; \ + elif [ "$(TYPE)" = "pipeline" ]; then \ + $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=$(TYPE) --category=$(CATEGORY) --name=$(NAME) --tests-only; \ + else \ + echo "Error: TYPE must be either 'component' or 'pipeline'"; exit 1; \ + fi readme: - @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi - @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi - @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component"; exit 1; fi + @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi + @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi + @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi @if [ "$(TYPE)" = "component" ]; then \ - $(UVRUN) -m scripts.generate_readme --component $(TYPE)s/$(CATEGORY)/$(NAME) --fix; \ + if [ -n "$(SUBCATEGORY)" ]; then \ + $(UVRUN) -m scripts.generate_readme --component $(TYPE)s/$(CATEGORY)/$(SUBCATEGORY)/$(NAME) --fix; \ + else \ + $(UVRUN) -m scripts.generate_readme --component $(TYPE)s/$(CATEGORY)/$(NAME) --fix; \ + fi; \ elif [ "$(TYPE)" = "pipeline" ]; then \ $(UVRUN) -m scripts.generate_readme --pipeline $(TYPE)s/$(CATEGORY)/$(NAME) --fix; \ else \ diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index f6345be3e..7aff80c89 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -128,6 +128,32 @@ Components must be organized by category under `components//`. Pipelines must be organized by category under `pipelines//`. +### Subcategories (Components Only) + +For better organization of related components, you can create **subcategories** within a category. +Subcategories provide: + +- **Logical grouping** of related components (e.g., all sklearn-based trainers) +- **Dedicated ownership** via subcategory-level OWNERS file +- **Shared utilities** via an optional `shared/` package + +```text +components/// +├── __init__.py # Subcategory package +├── OWNERS # Subcategory maintainers +├── README.md # Subcategory documentation +├── shared/ # Optional shared utilities package +│ ├── __init__.py +│ └── training_utils.py # Common code for components in this subcategory +└── / # Individual component + ├── __init__.py + ├── component.py + ├── metadata.yaml + ├── OWNERS + ├── README.md + └── tests/ +``` + ## Naming Conventions - **Components and pipelines** use `snake_case` (e.g., `data_preprocessing`, `model_trainer`) @@ -275,14 +301,20 @@ For rapid development, this repository provides convenient make commands that au The following make targets simplify the development workflow: -| Command | Description | -|--------------------------------------------------------|---------------------------------------------------| -| `make component CATEGORY= NAME= [NO_TESTS]` | Create a new component skeleton | -| `make pipeline CATEGORY= NAME= [NO_TESTS]` | Create a new pipeline skeleton | -| `make tests TYPE= CATEGORY= NAME=` | Add tests to existing component/pipeline | -| `make readme TYPE= CATEGORY= NAME=` | Generate/update README from code | -| `make format` | Auto-fix code formatting and linting issues | -| `make lint` | Check code quality (formatting, linting, imports) | +| Command | Description | +|---------|-------------| +| `make component CATEGORY= NAME=` | Create a new component skeleton | +| `make pipeline CATEGORY= NAME=` | Create a new pipeline skeleton | +| `make tests TYPE= CATEGORY= NAME=` | Add tests to existing component/pipeline | +| `make readme TYPE= CATEGORY= NAME=` | Generate/update README from code | +| `make format` | Auto-fix code formatting and linting issues | +| `make lint` | Check code quality (formatting, linting, imports) | + +**Optional flags** (append to component/pipeline commands): + +- `SUBCATEGORY=` - Create component in a subcategory (components only) +- `NO_TESTS=true` - Skip test file generation +- `CREATE_SHARED=true` - Create shared utilities package (requires SUBCATEGORY) @@ -301,8 +333,8 @@ make pipeline CATEGORY=training NAME=my_training_pipeline **Create without tests (for rapid prototyping):** ```bash -make component CATEGORY=data_processing NAME=my_prototype NO_TESTS -make pipeline CATEGORY=training NAME=my_prototype NO_TESTS +make component CATEGORY=data_processing NAME=my_prototype NO_TESTS=true +make pipeline CATEGORY=training NAME=my_prototype NO_TESTS=true ``` This generates the complete directory structure: @@ -320,6 +352,33 @@ components/data_processing/my_data_processor/ └── test_component_local.py # Integration test template ``` +**Create a component within a subcategory:** + +```bash +# Create component in a subcategory (subcategory files created automatically) +make component CATEGORY=training SUBCATEGORY=sklearn_trainer NAME=logistic_regression + +# Create component in subcategory with shared utilities package +make component CATEGORY=training SUBCATEGORY=sklearn_trainer NAME=random_forest CREATE_SHARED=true +``` + +This generates a nested structure: + +```text +components/training/sklearn_trainer/ +├── __init__.py # Subcategory package +├── OWNERS # Subcategory maintainers +├── README.md # Subcategory documentation +├── shared/ # (if CREATE_SHARED) Shared utilities +│ └── __init__.py +└── logistic_regression/ # Your component + ├── __init__.py + ├── component.py + ├── metadata.yaml + ├── OWNERS + └── tests/ +``` +
🔧 Alternative: Manual Creation @@ -477,6 +536,35 @@ git push origin component/csv-cleaner This workflow typically takes just a few minutes to set up the complete component structure with documentation and tests. +#### Example Workflow with Subcategory + +When creating related components that share ownership or utilities: + +```bash +# 1. Create feature branch +git checkout -b component/sklearn-logistic-regression upstream/main + +# 2. Create component in subcategory (first component also creates subcategory structure) +make component CATEGORY=training SUBCATEGORY=sklearn_trainer NAME=logistic_regression + +# 3. Edit the component and subcategory files: +# - components/training/sklearn_trainer/logistic_regression/component.py (your logic) +# - components/training/sklearn_trainer/OWNERS (subcategory maintainers) +# - components/training/sklearn_trainer/README.md (subcategory docs) + +# 4. Generate documentation +make readme TYPE=component CATEGORY=training SUBCATEGORY=sklearn_trainer NAME=logistic_regression + +# 5. Format, lint, test, and submit (same as above) +make format +make lint +pytest components/training/sklearn_trainer/logistic_regression/tests/ -v +pre-commit run +git add . +git commit -m "feat(training): add logistic_regression component in sklearn_trainer subcategory" +git push origin component/sklearn-logistic-regression +``` + ## Testing and Quality ### Running Tests Locally diff --git a/scripts/generate_skeleton/generate_skeleton.py b/scripts/generate_skeleton/generate_skeleton.py index 79d8f07a3..a2b03e5a8 100755 --- a/scripts/generate_skeleton/generate_skeleton.py +++ b/scripts/generate_skeleton/generate_skeleton.py @@ -7,6 +7,8 @@ Usage: python scripts/generate_skeleton/generate_skeleton.py --type=component \\ --category=data_processing --name=my_processor + python scripts/generate_skeleton/generate_skeleton.py --type=component \\ + --category=training --subcategory=sklearn_trainer --name=logistic_regression python scripts/generate_skeleton/generate_skeleton.py --type=pipeline \\ --category=ml_workflows --name=my_training_pipeline """ @@ -95,6 +97,34 @@ def validate_category(category: str) -> None: raise ValueError("Category can only contain letters, numbers, and underscores") +def validate_subcategory(subcategory: str) -> None: + """Validate subcategory name for security. + + Args: + subcategory: The subcategory to validate + + Raises: + ValueError: If subcategory is invalid + """ + if not subcategory: + raise ValueError("Subcategory cannot be empty") + + # Check for directory traversal attempts + if "/" in subcategory or "\\" in subcategory: + raise ValueError("Subcategory cannot contain path separators (/, \\)") + + if "." in subcategory: + raise ValueError("Subcategory cannot contain dots (.)") + + # Enforce snake_case (no uppercase letters, allow underscores) + if subcategory != subcategory.lower(): + raise ValueError("Subcategory must be lowercase (snake_case)") + + # Allow letters, numbers, and underscores + if not subcategory.replace("_", "").isalnum() or not subcategory[0].isalpha(): + raise ValueError("Subcategory can only contain letters, numbers, and underscores") + + def get_existing_categories(skeleton_type: str) -> list[str]: """Get list of existing categories for the given skeleton type. @@ -111,6 +141,107 @@ def get_existing_categories(skeleton_type: str) -> list[str]: return [item.name for item in base_dir.iterdir() if item.is_dir() and not item.name.startswith((".", "_"))] +def build_skeleton_path(skeleton_type: str, category: str, name: str, subcategory: str | None = None) -> Path: + """Build the path for a skeleton directory. + + Args: + skeleton_type: Type of skeleton ('component' or 'pipeline') + category: Category name + name: Skeleton name + subcategory: Optional subcategory name + + Returns: + Path to the skeleton directory + """ + if subcategory: + return Path(f"{skeleton_type}s/{category}/{subcategory}/{name}") + return Path(f"{skeleton_type}s/{category}/{name}") + + +def generate_subcategory_files(subcategory: str) -> dict[str, str]: + """Generate files for a new subcategory (OWNERS and README.md). + + Args: + subcategory: Subcategory name + + Returns: + Dict of filename to content mappings + """ + env = _get_template_env() + + files = {} + + # Generate OWNERS for subcategory + template = env.get_template("OWNERS.j2") + files["OWNERS"] = template.render({"name": subcategory}) + + # Generate a simple README for subcategory + readme_content = f"""# {subcategory.replace("_", " ").title()} + +This subcategory contains related components. + +## Overview + +TODO: Add description of what this subcategory contains. + +## Components + +TODO: List components in this subcategory. + +## Shared Utilities + +If this subcategory has a `shared/` directory, document the shared utilities here. +""" + files["README.md"] = readme_content + + return files + + +def ensure_subcategory_exists(skeleton_type: str, category: str, subcategory: str, create_shared: bool = False) -> Path: + """Ensure subcategory directory exists with required files. + + Creates the subcategory directory and its OWNERS/README files if they don't exist. + + Args: + skeleton_type: Type of skeleton ('component' or 'pipeline') + category: Category name + subcategory: Subcategory name + create_shared: Whether to create a shared/ package directory + + Returns: + Path to the subcategory directory + """ + subcategory_dir = Path(f"{skeleton_type}s/{category}/{subcategory}") + + # Check if this is a new subcategory + is_new_subcategory = not subcategory_dir.exists() + + subcategory_dir.mkdir(parents=True, exist_ok=True) + + # Create subcategory-level files only if this is a new subcategory + if is_new_subcategory: + subcategory_files = generate_subcategory_files(subcategory) + for filename, content in subcategory_files.items(): + file_path = subcategory_dir / filename + if not file_path.exists(): + file_path.write_text(content) + + # Create __init__.py for the subcategory package + init_path = subcategory_dir / "__init__.py" + if not init_path.exists(): + init_path.write_text(f'"""Components in the {subcategory} subcategory."""\n') + + # Optionally create shared/ package + if create_shared: + shared_dir = subcategory_dir / "shared" + shared_dir.mkdir(exist_ok=True) + shared_init = shared_dir / "__init__.py" + if not shared_init.exists(): + shared_init.write_text(f'"""Shared utilities for {subcategory} components."""\n') + + return subcategory_dir + + def generate_core_files(skeleton_type: str, category: str, name: str) -> dict[str, str]: """Generate core files for skeleton based on type. @@ -192,20 +323,33 @@ def generate_test_files(skeleton_type: str, name: str) -> dict[str, str]: return files -def create_skeleton(skeleton_type: str, category: str, name: str, create_tests: bool = True): +def create_skeleton( + skeleton_type: str, + category: str, + name: str, + subcategory: str | None = None, + create_tests: bool = True, + create_shared: bool = False, +): """Create skeleton files for a component or pipeline. Args: skeleton_type: Type of skeleton ('component' or 'pipeline') category: Category name (e.g., 'data_processing', 'training') name: Skeleton name (e.g., 'my_processor') + subcategory: Optional subcategory name (e.g., 'sklearn_trainer') create_tests: Whether to create test files (default: True) + create_shared: Whether to create shared/ package in subcategory (default: False) Returns: Path to created directory """ + # Ensure subcategory exists with required files if specified + if subcategory: + ensure_subcategory_exists(skeleton_type, category, subcategory, create_shared) + # Create directory structure - skeleton_dir = Path(f"{skeleton_type}s/{category}/{name}") + skeleton_dir = build_skeleton_path(skeleton_type, category, name, subcategory) skeleton_dir.mkdir(parents=True, exist_ok=True) tests_dir = skeleton_dir / "tests" @@ -226,13 +370,14 @@ def create_skeleton(skeleton_type: str, category: str, name: str, create_tests: return skeleton_dir -def create_tests_only(skeleton_type: str, category: str, name: str): +def create_tests_only(skeleton_type: str, category: str, name: str, subcategory: str | None = None): """Create test files for an existing skeleton. Args: skeleton_type: Type of skeleton ('component' or 'pipeline') category: Category name (e.g., 'data_processing', 'training') name: Skeleton name (e.g., 'my_processor') + subcategory: Optional subcategory name (e.g., 'sklearn_trainer') Returns: Path to created tests directory @@ -240,27 +385,42 @@ def create_tests_only(skeleton_type: str, category: str, name: str): Raises: ValueError: If the skeleton directory or required files don't exist """ - skeleton_dir = Path(f"{skeleton_type}s/{category}/{name}") + skeleton_dir = build_skeleton_path(skeleton_type, category, name, subcategory) main_file = skeleton_dir / f"{skeleton_type}.py" + # Build the command hint with subcategory if provided + subcategory_arg = f" --subcategory={subcategory}" if subcategory else "" + make_subcategory_arg = f" SUBCATEGORY={subcategory}" if subcategory else "" + # Check if skeleton directory exists if not skeleton_dir.exists(): + location = f"subcategory '{subcategory}' of category '{category}'" if subcategory else f"category '{category}'" + script_cmd = ( + f"python scripts/generate_skeleton/generate_skeleton.py " + f"--type={skeleton_type} --category={category}{subcategory_arg} --name={name}" + ) + make_cmd = f"make {skeleton_type} CATEGORY={category}{make_subcategory_arg} NAME={name}" raise ValueError( f""" -Error: {skeleton_type.title()} '{name}' does not exist in category '{category}'. +Error: {skeleton_type.title()} '{name}' does not exist in {location}. Expected directory: {skeleton_dir} To create this {skeleton_type} first, run: - python scripts/generate_skeleton/generate_skeleton.py --type={skeleton_type} --category={category} --name={name} + {script_cmd} Or use the Makefile: - make {skeleton_type} CATEGORY={category} NAME={name} + {make_cmd} """.strip() ) # Check if the main skeleton file exists if not main_file.exists(): + script_cmd = ( + f"python scripts/generate_skeleton/generate_skeleton.py " + f"--type={skeleton_type} --category={category}{subcategory_arg} --name={name}" + ) + make_cmd = f"make {skeleton_type} CATEGORY={category}{make_subcategory_arg} NAME={name}" raise ValueError( f""" Error: {skeleton_type.title()} '{name}' directory exists but missing main file. @@ -268,10 +428,10 @@ def create_tests_only(skeleton_type: str, category: str, name: str): Expected file: {main_file} The {skeleton_type} directory exists but appears incomplete. Please recreate it: - python scripts/generate_skeleton/generate_skeleton.py --type={skeleton_type} --category={category} --name={name} + {script_cmd} Or use the Makefile: - make {skeleton_type} CATEGORY={category} NAME={name} + {make_cmd} """.strip() ) @@ -301,6 +461,10 @@ def main(): %(prog)s --type=component --category=data_processing --name=my_processor %(prog)s --type=pipeline --category=ml_workflows --name=training_pipeline %(prog)s --type=component --category=training --name=bert_trainer + +With subcategory: + %(prog)s --type=component --category=training --subcategory=sklearn_trainer --name=logistic_regression + %(prog)s --type=component --category=training --subcategory=sklearn_trainer --name=random_forest --create-shared """, ) @@ -312,6 +476,13 @@ def main(): help="Category for the component/pipeline (e.g., 'data_processing', 'training', 'ml_workflows')", ) + parser.add_argument( + "--subcategory", + required=False, + default=None, + help="Optional subcategory within the category (components only, e.g., 'sklearn_trainer')", + ) + parser.add_argument( "--name", required=True, help="Name of the component/pipeline (use snake_case, e.g., 'my_processor')" ) @@ -322,16 +493,34 @@ def main(): "--tests-only", action="store_true", help="Create only test files for an existing component/pipeline" ) + parser.add_argument( + "--create-shared", + action="store_true", + help="Create a shared/ package in the subcategory for common utilities (only with --subcategory)", + ) + args = parser.parse_args() # Validate input parameters using comprehensive validation try: validate_name(args.name) validate_category(args.category) + if args.subcategory: + validate_subcategory(args.subcategory) except ValueError as e: print(f"Error: {e}") sys.exit(1) + # Validate --create-shared requires --subcategory + if args.create_shared and not args.subcategory: + print("Error: --create-shared requires --subcategory to be specified") + sys.exit(1) + + # Validate --subcategory is only allowed for components (not pipelines) + if args.subcategory and args.type == "pipeline": + print("Error: --subcategory is only supported for components, not pipelines") + sys.exit(1) + # Validate that category exists (for new skeletons) or provide helpful guidance if not args.tests_only: existing_categories = get_existing_categories(args.type) @@ -348,10 +537,13 @@ def main(): print("Error: --no-tests and --tests-only cannot be used together") sys.exit(1) + # Build command hints for output messages + make_subcategory_arg = f" SUBCATEGORY={args.subcategory}" if args.subcategory else "" + try: if args.tests_only: # Create tests for existing skeleton - created_dir = create_tests_only(args.type, args.category, args.name) + created_dir = create_tests_only(args.type, args.category, args.name, args.subcategory) print(f"✅ Test files created successfully at: {created_dir}") print(f""" Next steps: @@ -360,14 +552,16 @@ def main(): """) else: # Check if directory already exists for new skeleton - target_dir = Path(f"{args.type}s/{args.category}/{args.name}") + target_dir = build_skeleton_path(args.type, args.category, args.name, args.subcategory) if target_dir.exists(): print(f"Error: Directory '{target_dir}' already exists.") sys.exit(1) # Create new skeleton create_tests = not args.no_tests - created_dir = create_skeleton(args.type, args.category, args.name, create_tests) + created_dir = create_skeleton( + args.type, args.category, args.name, args.subcategory, create_tests, args.create_shared + ) print(f"✅ {args.type.title()} skeleton created successfully at: {created_dir}") next_steps = f""" @@ -376,18 +570,33 @@ def main(): 2. Implement the logic in {created_dir}/{args.type}.py 3. Update {created_dir}/metadata.yaml with correct dependencies and tags""" + if args.subcategory: + subcategory_dir = Path(f"{args.type}s/{args.category}/{args.subcategory}") + next_steps += f""" +4. Update {subcategory_dir}/OWNERS with subcategory owners +5. Update {subcategory_dir}/README.md with subcategory documentation""" + step_offset = 6 + else: + step_offset = 4 + if create_tests: - readme_cmd = f"make readme TYPE={args.type} CATEGORY={args.category} NAME={args.name}" + readme_cmd = ( + f"make readme TYPE={args.type} CATEGORY={args.category}{make_subcategory_arg} NAME={args.name}" + ) next_steps += f""" -4. Write comprehensive tests in {created_dir}/tests/ -5. Update {created_dir}/README.md with actual documentation or run: {readme_cmd} -6. Run tests: pytest {created_dir}/tests/ -v""" +{step_offset}. Write comprehensive tests in {created_dir}/tests/ +{step_offset + 1}. Update {created_dir}/README.md with actual documentation or run: {readme_cmd} +{step_offset + 2}. Run tests: pytest {created_dir}/tests/ -v""" else: - readme_cmd = f"make readme TYPE={args.type} CATEGORY={args.category} NAME={args.name}" - tests_cmd = f"make tests TYPE={args.type} CATEGORY={args.category} NAME={args.name}" + readme_cmd = ( + f"make readme TYPE={args.type} CATEGORY={args.category}{make_subcategory_arg} NAME={args.name}" + ) + tests_cmd = ( + f"make tests TYPE={args.type} CATEGORY={args.category}{make_subcategory_arg} NAME={args.name}" + ) next_steps += f""" -4. Update {created_dir}/README.md with actual documentation or run: {readme_cmd} -5. Add tests later with: {tests_cmd}""" +{step_offset}. Update {created_dir}/README.md with actual documentation or run: {readme_cmd} +{step_offset + 1}. Add tests later with: {tests_cmd}""" print(next_steps) diff --git a/scripts/generate_skeleton/tests/test_generate_skeleton.py b/scripts/generate_skeleton/tests/test_generate_skeleton.py index 02366f191..441ed8e06 100644 --- a/scripts/generate_skeleton/tests/test_generate_skeleton.py +++ b/scripts/generate_skeleton/tests/test_generate_skeleton.py @@ -7,13 +7,17 @@ import pytest from ..generate_skeleton import ( + build_skeleton_path, create_skeleton, create_tests_only, + ensure_subcategory_exists, generate_core_files, + generate_subcategory_files, generate_test_files, get_existing_categories, validate_category, validate_name, + validate_subcategory, ) @@ -314,3 +318,247 @@ def test_category_with_underscores(self): # Check that category underscores are converted to hyphens in tags assert "- data-processing" in files["metadata.yaml"] + + +class TestSubcategoryValidation: + """Test subcategory validation functions.""" + + def test_validate_subcategory_valid_cases(self): + """Test validate_subcategory with valid subcategory names.""" + valid_subcategories = ["sklearn_trainer", "pytorch_models", "utils", "model_v2"] + for subcategory in valid_subcategories: + validate_subcategory(subcategory) # Should not raise + + @pytest.mark.parametrize( + "invalid_subcategory", + [ + "", # Empty + "../malicious", # Path traversal + "path/traversal", # Forward slash + "windows\\path", # Backslash + "subcategory.with.dots", # Dots + "SklearnTrainer", # Uppercase + "CamelCase", # Mixed case + "subcategory!", # Invalid character + "subcategory-with-hyphens", # Hyphens + "subcategory with spaces", # Spaces + ], + ) + def test_validate_subcategory_invalid_cases(self, invalid_subcategory): + """Test validate_subcategory raises ValueError for invalid names.""" + with pytest.raises(ValueError): + validate_subcategory(invalid_subcategory) + + +class TestBuildSkeletonPath: + """Test the build_skeleton_path helper function.""" + + def test_path_without_subcategory(self): + """Test building path without subcategory.""" + path = build_skeleton_path("component", "training", "my_trainer") + assert path == Path("components/training/my_trainer") + + def test_path_with_subcategory(self): + """Test building path with subcategory.""" + path = build_skeleton_path("component", "training", "logistic_regression", "sklearn_trainer") + assert path == Path("components/training/sklearn_trainer/logistic_regression") + + def test_pipeline_path_without_subcategory(self): + """Test building pipeline path without subcategory.""" + path = build_skeleton_path("pipeline", "ml_workflows", "training_pipeline") + assert path == Path("pipelines/ml_workflows/training_pipeline") + + +class TestGenerateSubcategoryFiles: + """Test subcategory file generation.""" + + def test_generates_required_files(self): + """Test that OWNERS and README.md are generated for subcategory.""" + files = generate_subcategory_files("sklearn_trainer") + + assert "OWNERS" in files + assert "README.md" in files + + def test_readme_contains_subcategory_name(self): + """Test that README contains the subcategory name.""" + files = generate_subcategory_files("sklearn_trainer") + + assert "Sklearn Trainer" in files["README.md"] + + +class TestEnsureSubcategoryExists: + """Test the ensure_subcategory_exists function.""" + + def test_creates_new_subcategory(self): + """Test creating a new subcategory directory with required files.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + # Create category directory first + Path("components/training").mkdir(parents=True) + + # Ensure subcategory exists + subcategory_dir = ensure_subcategory_exists("component", "training", "sklearn_trainer") + + # Check directory was created + assert subcategory_dir.exists() + assert (subcategory_dir / "OWNERS").exists() + assert (subcategory_dir / "README.md").exists() + assert (subcategory_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + + def test_creates_shared_package_when_requested(self): + """Test creating shared package in subcategory.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + Path("components/training").mkdir(parents=True) + + subcategory_dir = ensure_subcategory_exists( + "component", "training", "sklearn_trainer", create_shared=True + ) + + # Check shared directory was created + shared_dir = subcategory_dir / "shared" + assert shared_dir.exists() + assert (shared_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + + def test_does_not_overwrite_existing_files(self): + """Test that existing subcategory files are not overwritten.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + # Create subcategory with custom OWNERS + subcategory_dir = Path("components/training/sklearn_trainer") + subcategory_dir.mkdir(parents=True) + custom_owners = "approvers:\n - custom_owner\n" + (subcategory_dir / "OWNERS").write_text(custom_owners) + + # Call ensure_subcategory_exists + ensure_subcategory_exists("component", "training", "sklearn_trainer") + + # Verify OWNERS was not overwritten + assert (subcategory_dir / "OWNERS").read_text() == custom_owners + + finally: + os.chdir(original_cwd) + + +class TestCreateSkeletonWithSubcategory: + """Test create_skeleton with subcategory support.""" + + def test_create_component_with_subcategory(self): + """Test creating a component within a subcategory.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + # Create category directory + Path("components/training").mkdir(parents=True) + + # Create skeleton with subcategory + result_dir = create_skeleton( + "component", "training", "logistic_regression", subcategory="sklearn_trainer", create_tests=True + ) + + # Check component directory structure + assert result_dir.exists() + assert result_dir == Path("components/training/sklearn_trainer/logistic_regression") + assert (result_dir / "component.py").exists() + assert (result_dir / "metadata.yaml").exists() + assert (result_dir / "OWNERS").exists() + assert (result_dir / "tests").exists() + + # Check subcategory files were created + subcategory_dir = Path("components/training/sklearn_trainer") + assert (subcategory_dir / "OWNERS").exists() + assert (subcategory_dir / "README.md").exists() + assert (subcategory_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + + def test_create_component_with_subcategory_and_shared(self): + """Test creating a component with subcategory and shared package.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + Path("components/training").mkdir(parents=True) + + create_skeleton( + "component", + "training", + "logistic_regression", + subcategory="sklearn_trainer", + create_tests=False, + create_shared=True, + ) + + # Check shared package was created + shared_dir = Path("components/training/sklearn_trainer/shared") + assert shared_dir.exists() + assert (shared_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + + +class TestCreateTestsOnlyWithSubcategory: + """Test create_tests_only with subcategory support.""" + + def test_create_tests_for_subcategory_component(self): + """Test creating tests for a component in a subcategory.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + # First create skeleton without tests + Path("components/training").mkdir(parents=True) + create_skeleton( + "component", "training", "logistic_regression", subcategory="sklearn_trainer", create_tests=False + ) + + # Now create tests + tests_dir = create_tests_only( + "component", "training", "logistic_regression", subcategory="sklearn_trainer" + ) + + # Check tests were created + assert tests_dir.exists() + assert (tests_dir / "__init__.py").exists() + assert (tests_dir / "test_component_unit.py").exists() + assert (tests_dir / "test_component_local.py").exists() + + finally: + os.chdir(original_cwd) + + def test_create_tests_only_missing_subcategory_component(self): + """Test error when trying to create tests for non-existent subcategory component.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + with pytest.raises(ValueError) as exc_info: + create_tests_only("component", "training", "nonexistent", subcategory="sklearn_trainer") + + assert "does not exist" in str(exc_info.value) + assert "sklearn_trainer" in str(exc_info.value) + + finally: + os.chdir(original_cwd) diff --git a/scripts/lib/discovery.py b/scripts/lib/discovery.py index 05d4f5315..bc270d220 100644 --- a/scripts/lib/discovery.py +++ b/scripts/lib/discovery.py @@ -58,7 +58,8 @@ def discover_assets(base_dir: Path, asset_type: str) -> list[dict[str, Any]]: asset_type: Either 'component' or 'pipeline' Returns: - List of dicts with 'path', 'category', 'name', and 'module_path' keys + List of dicts with 'path', 'category', 'subcategory', 'name', and 'module_path' keys. + 'subcategory' is None for direct category assets. """ assets = [] filename = f"{asset_type}.py" @@ -70,20 +71,39 @@ def discover_assets(base_dir: Path, asset_type: str) -> list[dict[str, Any]]: if not category_dir.is_dir() or category_dir.name.startswith(("_", ".")): continue - for asset_dir in category_dir.iterdir(): - if not asset_dir.is_dir() or asset_dir.name.startswith(("_", ".")): + for item_dir in category_dir.iterdir(): + if not item_dir.is_dir() or item_dir.name.startswith(("_", ".")): continue - asset_file = asset_dir / filename + # Check if this is a direct asset + asset_file = item_dir / filename if asset_file.exists(): assets.append( { "path": asset_file, "category": category_dir.name, - "name": asset_dir.name, + "subcategory": None, + "name": item_dir.name, "module_path": str(asset_file), } ) + else: + # This might be a subcategory + for subitem_dir in item_dir.iterdir(): + if not subitem_dir.is_dir() or subitem_dir.name.startswith(("_", ".")): + continue + + sub_asset_file = subitem_dir / filename + if sub_asset_file.exists(): + assets.append( + { + "path": sub_asset_file, + "category": category_dir.name, + "subcategory": item_dir.name, + "name": subitem_dir.name, + "module_path": str(sub_asset_file), + } + ) return assets @@ -96,7 +116,8 @@ def find_assets_with_metadata(asset_type: str, base_path: Path | None = None) -> base_path: Optional base path, defaults to current directory Returns: - List of asset paths like 'components/training/my_component' + List of asset paths like 'components/training/my_component' or + 'components/training/sklearn_trainer/logistic_regression' """ assets = [] if base_path is None: @@ -110,12 +131,21 @@ def find_assets_with_metadata(asset_type: str, base_path: Path | None = None) -> if not category.is_dir() or category.name.startswith((".", "_")): continue - for asset in category.iterdir(): - if not asset.is_dir() or asset.name.startswith((".", "_")): + for item in category.iterdir(): + if not item.is_dir() or item.name.startswith((".", "_")): continue - if (asset / "metadata.yaml").exists(): - assets.append(f"{asset_type}/{category.name}/{asset.name}") + # Check if this is a direct asset + if (item / "metadata.yaml").exists(): + assets.append(f"{asset_type}/{category.name}/{item.name}") + else: + # This might be a subcategory + for subitem in item.iterdir(): + if not subitem.is_dir() or subitem.name.startswith((".", "_")): + continue + + if (subitem / "metadata.yaml").exists(): + assets.append(f"{asset_type}/{category.name}/{item.name}/{subitem.name}") return assets @@ -217,15 +247,47 @@ def resolve_pipeline_path(repo_root: Path, raw: str) -> Path: def _build_asset_dict_from_repo_path( repo_root: Path, asset_root: str, asset_file: Path, expected_filename: str ) -> dict[str, Any]: + """Build asset metadata dictionary from a file path. + + Args: + repo_root: Repository root directory. + asset_root: Either 'components' or 'pipelines'. + asset_file: Path to the asset file (component.py or pipeline.py). + expected_filename: Expected filename (component.py or pipeline.py). + + Returns: + Dictionary containing path, category, subcategory, name, and module_path. + 'subcategory' is None for direct category assets. + + Raises: + ValueError: If the path structure is invalid. + """ root = (repo_root / asset_root).resolve() resolved = asset_file.resolve() if resolved.name != expected_filename: raise ValueError(f"Expected {expected_filename} under {asset_root}: {asset_file}") rel = resolved.relative_to(root) - if len(rel.parts) < 3: - raise ValueError(f"Path must be {asset_root}///{expected_filename}: {asset_file}") - category, name = rel.parts[0], rel.parts[1] - return {"path": asset_file, "category": category, "name": name, "module_path": str(asset_file)} + + if len(rel.parts) == 3: + # Direct category asset: category/name/filename + category, name = rel.parts[0], rel.parts[1] + subcategory = None + elif len(rel.parts) == 4: + # Subcategory asset: category/subcategory/name/filename + category, subcategory, name = rel.parts[0], rel.parts[1], rel.parts[2] + else: + raise ValueError( + f"Path must be {asset_root}///{expected_filename} or " + f"{asset_root}////{expected_filename}: {asset_file}" + ) + + return { + "path": asset_file, + "category": category, + "subcategory": subcategory, + "name": name, + "module_path": str(asset_file), + } def build_component_asset(repo_root: Path, component_file: Path) -> dict[str, Any]: @@ -236,7 +298,8 @@ def build_component_asset(repo_root: Path, component_file: Path) -> dict[str, An component_file: Path to the component.py file. Returns: - Dictionary containing path, category, name, and module_path. + Dictionary containing path, category, subcategory, name, and module_path. + 'subcategory' is None for direct category components. """ return _build_asset_dict_from_repo_path(repo_root, "components", component_file, _COMPONENT_FILENAME) @@ -249,6 +312,7 @@ def build_pipeline_asset(repo_root: Path, pipeline_file: Path) -> dict[str, Any] pipeline_file: Path to the pipeline.py file. Returns: - Dictionary containing path, category, name, and module_path. + Dictionary containing path, category, subcategory, name, and module_path. + 'subcategory' is None for direct category pipelines. """ return _build_asset_dict_from_repo_path(repo_root, "pipelines", pipeline_file, _PIPELINE_FILENAME) diff --git a/scripts/validate_metadata/validate_metadata.py b/scripts/validate_metadata/validate_metadata.py index b27132f55..e84c483b9 100644 --- a/scripts/validate_metadata/validate_metadata.py +++ b/scripts/validate_metadata/validate_metadata.py @@ -69,13 +69,13 @@ def parse_args() -> argparse.Namespace: def validate_dir(path: str) -> Path: - """Validate that the input path is a valid directory and contains required files. + """Validate that the input path is a valid directory. Args: - path: String representation of the path to the component or pipeline directory. + path: String representation of the path to the component, pipeline, or subcategory directory. Returns: - Path: Validated Path object to the component or pipeline directory. + Path: Validated Path object to the directory. Raises: argparse.ArgumentTypeError: If validation fails. @@ -87,15 +87,38 @@ def validate_dir(path: str) -> Path: if not path.is_dir(): raise argparse.ArgumentTypeError(f"'{path}' is not a directory") - file_path = path / OWNERS - if not file_path.exists(): - raise argparse.ArgumentTypeError(f"{path} does not contain an {OWNERS} file") + return path - metadata_file = path / METADATA - if not metadata_file.exists(): - raise argparse.ArgumentTypeError(f"'{path}' does not contain a {METADATA} file") - return path +def find_dirs_to_validate(input_dir: Path) -> list[Path]: + """Find all directories that need validation (handles both components and subcategories). + + Args: + input_dir: Path to a component/pipeline directory or a subcategory directory. + + Returns: + List of Path objects to directories containing metadata.yaml files. + + Raises: + argparse.ArgumentTypeError: If no valid directories are found. + """ + # Check if this directory has metadata.yaml + if (input_dir / METADATA).exists(): + return [input_dir] + + # This might be a subcategory - find subdirectories with metadata.yaml + dirs_to_validate = [] + for subdir in input_dir.iterdir(): + if subdir.is_dir() and (subdir / METADATA).exists(): + dirs_to_validate.append(subdir) + + if not dirs_to_validate: + raise argparse.ArgumentTypeError( + f"'{input_dir}' does not contain a {METADATA} file and has no subdirectories with one. " + f"If this is a subcategory, ensure it contains component directories." + ) + + return dirs_to_validate def validate_owners_file(filepath: Path): @@ -353,25 +376,41 @@ def main(): args = parse_args() input_dir = args.dir - # Validate OWNERS + # Find all directories to validate (handles subcategories) try: - owners_file_path = input_dir / OWNERS - validate_owners_file(owners_file_path) - except ValidationError as e: - logging.error("Validation Error: %s", e) + dirs_to_validate = find_dirs_to_validate(input_dir) + except argparse.ArgumentTypeError as e: + logging.error("Error: %s", e) sys.exit(1) - # Validate metadata.yaml - try: - metadata_file_path = input_dir / METADATA - validate_metadata_yaml(metadata_file_path) - except ValidationError as e: - logging.error("Validation Error: %s", e) - sys.exit(1) + has_errors = False + for dir_path in dirs_to_validate: + print(f"Validating {dir_path}...") + + # Validate OWNERS + try: + owners_file_path = dir_path / OWNERS + validate_owners_file(owners_file_path) + except ValidationError as e: + logging.error("Validation Error: %s", e) + has_errors = True + continue - # Validation successful. - logging.info(f"Validation successful for {input_dir}.") - print(f"Validation successful for {input_dir}.") + # Validate metadata.yaml + try: + metadata_file_path = dir_path / METADATA + validate_metadata_yaml(metadata_file_path) + except ValidationError as e: + logging.error("Validation Error: %s", e) + has_errors = True + continue + + # Validation successful for this directory. + logging.info(f"Validation successful for {dir_path}.") + print(f"Validation successful for {dir_path}.") + + if has_errors: + sys.exit(1) if __name__ == "__main__": From fabffe1c42389fb29f85c129a401d356d83f3304 Mon Sep 17 00:00:00 2001 From: Vani Haripriya Mudadla Date: Tue, 10 Feb 2026 23:55:05 -0600 Subject: [PATCH 2/3] Enable subcategory support for pipelines Signed-off-by: Vani Haripriya Mudadla --- AGENTS.md | 14 +- Makefile | 63 +++-- docs/CONTRIBUTING.md | 93 ++++++-- pyproject.toml | 8 +- .../category_index_generator.py | 222 +++++++++++++----- scripts/generate_readme/constants.py | 1 + .../templates/CATEGORY_README.md.j2 | 7 + .../templates/SUBCATEGORY_README.md.j2 | 7 + scripts/generate_readme/writer.py | 129 +++++++--- .../generate_skeleton/generate_skeleton.py | 80 ++++--- .../tests/test_generate_skeleton.py | 84 ++++++- scripts/sync_packages.py | 56 +++++ .../validate_metadata/validate_metadata.py | 2 +- 13 files changed, 591 insertions(+), 175 deletions(-) create mode 100644 scripts/generate_readme/templates/SUBCATEGORY_README.md.j2 create mode 100644 scripts/sync_packages.py diff --git a/AGENTS.md b/AGENTS.md index bb3fae2dc..b0f3965d0 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -35,7 +35,7 @@ Agents typically interact with this repository in three modes. Use the mode to d extending/composing instead of duplicating. - **Create scaffolding**: use the Make targets in `Makefile`: - `make component CATEGORY= NAME= [SUBCATEGORY=] [NO_TESTS=true] [CREATE_SHARED=true]` - - `make pipeline CATEGORY= NAME= [NO_TESTS=true]` + - `make pipeline CATEGORY= NAME= [SUBCATEGORY=] [NO_TESTS=true] [CREATE_SHARED=true]` - `make tests TYPE=component|pipeline CATEGORY= NAME= [SUBCATEGORY=]` - `make readme TYPE=component|pipeline CATEGORY= NAME= [SUBCATEGORY=]` - **Validate like CI**: follow [`CONTRIBUTING.md` (Testing and Quality)](docs/CONTRIBUTING.md#testing-and-quality) and @@ -67,7 +67,8 @@ Good places to look: - Components live under `components///`. - Components can optionally use subcategories: `components////`. -- Pipelines live under `pipelines///` (subcategories not supported for pipelines). +- Pipelines live under `pipelines///`. +- Pipelines can optionally use subcategories: `pipelines////`. - Use `snake_case` directory names (per `CONTRIBUTING.md`). ### Required files @@ -124,6 +125,15 @@ metadata schema defined in [`CONTRIBUTING.md`](docs/CONTRIBUTING.md#metadatayaml `lastVerified`). Generate/validate `README.md` using `make readme TYPE=pipeline CATEGORY= NAME=`. Add tests (you can generate tests via `make tests TYPE=pipeline CATEGORY= NAME=`)." +#### Add a pipeline in a subcategory + +Use this prompt pattern when creating related pipelines that should share ownership or utilities: + +"Create a pipeline in a subcategory using `make pipeline CATEGORY= SUBCATEGORY= NAME=`. This +automatically creates the subcategory structure with OWNERS and README.md if it doesn't exist. For shared utilities, +add `CREATE_SHARED=true` to create a `shared/` package. Update the subcategory OWNERS and README.md with appropriate +maintainers and documentation. Follow the same pipeline implementation patterns as above." + #### Update an existing component safely "Find the existing component directory. Make the minimal change needed. Update docstrings and regenerate the README diff --git a/Makefile b/Makefile index e7e75afe2..8e9fc9ff7 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ RUFF ?= $(UVRUN) ruff YAMLLINT ?= $(UVRUN) yamllint PYTEST ?= $(UVRUN) pytest -.PHONY: format fix lint lint-format lint-python lint-markdown lint-yaml lint-imports test test-coverage component pipeline tests readme +.PHONY: format fix lint lint-format lint-python lint-markdown lint-yaml lint-imports test test-coverage component pipeline tests readme sync-packages format: $(RUFF) format components pipelines scripts @@ -43,46 +43,63 @@ component: @SUBCATEGORY_ARG=""; \ if [ -n "$(SUBCATEGORY)" ]; then SUBCATEGORY_ARG="--subcategory=$(SUBCATEGORY)"; fi; \ NO_TESTS_ARG=""; \ - if [ -n "$(NO_TESTS)" ]; then NO_TESTS_ARG="--no-tests"; fi; \ + if [ "$(NO_TESTS)" = "true" ]; then NO_TESTS_ARG="--no-tests"; fi; \ CREATE_SHARED_ARG=""; \ - if [ -n "$(CREATE_SHARED)" ]; then CREATE_SHARED_ARG="--create-shared"; fi; \ - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=component --category=$(CATEGORY) --name=$(NAME) $$SUBCATEGORY_ARG $$NO_TESTS_ARG $$CREATE_SHARED_ARG + if [ "$(CREATE_SHARED)" = "true" ]; then CREATE_SHARED_ARG="--create-shared"; fi; \ + $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=component --category=$(CATEGORY) --name=$(NAME) $$SUBCATEGORY_ARG $$NO_TESTS_ARG $$CREATE_SHARED_ARG; \ + echo ""; \ + echo "Generating READMEs..."; \ + if [ -n "$(SUBCATEGORY)" ]; then \ + $(UVRUN) -m scripts.generate_readme --component components/$(CATEGORY)/$(SUBCATEGORY)/$(NAME) --fix; \ + else \ + $(UVRUN) -m scripts.generate_readme --component components/$(CATEGORY)/$(NAME) --fix; \ + fi + @$(MAKE) --no-print-directory sync-packages pipeline: - @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [NO_TESTS=true]"; exit 1; fi - @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [NO_TESTS=true]"; exit 1; fi - @if [ -n "$(NO_TESTS)" ]; then \ - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=pipeline --category=$(CATEGORY) --name=$(NAME) --no-tests; \ + @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [SUBCATEGORY=x] [NO_TESTS=true] [CREATE_SHARED=true]"; exit 1; fi + @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make pipeline CATEGORY=training NAME=my_pipeline [SUBCATEGORY=x] [NO_TESTS=true] [CREATE_SHARED=true]"; exit 1; fi + @SUBCATEGORY_ARG=""; \ + if [ -n "$(SUBCATEGORY)" ]; then SUBCATEGORY_ARG="--subcategory=$(SUBCATEGORY)"; fi; \ + NO_TESTS_ARG=""; \ + if [ "$(NO_TESTS)" = "true" ]; then NO_TESTS_ARG="--no-tests"; fi; \ + CREATE_SHARED_ARG=""; \ + if [ "$(CREATE_SHARED)" = "true" ]; then CREATE_SHARED_ARG="--create-shared"; fi; \ + $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=pipeline --category=$(CATEGORY) --name=$(NAME) $$SUBCATEGORY_ARG $$NO_TESTS_ARG $$CREATE_SHARED_ARG; \ + echo ""; \ + echo "Generating READMEs..."; \ + if [ -n "$(SUBCATEGORY)" ]; then \ + $(UVRUN) -m scripts.generate_readme --pipeline pipelines/$(CATEGORY)/$(SUBCATEGORY)/$(NAME) --fix; \ else \ - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=pipeline --category=$(CATEGORY) --name=$(NAME); \ + $(UVRUN) -m scripts.generate_readme --pipeline pipelines/$(CATEGORY)/$(NAME) --fix; \ fi + @$(MAKE) --no-print-directory sync-packages tests: - @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi - @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi - @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi - @if [ "$(TYPE)" = "component" ]; then \ + @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make tests TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ "$(TYPE)" = "component" ] || [ "$(TYPE)" = "pipeline" ]; then \ SUBCATEGORY_ARG=""; \ if [ -n "$(SUBCATEGORY)" ]; then SUBCATEGORY_ARG="--subcategory=$(SUBCATEGORY)"; fi; \ $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=$(TYPE) --category=$(CATEGORY) --name=$(NAME) $$SUBCATEGORY_ARG --tests-only; \ - elif [ "$(TYPE)" = "pipeline" ]; then \ - $(UVRUN) scripts/generate_skeleton/generate_skeleton.py --type=$(TYPE) --category=$(CATEGORY) --name=$(NAME) --tests-only; \ else \ echo "Error: TYPE must be either 'component' or 'pipeline'"; exit 1; \ fi readme: - @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi - @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi - @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x (component only)]"; exit 1; fi - @if [ "$(TYPE)" = "component" ]; then \ + @if [ -z "$(TYPE)" ]; then echo "Error: TYPE is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ -z "$(CATEGORY)" ]; then echo "Error: CATEGORY is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ -z "$(NAME)" ]; then echo "Error: NAME is required. Usage: make readme TYPE=component|pipeline CATEGORY=data_processing NAME=my_component [SUBCATEGORY=x]"; exit 1; fi + @if [ "$(TYPE)" = "component" ] || [ "$(TYPE)" = "pipeline" ]; then \ if [ -n "$(SUBCATEGORY)" ]; then \ - $(UVRUN) -m scripts.generate_readme --component $(TYPE)s/$(CATEGORY)/$(SUBCATEGORY)/$(NAME) --fix; \ + $(UVRUN) -m scripts.generate_readme --$(TYPE) $(TYPE)s/$(CATEGORY)/$(SUBCATEGORY)/$(NAME) --fix; \ else \ - $(UVRUN) -m scripts.generate_readme --component $(TYPE)s/$(CATEGORY)/$(NAME) --fix; \ + $(UVRUN) -m scripts.generate_readme --$(TYPE) $(TYPE)s/$(CATEGORY)/$(NAME) --fix; \ fi; \ - elif [ "$(TYPE)" = "pipeline" ]; then \ - $(UVRUN) -m scripts.generate_readme --pipeline $(TYPE)s/$(CATEGORY)/$(NAME) --fix; \ else \ echo "Error: TYPE must be either 'component' or 'pipeline'"; exit 1; \ fi + +sync-packages: + @$(UVRUN) scripts/sync_packages.py diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 7aff80c89..62c05e28a 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -128,15 +128,17 @@ Components must be organized by category under `components//`. Pipelines must be organized by category under `pipelines//`. -### Subcategories (Components Only) +### Subcategories -For better organization of related components, you can create **subcategories** within a category. +For better organization of related components or pipelines, you can create **subcategories** within a category. Subcategories provide: -- **Logical grouping** of related components (e.g., all sklearn-based trainers) +- **Logical grouping** of related assets (e.g., all sklearn-based trainers, related ML workflows) - **Dedicated ownership** via subcategory-level OWNERS file - **Shared utilities** via an optional `shared/` package +**Component subcategory structure:** + ```text components/// ├── __init__.py # Subcategory package @@ -154,6 +156,25 @@ components/// └── tests/ ``` +**Pipeline subcategory structure:** + +```text +pipelines/// +├── __init__.py # Subcategory package +├── OWNERS # Subcategory maintainers +├── README.md # Subcategory documentation +├── shared/ # Optional shared utilities package +│ ├── __init__.py +│ └── workflow_utils.py # Common code for pipelines in this subcategory +└── / # Individual pipeline + ├── __init__.py + ├── pipeline.py + ├── metadata.yaml + ├── OWNERS + ├── README.md + └── tests/ +``` + ## Naming Conventions - **Components and pipelines** use `snake_case` (e.g., `data_preprocessing`, `model_trainer`) @@ -312,7 +333,7 @@ The following make targets simplify the development workflow: **Optional flags** (append to component/pipeline commands): -- `SUBCATEGORY=` - Create component in a subcategory (components only) +- `SUBCATEGORY=` - Create asset in a subcategory - `NO_TESTS=true` - Skip test file generation - `CREATE_SHARED=true` - Create shared utilities package (requires SUBCATEGORY) @@ -366,17 +387,54 @@ This generates a nested structure: ```text components/training/sklearn_trainer/ -├── __init__.py # Subcategory package -├── OWNERS # Subcategory maintainers -├── README.md # Subcategory documentation -├── shared/ # (if CREATE_SHARED) Shared utilities -│ └── __init__.py -└── logistic_regression/ # Your component +├── __init__.py # Subcategory package +├── OWNERS # Subcategory maintainers +├── README.md # Subcategory documentation +├── shared/ # (if CREATE_SHARED=true) Shared utilities +│ ├── __init__.py +│ └── sklearn_trainer_utils.py # Placeholder utility file +└── logistic_regression/ # Your component ├── __init__.py ├── component.py ├── metadata.yaml ├── OWNERS + ├── README.md + └── tests/ + ├── __init__.py + ├── test_component_local.py + └── test_component_unit.py +``` + +**Create a pipeline within a subcategory:** + +```bash +# Create pipeline in a subcategory (subcategory files created automatically) +make pipeline CATEGORY=training SUBCATEGORY=ml_workflows NAME=batch_training + +# Create pipeline in subcategory with shared utilities package +make pipeline CATEGORY=training SUBCATEGORY=ml_workflows NAME=inference CREATE_SHARED=true +``` + +This generates a nested structure: + +```text +pipelines/training/ml_workflows/ +├── __init__.py # Subcategory package +├── OWNERS # Subcategory maintainers +├── README.md # Subcategory documentation +├── shared/ # (if CREATE_SHARED=true) Shared utilities +│ ├── __init__.py +│ └── ml_workflows_utils.py # Placeholder utility file +└── batch_training/ # Your pipeline + ├── __init__.py + ├── pipeline.py + ├── metadata.yaml + ├── OWNERS + ├── README.md └── tests/ + ├── __init__.py + ├── test_pipeline_local.py + └── test_pipeline_unit.py ```
@@ -841,21 +899,6 @@ pytest tests/ --cov=. --cov-report=html - **Dependencies**: Mock external services in unit tests; use real dependencies in local runner tests - **Cleanup**: Use provided fixtures to ensure proper test environment cleanup -### Package Validation - -The validation script ensures the `packages` list in `pyproject.toml` stays in sync with the actual -Python package structure. It discovers all packages in `components/` and `pipelines/` and compares -them with the declared packages in `pyproject.toml`. - -Run the validation locally: - -```bash -uv run python -m scripts.validate_package_entries.validate_package_entries -``` - -If validation fails, update the `packages` list in `pyproject.toml` under `[tool.setuptools]` to -include any missing packages. The script will report exactly which packages are missing or extra. - ### Building Custom Container Images If your component uses a custom image, test the container build: diff --git a/pyproject.toml b/pyproject.toml index 5e6179932..16ab88a6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,16 +50,16 @@ Issues = "https://github.com/kubeflow/pipelines-components/issues" packages = [ "kfp_components", "kfp_components.components", - "kfp_components.components.training", - "kfp_components.components.evaluation", "kfp_components.components.data_processing", "kfp_components.components.data_processing.yoda_data_processor", "kfp_components.components.deployment", + "kfp_components.components.evaluation", + "kfp_components.components.training", "kfp_components.pipelines", - "kfp_components.pipelines.training", - "kfp_components.pipelines.evaluation", "kfp_components.pipelines.data_processing", "kfp_components.pipelines.deployment", + "kfp_components.pipelines.evaluation", + "kfp_components.pipelines.training", ] [tool.setuptools.package-dir] diff --git a/scripts/generate_readme/category_index_generator.py b/scripts/generate_readme/category_index_generator.py index 0b202d43e..ed20a339d 100644 --- a/scripts/generate_readme/category_index_generator.py +++ b/scripts/generate_readme/category_index_generator.py @@ -1,34 +1,41 @@ -"""Category index generator for KFP components and pipelines.""" +"""Category and subcategory index generators for KFP components and pipelines.""" import logging from pathlib import Path -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import yaml from jinja2 import Environment, FileSystemLoader -from scripts.generate_readme.constants import CATEGORY_README_TEMPLATE, MAX_LINE_LENGTH +from scripts.generate_readme.constants import ( + CATEGORY_README_TEMPLATE, + MAX_LINE_LENGTH, + SUBCATEGORY_README_TEMPLATE, +) from scripts.generate_readme.metadata_parser import MetadataParser from scripts.generate_readme.utils import format_title logger = logging.getLogger(__name__) -class CategoryIndexGenerator: - """Generates category-level README.md that indexes all components/pipelines in a category.""" +class _BaseIndexGenerator: + """Base class for index generators with shared Jinja2 setup and item extraction.""" - def __init__(self, category_dir: Path, is_component: bool = True): - """Initialize the category index generator. + def __init__(self, directory: Path, template_name: str, is_component: bool = True): + """Initialize the base index generator. Args: - category_dir: Path to the category directory (e.g., components/dev/). + directory: Path to the directory to index. + template_name: Name of the Jinja2 template to use. is_component: True if indexing components, False if indexing pipelines. """ - self.category_dir = category_dir - if category_dir.exists() is False: - raise ValueError(f"Required category directory not found: {category_dir}") + if not directory.exists(): + raise ValueError(f"Required directory not found: {directory}") + + self.directory = directory self.is_component = is_component - self.category_name = category_dir.name + self.type_name = "Components" if is_component else "Pipelines" + self._target_file = "component.py" if is_component else "pipeline.py" # Set up Jinja2 environment template_dir = Path(__file__).parent / "templates" @@ -37,38 +44,20 @@ def __init__(self, category_dir: Path, is_component: bool = True): trim_blocks=True, lstrip_blocks=True, ) - self.template = self.env.get_template(CATEGORY_README_TEMPLATE) - - def _find_items_in_category(self) -> List[Path]: - """Find all component/pipeline directories within the category. - - Returns: - List of paths to component/pipeline directories. - """ - items = [] - - # Look for subdirectories containing component.py or pipeline.py - target_file = "component.py" if self.is_component else "pipeline.py" - - for subdir in self.category_dir.iterdir(): - if subdir.is_dir() and not subdir.name.startswith("__"): - target_path = subdir / target_file - metadata_path = subdir / "metadata.yaml" - if target_path.exists() and metadata_path.exists(): - items.append(subdir) - - return items + self.template = self.env.get_template(template_name) def _get_display_name(self, item_dir: Path) -> str: - """Get the display name for an item, retrieved from the `name` field in metadata.yaml. + """Get the display name for an item from the `name` field in metadata.yaml. Args: item_dir: Path to the component/pipeline directory. Returns: The display name to use. + + Raises: + ValueError: If the `name` field is not found in metadata.yaml. """ - # Try to load metadata.yaml metadata_file = item_dir / "metadata.yaml" try: with open(metadata_file, "r", encoding="utf-8") as f: @@ -90,34 +79,26 @@ def _extract_item_info(self, item_dir: Path) -> Optional[Dict[str, str]]: Dictionary with 'name', 'overview', and 'link' keys, or None if extraction fails. """ try: - # Determine source file and parser - if self.is_component: - source_file = item_dir / "component.py" - parser = MetadataParser(source_file, "component") - else: - source_file = item_dir / "pipeline.py" - parser = MetadataParser(source_file, "pipeline") - - # Find the function + source_file = item_dir / self._target_file + parser_type = "component" if self.is_component else "pipeline" + parser = MetadataParser(source_file, parser_type) + function_name = parser.find_function() if not function_name: logger.warning(f"No function found in {source_file}") return None - # Extract metadata function_metadata = parser.extract_metadata(function_name) if not function_metadata: logger.warning(f"Could not extract function metadata from {source_file}") return None + name = self._get_display_name(item_dir) - # Format name to match individual README titles formatted_name = format_title(name) - # Get overview from docstring - overview = function_metadata.get("overview") + overview = function_metadata.get("overview", "") overview = overview.split("\n")[0].strip() - # Create relative link to the item's README link = f"./{item_dir.name}/README.md" return { @@ -130,30 +111,153 @@ def _extract_item_info(self, item_dir: Path) -> Optional[Dict[str, str]]: logger.warning(f"Error extracting info from {item_dir}: {e}") return None - def generate(self) -> str: - """Generate the category index README content. + def _find_asset_dirs(self) -> List[Path]: + """Find component/pipeline directories that contain a target file and metadata.yaml. + + Skips directories starting with '__' and directories named 'shared'. Returns: - Complete README.md content for the category index. + List of paths to component/pipeline directories. """ - # Find all items in the category - item_dirs = self._find_items_in_category() + items = [] + for subdir in self.directory.iterdir(): + if subdir.is_dir() and not subdir.name.startswith("__") and subdir.name != "shared": + if (subdir / self._target_file).exists() and (subdir / "metadata.yaml").exists(): + items.append(subdir) + return items + + def _collect_items(self, item_dirs: List[Path]) -> List[Dict[str, str]]: + """Extract and sort item info from a list of directories. - # Extract info for each item + Args: + item_dirs: List of paths to component/pipeline directories. + + Returns: + Sorted list of item info dictionaries. + """ items = [] for item_dir in item_dirs: item_info = self._extract_item_info(item_dir) if item_info: items.append(item_info) - - # Sort items by display name items.sort(key=lambda x: x["name"]) + return items + + +class CategoryIndexGenerator(_BaseIndexGenerator): + """Generates category-level README.md that indexes all components/pipelines in a category.""" + + def __init__(self, category_dir: Path, is_component: bool = True): + """Initialize the category index generator. + + Args: + category_dir: Path to the category directory (e.g., components/dev/). + is_component: True if indexing components, False if indexing pipelines. + """ + super().__init__(category_dir, CATEGORY_README_TEMPLATE, is_component) + self.category_dir = category_dir + self.category_name = category_dir.name + + def _is_subcategory(self, subdir: Path) -> bool: + """Check if a directory is a subcategory (contains child dirs with component.py/pipeline.py). + + Args: + subdir: Path to check. + + Returns: + True if the directory is a subcategory. + """ + for child in subdir.iterdir(): + if child.is_dir() and not child.name.startswith("__") and child.name != "shared": + if (child / self._target_file).exists(): + return True + return False + + def _find_items_in_category(self) -> Tuple[List[Path], List[Path]]: + """Find all component/pipeline directories and subcategories within the category. + + Returns: + Tuple of (direct_items, subcategories) where each is a list of paths. + """ + direct_items = [] + subcategories = [] + + for subdir in self.category_dir.iterdir(): + if subdir.is_dir() and not subdir.name.startswith("__"): + if (subdir / self._target_file).exists() and (subdir / "metadata.yaml").exists(): + direct_items.append(subdir) + elif self._is_subcategory(subdir): + subcategories.append(subdir) + + return direct_items, subcategories + + @staticmethod + def _extract_subcategory_info(subcat_dir: Path) -> Dict[str, str]: + """Extract display info for a subcategory directory. + + Args: + subcat_dir: Path to the subcategory directory. + + Returns: + Dictionary with 'name' and 'link' keys. + """ + return { + "name": format_title(subcat_dir.name), + "link": f"./{subcat_dir.name}/README.md", + } + + def generate(self) -> str: + """Generate the category index README content. + + Returns: + Complete README.md content for the category index. + """ + item_dirs, subcategory_dirs = self._find_items_in_category() + + items = self._collect_items(item_dirs) + + subcategories = [self._extract_subcategory_info(d) for d in subcategory_dirs] + subcategories.sort(key=lambda x: x["name"]) - # Prepare template context context = { "category_name": format_title(self.category_name), "is_component": self.is_component, - "type_name": "Components" if self.is_component else "Pipelines", + "type_name": self.type_name, + "items": items, + "subcategories": subcategories, + } + + return self.template.render(**context) + + +class SubcategoryIndexGenerator(_BaseIndexGenerator): + """Generates subcategory-level README.md that indexes all components/pipelines in a subcategory.""" + + def __init__(self, subcategory_dir: Path, is_component: bool = True): + """Initialize the subcategory index generator. + + Args: + subcategory_dir: Path to the subcategory directory + (e.g., components/training/sklearn_trainer/). + is_component: True if indexing components, False if indexing pipelines. + """ + super().__init__(subcategory_dir, SUBCATEGORY_README_TEMPLATE, is_component) + self.subcategory_dir = subcategory_dir + self.subcategory_name = subcategory_dir.name + + def generate(self) -> str: + """Generate the subcategory index README content. + + Returns: + Complete README.md content for the subcategory index. + """ + item_dirs = self._find_asset_dirs() + items = self._collect_items(item_dirs) + + context = { + "subcategory_name": format_title(self.subcategory_name), + "is_component": self.is_component, + "type_name": self.type_name, "items": items, } diff --git a/scripts/generate_readme/constants.py b/scripts/generate_readme/constants.py index dce99a1e4..209902167 100644 --- a/scripts/generate_readme/constants.py +++ b/scripts/generate_readme/constants.py @@ -5,6 +5,7 @@ # README Templates CATEGORY_README_TEMPLATE = "CATEGORY_README.md.j2" +SUBCATEGORY_README_TEMPLATE = "SUBCATEGORY_README.md.j2" README_TEMPLATE = "README.md.j2" # Exit codes diff --git a/scripts/generate_readme/templates/CATEGORY_README.md.j2 b/scripts/generate_readme/templates/CATEGORY_README.md.j2 index 02e889ded..b6844e1bf 100644 --- a/scripts/generate_readme/templates/CATEGORY_README.md.j2 +++ b/scripts/generate_readme/templates/CATEGORY_README.md.j2 @@ -5,3 +5,10 @@ This directory contains {{ type_name | lower }} in the **{{ category_name }}** c {% for item in items %} - [{{ item.name }}]({{ item.link }}): {{ item.overview }} {% endfor %} +{% if subcategories %} +## Subcategories + +{% for sub in subcategories %} +- [{{ sub.name }}]({{ sub.link }}) +{% endfor %} +{% endif %} diff --git a/scripts/generate_readme/templates/SUBCATEGORY_README.md.j2 b/scripts/generate_readme/templates/SUBCATEGORY_README.md.j2 new file mode 100644 index 000000000..aaed6b8ee --- /dev/null +++ b/scripts/generate_readme/templates/SUBCATEGORY_README.md.j2 @@ -0,0 +1,7 @@ +# {{ subcategory_name }} + +This subcategory contains {{ type_name | lower }} in the **{{ subcategory_name }}** group: + +{% for item in items %} +- [{{ item.name }}]({{ item.link }}): {{ item.overview }} +{% endfor %} diff --git a/scripts/generate_readme/writer.py b/scripts/generate_readme/writer.py index b54bc5a85..a935716d5 100644 --- a/scripts/generate_readme/writer.py +++ b/scripts/generate_readme/writer.py @@ -5,7 +5,10 @@ from pathlib import Path from typing import Optional -from scripts.generate_readme.category_index_generator import CategoryIndexGenerator +from scripts.generate_readme.category_index_generator import ( + CategoryIndexGenerator, + SubcategoryIndexGenerator, +) from scripts.generate_readme.constants import CUSTOM_CONTENT_MARKER, EXIT_ERROR from scripts.generate_readme.content_generator import ReadmeContentGenerator from scripts.generate_readme.metadata_parser import MetadataParser @@ -48,8 +51,21 @@ def __init__( self.source_file = pipeline_dir / "pipeline.py" self.function_type = "pipeline" - self.category_dir = self.source_dir.parent + self.subcategory_dir = None + parent = self.source_dir.parent + try: + if parent.parent.parent.name in {"components", "pipelines"}: + # 3-level: components//// + self.subcategory_dir = parent + self.category_dir = parent.parent + else: + # 2-level: components/// + self.category_dir = parent + except (AttributeError, ValueError): + self.category_dir = parent + self.category_index_file = self.category_dir / "README.md" + self.subcategory_index_file = self.subcategory_dir / "README.md" if self.subcategory_dir else None self.parser = MetadataParser(self.source_file, self.function_type) self.metadata_file = self.source_dir / "metadata.yaml" @@ -111,6 +127,22 @@ def _has_diff(self, expected: str, actual: Optional[str]) -> bool: return True return expected != actual + def _check_index_file(self, index_file: Path, expected_content: str) -> bool: + """Check if an index README matches expected content. + + Args: + index_file: Path to the index README file. + expected_content: The expected content. + + Returns: + True if there's a diff, False if content matches. + """ + actual_content = self._read_file_content(index_file) + has_diff = self._has_diff(expected_content, actual_content) + if has_diff: + logger.warning(f"Out of sync: {index_file}") + return has_diff + def _check_category_index(self, category_content: str) -> bool: """Check if category index matches expected content. @@ -120,33 +152,60 @@ def _check_category_index(self, category_content: str) -> bool: Returns: True if there's a diff, False if content matches. """ - actual_content = self._read_file_content(self.category_index_file) - has_diff = self._has_diff(category_content, actual_content) - if has_diff: - logger.warning(f"Out of sync: {self.category_index_file}") - return has_diff + return self._check_index_file(self.category_index_file, category_content) - def _write_category_index(self, category_content: str) -> None: - """Write the category-level README index. + def _check_subcategory_index(self, subcategory_content: str) -> bool: + """Check if subcategory index matches expected content. Args: - category_content: The generated category index content to write. + subcategory_content: The expected subcategory index content. + + Returns: + True if there's a diff, False if content matches. """ - if self.category_index_file.exists(): - logger.info(f"Category index exists at {self.category_index_file}, regenerating entries.") - else: - logger.info(f"Category index does not exist yet at {self.category_index_file}, creating new file") + if self.subcategory_index_file is None: + return False + return self._check_index_file(self.subcategory_index_file, subcategory_content) - try: - with open(self.category_index_file, "w", encoding="utf-8") as f: - f.write(category_content) + def _write_index_file(self, index_file: Path, content: str, label: str) -> None: + """Write an index README file. - logger.info(f"Category index generated at {self.category_index_file}") + Args: + index_file: Path to the index README file. + content: The generated content to write. + label: Human-readable label for log messages (e.g., "Category index"). + """ + if index_file.exists(): + logger.info(f"{label} exists at {index_file}, regenerating entries.") + else: + logger.info(f"{label} does not exist yet at {index_file}, creating new file") + try: + with open(index_file, "w", encoding="utf-8") as f: + f.write(content) + logger.info(f"{label} generated at {index_file}") except Exception as e: - logger.error(f"Could not write category index: {e}") + logger.error(f"Could not write {label.lower()}: {e}") sys.exit(EXIT_ERROR) + def _write_category_index(self, category_content: str) -> None: + """Write the category-level README index. + + Args: + category_content: The generated category index content to write. + """ + self._write_index_file(self.category_index_file, category_content, "Category index") + + def _write_subcategory_index(self, subcategory_content: str) -> None: + """Write the subcategory-level README index. + + Args: + subcategory_content: The generated subcategory index content to write. + """ + if self.subcategory_index_file is None: + return + self._write_index_file(self.subcategory_index_file, subcategory_content, "Subcategory index") + def _check_readme_file(self, readme_content: str) -> bool: """Check if README matches expected content. @@ -195,6 +254,11 @@ def _write_readme_file(self, readme_content: str) -> None: def generate(self, fix: bool = False) -> bool: """Generate the README documentation. + Generates up to 3 README files: + 1. The component/pipeline README (always) + 2. The subcategory index README (if in a subcategory) + 3. The category index README (always) + Args: fix: If True, write/update README files. If False, only check for diffs without writing files. @@ -227,19 +291,30 @@ def generate(self, fix: bool = False) -> bool: readme_content_generator = ReadmeContentGenerator(metadata, self.source_dir) readme_content = readme_content_generator.generate_readme() - # Generate category index content - index_generator = CategoryIndexGenerator(self.category_dir, self.is_component) - index_content = index_generator.generate() - - # Check for diffs (in both modes) + # Check component/pipeline README for diffs readme_has_diff = self._check_readme_file(readme_content) - category_has_diff = self._check_category_index(index_content) - has_diff = readme_has_diff or category_has_diff + has_diff = readme_has_diff + + # Generate subcategory index if we're in a subcategory + subcategory_content = None + if self.subcategory_dir: + subcategory_generator = SubcategoryIndexGenerator(self.subcategory_dir, self.is_component) + subcategory_content = subcategory_generator.generate() + subcategory_has_diff = self._check_subcategory_index(subcategory_content) + has_diff = has_diff or subcategory_has_diff + + # Generate category index content + category_generator = CategoryIndexGenerator(self.category_dir, self.is_component) + category_content = category_generator.generate() + category_has_diff = self._check_category_index(category_content) + has_diff = has_diff or category_has_diff if has_diff and fix: # Fix mode: write files self._write_readme_file(readme_content) - self._write_category_index(index_content) + if subcategory_content is not None: + self._write_subcategory_index(subcategory_content) + self._write_category_index(category_content) # Log metadata statistics logger.debug(f"README content length: {len(readme_content)} characters") diff --git a/scripts/generate_skeleton/generate_skeleton.py b/scripts/generate_skeleton/generate_skeleton.py index a2b03e5a8..686715b6e 100755 --- a/scripts/generate_skeleton/generate_skeleton.py +++ b/scripts/generate_skeleton/generate_skeleton.py @@ -11,6 +11,8 @@ --category=training --subcategory=sklearn_trainer --name=logistic_regression python scripts/generate_skeleton/generate_skeleton.py --type=pipeline \\ --category=ml_workflows --name=my_training_pipeline + python scripts/generate_skeleton/generate_skeleton.py --type=pipeline \\ + --category=training --subcategory=ml_workflows --name=batch_training """ import argparse @@ -178,15 +180,15 @@ def generate_subcategory_files(subcategory: str) -> dict[str, str]: # Generate a simple README for subcategory readme_content = f"""# {subcategory.replace("_", " ").title()} -This subcategory contains related components. +This subcategory contains related assets. ## Overview TODO: Add description of what this subcategory contains. -## Components +## Assets -TODO: List components in this subcategory. +TODO: List components/pipelines in this subcategory. ## Shared Utilities @@ -213,31 +215,36 @@ def ensure_subcategory_exists(skeleton_type: str, category: str, subcategory: st """ subcategory_dir = Path(f"{skeleton_type}s/{category}/{subcategory}") - # Check if this is a new subcategory - is_new_subcategory = not subcategory_dir.exists() - subcategory_dir.mkdir(parents=True, exist_ok=True) - # Create subcategory-level files only if this is a new subcategory - if is_new_subcategory: - subcategory_files = generate_subcategory_files(subcategory) - for filename, content in subcategory_files.items(): - file_path = subcategory_dir / filename - if not file_path.exists(): - file_path.write_text(content) - - # Create __init__.py for the subcategory package - init_path = subcategory_dir / "__init__.py" - if not init_path.exists(): - init_path.write_text(f'"""Components in the {subcategory} subcategory."""\n') - - # Optionally create shared/ package - if create_shared: - shared_dir = subcategory_dir / "shared" - shared_dir.mkdir(exist_ok=True) - shared_init = shared_dir / "__init__.py" - if not shared_init.exists(): - shared_init.write_text(f'"""Shared utilities for {subcategory} components."""\n') + # Create any missing subcategory-level files + subcategory_files = generate_subcategory_files(subcategory) + for filename, content in subcategory_files.items(): + file_path = subcategory_dir / filename + if not file_path.exists(): + file_path.write_text(content) + + # Create __init__.py for the subcategory package + init_path = subcategory_dir / "__init__.py" + if not init_path.exists(): + init_path.write_text(f'"""Assets in the {subcategory} subcategory."""\n') + + # Optionally create shared/ package + if create_shared: + shared_dir = subcategory_dir / "shared" + shared_dir.mkdir(exist_ok=True) + shared_init = shared_dir / "__init__.py" + if not shared_init.exists(): + shared_init.write_text(f'"""Shared utilities for the {subcategory} subcategory."""\n') + # Create a placeholder utility file + utils_file = shared_dir / f"{subcategory}_utils.py" + if not utils_file.exists(): + utils_file.write_text( + f'"""Shared utility functions for the {subcategory} subcategory."""\n' + "\n" + "\n" + "# TODO: Add shared utility functions, classes, or constants here.\n" + ) return subcategory_dir @@ -284,6 +291,19 @@ def generate_core_files(skeleton_type: str, category: str, name: str) -> dict[st template = env.get_template("OWNERS.j2") files["OWNERS"] = template.render(context) + # Generate placeholder README.md + title = name.replace("_", " ").title() + files["README.md"] = f"""# {title} + +## Overview + +TODO: Add description of this {skeleton_type}. + +## Usage + +TODO: Add usage examples. +""" + return files @@ -465,6 +485,7 @@ def main(): With subcategory: %(prog)s --type=component --category=training --subcategory=sklearn_trainer --name=logistic_regression %(prog)s --type=component --category=training --subcategory=sklearn_trainer --name=random_forest --create-shared + %(prog)s --type=pipeline --category=training --subcategory=ml_workflows --name=batch_training """, ) @@ -480,7 +501,7 @@ def main(): "--subcategory", required=False, default=None, - help="Optional subcategory within the category (components only, e.g., 'sklearn_trainer')", + help="Optional subcategory within the category (e.g., 'sklearn_trainer')", ) parser.add_argument( @@ -516,11 +537,6 @@ def main(): print("Error: --create-shared requires --subcategory to be specified") sys.exit(1) - # Validate --subcategory is only allowed for components (not pipelines) - if args.subcategory and args.type == "pipeline": - print("Error: --subcategory is only supported for components, not pipelines") - sys.exit(1) - # Validate that category exists (for new skeletons) or provide helpful guidance if not args.tests_only: existing_categories = get_existing_categories(args.type) diff --git a/scripts/generate_skeleton/tests/test_generate_skeleton.py b/scripts/generate_skeleton/tests/test_generate_skeleton.py index 441ed8e06..16a51307d 100644 --- a/scripts/generate_skeleton/tests/test_generate_skeleton.py +++ b/scripts/generate_skeleton/tests/test_generate_skeleton.py @@ -29,7 +29,7 @@ def test_generate_component_files(self): files = generate_core_files("component", "data_processing", "my_processor") # Check all expected files are generated - expected_files = ["__init__.py", "component.py", "metadata.yaml", "OWNERS"] + expected_files = ["__init__.py", "component.py", "metadata.yaml", "OWNERS", "README.md"] assert set(files.keys()) == set(expected_files) # Check content contains expected elements @@ -43,7 +43,7 @@ def test_generate_pipeline_files(self): files = generate_core_files("pipeline", "training", "my_pipeline") # Check all expected files are generated - expected_files = ["__init__.py", "pipeline.py", "metadata.yaml", "OWNERS"] + expected_files = ["__init__.py", "pipeline.py", "metadata.yaml", "OWNERS", "README.md"] assert set(files.keys()) == set(expected_files) # Check content contains expected elements @@ -368,6 +368,11 @@ def test_pipeline_path_without_subcategory(self): path = build_skeleton_path("pipeline", "ml_workflows", "training_pipeline") assert path == Path("pipelines/ml_workflows/training_pipeline") + def test_pipeline_path_with_subcategory(self): + """Test building pipeline path with subcategory.""" + path = build_skeleton_path("pipeline", "training", "batch_training", "ml_workflows") + assert path == Path("pipelines/training/ml_workflows/batch_training") + class TestGenerateSubcategoryFiles: """Test subcategory file generation.""" @@ -454,6 +459,26 @@ def test_does_not_overwrite_existing_files(self): finally: os.chdir(original_cwd) + def test_creates_pipeline_subcategory(self): + """Test creating a new pipeline subcategory directory with required files.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + Path("pipelines/training").mkdir(parents=True) + + subcategory_dir = ensure_subcategory_exists("pipeline", "training", "ml_workflows") + + assert subcategory_dir.exists() + assert subcategory_dir == Path("pipelines/training/ml_workflows") + assert (subcategory_dir / "OWNERS").exists() + assert (subcategory_dir / "README.md").exists() + assert (subcategory_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + class TestCreateSkeletonWithSubcategory: """Test create_skeleton with subcategory support.""" @@ -516,6 +541,36 @@ def test_create_component_with_subcategory_and_shared(self): finally: os.chdir(original_cwd) + def test_create_pipeline_with_subcategory(self): + """Test creating a pipeline within a subcategory.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + Path("pipelines/training").mkdir(parents=True) + + result_dir = create_skeleton( + "pipeline", "training", "batch_training", subcategory="ml_workflows", create_tests=True + ) + + # Check pipeline directory structure + assert result_dir.exists() + assert result_dir == Path("pipelines/training/ml_workflows/batch_training") + assert (result_dir / "pipeline.py").exists() + assert (result_dir / "metadata.yaml").exists() + assert (result_dir / "OWNERS").exists() + assert (result_dir / "tests").exists() + + # Check subcategory files were created + subcategory_dir = Path("pipelines/training/ml_workflows") + assert (subcategory_dir / "OWNERS").exists() + assert (subcategory_dir / "README.md").exists() + assert (subcategory_dir / "__init__.py").exists() + + finally: + os.chdir(original_cwd) + class TestCreateTestsOnlyWithSubcategory: """Test create_tests_only with subcategory support.""" @@ -562,3 +617,28 @@ def test_create_tests_only_missing_subcategory_component(self): finally: os.chdir(original_cwd) + + def test_create_tests_for_subcategory_pipeline(self): + """Test creating tests for a pipeline in a subcategory.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_cwd = Path.cwd() + os.chdir(temp_dir) + + try: + # First create pipeline skeleton without tests + Path("pipelines/training").mkdir(parents=True) + create_skeleton( + "pipeline", "training", "batch_training", subcategory="ml_workflows", create_tests=False + ) + + # Now create tests + tests_dir = create_tests_only("pipeline", "training", "batch_training", subcategory="ml_workflows") + + # Check tests were created + assert tests_dir.exists() + assert (tests_dir / "__init__.py").exists() + assert (tests_dir / "test_pipeline_unit.py").exists() + assert (tests_dir / "test_pipeline_local.py").exists() + + finally: + os.chdir(original_cwd) diff --git a/scripts/sync_packages.py b/scripts/sync_packages.py new file mode 100644 index 000000000..318f0840a --- /dev/null +++ b/scripts/sync_packages.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +"""Sync the packages list in pyproject.toml with discovered packages. + +Discovers packages under components/ and pipelines/, maps them to the +kfp_components.* namespace, and updates the static packages list in +pyproject.toml. + +Usage: + uv run scripts/sync_packages.py +""" + +import re +from pathlib import Path + +from setuptools import find_packages + +REPO_ROOT = Path(__file__).resolve().parent.parent + + +def discover_packages() -> list[str]: + """Discover packages and map to kfp_components namespace.""" + physical = find_packages( + where=str(REPO_ROOT), + include=["components", "components.*", "pipelines", "pipelines.*"], + exclude=["*.tests", "*.tests.*"], + ) + return sorted(["kfp_components"] + [f"kfp_components.{p}" for p in physical]) + + +def sync_packages() -> None: + """Update the packages list in pyproject.toml.""" + pyproject = REPO_ROOT / "pyproject.toml" + content = pyproject.read_text() + packages = discover_packages() + + lines = ",\n".join([f' "{p}"' for p in packages]) + new_block = f"packages = [\n{lines},\n]" + + updated = re.sub( + r"packages\s*=\s*\[.*?\]", + new_block, + content, + count=1, + flags=re.DOTALL, + ) + + if updated == content: + print("pyproject.toml packages already in sync.") + return + + pyproject.write_text(updated) + print(f"Synced {len(packages)} packages in pyproject.toml") + + +if __name__ == "__main__": + sync_packages() diff --git a/scripts/validate_metadata/validate_metadata.py b/scripts/validate_metadata/validate_metadata.py index e84c483b9..9b7801784 100644 --- a/scripts/validate_metadata/validate_metadata.py +++ b/scripts/validate_metadata/validate_metadata.py @@ -62,7 +62,7 @@ def parse_args() -> argparse.Namespace: "--dir", type=validate_dir, required=True, - help="Path to the component or pipeline directory (must contain OWNERS and metadata.yaml files)", + help="Path to a component/pipeline directory or a subcategory containing multiple components/pipelines", ) return parser.parse_args() From b7ed7737121ac29e53ce9e8c1360dffd8495c7ad Mon Sep 17 00:00:00 2001 From: Vani Haripriya Mudadla Date: Wed, 11 Feb 2026 14:23:06 -0600 Subject: [PATCH 3/3] Created sample component and pipeline under subcategories Signed-off-by: Vani Haripriya Mudadla --- components/training/README.md | 7 ++++ components/training/sklearn_models/OWNERS | 7 ++++ components/training/sklearn_models/README.md | 5 +++ .../training/sklearn_models/__init__.py | 1 + .../sklearn_models/logistic_regression/OWNERS | 7 ++++ .../logistic_regression/README.md | 39 +++++++++++++++++++ .../logistic_regression/__init__.py | 3 ++ .../logistic_regression/component.py | 34 ++++++++++++++++ .../logistic_regression/metadata.yaml | 17 ++++++++ .../logistic_regression/tests/__init__.py | 1 + .../tests/test_component_local.py | 17 ++++++++ .../tests/test_component_unit.py | 27 +++++++++++++ .../sklearn_models/shared/__init__.py | 1 + .../shared/sklearn_models_utils.py | 4 ++ pipelines/training/README.md | 7 ++++ pipelines/training/ml_workflows/OWNERS | 7 ++++ pipelines/training/ml_workflows/README.md | 5 +++ pipelines/training/ml_workflows/__init__.py | 1 + .../ml_workflows/batch_training/OWNERS | 7 ++++ .../ml_workflows/batch_training/README.md | 30 ++++++++++++++ .../ml_workflows/batch_training/__init__.py | 3 ++ .../ml_workflows/batch_training/metadata.yaml | 18 +++++++++ .../ml_workflows/batch_training/pipeline.py | 31 +++++++++++++++ .../batch_training/tests/__init__.py | 1 + .../tests/test_pipeline_local.py | 17 ++++++++ .../tests/test_pipeline_unit.py | 27 +++++++++++++ .../training/ml_workflows/shared/__init__.py | 1 + .../ml_workflows/shared/ml_workflows_utils.py | 4 ++ pyproject.toml | 6 +++ 29 files changed, 335 insertions(+) create mode 100644 components/training/README.md create mode 100644 components/training/sklearn_models/OWNERS create mode 100644 components/training/sklearn_models/README.md create mode 100644 components/training/sklearn_models/__init__.py create mode 100644 components/training/sklearn_models/logistic_regression/OWNERS create mode 100644 components/training/sklearn_models/logistic_regression/README.md create mode 100644 components/training/sklearn_models/logistic_regression/__init__.py create mode 100644 components/training/sklearn_models/logistic_regression/component.py create mode 100644 components/training/sklearn_models/logistic_regression/metadata.yaml create mode 100644 components/training/sklearn_models/logistic_regression/tests/__init__.py create mode 100644 components/training/sklearn_models/logistic_regression/tests/test_component_local.py create mode 100644 components/training/sklearn_models/logistic_regression/tests/test_component_unit.py create mode 100644 components/training/sklearn_models/shared/__init__.py create mode 100644 components/training/sklearn_models/shared/sklearn_models_utils.py create mode 100644 pipelines/training/README.md create mode 100644 pipelines/training/ml_workflows/OWNERS create mode 100644 pipelines/training/ml_workflows/README.md create mode 100644 pipelines/training/ml_workflows/__init__.py create mode 100644 pipelines/training/ml_workflows/batch_training/OWNERS create mode 100644 pipelines/training/ml_workflows/batch_training/README.md create mode 100644 pipelines/training/ml_workflows/batch_training/__init__.py create mode 100644 pipelines/training/ml_workflows/batch_training/metadata.yaml create mode 100644 pipelines/training/ml_workflows/batch_training/pipeline.py create mode 100644 pipelines/training/ml_workflows/batch_training/tests/__init__.py create mode 100644 pipelines/training/ml_workflows/batch_training/tests/test_pipeline_local.py create mode 100644 pipelines/training/ml_workflows/batch_training/tests/test_pipeline_unit.py create mode 100644 pipelines/training/ml_workflows/shared/__init__.py create mode 100644 pipelines/training/ml_workflows/shared/ml_workflows_utils.py diff --git a/components/training/README.md b/components/training/README.md new file mode 100644 index 000000000..c301e306c --- /dev/null +++ b/components/training/README.md @@ -0,0 +1,7 @@ +# Training Components + +This directory contains components in the **Training** category: + +## Subcategories + +- [Sklearn Models](./sklearn_models/README.md) diff --git a/components/training/sklearn_models/OWNERS b/components/training/sklearn_models/OWNERS new file mode 100644 index 000000000..3423f6e47 --- /dev/null +++ b/components/training/sklearn_models/OWNERS @@ -0,0 +1,7 @@ +approvers: + # TODO: Add your GitHub username here (must be a Kubeflow community member) + # - your-github-username +reviewers: + # TODO: Add reviewers' GitHub usernames here + # - reviewer1 + # - reviewer2 \ No newline at end of file diff --git a/components/training/sklearn_models/README.md b/components/training/sklearn_models/README.md new file mode 100644 index 000000000..43209e35d --- /dev/null +++ b/components/training/sklearn_models/README.md @@ -0,0 +1,5 @@ +# Sklearn Models + +This subcategory contains components in the **Sklearn Models** group: + +- [Logistic Regression](./logistic_regression/README.md): Logistic Regression component. diff --git a/components/training/sklearn_models/__init__.py b/components/training/sklearn_models/__init__.py new file mode 100644 index 000000000..3dbd4594d --- /dev/null +++ b/components/training/sklearn_models/__init__.py @@ -0,0 +1 @@ +"""Assets in the sklearn_models subcategory.""" diff --git a/components/training/sklearn_models/logistic_regression/OWNERS b/components/training/sklearn_models/logistic_regression/OWNERS new file mode 100644 index 000000000..3423f6e47 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/OWNERS @@ -0,0 +1,7 @@ +approvers: + # TODO: Add your GitHub username here (must be a Kubeflow community member) + # - your-github-username +reviewers: + # TODO: Add reviewers' GitHub usernames here + # - reviewer1 + # - reviewer2 \ No newline at end of file diff --git a/components/training/sklearn_models/logistic_regression/README.md b/components/training/sklearn_models/logistic_regression/README.md new file mode 100644 index 000000000..dc321ee25 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/README.md @@ -0,0 +1,39 @@ +# Logistic Regression ✨ + +> ⚠️ **Stability: alpha** — This asset is not yet stable and may change. + +## Overview 🧾 + +Logistic Regression component. + +TODO: Add a detailed description of what this component does. + +Args: input_param: Description of the component parameter. # Add descriptions for other parameters + +Returns: Description of what the component returns. + +## Inputs 📥 + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `input_param` | `str` | `None` | | + +## Outputs 📤 + +| Name | Type | Description | +|------|------|-------------| +| Output | `str` | | + +## Metadata 🗂️ + +- **Name**: logistic_regression +- **Stability**: alpha +- **Dependencies**: + - Kubeflow: + - Name: Pipelines, Version: >=2.15.2 +- **Tags**: + - training +- **Last Verified**: 2026-02-11 20:18:36+00:00 +- **Owners**: + - Approvers: None + - Reviewers: None diff --git a/components/training/sklearn_models/logistic_regression/__init__.py b/components/training/sklearn_models/logistic_regression/__init__.py new file mode 100644 index 000000000..9474e0323 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/__init__.py @@ -0,0 +1,3 @@ +from .component import logistic_regression + +__all__ = ["logistic_regression"] diff --git a/components/training/sklearn_models/logistic_regression/component.py b/components/training/sklearn_models/logistic_regression/component.py new file mode 100644 index 000000000..8a0b38ee9 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/component.py @@ -0,0 +1,34 @@ +from kfp import dsl + + +@dsl.component( + base_image="python:3.11", + # packages_to_install=["numpy", "pandas"], # Add your dependencies here +) +def logistic_regression( + # Add your component parameters here + input_param: str, + # Add your output artifacts here + # output_artifact: dsl.Output[dsl.Artifact] +) -> str: # Specify your return type + """Logistic Regression component. + + TODO: Add a detailed description of what this component does. + + Args: + input_param: Description of the component parameter. + # Add descriptions for other parameters + + Returns: + Description of what the component returns. + """ + # TODO: Implement your component logic here + + +if __name__ == "__main__": + from kfp.compiler import Compiler + + Compiler().compile( + logistic_regression, + package_path=__file__.replace(".py", "_component.yaml"), + ) diff --git a/components/training/sklearn_models/logistic_regression/metadata.yaml b/components/training/sklearn_models/logistic_regression/metadata.yaml new file mode 100644 index 000000000..23ed8dd88 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/metadata.yaml @@ -0,0 +1,17 @@ +--- +name: logistic_regression +stability: alpha # New component without proven track record +dependencies: + kubeflow: + - name: Pipelines + version: '>=2.15.2' + # external_services: # Add if component uses external services + # - name: Example Service + # version: ">=1.0.0" +tags: + - training + # Add more relevant tags here +lastVerified: 2026-02-11T20:18:36Z +# links: # Add relevant links +# documentation: https://your-docs-url.com +# issue_tracker: https://github.com/kubeflow/pipelines-components/issues diff --git a/components/training/sklearn_models/logistic_regression/tests/__init__.py b/components/training/sklearn_models/logistic_regression/tests/__init__.py new file mode 100644 index 000000000..92cbc4c96 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/tests/__init__.py @@ -0,0 +1 @@ +# Test package for component tests diff --git a/components/training/sklearn_models/logistic_regression/tests/test_component_local.py b/components/training/sklearn_models/logistic_regression/tests/test_component_local.py new file mode 100644 index 000000000..ec36beff0 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/tests/test_component_local.py @@ -0,0 +1,17 @@ +"""Local runner tests for the logistic_regression component.""" + +from ..component import logistic_regression + + +class TestLogisticRegressionLocalRunner: + """Test component with LocalRunner (subprocess execution).""" + + def test_local_execution(self, setup_and_teardown_subprocess_runner): # noqa: F811 + """Test component execution with LocalRunner.""" + # TODO: Implement local runner tests for your component + + # Example test structure: + result = logistic_regression(input_param="test_value") + + # Add assertions about expected outputs if needed + assert result is not None diff --git a/components/training/sklearn_models/logistic_regression/tests/test_component_unit.py b/components/training/sklearn_models/logistic_regression/tests/test_component_unit.py new file mode 100644 index 000000000..2acffb144 --- /dev/null +++ b/components/training/sklearn_models/logistic_regression/tests/test_component_unit.py @@ -0,0 +1,27 @@ +"""Tests for the logistic_regression component.""" + +from ..component import logistic_regression + + +class TestLogisticRegressionUnitTests: + """Unit tests for component logic.""" + + def test_component_function_exists(self): + """Test that the component function is properly imported.""" + assert callable(logistic_regression) + assert hasattr(logistic_regression, "python_func") + + def test_component_with_default_parameters(self): + """Test component with valid input parameters.""" + # TODO: Implement unit tests for your component + + # Example test structure: + result = logistic_regression.python_func(input_param="test_value") + assert isinstance(result, str) + assert "test_value" in result + + # TODO: Add more comprehensive unit tests + # @mock.patch("external_library.some_function") + # def test_component_with_mocked_dependencies(self, mock_function): + # """Test component behavior with mocked external calls.""" + # pass diff --git a/components/training/sklearn_models/shared/__init__.py b/components/training/sklearn_models/shared/__init__.py new file mode 100644 index 000000000..48e5b99a9 --- /dev/null +++ b/components/training/sklearn_models/shared/__init__.py @@ -0,0 +1 @@ +"""Shared utilities for the sklearn_models subcategory.""" diff --git a/components/training/sklearn_models/shared/sklearn_models_utils.py b/components/training/sklearn_models/shared/sklearn_models_utils.py new file mode 100644 index 000000000..3e8a8f588 --- /dev/null +++ b/components/training/sklearn_models/shared/sklearn_models_utils.py @@ -0,0 +1,4 @@ +"""Shared utility functions for the sklearn_models subcategory.""" + + +# TODO: Add shared utility functions, classes, or constants here. diff --git a/pipelines/training/README.md b/pipelines/training/README.md new file mode 100644 index 000000000..d7f473063 --- /dev/null +++ b/pipelines/training/README.md @@ -0,0 +1,7 @@ +# Training Pipelines + +This directory contains pipelines in the **Training** category: + +## Subcategories + +- [Ml Workflows](./ml_workflows/README.md) diff --git a/pipelines/training/ml_workflows/OWNERS b/pipelines/training/ml_workflows/OWNERS new file mode 100644 index 000000000..3423f6e47 --- /dev/null +++ b/pipelines/training/ml_workflows/OWNERS @@ -0,0 +1,7 @@ +approvers: + # TODO: Add your GitHub username here (must be a Kubeflow community member) + # - your-github-username +reviewers: + # TODO: Add reviewers' GitHub usernames here + # - reviewer1 + # - reviewer2 \ No newline at end of file diff --git a/pipelines/training/ml_workflows/README.md b/pipelines/training/ml_workflows/README.md new file mode 100644 index 000000000..82615ec09 --- /dev/null +++ b/pipelines/training/ml_workflows/README.md @@ -0,0 +1,5 @@ +# Ml Workflows + +This subcategory contains pipelines in the **Ml Workflows** group: + +- [Batch Training](./batch_training/README.md): TODO: Add pipeline description. diff --git a/pipelines/training/ml_workflows/__init__.py b/pipelines/training/ml_workflows/__init__.py new file mode 100644 index 000000000..562b1f6b0 --- /dev/null +++ b/pipelines/training/ml_workflows/__init__.py @@ -0,0 +1 @@ +"""Assets in the ml_workflows subcategory.""" diff --git a/pipelines/training/ml_workflows/batch_training/OWNERS b/pipelines/training/ml_workflows/batch_training/OWNERS new file mode 100644 index 000000000..3423f6e47 --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/OWNERS @@ -0,0 +1,7 @@ +approvers: + # TODO: Add your GitHub username here (must be a Kubeflow community member) + # - your-github-username +reviewers: + # TODO: Add reviewers' GitHub usernames here + # - reviewer1 + # - reviewer2 \ No newline at end of file diff --git a/pipelines/training/ml_workflows/batch_training/README.md b/pipelines/training/ml_workflows/batch_training/README.md new file mode 100644 index 000000000..62c3dd5ec --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/README.md @@ -0,0 +1,30 @@ +# Batch Training ✨ + +> ⚠️ **Stability: alpha** — This asset is not yet stable and may change. + +## Overview 🧾 + +TODO: Add pipeline description. + +TODO: Add a detailed description of what this pipeline does. + +## Inputs 📥 + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `input_param` | `str` | `default_value` | Description of the pipeline parameter. | + +## Metadata 🗂️ + +- **Name**: batch_training +- **Stability**: alpha +- **Dependencies**: + - Kubeflow: + - Name: Pipelines, Version: >=2.15.2 +- **Tags**: + - training + - pipeline +- **Last Verified**: 2026-02-11 20:18:46+00:00 +- **Owners**: + - Approvers: None + - Reviewers: None diff --git a/pipelines/training/ml_workflows/batch_training/__init__.py b/pipelines/training/ml_workflows/batch_training/__init__.py new file mode 100644 index 000000000..6a2953237 --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/__init__.py @@ -0,0 +1,3 @@ +from .pipeline import batch_training + +__all__ = ["batch_training"] diff --git a/pipelines/training/ml_workflows/batch_training/metadata.yaml b/pipelines/training/ml_workflows/batch_training/metadata.yaml new file mode 100644 index 000000000..3394eddf8 --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/metadata.yaml @@ -0,0 +1,18 @@ +--- +name: batch_training +stability: alpha # New pipeline without proven track record +dependencies: + kubeflow: + - name: Pipelines + version: '>=2.15.2' + # external_services: # Add if pipeline uses external services + # - name: Example Service + # version: ">=1.0.0" +tags: + - training + - pipeline + # Add more relevant tags here +lastVerified: 2026-02-11T20:18:46Z +# links: # Add relevant links +# documentation: https://your-docs-url.com +# issue_tracker: https://github.com/kubeflow/pipelines-components/issues diff --git a/pipelines/training/ml_workflows/batch_training/pipeline.py b/pipelines/training/ml_workflows/batch_training/pipeline.py new file mode 100644 index 000000000..b43e7814c --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/pipeline.py @@ -0,0 +1,31 @@ +from kfp import dsl + + +@dsl.pipeline( + name="batch-training", + description="TODO: Add a brief description of this pipeline", +) +def batch_training( + # Add your pipeline parameters here + input_param: str = "default_value", +): + """TODO: Add pipeline description. + + TODO: Add a detailed description of what this pipeline does. + + Args: + input_param: Description of the pipeline parameter. + + Returns: + Pipeline outputs or None if no outputs are needed. + """ + # TODO: Implement your pipeline logic here + + +if __name__ == "__main__": + from kfp.compiler import Compiler + + Compiler().compile( + batch_training, + package_path=__file__.replace(".py", "_pipeline.yaml"), + ) diff --git a/pipelines/training/ml_workflows/batch_training/tests/__init__.py b/pipelines/training/ml_workflows/batch_training/tests/__init__.py new file mode 100644 index 000000000..d9eb4f378 --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/tests/__init__.py @@ -0,0 +1 @@ +# Test package for pipeline tests diff --git a/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_local.py b/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_local.py new file mode 100644 index 000000000..43558a7e8 --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_local.py @@ -0,0 +1,17 @@ +"""Local runner tests for the batch_training pipeline.""" + +from ..pipeline import batch_training + + +class TestBatchTrainingLocalRunner: + """Test pipeline with LocalRunner (subprocess execution).""" + + def test_local_execution(self, setup_and_teardown_subprocess_runner): # noqa: F811 + """Test pipeline execution with LocalRunner.""" + # TODO: Implement local runner tests for your pipeline + + # Example test structure: + result = batch_training(input_param="test_value") + + # Add assertions about expected outputs if needed + assert result is not None diff --git a/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_unit.py b/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_unit.py new file mode 100644 index 000000000..650ef559c --- /dev/null +++ b/pipelines/training/ml_workflows/batch_training/tests/test_pipeline_unit.py @@ -0,0 +1,27 @@ +"""Tests for the batch_training pipeline.""" + +from ..pipeline import batch_training + + +class TestBatchTrainingUnitTests: + """Unit tests for pipeline logic.""" + + def test_pipeline_function_exists(self): + """Test that the pipeline function is properly imported.""" + assert callable(batch_training) + assert hasattr(batch_training, "python_func") + + def test_pipeline_with_default_parameters(self): + """Test pipeline with valid input parameters.""" + # TODO: Implement unit tests for your pipeline + + # Example test structure: + result = batch_training.python_func(input_param="test_value") + assert isinstance(result, str) + assert "test_value" in result + + # TODO: Add more comprehensive unit tests + # @mock.patch("external_library.some_function") + # def test_pipeline_with_mocked_dependencies(self, mock_function): + # """Test pipeline behavior with mocked external calls.""" + # pass diff --git a/pipelines/training/ml_workflows/shared/__init__.py b/pipelines/training/ml_workflows/shared/__init__.py new file mode 100644 index 000000000..1c2bc45d9 --- /dev/null +++ b/pipelines/training/ml_workflows/shared/__init__.py @@ -0,0 +1 @@ +"""Shared utilities for the ml_workflows subcategory.""" diff --git a/pipelines/training/ml_workflows/shared/ml_workflows_utils.py b/pipelines/training/ml_workflows/shared/ml_workflows_utils.py new file mode 100644 index 000000000..ad3af8a7c --- /dev/null +++ b/pipelines/training/ml_workflows/shared/ml_workflows_utils.py @@ -0,0 +1,4 @@ +"""Shared utility functions for the ml_workflows subcategory.""" + + +# TODO: Add shared utility functions, classes, or constants here. diff --git a/pyproject.toml b/pyproject.toml index 16ab88a6c..e52fa9139 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,11 +55,17 @@ packages = [ "kfp_components.components.deployment", "kfp_components.components.evaluation", "kfp_components.components.training", + "kfp_components.components.training.sklearn_models", + "kfp_components.components.training.sklearn_models.logistic_regression", + "kfp_components.components.training.sklearn_models.shared", "kfp_components.pipelines", "kfp_components.pipelines.data_processing", "kfp_components.pipelines.deployment", "kfp_components.pipelines.evaluation", "kfp_components.pipelines.training", + "kfp_components.pipelines.training.ml_workflows", + "kfp_components.pipelines.training.ml_workflows.batch_training", + "kfp_components.pipelines.training.ml_workflows.shared", ] [tool.setuptools.package-dir]