From 4f9b409e064b5e13d7c53ac9bfdf4dccede5e00f Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 24 Feb 2026 08:45:42 +0530 Subject: [PATCH 01/10] Integration issue fixes --- .../image_creation/tasks/build_base_image.yml | 19 +++ .../templates/base_image_template.j2 | 4 - .../templates/compute_images_templates.j2 | 5 - .../image_creation/tasks/build_base_image.yml | 19 +++ .../templates/base_image_template.j2 | 6 +- .../templates/compute_images_templates.j2 | 5 - build_stream/api/local_repo/routes.py | 2 +- build_stream/infra/db/alembic.ini | 2 +- .../nfs_playbook_queue_result_repository.py | 39 ++++-- .../orchestrator/local_repo/result_poller.py | 114 +++++++++++------- .../tasks/deploy_build_stream.yml | 6 +- 11 files changed, 141 insertions(+), 80 deletions(-) diff --git a/build_image_aarch64/roles/image_creation/tasks/build_base_image.yml b/build_image_aarch64/roles/image_creation/tasks/build_base_image.yml index 463bfe8131..cd09de18b9 100644 --- a/build_image_aarch64/roles/image_creation/tasks/build_base_image.yml +++ b/build_image_aarch64/roles/image_creation/tasks/build_base_image.yml @@ -13,6 +13,22 @@ # limitations under the License. --- +- name: Normalize build stream inputs for base image + ansible.builtin.set_fact: + enable_build_stream: "{{ enable_build_stream | default(false) | bool }}" + build_stream_job_id: "{{ build_stream_job_id | default('') }}" + image_key: "{{ image_key | default('') }}" + base_image_suffix: "" + +- name: Set base image suffix when build stream inputs present + ansible.builtin.set_fact: + base_image_suffix: "{{ ('-' ~ build_stream_job_id ~ '-' ~ (image_key | default(''))) | regex_replace('^-+', '') }}" + rhel_base_image_name: "{{ rhel_aarch64_base_image_name ~ ('-' ~ build_stream_job_id ~ '-' ~ (image_key | default(''))) | regex_replace('^-+', '') }}" + when: + - enable_build_stream | bool + - (build_stream_job_id | default('') | length) > 0 + - (image_key | default('') | length) > 0 + - name: Create temporary inventory with ochami group ansible.builtin.copy: dest: "{{ aarch64_inventory_file }}" @@ -32,6 +48,9 @@ src: "{{ openchami_base_image_vars_template }}" dest: "{{ openchami_aarch64_base_image_vars_path }}" mode: "{{ dir_permissions_644 }}" + vars: + # Override rhel_base_image_name for build stream + rhel_base_image_name: "{{ rhel_base_image_name | default(rhel_aarch64_base_image_name) }}" - name: Invoking Openchami playbook for rhel-base image build ansible.builtin.shell: | diff --git a/build_image_aarch64/roles/image_creation/templates/base_image_template.j2 b/build_image_aarch64/roles/image_creation/templates/base_image_template.j2 index 2b672750f6..df117cb5c6 100644 --- a/build_image_aarch64/roles/image_creation/templates/base_image_template.j2 +++ b/build_image_aarch64/roles/image_creation/templates/base_image_template.j2 @@ -1,10 +1,6 @@ openchami_work_dir: "{{ openchami_work_dir }}" rhel_tag: "{{ rhel_tag }}" -{% if enable_build_stream %} -rhel_base_image_name: "{{ rhel_aarch64_base_image_name }}-{{ build_stream_image_key }}-{{ build_stream_job_id }}" -{% else %} rhel_base_image_name: "{{ rhel_aarch64_base_image_name }}" -{% endif %} rhel_base_image: "{{ oim_node_name }}/{{ rhel_aarch64_base_image_name }}" cluster_name: "{{ oim_node_name }}" cluster_domain: "{{ domain_name }}" diff --git a/build_image_aarch64/roles/image_creation/templates/compute_images_templates.j2 b/build_image_aarch64/roles/image_creation/templates/compute_images_templates.j2 index 3ffacc8538..0ab9cb2298 100644 --- a/build_image_aarch64/roles/image_creation/templates/compute_images_templates.j2 +++ b/build_image_aarch64/roles/image_creation/templates/compute_images_templates.j2 @@ -1,12 +1,7 @@ openchami_work_dir: "{{ openchami_work_dir }}" rhel_tag: "{{ rhel_tag }}" -{% if enable_build_stream %} -rhel_base_image: "{{ oim_node_name }}/{{ rhel_aarch64_base_image_name }}-{{ build_stream_image_key }}-{{ build_stream_job_id }}" -{% set image_name_suffix = '-' + build_stream_image_key + '-' + build_stream_job_id %} -{% else %} rhel_base_image: "{{ oim_node_name }}/{{ rhel_aarch64_base_image_name }}" {% set image_name_suffix = compute_image_suffix | default('') %} -{% endif %} base_compute_image_name: "{{ item.key }}{{ image_name_suffix }}" rhel_base_compute_image_name: "rhel-{{ item.key }}{{ image_name_suffix }}" rhel_base_compute_image: "{{ oim_node_name }}/rhel-{{ item.key }}{{ image_name_suffix }}" diff --git a/build_image_x86_64/roles/image_creation/tasks/build_base_image.yml b/build_image_x86_64/roles/image_creation/tasks/build_base_image.yml index de6b2e0c33..a4a2c56d96 100644 --- a/build_image_x86_64/roles/image_creation/tasks/build_base_image.yml +++ b/build_image_x86_64/roles/image_creation/tasks/build_base_image.yml @@ -13,6 +13,22 @@ # limitations under the License. --- +- name: Normalize build stream inputs for base image + ansible.builtin.set_fact: + enable_build_stream: "{{ enable_build_stream | default(false) | bool }}" + build_stream_job_id: "{{ build_stream_job_id | default('') }}" + image_key: "{{ image_key | default('') }}" + base_image_suffix: "" + +- name: Set base image suffix when build stream inputs present + ansible.builtin.set_fact: + base_image_suffix: "{{ ('-' ~ build_stream_job_id ~ '-' ~ (image_key | default(''))) | regex_replace('^-+', '') }}" + rhel_base_image_name: "{{ rhel_x86_64_base_image_name ~ ('-' ~ build_stream_job_id ~ '-' ~ (image_key | default(''))) | regex_replace('^-+', '') }}" + when: + - enable_build_stream | bool + - (build_stream_job_id | default('') | length) > 0 + - (image_key | default('') | length) > 0 + - name: Create x86_64_base_image.log as a file ansible.builtin.file: path: "{{ openchami_x86_64_base_image_log_path }}" @@ -24,6 +40,9 @@ src: "{{ openchami_base_image_vars_template }}" dest: "{{ openchami_x86_64_base_image_vars_path }}" mode: "{{ dir_permissions_644 }}" + vars: + # Override rhel_base_image_name for build stream + rhel_base_image_name: "{{ rhel_base_image_name | default(rhel_x86_64_base_image_name) }}" - name: Invoking Openchami playbook for rhel-base image build ansible.builtin.shell: | diff --git a/build_image_x86_64/roles/image_creation/templates/base_image_template.j2 b/build_image_x86_64/roles/image_creation/templates/base_image_template.j2 index 8ad96d8342..55033fb4d3 100644 --- a/build_image_x86_64/roles/image_creation/templates/base_image_template.j2 +++ b/build_image_x86_64/roles/image_creation/templates/base_image_template.j2 @@ -1,10 +1,6 @@ openchami_work_dir: "{{ openchami_work_dir }}" -{% if enable_build_stream %} -rhel_base_image_name: "{{ rhel_x86_64_base_image_name }}-{{ build_stream_image_key }}-{{ build_stream_job_id }}" -{% else %} rhel_base_image_name: "{{ rhel_x86_64_base_image_name }}" -{% endif %} -rhel_base_image: "{{ oim_node_name }}/{{ rhel_x86_64_base_image_name }}" +rhel_base_image: "{{ oim_node_name }}/{{ rhel_base_image_name }}" cluster_name: "{{ oim_node_name }}" cluster_domain: "{{ domain_name }}" rhel_base_mounts: {{ ochami_mounts | join(' ') }} diff --git a/build_image_x86_64/roles/image_creation/templates/compute_images_templates.j2 b/build_image_x86_64/roles/image_creation/templates/compute_images_templates.j2 index f979671432..aa7c6c2080 100644 --- a/build_image_x86_64/roles/image_creation/templates/compute_images_templates.j2 +++ b/build_image_x86_64/roles/image_creation/templates/compute_images_templates.j2 @@ -1,11 +1,6 @@ openchami_work_dir: "{{ openchami_work_dir }}" -{% if enable_build_stream %} -rhel_base_image: "{{ oim_node_name }}/rhel-{{ build_stream_image_key }}-{{ build_stream_job_id }}" -{% set image_name_suffix = '-' + build_stream_image_key + '-' + build_stream_job_id %} -{% else %} rhel_base_image: "{{ oim_node_name }}/{{ rhel_x86_64_base_image_name }}" {% set image_name_suffix = compute_image_suffix | default('') %} -{% endif %} base_compute_image_name: "{{ item.key }}{{ image_name_suffix }}" rhel_base_compute_image_name: "rhel-{{ item.key }}{{ image_name_suffix }}" rhel_base_compute_image: "{{ oim_node_name }}/rhel-{{ item.key }}{{ image_name_suffix }}" diff --git a/build_stream/api/local_repo/routes.py b/build_stream/api/local_repo/routes.py index 64bf489f75..6f973022ae 100644 --- a/build_stream/api/local_repo/routes.py +++ b/build_stream/api/local_repo/routes.py @@ -160,7 +160,7 @@ def create_local_repository( ) # Provide helpful message for terminal state violations if exc.state == "FAILED": - message = f"Job {job_id} stage is in {exc.state} state and cannot be retried. Reset the stage using /stages/create-local-repository/reset endpoint." + message = f"Job {job_id} stage is in {exc.state} state and cannot be retried. Please create a new job to proceed." else: message = f"Job {job_id} stage is in {exc.state} state and cannot be modified." diff --git a/build_stream/infra/db/alembic.ini b/build_stream/infra/db/alembic.ini index 4cd223e550..7c76dd4651 100644 --- a/build_stream/infra/db/alembic.ini +++ b/build_stream/infra/db/alembic.ini @@ -1,6 +1,6 @@ [alembic] script_location = %(here)s/alembic -sqlalchemy.url = postgresql://build_stream:build_stream@localhost:5432/build_stream_db +sqlalchemy.url = postgresql://%(DB_USER)s:%(DB_PASSWORD)s@%(DB_HOST)s:5432/%(DB_NAME)s [loggers] keys = root,sqlalchemy,alembic diff --git a/build_stream/infra/repositories/nfs_playbook_queue_result_repository.py b/build_stream/infra/repositories/nfs_playbook_queue_result_repository.py index ddd801a9e4..ec32cc20da 100644 --- a/build_stream/infra/repositories/nfs_playbook_queue_result_repository.py +++ b/build_stream/infra/repositories/nfs_playbook_queue_result_repository.py @@ -49,6 +49,9 @@ def __init__(self, queue_base_path: str = DEFAULT_QUEUE_BASE) -> None: self._results_dir = self._queue_base / RESULTS_DIR_NAME self._archive_dir = self._queue_base / ARCHIVE_DIR_NAME self._processed_files: Set[str] = set() + # Clear cache on startup to ensure we don't miss any files + self.clear_processed_cache() + logger.info("Initialized NfsPlaybookQueueResultRepository with cleared cache") def get_unprocessed_results(self) -> List[Path]: """Return list of result files not yet processed. @@ -56,13 +59,20 @@ def get_unprocessed_results(self) -> List[Path]: Returns: List of paths to unprocessed result JSON files. """ - if not self._results_dir.is_dir(): - return [] - result_files = [] - for file_path in sorted(self._results_dir.glob("*.json")): - if file_path.name not in self._processed_files: - result_files.append(file_path) + + # Check results directory + if self._results_dir.is_dir(): + for file_path in sorted(self._results_dir.glob("*.json")): + if file_path.name not in self._processed_files: + result_files.append(file_path) + + # Also check archive/results directory for any missed files + if self._archive_dir.is_dir(): + for file_path in sorted(self._archive_dir.glob("*.json")): + if file_path.name not in self._processed_files: + result_files.append(file_path) + logger.info(f"Found unprocessed result in archive: {file_path.name}") return result_files @@ -107,12 +117,19 @@ def archive_result(self, result_path: Path) -> None: archive_path = self._archive_dir / result_path.name try: - shutil.move(str(result_path), str(archive_path)) + # Only move if not already in archive + if result_path.parent != self._archive_dir: + shutil.move(str(result_path), str(archive_path)) + log_secure_info( + "info", + "Result file moved to archive", + ) + else: + log_secure_info( + "info", + "Result file already in archive", + ) self._processed_files.add(result_path.name) - log_secure_info( - "info", - "Result file archived", - ) except OSError: # pylint: disable=unused-variable log_secure_info( "error", diff --git a/build_stream/orchestrator/local_repo/result_poller.py b/build_stream/orchestrator/local_repo/result_poller.py index 705fb09923..003d468b89 100644 --- a/build_stream/orchestrator/local_repo/result_poller.py +++ b/build_stream/orchestrator/local_repo/result_poller.py @@ -117,57 +117,76 @@ def _on_result_received(self, result: PlaybookResult) -> None: Args: result: Playbook execution result from NFS queue. """ + # Import here to avoid circular imports + from infra.db.session import get_db_session + from infra.db.repositories import SqlStageRepository, SqlAuditEventRepository + try: - # Find stage - stage_name = StageName(result.stage_name) - stage = self._stage_repo.find_by_job_and_name(result.job_id, stage_name) - - if stage is None: - logger.error( - "Stage not found for result: job_id=%s, stage=%s", - result.job_id, - result.stage_name, - ) - return - - # Update stage based on result - if result.status == "success": - stage.complete() + # Use a fresh session for this operation to ensure proper transaction management + with get_db_session() as session: + # Create repositories with the same session + stage_repo = SqlStageRepository(session=session) + audit_repo = SqlAuditEventRepository(session=session) + + # Find stage + stage_name = StageName(result.stage_name) + stage = stage_repo.find_by_job_and_name(result.job_id, stage_name) + + if stage is None: + logger.error( + "Stage not found for result: job_id=%s, stage=%s", + result.job_id, + result.stage_name, + ) + return + + # Update stage based on result + if result.status == "success": + stage.complete() + logger.info( + "Stage completed successfully: job_id=%s, stage=%s", + result.job_id, + result.stage_name, + ) + else: + error_code = result.error_code or "PLAYBOOK_FAILED" + error_summary = result.error_summary or "Playbook execution failed" + stage.fail(error_code=error_code, error_summary=error_summary) + logger.warning( + "Stage failed: job_id=%s, stage=%s, error=%s", + result.job_id, + result.stage_name, + error_code, + ) + + # Save updated stage (will be committed when context exits) + stage_repo.save(stage) logger.info( - "Stage completed successfully: job_id=%s, stage=%s", + "Stage state saved to database: job_id=%s, stage=%s, new_state=%s", result.job_id, result.stage_name, - ) - else: - error_code = result.error_code or "PLAYBOOK_FAILED" - error_summary = result.error_summary or "Playbook execution failed" - stage.fail(error_code=error_code, error_summary=error_summary) - logger.warning( - "Stage failed: job_id=%s, stage=%s, error=%s", - result.job_id, - result.stage_name, - error_code, + stage.stage_state.value, ) - # Save updated stage - self._stage_repo.save(stage) - - # Emit audit event - event = AuditEvent( - event_id=str(self._uuid_generator.generate()), - job_id=result.job_id, - event_type="STAGE_COMPLETED" if result.status == "success" else "STAGE_FAILED", - correlation_id=result.request_id, - client_id=result.job_id, # Using job_id as client_id placeholder - timestamp=datetime.now(timezone.utc), - details={ - "stage_name": result.stage_name, - "status": result.status, - "duration_seconds": result.duration_seconds, - "exit_code": result.exit_code, - }, - ) - self._audit_repo.save(event) + # Emit audit event + event = AuditEvent( + event_id=str(self._uuid_generator.generate()), + job_id=result.job_id, + event_type="STAGE_COMPLETED" if result.status == "success" else "STAGE_FAILED", + correlation_id=result.request_id, + client_id=result.job_id, # Using job_id as client_id placeholder + timestamp=datetime.now(timezone.utc), + details={ + "stage_name": result.stage_name, + "status": result.status, + "duration_seconds": result.duration_seconds, + "exit_code": result.exit_code, + }, + ) + audit_repo.save(event) + + # Session will be automatically committed when exiting the context + logger.info("Database transaction committed for stage update") log_secure_info( "info", @@ -177,7 +196,10 @@ def _on_result_received(self, result: PlaybookResult) -> None: except Exception as exc: # pylint: disable=broad-except logger.exception( - "Error handling result: job_id=%s, error=%s", + "Error handling result: job_id=%s, stage=%s, error=%s", result.job_id, + result.stage_name, exc, ) + # Don't re-raise the exception to avoid rolling back the transaction + # The stage state has already been saved successfully diff --git a/prepare_oim/roles/deploy_containers/build_stream/tasks/deploy_build_stream.yml b/prepare_oim/roles/deploy_containers/build_stream/tasks/deploy_build_stream.yml index 4a9aaf74d0..26daa9cef9 100644 --- a/prepare_oim/roles/deploy_containers/build_stream/tasks/deploy_build_stream.yml +++ b/prepare_oim/roles/deploy_containers/build_stream/tasks/deploy_build_stream.yml @@ -263,8 +263,10 @@ command: > python -m alembic -c {{ bs_rsync_destination }}infra/db/alembic.ini upgrade head env: - DATABASE_URL: "postgresql://{{ postgres_user }}:{{ postgres_password }}@{{ admin_ip }}:5432/{{ postgres_db_name }}" - PYTHONPATH: "{{ bs_rsync_destination }}" + DB_USER: "{{ postgres_user }}" + DB_PASSWORD: "{{ postgres_password }}" + DB_HOST: "{{ admin_ip }}" + DB_NAME: "{{ postgres_db_name }}" register: alembic_result changed_when: "'Running upgrade' in alembic_result.stdout" From 98e6743bb6c9d5293f116815d177a6e1da13dc49 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 24 Feb 2026 08:57:11 +0530 Subject: [PATCH 02/10] fix for build image failure --- .../roles/image_creation/templates/base_image_template.j2 | 1 + .../roles/image_creation/templates/base_image_template.j2 | 3 +++ 2 files changed, 4 insertions(+) diff --git a/build_image_aarch64/roles/image_creation/templates/base_image_template.j2 b/build_image_aarch64/roles/image_creation/templates/base_image_template.j2 index df117cb5c6..e251e3dea2 100644 --- a/build_image_aarch64/roles/image_creation/templates/base_image_template.j2 +++ b/build_image_aarch64/roles/image_creation/templates/base_image_template.j2 @@ -4,6 +4,7 @@ rhel_base_image_name: "{{ rhel_aarch64_base_image_name }}" rhel_base_image: "{{ oim_node_name }}/{{ rhel_aarch64_base_image_name }}" cluster_name: "{{ oim_node_name }}" cluster_domain: "{{ domain_name }}" +group_name: base rhel_base_mounts: {{ ochami_mounts | join(' ') }} image_build_name: {{ ochami_aarch64_image | join(' ') }} rhel_base_command_options: {{ ochami_base_command | join(' ') }} diff --git a/build_image_x86_64/roles/image_creation/templates/base_image_template.j2 b/build_image_x86_64/roles/image_creation/templates/base_image_template.j2 index 55033fb4d3..341287226c 100644 --- a/build_image_x86_64/roles/image_creation/templates/base_image_template.j2 +++ b/build_image_x86_64/roles/image_creation/templates/base_image_template.j2 @@ -3,9 +3,12 @@ rhel_base_image_name: "{{ rhel_x86_64_base_image_name }}" rhel_base_image: "{{ oim_node_name }}/{{ rhel_base_image_name }}" cluster_name: "{{ oim_node_name }}" cluster_domain: "{{ domain_name }}" +group_name: base rhel_base_mounts: {{ ochami_mounts | join(' ') }} image_build_name: {{ ochami_x86_64_image | join(' ') }} rhel_base_command_options: {{ ochami_base_command | join(' ') }} +# Override OpenCHAMI defaults to ensure correct mount path +rhel_tag: "{{ rhel_tag }}" rhel_repos: {% for repo in rhel_x86_64_repos %} From 042d71f4bee84a79508c1d685975a1290ea91617 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 24 Feb 2026 09:19:03 +0530 Subject: [PATCH 03/10] fix for OptimisticLockError exception --- .../repositories/nfs_input_repository.py | 5 +- .../use_cases/create_build_image.py | 48 +++++++++++++------ 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/build_stream/infra/repositories/nfs_input_repository.py b/build_stream/infra/repositories/nfs_input_repository.py index 3023ec962f..d4889ac424 100644 --- a/build_stream/infra/repositories/nfs_input_repository.py +++ b/build_stream/infra/repositories/nfs_input_repository.py @@ -141,7 +141,8 @@ def get_aarch64_inv_host(self, job_id: str) -> Optional[InventoryHost]: logger.warning("Empty build_stream_config.yml for job %s", job_id) return None - inventory_host = config.get("aarch64_inventory_host") + # Try both key names for backward compatibility + inventory_host = config.get("aarch64_inventory_host_ip") or config.get("aarch64_inventory_host") if inventory_host: logger.info( "Retrieved inventory_host for job %s: %s", @@ -150,7 +151,7 @@ def get_aarch64_inv_host(self, job_id: str) -> Optional[InventoryHost]: ) return InventoryHost(str(inventory_host)) - logger.info("No aarch64_inventory_host configured for job %s", job_id) + logger.info("No aarch64_inventory_host_ip configured for job %s", job_id) return None except yaml.YAMLError as exc: diff --git a/build_stream/orchestrator/build_image/use_cases/create_build_image.py b/build_stream/orchestrator/build_image/use_cases/create_build_image.py index 9fb42b46dc..784df51224 100644 --- a/build_stream/orchestrator/build_image/use_cases/create_build_image.py +++ b/build_stream/orchestrator/build_image/use_cases/create_build_image.py @@ -306,12 +306,18 @@ def _get_inventory_host( try: return InventoryHost(command.inventory_host) except ValueError as exc: - stage.start() - stage.fail( - error_code="INVALID_INVENTORY_HOST", - error_summary=f"Invalid inventory host format: {str(exc)}", + # Refresh stage from database to avoid OptimisticLockError + fresh_stage = self._stage_repo.find_by_job_and_name( + command.job_id, + stage.stage_name ) - self._stage_repo.save(stage) + if fresh_stage: + fresh_stage.start() + fresh_stage.fail( + error_code="INVALID_INVENTORY_HOST", + error_summary=f"Invalid inventory host format: {str(exc)}", + ) + self._stage_repo.save(fresh_stage) log_secure_info( "error", f"Invalid inventory host for job {command.job_id}", @@ -330,12 +336,18 @@ def _get_inventory_host( correlation_id=str(command.correlation_id), ) except InventoryHostMissingError as exc: - stage.start() - stage.fail( - error_code="INVENTORY_HOST_MISSING", - error_summary=exc.message, + # Refresh stage from database to avoid OptimisticLockError + fresh_stage = self._stage_repo.find_by_job_and_name( + command.job_id, + stage.stage_name ) - self._stage_repo.save(stage) + if fresh_stage: + fresh_stage.start() + fresh_stage.fail( + error_code="INVENTORY_HOST_MISSING", + error_summary=exc.message, + ) + self._stage_repo.save(fresh_stage) log_secure_info( "error", f"Inventory host missing for job {command.job_id}", @@ -374,12 +386,18 @@ def _create_inventory_file( ) return inventory_file_path except IOError as exc: - stage.start() - stage.fail( - error_code="INVENTORY_FILE_CREATION_FAILED", - error_summary=f"Failed to create inventory file: {str(exc)}", + # Refresh stage from database to avoid OptimisticLockError + fresh_stage = self._stage_repo.find_by_job_and_name( + command.job_id, + stage.stage_name ) - self._stage_repo.save(stage) + if fresh_stage: + fresh_stage.start() + fresh_stage.fail( + error_code="INVENTORY_FILE_CREATION_FAILED", + error_summary=f"Failed to create inventory file: {str(exc)}", + ) + self._stage_repo.save(fresh_stage) log_secure_info( "error", f"Failed to create inventory file for job {command.job_id}", From cd4495dda10dac801f9ccb74e7c5a199d72478f9 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 24 Feb 2026 09:19:58 +0530 Subject: [PATCH 04/10] removing unwanted comment --- build_stream/infra/repositories/nfs_input_repository.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/build_stream/infra/repositories/nfs_input_repository.py b/build_stream/infra/repositories/nfs_input_repository.py index d4889ac424..bb67735fad 100644 --- a/build_stream/infra/repositories/nfs_input_repository.py +++ b/build_stream/infra/repositories/nfs_input_repository.py @@ -141,8 +141,7 @@ def get_aarch64_inv_host(self, job_id: str) -> Optional[InventoryHost]: logger.warning("Empty build_stream_config.yml for job %s", job_id) return None - # Try both key names for backward compatibility - inventory_host = config.get("aarch64_inventory_host_ip") or config.get("aarch64_inventory_host") + inventory_host = config.get("aarch64_inventory_host_ip") if inventory_host: logger.info( "Retrieved inventory_host for job %s: %s", From 1d64e6810caf37fcb5ce57cc9ecc7bbf7b3009c8 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 24 Feb 2026 12:34:01 +0530 Subject: [PATCH 05/10] timeout value for playbook execution --- prepare_oim/roles/deploy_containers/build_stream/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml b/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml index 59e97ca33b..bba8d6176e 100644 --- a/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml @@ -53,7 +53,7 @@ watcher_cpu_quota: "50%" build_stream_watcher_playbook_queue_base: "{{ oim_shared_path }}/omnia/playbook_queue" watcher_poll_interval_seconds: 2 watcher_max_concurrent_jobs: 5 -watcher_default_timeout_minutes: 30 +watcher_default_timeout_minutes: 100 watcher_log_level: "INFO" # Directory & File Modes From 60f1d9407b4333cd09822ef6e687f23a5b79a79c Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 24 Feb 2026 12:48:29 +0530 Subject: [PATCH 06/10] updating timeout value --- prepare_oim/roles/deploy_containers/build_stream/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml b/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml index bba8d6176e..8001f01fd8 100644 --- a/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml @@ -53,7 +53,7 @@ watcher_cpu_quota: "50%" build_stream_watcher_playbook_queue_base: "{{ oim_shared_path }}/omnia/playbook_queue" watcher_poll_interval_seconds: 2 watcher_max_concurrent_jobs: 5 -watcher_default_timeout_minutes: 100 +watcher_default_timeout_minutes: 150 watcher_log_level: "INFO" # Directory & File Modes From c065a10178948a731564f08098c0f654732a6cec Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 24 Feb 2026 14:59:41 +0530 Subject: [PATCH 07/10] inventory host name and credentials in prepare_oim --- .../repositories/nfs_input_repository.py | 2 +- prepare_oim/prepare_oim.yml | 24 ++++++++++++++----- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/build_stream/infra/repositories/nfs_input_repository.py b/build_stream/infra/repositories/nfs_input_repository.py index bb67735fad..b38cbe2a93 100644 --- a/build_stream/infra/repositories/nfs_input_repository.py +++ b/build_stream/infra/repositories/nfs_input_repository.py @@ -192,7 +192,7 @@ def create_inventory_file(self, inventory_host: InventoryHost, job_id: str) -> P inventory_file = inventory_dir / "inv" # Create inventory content - inventory_content = f"[build_hosts]\n{inventory_host.value}\n" + inventory_content = f"[admin_aarch64]\n{inventory_host.value}\n" # Write inventory file with open(inventory_file, "w", encoding="utf-8") as f: diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index afaefbe8ac..7fbe5c45cf 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -16,6 +16,11 @@ - name: Check if upgrade is in progress ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml +- name: Include input project directory + when: not project_dir_status | default(false) | bool + ansible.builtin.import_playbook: ../utils/include_input_dir.yml + tags: always + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local @@ -24,7 +29,19 @@ - name: Set dynamic run tags including 'prepare_oim' when: not config_file_status | default(false) | bool ansible.builtin.set_fact: - omnia_run_tags: "{{ (ansible_run_tags | default([]) + ['prepare_oim']) + ['openldap'] | unique }}" + omnia_run_tags: >- + {{ + ( + ansible_run_tags | default([]) + + ['prepare_oim', 'openldap', 'provision'] + + ( + ['slurm', 'slurm_custom', 'csi_driver_powerscale', 'ldms', 'telemetry'] + if (lookup('file', hostvars['localhost']['input_project_dir'] ~ '/software_config.json') + | from_json).softwares | map(attribute='name') | list + else [] + ) + ) | unique + }} cacheable: true - name: Invoke validate_config.yml to perform L1 and L2 validations with prepare_oim tag @@ -34,11 +51,6 @@ - name: Invoke get_config_credentials.yml ansible.builtin.import_playbook: ../utils/credential_utility/get_config_credentials.yml -- name: Include input project directory - when: not project_dir_status | default(false) | bool - ansible.builtin.import_playbook: ../utils/include_input_dir.yml - tags: always - - name: Create oim group and provision group ansible.builtin.import_playbook: ../utils/create_container_group.yml vars: From 8e5c262320c712d3b7510b03c451eac45faddffa Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 25 Feb 2026 00:49:06 +0530 Subject: [PATCH 08/10] auto continue on user prompt when build_stream is enabled --- .../slurm_config/tasks/drain_and_remove_node.yml | 11 ++++++++++- .../roles/validation/tasks/validate_metadata.yml | 12 +++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml index da1c41d3fe..0ebc37aa12 100644 --- a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml +++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml @@ -89,7 +89,16 @@ Options: 1. Press Ctrl+C then 'A' to abort 2. Press Enter to force removal (jobs will be killed) - when: not force_scancel_node + when: + - not force_scancel_node + - not (enable_build_stream | default(false) | bool) + + - name: Auto-force removal when build stream is enabled + ansible.builtin.debug: + msg: "Build stream is enabled, automatically forcing removal of node {{ node_to_remove }} (jobs will be killed)." + when: + - not force_scancel_node + - enable_build_stream | default(false) | bool - name: Force cancel jobs if timeout reached ansible.builtin.command: scancel -f -w {{ node_to_remove }} diff --git a/local_repo/roles/validation/tasks/validate_metadata.yml b/local_repo/roles/validation/tasks/validate_metadata.yml index ebbef0ba71..53d30ecca4 100644 --- a/local_repo/roles/validation/tasks/validate_metadata.yml +++ b/local_repo/roles/validation/tasks/validate_metadata.yml @@ -40,11 +40,21 @@ {{ metadata_warn_msg }} Do you want to continue? (yes/no) register: user_input - when: not metadata_compare.identical + when: + - not metadata_compare.identical + - not (enable_build_stream | default(false) | bool) + + - name: Auto-continue when build stream is enabled + ansible.builtin.debug: + msg: "Build stream is enabled, automatically accepting metadata changes." + when: + - not metadata_compare.identical + - enable_build_stream | default(false) | bool - name: Fail if user chooses not to continue ansible.builtin.fail: msg: "User choose not to continue due to metadata change." when: - not metadata_compare.identical + - not (enable_build_stream | default(false) | bool) - user_input.user_input | lower != 'yes' From 781d92b2b94388984c30009ea840c740bff43097 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 25 Feb 2026 08:09:22 +0530 Subject: [PATCH 09/10] Fix for optional credentials --- .../image_creation/tasks/build_base_image.yml | 4 ++-- .../tasks/build_compute_image.yml | 2 +- .../image_creation/tasks/build_base_image.yml | 4 ++-- .../tasks/build_compute_image.yml | 2 +- .../update_config/tasks/credential_status.yml | 19 ++++++++++++++++++- .../update_config/tasks/fetch_credentials.yml | 1 + .../tasks/fetch_optional_credentials.yml | 8 ++++++++ 7 files changed, 33 insertions(+), 7 deletions(-) diff --git a/build_image_aarch64/roles/image_creation/tasks/build_base_image.yml b/build_image_aarch64/roles/image_creation/tasks/build_base_image.yml index cd09de18b9..943b79acd9 100644 --- a/build_image_aarch64/roles/image_creation/tasks/build_base_image.yml +++ b/build_image_aarch64/roles/image_creation/tasks/build_base_image.yml @@ -22,8 +22,8 @@ - name: Set base image suffix when build stream inputs present ansible.builtin.set_fact: - base_image_suffix: "{{ ('-' ~ build_stream_job_id ~ '-' ~ (image_key | default(''))) | regex_replace('^-+', '') }}" - rhel_base_image_name: "{{ rhel_aarch64_base_image_name ~ ('-' ~ build_stream_job_id ~ '-' ~ (image_key | default(''))) | regex_replace('^-+', '') }}" + base_image_suffix: "_{{ build_stream_job_id }}-{{ image_key | default('') }}" + rhel_base_image_name: "{{ rhel_aarch64_base_image_name }}_{{ build_stream_job_id }}-{{ image_key | default('') }}" when: - enable_build_stream | bool - (build_stream_job_id | default('') | length) > 0 diff --git a/build_image_aarch64/roles/image_creation/tasks/build_compute_image.yml b/build_image_aarch64/roles/image_creation/tasks/build_compute_image.yml index 459a9f4149..0d9254c001 100644 --- a/build_image_aarch64/roles/image_creation/tasks/build_compute_image.yml +++ b/build_image_aarch64/roles/image_creation/tasks/build_compute_image.yml @@ -22,7 +22,7 @@ - name: Set compute image suffix when build stream inputs present ansible.builtin.set_fact: - compute_image_suffix: "{{ ('-' ~ build_stream_job_id ~ '-' ~ (image_key | default(''))) | regex_replace('^-+', '') }}" + compute_image_suffix: "_{{ build_stream_job_id }}-{{ image_key | default('') }}" when: - enable_build_stream | bool - (build_stream_job_id | default('') | length) > 0 diff --git a/build_image_x86_64/roles/image_creation/tasks/build_base_image.yml b/build_image_x86_64/roles/image_creation/tasks/build_base_image.yml index a4a2c56d96..6c2a332931 100644 --- a/build_image_x86_64/roles/image_creation/tasks/build_base_image.yml +++ b/build_image_x86_64/roles/image_creation/tasks/build_base_image.yml @@ -22,8 +22,8 @@ - name: Set base image suffix when build stream inputs present ansible.builtin.set_fact: - base_image_suffix: "{{ ('-' ~ build_stream_job_id ~ '-' ~ (image_key | default(''))) | regex_replace('^-+', '') }}" - rhel_base_image_name: "{{ rhel_x86_64_base_image_name ~ ('-' ~ build_stream_job_id ~ '-' ~ (image_key | default(''))) | regex_replace('^-+', '') }}" + base_image_suffix: "_{{ build_stream_job_id }}-{{ image_key | default('') }}" + rhel_base_image_name: "{{ rhel_x86_64_base_image_name }}_{{ build_stream_job_id }}-{{ image_key | default('') }}" when: - enable_build_stream | bool - (build_stream_job_id | default('') | length) > 0 diff --git a/build_image_x86_64/roles/image_creation/tasks/build_compute_image.yml b/build_image_x86_64/roles/image_creation/tasks/build_compute_image.yml index 6c13902f81..84b7e521b2 100644 --- a/build_image_x86_64/roles/image_creation/tasks/build_compute_image.yml +++ b/build_image_x86_64/roles/image_creation/tasks/build_compute_image.yml @@ -22,7 +22,7 @@ - name: Set compute image suffix when build stream inputs present ansible.builtin.set_fact: - compute_image_suffix: "{{ ('-' ~ build_stream_job_id ~ '-' ~ (image_key | default(''))) | regex_replace('^-+', '') }}" + compute_image_suffix: "_{{ build_stream_job_id }}-{{ image_key | default('') }}" when: - enable_build_stream | bool - (build_stream_job_id | default('') | length) > 0 diff --git a/utils/credential_utility/roles/update_config/tasks/credential_status.yml b/utils/credential_utility/roles/update_config/tasks/credential_status.yml index 1d0f112d41..1e18615171 100644 --- a/utils/credential_utility/roles/update_config/tasks/credential_status.yml +++ b/utils/credential_utility/roles/update_config/tasks/credential_status.yml @@ -29,7 +29,7 @@ - >- (field.file is not defined or field.file != credential_files[1].file_path) and ((vars[field.username] is not defined or vars[field.username] == "" or (vars[field.username] | length == 0)) and - (mandatory_credentials_status or conditional_mandatory_credentials_status)) + (mandatory_credentials_status or conditional_mandatory_credentials_status or optional_credentials_status)) or (field.file is defined and field.file == credential_files[1].file_path) and (vars['build_stream_auth_username'] is not defined or vars['build_stream_auth_username'] == "" or @@ -51,9 +51,26 @@ (vars['build_stream_auth_password_hash'] is not defined or vars['build_stream_auth_password_hash'] == "" or (vars['build_stream_auth_password_hash'] is defined and (vars['build_stream_auth_password_hash'] | length == 0))) +# Initialize password status for optional credentials when username is provided and password is empty +- name: Initialize password status for optional credentials + ansible.builtin.set_fact: + password_status: true + when: + - field.password is defined + - field.password is not search('switch') + - field.file is not defined or field.file != credential_files[1].file_path + - optional_credentials_status | default(false) | bool + - vars[field.username] is defined + - vars[field.username] != "" + - vars[field.password] is defined + - (vars[field.password] == "" or (vars[field.password] | length == 0)) + # Reset credential status after processing - name: Reset credentials status ansible.builtin.set_fact: + mandatory_credentials_status: false + conditional_mandatory_credentials_status: false + optional_credentials_status: false username_status: false password_status: false when: reset_status | default(false) diff --git a/utils/credential_utility/roles/update_config/tasks/fetch_credentials.yml b/utils/credential_utility/roles/update_config/tasks/fetch_credentials.yml index b441b0bfbf..e4fa5c5f6c 100644 --- a/utils/credential_utility/roles/update_config/tasks/fetch_credentials.yml +++ b/utils/credential_utility/roles/update_config/tasks/fetch_credentials.yml @@ -17,6 +17,7 @@ ansible.builtin.set_fact: mandatory_credentials_status: false conditional_mandatory_credentials_status: false + optional_credentials_status: false username_status: false password_status: false diff --git a/utils/credential_utility/roles/update_config/tasks/fetch_optional_credentials.yml b/utils/credential_utility/roles/update_config/tasks/fetch_optional_credentials.yml index 9450fe3d4e..b65c9f892c 100644 --- a/utils/credential_utility/roles/update_config/tasks/fetch_optional_credentials.yml +++ b/utils/credential_utility/roles/update_config/tasks/fetch_optional_credentials.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Set optional credentials status + ansible.builtin.set_fact: + optional_credentials_status: true + - name: Notify user about optional inputs ansible.builtin.debug: msg: "{{ optional_warning_msg }}" @@ -22,3 +26,7 @@ loop: "{{ type.value }}" loop_control: loop_var: field + +- name: Reset optional credentials status + ansible.builtin.set_fact: + optional_credentials_status: false From 30e80e66da34e5582c0ce88b470a4038c7b2e1a7 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 25 Feb 2026 08:17:34 +0530 Subject: [PATCH 10/10] ansible lint fixes --- .../roles/slurm_config/tasks/drain_and_remove_node.yml | 6 +++--- local_repo/roles/validation/tasks/validate_metadata.yml | 6 +++--- utils/credential_utility/roles/update_config/vars/main.yml | 4 ++++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml index 0ebc37aa12..83c86781b6 100644 --- a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml +++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml @@ -89,14 +89,14 @@ Options: 1. Press Ctrl+C then 'A' to abort 2. Press Enter to force removal (jobs will be killed) - when: + when: - not force_scancel_node - not (enable_build_stream | default(false) | bool) - name: Auto-force removal when build stream is enabled ansible.builtin.debug: - msg: "Build stream is enabled, automatically forcing removal of node {{ node_to_remove }} (jobs will be killed)." - when: + msg: "{{ build_stream_auto_force_node_msg }}" + when: - not force_scancel_node - enable_build_stream | default(false) | bool diff --git a/local_repo/roles/validation/tasks/validate_metadata.yml b/local_repo/roles/validation/tasks/validate_metadata.yml index 53d30ecca4..aa0fbb0417 100644 --- a/local_repo/roles/validation/tasks/validate_metadata.yml +++ b/local_repo/roles/validation/tasks/validate_metadata.yml @@ -40,14 +40,14 @@ {{ metadata_warn_msg }} Do you want to continue? (yes/no) register: user_input - when: + when: - not metadata_compare.identical - not (enable_build_stream | default(false) | bool) - name: Auto-continue when build stream is enabled ansible.builtin.debug: - msg: "Build stream is enabled, automatically accepting metadata changes." - when: + msg: "{{ build_stream_auto_accept_metadata_msg }}" + when: - not metadata_compare.identical - enable_build_stream | default(false) | bool diff --git a/utils/credential_utility/roles/update_config/vars/main.yml b/utils/credential_utility/roles/update_config/vars/main.yml index 1dcddca9c8..a6eff521e1 100644 --- a/utils/credential_utility/roles/update_config/vars/main.yml +++ b/utils/credential_utility/roles/update_config/vars/main.yml @@ -58,6 +58,10 @@ docker_hub_warning: | Proceed to enter your Docker credentials if you want to avoid pull rate limits. Press Enter. +# Build stream automation messages +build_stream_auto_force_node_msg: "Build stream is enabled, automatically forcing removal of node {{ node_to_remove }} (jobs will be killed)." +build_stream_auto_accept_metadata_msg: "Build stream is enabled, automatically accepting metadata changes." + omnia_credentials: provision: mandatory: