From c42088bf25e5e19224b691ba77e473aa46060ea3 Mon Sep 17 00:00:00 2001 From: Jordan Patterson Date: Mon, 3 Nov 2025 16:52:27 -0500 Subject: [PATCH 1/3] actions: Run hipFile tests on NVIDIA platform. --- .github/workflows/build_ais.yml | 5 + .github/workflows/hipfile-nvidia.yml | 147 +++++++++++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 .github/workflows/hipfile-nvidia.yml diff --git a/.github/workflows/build_ais.yml b/.github/workflows/build_ais.yml index 73d3031f..eefeaa1a 100644 --- a/.github/workflows/build_ais.yml +++ b/.github/workflows/build_ais.yml @@ -401,3 +401,8 @@ jobs: if: ${{ always() }} run: | docker stop ${AIS_CONTAINER_NAME} + Run_hipFile_NVIDIA: + uses: ./.github/workflows/hipfile-nvidia.yml + needs: build_AIS_image + with: + platform: ${{inputs.platform}} diff --git a/.github/workflows/hipfile-nvidia.yml b/.github/workflows/hipfile-nvidia.yml new file mode 100644 index 00000000..2234e80e --- /dev/null +++ b/.github/workflows/hipfile-nvidia.yml @@ -0,0 +1,147 @@ +name: hipFile NVIDIA +run-name: Build and run tests on NVIDIA +env: + AIS_MOUNT_PATH: /mnt/ais/ext4 + AIS_DOCKER_REGISTRY: ghcr.io/rocm/hipfile + AIS_CI_IMAGE_NAME: ais_ci_${{inputs.platform}} +on: + workflow_call: + inputs: + platform: + required: true + type: string +permissions: + contents: read + packages: read +jobs: + NVIDIA_tests: + runs-on: [linux, NVIDIA] + steps: + - name: Get PR number and store it as a environment variable + run: echo "AIS_PR_NUMBER=$(echo ${{ github.ref }} | sed 's|[^0-9]||g')" >> "$GITHUB_ENV" + - name: Set AIS CI image environment variables + run: | + echo "AIS_CI_DEV_IMAGE=${{ env.AIS_DOCKER_REGISTRY }}/${{ env.AIS_CI_IMAGE_NAME }}_dev:$(echo ${{ github.ref }} \ + | sed 's|[^a-zA-Z0-9]|-|g')" >> "$GITHUB_ENV" + echo "AIS_CONTAINER_NAME=${AIS_PR_NUMBER}_${{ github.job }}" >> "$GITHUB_ENV" + - name: Fetching code repository... + uses: actions/checkout@v5 + - name: Authenticating to GitHub Container Registry + uses: docker/login-action@v3.6.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + # Detach the container and run separate commands to it. + # Thus we can make separate explicit steps in the Github CI + # as if we were able to parameterize the container image in the first place. + - name: Starting Docker Container + run: | + docker run \ + -dt \ + --rm \ + --ipc host \ + -e NVIDIA_GDS=enabled \ + --runtime=nvidia \ + --gpus all \ + --pull always \ + --cap-add=CAP_SYSLOG \ + -v $(pwd):/mnt/ais:ro \ + -v ${{ env.AIS_MOUNT_PATH }}:/mnt/ais-fs \ + --name ${AIS_CONTAINER_NAME} \ + ${AIS_CI_DEV_IMAGE} + - name: Make copy of the code repository + run: | + docker exec \ + ${AIS_CONTAINER_NAME} \ + /bin/bash -c ' + cp -R /mnt/ais /ais + mkdir /ais/build + ' + - name: Make temporary directory to run tests in + run: | + ROCTMPDIR=$(docker exec \ + ${AIS_CONTAINER_NAME} \ + /bin/bash -c 'mktemp -d -p /mnt/ais-fs/ci') + echo "ROCTMPDIR=${ROCTMPDIR}" >> "$GITHUB_ENV" + - name: Generate build files for hipFile targeting the NVIDIA platform + run: | + docker exec \ + -w /ais/build \ + ${AIS_CONTAINER_NAME} \ + /bin/bash -c " + cmake \ + -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_CXX_FLAGS="-Werror" \ + -DBUILD_ROCFILE=OFF \ + -DBUILD_HIPFILE=ON \ + -DCMAKE_HIP_PLATFORM=nvidia \ + -DBUILD_AIS_DOCS=ON \ + -DAIS_CAPABLE_DIR=\"${ROCTMPDIR}\" \ + .. + " + - name: Build hipFile for the NVIDIA platform + run: | + docker exec \ + -w /ais/build \ + ${AIS_CONTAINER_NAME} \ + /bin/bash -c ' + cmake --build . --parallel + ' + - name: Get start time of tests + run: | + AIS_START_TIME=$(docker exec \ + ${AIS_CONTAINER_NAME} \ + /bin/bash -c 'date +%s') + echo "AIS_START_TIME=${AIS_START_TIME}" >> "$GITHUB_ENV" + - name: Test hipFile unit and system tests for the NVIDIA platform + id: unit + run: | + docker exec \ + -w /ais/build \ + ${AIS_CONTAINER_NAME} \ + /bin/bash -c ' + ctest -V -L "unit|system" --parallel + ' + - name: Gather logs + if: ${{ failure() && steps.unit.conclusion == 'failure' }} + id: gather + run: | + docker exec \ + -w /ais/build \ + ${AIS_CONTAINER_NAME} \ + /bin/bash -c " + find -name cufile.log -print0 | tar -cf nvidia-logs.tar --null -T - + dmesg -T --since \"@${AIS_START_TIME}\" > dmesg.log + tar -rf nvidia-logs.tar dmesg.log + " + - name: Copy nvidia logs from container + if: ${{ failure() && steps.gather.conclusion == 'success' }} + id: copy_log + run: | + docker cp \ + ${AIS_CONTAINER_NAME}:/ais/build/nvidia-logs.tar \ + nvidia-logs-${{inputs.platform}}.tar + - name: Upload nvidia logs + if: ${{ failure() && steps.copy_log.conclusion == 'success' }} + uses: actions/upload-artifact@v5 + with: + name: nvidia-logs-${{inputs.platform}}.tar + path: nvidia-logs-${{inputs.platform}}.tar + retention-days: 7 + - name: Clean up temporary directory + if: ${{ always() }} + run: | + docker exec \ + -w /ais/build \ + ${AIS_CONTAINER_NAME} \ + /bin/bash -c " + if [ -n \"$ROCTMPDIR\" ] && [ -d \"$ROCTMPDIR\" ] + then + rm -rf \"$ROCTMPDIR\" + fi + " + - name: Cleanup & Stop the Docker container + if: ${{ always() }} + run: | + docker stop ${AIS_CONTAINER_NAME} From aa38a738dfc202a9c3c8a4c38bee15bb2e6ccebb Mon Sep 17 00:00:00 2001 From: Jordan Patterson Date: Mon, 3 Nov 2025 17:00:14 -0500 Subject: [PATCH 2/3] actions: Fix Github repo URLs. --- .github/workflows/build_ais.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_ais.yml b/.github/workflows/build_ais.yml index eefeaa1a..a7650964 100644 --- a/.github/workflows/build_ais.yml +++ b/.github/workflows/build_ais.yml @@ -2,9 +2,9 @@ name: AIS # Mono-workflow - May be advantageous to split up run-name: Build, Test, and Analyze AIS env: AIS_MOUNT_PATH: /mnt/ais/ext4 - AIS_DOCKER_REGISTRY: ghcr.io/rocm/rocfile + AIS_DOCKER_REGISTRY: ghcr.io/rocm/hipfile AIS_CI_IMAGE_NAME: ais_ci_${{inputs.platform}} - AIS_PR_BASE_URL: https://github.com/ROCm/rocFile/pull + AIS_PR_BASE_URL: https://github.com/ROCm/hipFile/pull on: workflow_call: inputs: From e77b68356e28890c0fb5ee2456ec7ce2aeeff5f8 Mon Sep 17 00:00:00 2001 From: Jordan Patterson Date: Mon, 3 Nov 2025 18:15:48 -0500 Subject: [PATCH 3/3] hipFile: Use --ais-capable-dir argument for system tests. --- hipfile/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipfile/test/CMakeLists.txt b/hipfile/test/CMakeLists.txt index 9c79357e..f26f2d1d 100644 --- a/hipfile/test/CMakeLists.txt +++ b/hipfile/test/CMakeLists.txt @@ -62,7 +62,6 @@ endif() ais_gtest_discover_tests( hipfile_tests - WORKING_DIRECTORY ${AIS_CAPABLE_DIR} PROPERTIES LABELS unit TEST_LIST hipfile_unit_tests ) @@ -81,6 +80,7 @@ endif() gtest_discover_tests( hipfile_system_tests + EXTRA_ARGS --ais-capable-dir ${AIS_CAPABLE_DIR} WORKING_DIRECTORY ${AIS_CAPABLE_DIR} PROPERTIES LABELS system )