-
Notifications
You must be signed in to change notification settings - Fork 251
179 lines (154 loc) · 6.63 KB
/
pytest-gpu.yaml
File metadata and controls
179 lines (154 loc) · 6.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# Runner information:
# - OpenACC/OpenMP on NVIDIA runs on runners labeled `nvidiagpu`
# - OpenMP on AMD runs on runners labeled `amdgpu`
#
# Changes vs original:
# * Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…"
# * Tag images and container names with ${{ runner.name }} to avoid cross-runner races and maximize cache reuse
# * Remove docker prune / global container deletes (we assume disk space is fine)
# * Add comments throughout
name: CI-gpu
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
RESOURCE_GROUP: CI-gpu
on:
push:
branches: [main]
pull_request:
branches: [main]
workflow_dispatch:
inputs:
tags:
description: "Run GPU tests"
jobs:
build:
name: ${{ matrix.name }}
runs-on:
- self-hosted
- ${{ matrix.runner_label }}
outputs:
unique : ${{ steps.uniquetag.outputs.unique }}
strategy:
fail-fast: false
matrix:
name: [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd]
test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]
include:
# -------------------- NVIDIA job --------------------
- name: pytest-gpu-acc-nvidia
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py tests/test_operator.py::TestEstimateMemory"
base: "devitocodes/bases:nvidia-nvc12"
runner_label: nvidiagpu
test_drive_cmd: "nvidia-smi"
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
dockerflags: >-
--init --rm -t
--name ${CONTAINER_BASENAME}
--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
# -------------------- AMD job -----------------------
- name: pytest-gpu-omp-amd
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py tests/test_operator.py::TestEstimateMemory"
runner_label: amdgpu
base: "devitocodes/bases:amd"
test_drive_cmd: "rocm-smi"
# Unchanged, still passes through required /dev nodes etc.
dockerflags: >-
--init --network=host
--device=/dev/kfd --device=/dev/dri
--ipc=host
--group-add video --group-add "$(getent group render | cut -d: -f3)"
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
--rm -t
--name ${CONTAINER_BASENAME}
steps:
- name: Checkout devito
uses: actions/checkout@v6
- name: Generate unique CI tag
id: uniquetag
run: |
UNIQUE=$(echo "${GITHUB_RUN_ID}_${GITHUB_RUN_ATTEMPT}" | cksum | cut -f 1 -d " ")
echo "Unique ID: ${UNIQUE}"
echo "unique=${UNIQUE}" >> "$GITHUB_OUTPUT"
- name: Set per-runner tags
env:
UNIQUE: ${{ steps.uniquetag.outputs.unique }}
run: |
echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}-${UNIQUE}" >> "$GITHUB_ENV"
echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}-${{ github.sha }}" >> "$GITHUB_ENV"
- name: Ensure buildx builder
run: |
docker buildx inspect "${RUNNER_NAME// /_}" >/dev/null 2>&1 || \
docker buildx create --name "${RUNNER_NAME// /_}" --driver docker-container
docker buildx use "${RUNNER_NAME// /_}"
- name: Build docker image
run: |
docker buildx build . \
--builder "${RUNNER_NAME// /_}" \
--load \
--label ci-run="$GITHUB_RUN_ID" \
--rm --pull \
--file docker/Dockerfile.devito \
--tag "${DOCKER_IMAGE}" \
--build-arg base="${{ matrix.base }}"
- name: Export CODECOV token
run: echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> "$GITHUB_ENV"
- name: Probe gpu
run: |
# Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
# runners; fall back to "all" so the driver probe does not fail.
if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
echo "CUDA_VISIBLE_DEVICES=all" >> "$GITHUB_ENV"
fi
# Run a simple driver-probe command (nvidia-smi / rocm-smi)
docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true
docker run ${{ matrix.dockerflags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
- name: Test with pytest
env:
# Exported earlier in the job; needed inside the container for codecov
CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
run: |
# Add Codecov’s environment variables (GITHUB_SHA, etc.)
ci_env=$(bash <(curl -s https://codecov.io/env))
# Run the test suite using the matrix-defined flags
docker run \
${{ matrix.dockerflags }} \
"${ci_env}" \
--env CI=true \
--env PYTHONFAULTHANDLER=1 \
--env DEVITO_LOGGING=DEBUG \
--env CODECOV_TOKEN \
"${DOCKER_IMAGE}" \
pytest -vvv --capture=no --showlocals \
--log-cli-level=DEBUG -o log_cli=true \
--full-trace --durations=10 \
--cov --cov-config=.coveragerc --cov-report=xml \
${{ matrix.test_files }}
- name: Test examples
run: |
docker run \
${{ matrix.dockerflags }} \
"${DOCKER_IMAGE}" \
pytest ${{ matrix.test_examples }}
- name: Test examples with MPI
run: |
docker run \
${{ matrix.dockerflags }} \
--env DEVITO_MPI=1 \
"${DOCKER_IMAGE}" \
mpiexec -n 2 pytest ${{ matrix.test_examples }}
- name: Builder & image cleanup (keep 3 days of cache)
if: always()
run: |
# Remove only the test image we built
docker rmi -f "${DOCKER_IMAGE}" || true
# Classic image layers created in this job
docker image prune -f --filter label=ci-run="$GITHUB_RUN_ID"
# BuildKit cache: target the per-runner builder explicitly
docker builder prune --builder "${RUNNER_NAME// /_}" \
-f \
--filter "until=72h"