xchplot2/compose.yaml at main · Jsewill/xchplot2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# compose.yaml — podman-first (also works with docker compose).
#
# Three vendor-specific services share one Containerfile, parameterized
# via build args. Pick one based on your GPU; the build context is the
# same so the AdaptiveCpp + xchplot2 build layers cache across services.
#
# Build & run examples:
#
#   # NVIDIA (default sm_89 / RTX 4090; override via $CUDA_ARCH=120 etc.)
#   podman compose build cuda
#   podman compose run --rm cuda test 22 <plot_id_hex> 2 0 0 -G -o /out
#
#   # NVIDIA Pascal/Volta (sm_61 / GTX 10-series, sm_70 / V100): CUDA 13.x
#   # dropped codegen for pre-Turing archs, so pin to a 12.x base image.
#   # scripts/build-container.sh does this automatically when it detects
#   # CUDA_ARCH < 75; if invoking compose directly, set the base manually:
#   CUDA_ARCH=61 \
#     BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \
#     BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \
#     podman compose build cuda
#
#   # AMD ROCm — set $ACPP_GFX to your card's gfx target (rocminfo | grep gfx).
#   #   gfx1031 = Navi 22 (RX 6700/6700 XT/6800M)
#   #   gfx1100 = Navi 31 (RX 7900 XTX/XT)   ← default
#   #   gfx900  = Vega 10 (RX Vega 56/64, MI25)
#   ACPP_GFX=gfx1031 podman compose build rocm
#   podman compose run --rm rocm test 22 <plot_id_hex> 2 0 0 -G -o /out
#
#   # Intel oneAPI (experimental, untested).
#   podman compose build intel
#
# Plot files land in ./plots/ on the host (mounted at /out in the
# container).

services:
  cuda:
    build:
      context: .
      dockerfile: Containerfile
      args:
        # BASE_DEVEL / BASE_RUNTIME default to CUDA 13.x (latest, sm_75+).
        # scripts/build-container.sh overrides both to nvidia/cuda:12.9.1
        # when it detects a pre-Turing GPU (Pascal/Volta, CUDA_ARCH < 75)
        # — CUDA 13.0 dropped codegen for those archs. Set BASE_DEVEL
        # explicitly to bypass the auto-pick (e.g. for cross-targeting an
        # arch the host doesn't have).
        BASE_DEVEL:           "${BASE_DEVEL:-docker.io/nvidia/cuda:13.0.0-devel-ubuntu24.04}"
        BASE_RUNTIME:         "${BASE_RUNTIME:-docker.io/nvidia/cuda:13.0.0-devel-ubuntu24.04}"
        ACPP_TARGETS:         "generic"
        XCHPLOT2_BUILD_CUDA:  "ON"
        INSTALL_CUDA_HEADERS: "0"
        CUDA_ARCH:            "${CUDA_ARCH:-89}"
    image: xchplot2:cuda
    # GPU pass-through. Works on both engines:
    #   - Docker (with nvidia-container-toolkit + `nvidia-ctk runtime
    #     configure --runtime=docker && systemctl restart docker`)
    #   - Podman 5.x (with podman-compose 1.x+; equivalent to
    #     `--device nvidia.com/gpu=all` via CDI)
    # The previous `devices: nvidia.com/gpu=all` shorthand worked on
    # podman but Docker silently ignored it as an unknown device path,
    # leaving the container without libcuda.so.1 and producing a
    # confusing "No matching device" failure mid-plot.
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    volumes:
      - ./plots:/out

  rocm:
    build:
      context: .
      dockerfile: Containerfile
      args:
        # Pinned to ROCm 6.2.x for two reasons:
        #   1. ROCm 7.x's rocm-llvm package no longer ships LLVMConfig.cmake,
        #      so AdaptiveCpp's find_package(LLVM) can't run.
        #   2. ROCm 6.2 ships LLVM 18.0git, matching Ubuntu's llvm-18 so the
        #      device bitcode (ocml.bc, ockl.bc) is readable by AdaptiveCpp
        #      built against Ubuntu's LLVM. No "Unknown attribute kind"
        #      mismatch.
        # AdaptiveCpp is therefore built against Ubuntu's /usr/lib/llvm-18
        # (the Containerfile default), and ROCm provides its own clang +
        # device libs at /opt/rocm/llvm for the HIP backend at runtime.
        BASE_DEVEL:           docker.io/rocm/dev-ubuntu-24.04:6.2-complete
        BASE_RUNTIME:         docker.io/rocm/dev-ubuntu-24.04:6.2-complete
        # IMPORTANT: ACPP_GFX is intentionally *required* — no silent default.
        # If it's unset the SYCL kernels are AOT-compiled for the wrong amdgcn
        # ISA, which HIP loads without error but the kernels execute as silent
        # no-ops at runtime (sort returns input, AES match finds zero results,
        # plot content diverges from the canonical reference). That failure
        # mode is extremely confusing to diagnose — it looks like a correctness
        # bug in the kernels rather than a build-time config error.
        #
        # Set ACPP_GFX explicitly. If you sudo compose, pass the var through
        # (sudo strips env by default):
        #   ACPP_GFX=gfx1031 sudo -E podman compose build rocm
        #   sudo ACPP_GFX=gfx1031 podman compose build rocm
        #
        # Common gfx targets (see `rocminfo | grep gfx`):
        #   gfx1030 = RDNA2 Navi 21 (RX 6800/6800 XT/6900 XT)
        #   gfx1031 = RDNA2 Navi 22 (RX 6700/6700 XT/6800M)
        #   gfx1100 = RDNA3 Navi 31 (RX 7900 XTX/XT)
        #   gfx1101 = RDNA3 Navi 32 (RX 7800 XT/7700 XT)
        #   gfx906  = Vega 20 (Radeon VII, MI50)
        #   gfx900  = Vega 10 (RX Vega 56/64, MI25)
        # Use ${VAR:-default} (NOT ${VAR:?error}) so that building cuda
        # / intel / cpu services without ACPP_GFX set doesn't trip a
        # parse-time error — podman-compose evaluates :? across ALL
        # services during YAML parse, not just the one being built.
        # The placeholder value is intentionally invalid as a gfx
        # target so AdaptiveCpp's HIP backend fails loudly with the
        # placeholder string in its error message — much better than
        # silently building wrong-arch amdgcn ISA from a default like
        # gfx1100 (kernels would then execute as runtime no-ops, see
        # the IMPORTANT block above).
        ACPP_TARGETS:         "hip:${ACPP_GFX:-MISSING-set-ACPP_GFX-or-use-scripts-build-container-sh}"
        XCHPLOT2_BUILD_CUDA:  "OFF"
        # No CUDA headers on the AMD path — they conflict with HIP's
        # uchar1/etc. typedefs. CudaHalfShim.hpp's __has_include guard
        # handles the absence cleanly.
        INSTALL_CUDA_HEADERS: "0"
    image: xchplot2:rocm
    devices:
      - /dev/kfd
      - /dev/dri
    group_add:
      - video
    # Rootless podman's default seccomp filter + capability set blocks
    # some of the KFD IOCTLs libhsa-runtime64 issues during DMA setup,
    # which surfaces as a segfault inside the HSA runtime on the first
    # host→device copy (rocminfo-level queries still work, so the
    # failure is subtle and confusing). Loosen the sandbox just enough
    # for HSA's DMA path. If rootless still fails on your host, run
    # rootful + privileged instead:
    #   sudo podman run --rm --privileged --device /dev/kfd \
    #        --device /dev/dri -v $PWD/plots:/out xchplot2:rocm \
    #        plot -k 28 -n 10 -f <farmer-pk> -c <pool-contract> -o /out
    security_opt:
      - seccomp=unconfined
    cap_add:
      - SYS_ADMIN
    volumes:
      - ./plots:/out

  intel:
    build:
      context: .
      dockerfile: Containerfile
      args:
        BASE_DEVEL:           docker.io/intel/oneapi-basekit:latest
        BASE_RUNTIME:         docker.io/intel/oneapi-runtime:latest
        ACPP_TARGETS:         "generic"
        XCHPLOT2_BUILD_CUDA:  "OFF"
        INSTALL_CUDA_HEADERS: "1"
    image: xchplot2:intel
    devices:
      - /dev/dri
    volumes:
      - ./plots:/out

  cpu:
    # CPU-only image: AdaptiveCpp's OpenMP backend compiles the SYCL
    # kernels for the host CPU. No GPU runtime needed. Plotting is
    # 1-2 orders of magnitude slower than GPU; useful for headless CI,
    # dev machines without a GPU, or as an extra worker on a
    # heterogeneous `--devices` list. See README's CPU section.
    build:
      context: .
      dockerfile: Containerfile
      args:
        BASE_DEVEL:           docker.io/ubuntu:24.04
        BASE_RUNTIME:         docker.io/ubuntu:24.04
        ACPP_TARGETS:         "omp"
        XCHPLOT2_BUILD_CUDA:  "OFF"
        # AdaptiveCpp's libkernel/half.hpp includes cuda_fp16.h on every
        # build path; pull the headers (no libcudart link, just headers).
        INSTALL_CUDA_HEADERS: "1"
    image: xchplot2:cpu
    volumes:
      - ./plots:/out