xchplot2/build.rs at main · Jsewill/xchplot2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
// build.rs — drive the existing CMake build to produce the static libs
// that the Rust `[[bin]] xchplot2` then links against.
//
// The CMake build is the authoritative one (CUDA, separable compilation,
// pos2-chip FetchContent, the keygen-rs Rust shim). We just call it from
// here so a `cargo install` works end-to-end on a machine with the build
// dependencies listed in README.md (CMake ≥ 3.24, CUDA Toolkit, C++20
// compiler, and a Rust toolchain — the last one cargo provides).

use std::env;
use std::path::PathBuf;
use std::process::Command;

/// Ask `nvidia-smi` for the local GPU's compute capability and return it as
/// a CMake-style integer (e.g. "89" for an sm_89 RTX 4090, "120" for an
/// sm_120 RTX 5090). Returns None on any failure — no nvidia-smi, no GPU,
/// driver issue — so callers can fall back cleanly.
fn detect_cuda_arch() -> Option<String> {
    let out = Command::new("nvidia-smi")
        .args(["--query-gpu=compute_cap", "--format=csv,noheader,nounits"])
        .output()
        .ok()?;
    if !out.status.success() {
        return None;
    }
    let s = std::str::from_utf8(&out.stdout).ok()?.trim();
    if s.is_empty() {
        return None;
    }
    // If multiple GPUs, just use the first; user can override with
    // $CUDA_ARCHITECTURES (which accepts CMake's `89;120` multi-arch syntax)
    // if they need a fat binary.
    let first = s.lines().next()?.trim();
    let cap: f32 = first.parse().ok()?;        // "8.9" -> 8.9
    let arch = (cap * 10.0).round() as u32;    // -> 89
    Some(arch.to_string())
}

/// Same probe as `detect_cuda_arch`, but filters out NVIDIA GPUs
/// below our README-documented minimum compute capability (sm_50,
/// Maxwell first-gen / GTX 750-class). The floor used to be sm_61 on
/// the assumption that AdaptiveCpp's `half.hpp` referenced FP16
/// intrinsics (`__hadd` / `__hsub` / `__hmul` / `__hdiv` / `__hlt` /
/// `__hgt`) only available on sm_53+ — but those intrinsics are
/// *implemented* in `cuda_fp16.hpp` via `NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, …)`
/// with a fp32 emulation fallback for pre-sm_53 cards. CUDA 12.x
/// toolkits compile cleanly for sm_50/52/53. The real floor is the
/// toolkit's own codegen support: CUDA 12.x supports sm_50-90+,
/// CUDA 13.x dropped sm_50-72 (CMakeLists' nvcc-vs-arch preflight
/// catches that pairing with a FATAL_ERROR + fix block).
///
/// Returns Some(arch) only when nvidia-smi reports a card at or
/// above our minimum; emits a cargo:warning and returns None
/// otherwise so callers fall through to the AMD / Intel detection.
fn usable_nvidia_arch() -> Option<String> {
    let arch = match detect_cuda_arch() {
        Some(a) => a,
        None => {
            // nvidia-smi missing or its `--query-gpu=compute_cap` query
            // failed. Fall back to a sysfs PCI probe so hosts with old
            // drivers or partial enumeration still get NVIDIA-aware
            // build flags. We can't know the real compute_cap from
            // sysfs, so honor $CUDA_ARCHITECTURES if set; otherwise
            // default to sm_75 (Turing — works on every CUDA toolkit
            // 12.x or 13.x without the Maxwell/Pascal/Volta drop).
            if !nvidia_gpu_present() {
                return None;
            }
            let (fallback_arch, source) = match env::var("CUDA_ARCHITECTURES")
                .ok()
                .and_then(|s| min_arch(&s))
            {
                Some(a) => (a.to_string(), "$CUDA_ARCHITECTURES"),
                None => ("75".to_string(), "default (Turing)"),
            };
            println!(
                "cargo:warning=xchplot2: nvidia-smi --query-gpu=compute_cap \
                 failed, but /sys/class/drm reports an NVIDIA GPU (vendor \
                 0x10de). Falling back to sm_{fallback_arch} ({source}). If \
                 your card is older or newer, set $CUDA_ARCHITECTURES \
                 explicitly (e.g. CUDA_ARCHITECTURES=89 for an RTX 4090) \
                 — autodetect can't read the compute_cap from sysfs alone.");
            fallback_arch
        }
    };
    let n: u32 = arch.parse().ok()?;
    if n < 50 {
        println!(
            "cargo:warning=xchplot2: nvidia-smi detected sm_{arch} — below our \
             minimum supported compute capability (sm_50 / Maxwell). CUDA 11.x \
             was the last toolkit to compile for Kepler (sm_30-37); we don't \
             support that path. Ignoring NVIDIA for default targeting; if \
             this card is your only GPU, force the build with \
             CUDA_ARCHITECTURES={arch} + XCHPLOT2_BUILD_CUDA=ON and an \
             appropriately-old CUDA toolkit, or fall back to \
             ACPP_TARGETS=omp for AdaptiveCpp's CPU OpenMP backend.");
        return None;
    }
    if n < 75 && detect_nvcc_major().map(|m| m >= 13).unwrap_or(false) {
        println!(
            "cargo:warning=xchplot2: nvidia-smi detected sm_{arch} (Maxwell / \
             Pascal / Volta) but nvcc is CUDA 13.x, which dropped codegen \
             for sm_50-72. Ignoring NVIDIA for default targeting; install \
             CUDA 12.9 (last toolkit with Maxwell-Volta support) and re-run, \
             or use scripts/build-container.sh which auto-pins the right \
             base image. CMakeLists' preflight will FATAL_ERROR with the \
             exact remediation if you force-build anyway.");
        return None;
    }
    Some(arch)
}

/// Check whether nvcc is on $PATH and runnable. Used as the fall-back
/// signal for XCHPLOT2_BUILD_CUDA when no GPU is enumerable (headless
/// CI / container builds). Runs `nvcc --version` rather than a simple
/// PATH lookup so stale symlinks don't pass.
fn detect_nvcc() -> bool {
    Command::new("nvcc")
        .arg("--version")
        .output()
        .map(|o| o.status.success())
        .unwrap_or(false)
}

/// Parse nvcc's major version from `nvcc --version` output.
/// The release line looks like:
///   "Cuda compilation tools, release 13.0, V13.0.48"
/// Returns None if nvcc isn't on PATH or the line can't be parsed —
/// callers treat that as "skip the version-vs-arch compat check"
/// rather than blocking the build.
fn detect_nvcc_major() -> Option<u32> {
    let out = Command::new("nvcc").arg("--version").output().ok()?;
    if !out.status.success() { return None; }
    let s = std::str::from_utf8(&out.stdout).ok()?;
    for line in s.lines() {
        let mut iter = line.split_whitespace();
        while let Some(w) = iter.next() {
            if w == "release" {
                let next = iter.next()?;                         // "13.0,"
                let major = next.trim_end_matches(',').split('.').next()?;
                return major.parse().ok();
            }
        }
    }
    None
}

/// Minimum integer arch from a CMake-style CUDA_ARCHITECTURES list
/// ("61", "61;86", "61;86;120"). Tolerates "sm_61" / "compute_61"
/// prefixes that Cargo users sometimes pass through. Returns None
/// when the list parses to nothing.
fn min_arch(arch_list: &str) -> Option<u32> {
    arch_list.split(';')
        .filter_map(|s| {
            let s = s.trim()
                .trim_start_matches("sm_")
                .trim_start_matches("compute_");
            s.parse().ok()
        })
        .min()
}

/// Probe /sys/class/drm for a display-class PCI device with Intel's
/// vendor ID (0x8086). Used as a heuristic to default
/// XCHPLOT2_BUILD_CUDA=OFF on Intel hosts, mirroring what rocminfo
/// already does for AMD. Returns false on non-Linux or when the sysfs
/// path isn't accessible — callers fall back to the next signal.
fn detect_intel_gpu() -> bool {
    let entries = match std::fs::read_dir("/sys/class/drm") {
        Ok(d) => d,
        Err(_) => return false,
    };
    for entry in entries.flatten() {
        let name = entry.file_name();
        let name = name.to_string_lossy();
        // Skip connector nodes like card0-DP-1; we only want the card itself.
        if !name.starts_with("card") || name.contains('-') {
            continue;
        }
        let vendor = entry.path().join("device/vendor");
        if let Ok(v) = std::fs::read_to_string(&vendor) {
            if v.trim() == "0x8086" {
                return true;
            }
        }
    }
    false
}

/// Does the host have any NVIDIA GPU? Sysfs PCI vendor-ID probe (0x10de)
/// — same fallback shape as `amd_gpu_present()`. Used by
/// `usable_nvidia_arch()` to recover when `nvidia-smi --query-gpu=
/// compute_cap` fails (older driver, partial enumeration, container
/// missing nvidia-smi binary, etc.) but the host clearly has an NVIDIA
/// card. Doesn't tell us the compute_cap; callers fall back to
/// `$CUDA_ARCHITECTURES` or a sensible default if this returns true.
fn nvidia_gpu_present() -> bool {
    let entries = match std::fs::read_dir("/sys/class/drm") {
        Ok(d) => d,
        Err(_) => return false,
    };
    for entry in entries.flatten() {
        let name = entry.file_name();
        let name = name.to_string_lossy();
        if !name.starts_with("card") || name.contains('-') {
            continue;
        }
        let vendor = entry.path().join("device/vendor");
        if let Ok(v) = std::fs::read_to_string(&vendor) {
            if v.trim() == "0x10de" {
                return true;
            }
        }
    }
    false
}

/// Does the host have any AMD GPU detectable by rocminfo? Independent
/// of which ACPP_TARGETS string we'd pick for it — `detect_amd_gfx` may
/// return None for AMD cards we choose to route through SSCP (RDNA1
/// default), but the GPU is still present and BUILD_CUDA detection
/// should still see it as "AMD host, skip CUDA TUs".
///
/// Falls back to /sys/class/drm vendor-ID probe (0x1002) when rocminfo
/// isn't on $PATH at build time. That happens reliably when users
/// install ROCm via /opt/rocm/bin without sourcing /etc/profile.d/rocm.sh
/// in the shell that runs `cargo install`, or run `cargo install` under
/// systemd / sudo / chroot where the parent shell's PATH is stripped.
/// Without the fallback the BUILD_CUDA selector falls through to the
/// `nvcc present → ON, "CI fallback"` arm, the build links CUB, and the
/// streaming pipeline dies on first sort dispatch against the AMD card.
fn amd_gpu_present() -> bool {
    if let Ok(out) = Command::new("rocminfo").output() {
        if out.status.success() {
            if let Ok(s) = std::str::from_utf8(&out.stdout) {
                if s.lines().any(|l| {
                    l.trim().strip_prefix("Name:")
                        .map(|rest| rest.trim().starts_with("gfx"))
                        .unwrap_or(false)
                }) {
                    return true;
                }
            }
        }
    }
    // PCI fallback — same pattern as detect_intel_gpu(). Doesn't need any
    // user-space tools, only readable sysfs (true on every Linux host
    // with the amdgpu / radeon kernel module loaded).
    let entries = match std::fs::read_dir("/sys/class/drm") {
        Ok(d) => d,
        Err(_) => return false,
    };
    for entry in entries.flatten() {
        let name = entry.file_name();
        let name = name.to_string_lossy();
        if !name.starts_with("card") || name.contains('-') {
            continue;
        }
        let vendor = entry.path().join("device/vendor");
        if let Ok(v) = std::fs::read_to_string(&vendor) {
            if v.trim() == "0x1002" {
                return true;
            }
        }
    }
    false
}

/// Ask `rocminfo` for the first AMD GPU's architecture, e.g. "gfx1100" for
/// an RX 7900 XTX. Returns None when rocminfo is missing or there's no AMD
/// GPU, AND ALSO when we deliberately want the caller to fall through to
/// ACPP_TARGETS=generic (currently for RDNA1 gfx1010/1011/1012). Use
/// amd_gpu_present() to distinguish "no AMD GPU at all" from "AMD GPU
/// present but routed through generic SSCP".
fn detect_amd_gfx() -> Option<String> {
    let out = Command::new("rocminfo").output().ok()?;
    if !out.status.success() {
        return None;
    }
    let s = std::str::from_utf8(&out.stdout).ok()?;
    for line in s.lines() {
        if let Some(rest) = line.trim().strip_prefix("Name:") {
            let name = rest.trim();
            if name.starts_with("gfx") {
                // RDNA1 (gfx1010/1011/1012) isn't a direct AdaptiveCpp
                // HIP AOT target. We previously defaulted to a community
                // workaround that AOT-compiled for gfx1013 (close-ISA),
                // but it has been observed to silently produce no-op
                // kernels on at least one W5700 / ROCm 6 / AdaptiveCpp
                // 25.10 setup — every kernel dispatch completes without
                // writing, surfacing far downstream as "T1 match
                // produced 0 entries". A separate-build experiment on
                // the same host with ACPP_TARGETS=generic (SSCP JIT)
                // dispatched and produced correct output through k=24.
                //
                // Default for RDNA1 is now ACPP_TARGETS=generic (signal
                // by returning None — caller's None branch picks
                // generic). Two opt-in escape hatches preserved for
                // users who've validated their stack on the legacy
                // path:
                //   XCHPLOT2_FORCE_GFX_SPOOF=1 — gfx1013 AOT spoof
                //   XCHPLOT2_NO_GFX_SPOOF=1    — native gfx1010 AOT
                //                                (may fail to compile
                //                                if AdaptiveCpp doesn't
                //                                advertise it as a HIP
                //                                target).
                let spoofed = match name {
                    "gfx1010" | "gfx1011" | "gfx1012" => {
                        let force_spoof = env::var("XCHPLOT2_FORCE_GFX_SPOOF")
                            .map(|v| !v.is_empty() && v != "0")
                            .unwrap_or(false);
                        let no_spoof = env::var("XCHPLOT2_NO_GFX_SPOOF")
                            .map(|v| !v.is_empty() && v != "0")
                            .unwrap_or(false);
                        if force_spoof {
                            println!(
                                "cargo:warning=xchplot2: RDNA1 {name} detected, \
                                 XCHPLOT2_FORCE_GFX_SPOOF set — building for \
                                 gfx1013 (legacy community workaround). The \
                                 default switched to ACPP_TARGETS=generic (SSCP \
                                 JIT) after the spoof was observed to silently \
                                 produce no-op kernels on some W5700 setups; \
                                 unset XCHPLOT2_FORCE_GFX_SPOOF if your plots \
                                 fail with 'T1 match produced 0 entries'.");
                            "gfx1013".to_string()
                        } else if no_spoof {
                            println!(
                                "cargo:warning=xchplot2: RDNA1 {name} detected, \
                                 XCHPLOT2_NO_GFX_SPOOF set — AOT-targeting {name} \
                                 natively. If AdaptiveCpp doesn't advertise {name} \
                                 as a HIP target on your toolchain, the build will \
                                 fail; unset XCHPLOT2_NO_GFX_SPOOF to fall back to \
                                 the (working-on-most-cards) generic SSCP JIT.");
                            name.to_string()
                        } else {
                            println!(
                                "cargo:warning=xchplot2: RDNA1 {name} detected — \
                                 defaulting to ACPP_TARGETS=generic (SSCP JIT). \
                                 The previous gfx1013 community workaround was \
                                 observed to silently produce no-op kernels on \
                                 at least one W5700 / ROCm 6 setup. Override: \
                                 XCHPLOT2_FORCE_GFX_SPOOF=1 (back to gfx1013 AOT) \
                                 or XCHPLOT2_NO_GFX_SPOOF=1 (try native {name})."
                            );
                            return None;
                        }
                    }
                    other => other.to_string(),
                };
                return Some(spoofed);
            }
        }
    }
    None
}

/// Probe whether `cmd` is on PATH and runnable. Used by preflight()
/// to detect missing toolchain pieces before cmake gets to fail with
/// a cryptic message.
fn command_runs(cmd: &str) -> bool {
    Command::new(cmd)
        .arg("--version")
        .output()
        .map(|o| o.status.success())
        .unwrap_or(false)
}

/// Locate `ld.lld` either on PATH or in the conventional LLVM-{16..20}
/// install prefixes. Mirrors the find_program HINTS list in
/// CMakeLists.txt's FetchContent block. AdaptiveCpp's CMake aborts
/// with "Cannot find ld.lld" without it.
fn ld_lld_findable() -> bool {
    if command_runs("ld.lld") { return true; }
    for p in &[
        "/usr/lib/llvm-20/bin/ld.lld", "/usr/lib/llvm-19/bin/ld.lld",
        "/usr/lib/llvm-18/bin/ld.lld", "/usr/lib/llvm-17/bin/ld.lld",
        "/usr/lib/llvm-16/bin/ld.lld",
        "/usr/lib/llvm20/bin/ld.lld",  "/usr/lib/llvm19/bin/ld.lld",
        "/usr/lib/llvm18/bin/ld.lld",
        "/usr/lib64/llvm20/bin/ld.lld", "/usr/lib64/llvm19/bin/ld.lld",
        "/usr/lib64/llvm18/bin/ld.lld",
        "/opt/llvm-20/bin/ld.lld", "/opt/llvm-19/bin/ld.lld",
        "/opt/llvm-18/bin/ld.lld",
    ] {
        if std::path::Path::new(p).exists() { return true; }
    }
    false
}

/// True when AdaptiveCpp is already installed — at $ACPP_PREFIX if
/// set, otherwise the install-deps.sh default of /opt/adaptivecpp.
/// When this is true the FetchContent fallback won't fire and
/// AdaptiveCpp's own build-time deps (notably ld.lld) aren't needed
/// for our build.
fn adaptivecpp_installed() -> bool {
    let prefix = env::var("ACPP_PREFIX")
        .unwrap_or_else(|_| "/opt/adaptivecpp".to_string());
    std::path::Path::new(&format!(
        "{prefix}/lib/cmake/AdaptiveCpp/AdaptiveCppConfig.cmake"
    )).exists()
}

/// Detect a container engine on PATH, preferring podman (matches
/// scripts/build-container.sh's default). Used to phrase the preflight
/// panic differently when the user already has tooling that lets them
/// skip the host-side install entirely.
fn detect_container_engine() -> Option<&'static str> {
    if command_runs("podman") { return Some("podman"); }
    if command_runs("docker") { return Some("docker"); }
    None
}

/// Walk critical build-time prerequisites and return human-readable
/// names of anything missing. Cargo install users in particular don't
/// read the Build section of README.md (and don't expect to need to),
/// so a friendly preflight is much better than letting CMake or
/// AdaptiveCpp fail with cryptic errors deep into a build.
fn preflight(build_cuda_on: bool) -> Vec<String> {
    let mut missing: Vec<String> = vec![];
    if !command_runs("cmake") {
        missing.push("cmake (3.24+) — apt install cmake / dnf install cmake / pacman -S cmake".into());
    }
    if !command_runs("c++") && !command_runs("g++") && !command_runs("clang++") {
        missing.push("C++20 compiler (g++ ≥ 13 or clang++ ≥ 18) — apt install build-essential, dnf install gcc-c++, or pacman -S base-devel".into());
    }
    // ld.lld is only required when FetchContent will rebuild
    // AdaptiveCpp; a pre-installed AdaptiveCpp linked against ld.lld
    // at its own install time, so consumers don't need it again.
    if !adaptivecpp_installed() && !ld_lld_findable() {
        missing.push("ld.lld (apt: lld-18, dnf/pacman: lld) — required by AdaptiveCpp's FetchContent build".into());
    }
    if build_cuda_on && !detect_nvcc() {
        missing.push("nvcc (CUDA Toolkit 12+) — XCHPLOT2_BUILD_CUDA=ON requested but no nvcc on PATH".into());
    }
    missing
}

fn main() {
    let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
    let out_dir      = PathBuf::from(env::var("OUT_DIR").unwrap());
    let cmake_build  = out_dir.join("cmake-build");
    std::fs::create_dir_all(&cmake_build).expect("create cmake-build dir");

    // Architecture precedence:
    //   1. $CUDA_ARCHITECTURES if set (lets the user pick or list multiple).
    //   2. nvidia-smi probe of the build machine's local GPU.
    //   3. 89 (sm_89, RTX 4090 / Ada Lovelace) as a sensible default for
    //      machines without nvidia-smi (e.g. CI, headless package builds).
    let (cuda_arch, source) = match env::var("CUDA_ARCHITECTURES") {
        Ok(v) => (v, "$CUDA_ARCHITECTURES"),
        Err(_) => match detect_cuda_arch() {
            Some(v) => (v, "nvidia-smi probe"),
            None    => ("89".to_string(), "fallback (no nvidia-smi)"),
        },
    };
    println!("cargo:warning=xchplot2: building for CUDA arch {cuda_arch} ({source})");

    // AdaptiveCpp target precedence:
    //   1. $ACPP_TARGETS if set.
    //   2. NVIDIA: "generic" (LLVM SSCP). Empirically a few percent
    //      faster than cuda:sm_<arch> on our kernels.
    //   3. AMD:    hip:gfx<...> via rocminfo. SSCP's HIP path is less
    //      mature, so AOT-compile for the gfx target.
    //   4. generic (LLVM SSCP, JITs on first use).
    let (acpp_targets, acpp_source) = match env::var("ACPP_TARGETS") {
        // Treat an empty env var the same as unset — Containerfile build
        // args propagate as `ACPP_TARGETS=` when the user doesn't override
        // them, and acpp rejects an empty target string.
        Ok(v) if !v.is_empty() => (v, "$ACPP_TARGETS"),
        Ok(_) | Err(_) => {
            // Prefer a USABLE NVIDIA GPU (sm_61+) over AMD, otherwise fall
            // through to AMD / fallback. `detect_cuda_arch` alone would
            // trigger on an ancient secondary NVIDIA card even when AMD is
            // the real plotting target (see usable_nvidia_arch).
            if usable_nvidia_arch().is_some() {
                ("generic".to_string(), "NVIDIA detected — using SSCP")
            } else if let Some(gfx) = detect_amd_gfx() {
                (format!("hip:{gfx}"), "rocminfo probe")
            } else {
                ("generic".to_string(), "fallback (LLVM SSCP)")
            }
        }
    };
    println!("cargo:warning=xchplot2: ACPP_TARGETS={acpp_targets} ({acpp_source})");

    // XCHPLOT2_BUILD_CUDA toggles whether the CUB sort + nvcc-compiled
    // CUDA TUs (AesGpu.cu, SortCuda.cu, AesGpuBitsliced.cu) are built.
    // Autodetect prefers actual GPU vendor over toolchain availability:
    // dual-toolchain hosts (AMD / Intel GPU, CUDA Toolkit also installed)
    // would otherwise try to compile SortCuda.cu through nvcc + AdaptiveCpp
    // — which has triggered upstream `half.hpp` compile errors for at
    // least one Radeon Pro W5700 user. Priority order:
    //   NVIDIA GPU → ON      (CUB is the fast path)
    //   AMD GPU    → OFF     (SYCL/HIP path; CUB unused anyway)
    //   Intel GPU  → OFF     (SYCL/L0 path)
    //   no GPU, nvcc present → ON  (CI / container build)
    //   no GPU, no nvcc      → OFF
    let (build_cuda, bc_source) = match env::var("XCHPLOT2_BUILD_CUDA") {
        Ok(v) if !v.is_empty() => (v, "$XCHPLOT2_BUILD_CUDA"),
        _ => {
            // Same usable-arch gate as the ACPP_TARGETS block: an
            // ancient secondary NVIDIA card (e.g. sm_52 alongside an
            // AMD W5700) must NOT claim the CUB path, because
            // AdaptiveCpp half.hpp references sm_53+ FP16 intrinsics
            // that the old card's cuda_fp16.h guards out.
            let nvidia_gpu = usable_nvidia_arch().is_some();
            // amd_gpu_present, NOT detect_amd_gfx().is_some() — the
            // latter returns None for RDNA1 (we route those through
            // SSCP instead of an AOT hip:* target), but the GPU is
            // there and we MUST skip CUDA TUs to avoid running
            // SortCuda.cu's CUB calls against AMD silicon.
            let amd_gpu    = amd_gpu_present();
            let intel_gpu  = detect_intel_gpu();
            if nvidia_gpu {
                ("ON".to_string(), "NVIDIA GPU detected")
            } else if amd_gpu {
                ("OFF".to_string(), "AMD GPU detected — skipping CUDA TUs")
            } else if intel_gpu {
                ("OFF".to_string(), "Intel GPU detected — skipping CUDA TUs")
            } else if detect_nvcc() {
                ("ON".to_string(), "no GPU probe, nvcc present — assuming CI/container")
            } else {
                ("OFF".to_string(), "no GPU, no nvcc — skipping CUDA TUs")
            }
        },
    };
    println!("cargo:warning=xchplot2: XCHPLOT2_BUILD_CUDA={build_cuda} ({bc_source})");

    // Preflight critical system deps BEFORE invoking cmake. Cargo
    // install users land here without reading README.md's Build
    // section; without preflight, missing deps surface as cryptic
    // CMake / AdaptiveCpp errors deep in the configure / build.
    let missing = preflight(build_cuda == "ON");
    if !missing.is_empty() {
        let bullets = missing.iter()
            .map(|m| format!("  - {m}"))
            .collect::<Vec<_>>()
            .join("\n");
        // Surface the container path proactively when we can already
        // see podman/docker — for many users that's the smoothest fix
        // because the toolchain stays bundled in the image.
        let next_steps = match detect_container_engine() {
            Some(engine) => format!(
                "Two ways forward, pick whichever fits:\n\n  \
                   - Install those packages on the host:\n      \
                       ./scripts/install-deps.sh --gpu nvidia    # auto-detects vendor + AdaptiveCpp\n\n  \
                   - Or, since you have {engine} installed, build inside a container —\n    \
                     toolchain stays in the image, no host changes needed:\n      \
                       ./scripts/build-container.sh\n      \
                       {engine} compose run --rm cuda plot ...    # or rocm / intel / cpu\n\n\
                 If install-deps.sh just ran and you're still seeing this, check\n\
                 its tail output — it names the failed package before exiting."
            ),
            None => format!(
                "Two ways forward, pick whichever fits:\n\n  \
                   - Install those packages on the host:\n      \
                       ./scripts/install-deps.sh --gpu nvidia    # auto-detects vendor + AdaptiveCpp\n\n  \
                   - Or build inside a container (no host toolchain needed beyond\n    \
                     podman or docker — install whichever you prefer first):\n      \
                       ./scripts/build-container.sh\n\n\
                 If install-deps.sh just ran and you're still seeing this, check\n\
                 its tail output — it names the failed package before exiting."
            ),
        };
        panic!("\nxchplot2: build prerequisites missing:\n{bullets}\n\n{next_steps}\n");
    }

    // CUDA 13.0 dropped codegen for sm_50/52/53/60/61/62/70/72 entirely
    // — its nvcc fails the CMake TryCompile probe with "Unsupported gpu
    // architecture 'compute_61'" on Pascal, "compute_70" on Volta, etc.
    // Catch that mismatch HERE so the failure surfaces with a clear fix
    // path, not buried in a CMakeError.log 40 lines into a TryCompile.
    // Skipped when nvcc version or arch list can't be parsed (treat as
    // "preflight not actionable, let cmake try" — preserves prior
    // behaviour for unusual setups).
    if build_cuda == "ON" {
        if let (Some(nvcc_major), Some(min)) = (detect_nvcc_major(), min_arch(&cuda_arch)) {
            if nvcc_major >= 13 && min < 75 {
                // Container detection: Docker writes /.dockerenv, Podman writes
                // /run/.containerenv. Either presence means the host-side fixes
                // (apt install cuda-toolkit, set CUDA_PATH) are not actionable
                // from inside this build — the user needs to rebuild the image
                // with a different BASE_DEVEL.
                let in_container = std::path::Path::new("/.dockerenv").exists()
                    || std::path::Path::new("/run/.containerenv").exists();
                let fix_block = if in_container {
                    format!(
                        "You're building inside a container — the toolkit comes from the\n\
                         base image, not the host. Rebuild the image with a CUDA 12.x base:\n  \
                           - Recommended: rerun scripts/build-container.sh on the host;\n    \
                             it auto-pins nvidia/cuda:12.9.1 when CUDA_ARCH < 75.\n  \
                           - Or pass --build-arg explicitly:\n      \
                               podman build -t xchplot2:cuda \\\n        \
                                 --build-arg BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n        \
                                 --build-arg BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n        \
                                 --build-arg CUDA_ARCH={min} \\\n        \
                                 .\n  \
                           - Or via compose with env vars:\n      \
                               CUDA_ARCH={min} \\\n        \
                                 BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n        \
                                 BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n        \
                                 podman compose build cuda\n"
                    )
                } else {
                    "Fix one of:\n  \
                       - Install CUDA 12.9 (last toolkit with Pascal/Volta support):\n      \
                           Ubuntu/Debian:  sudo apt install cuda-toolkit-12-9\n      \
                           Arch:           pacman -S cuda  (or pin to a 12.x channel)\n    \
                         then point the build at it:\n      \
                           CUDA_PATH=/usr/local/cuda-12.9 cargo install \\\n      \
                             --git https://github.com/Jsewill/xchplot2 --force\n  \
                       - Or override the arch (only valid if you actually have a Turing+ card):\n      \
                           CUDA_ARCHITECTURES=75 cargo install \\\n      \
                             --git https://github.com/Jsewill/xchplot2 --force\n  \
                       - Or use the container path — scripts/build-container.sh auto-pins\n    \
                         the 12.9 base image when it detects a pre-Turing GPU.\n".to_string()
                };
                panic!(
                    "\nxchplot2: CUDA Toolkit {nvcc_major}.x dropped codegen for sm_{min} \
                     (Pascal / Volta / pre-Turing).\n\
                     \n\
                     Detected:\n  \
                       nvcc {nvcc_major}.x\n  \
                       target arch: sm_{min} (from CUDA_ARCHITECTURES={cuda_arch})\n\
                     \n\
                     {fix_block}"
                );
            }
        }
    }

    // ---- configure ----
    let status = Command::new("cmake")
        .args([
            "-S", manifest_dir.to_str().unwrap(),
            "-B", cmake_build.to_str().unwrap(),
            "-DCMAKE_BUILD_TYPE=Release",
        ])
        .arg(format!("-DCMAKE_CUDA_ARCHITECTURES={cuda_arch}"))
        .arg(format!("-DACPP_TARGETS={acpp_targets}"))
        .arg(format!("-DXCHPLOT2_BUILD_CUDA={build_cuda}"))
        .status()
        .expect("failed to invoke cmake — is it installed?");
    if !status.success() {
        panic!("cmake configure failed");
    }

    // ---- build only the static libs we need; skip the cmake-built
    // executable (we're producing our own via cargo) and the parity tests.
    let status = Command::new("cmake")
        .args([
            "--build", cmake_build.to_str().unwrap(),
            "--target", "xchplot2_cli",
            "--parallel",
        ])
        .status()
        .expect("failed to invoke cmake --build");
    if !status.success() {
        panic!("cmake build of xchplot2_cli failed");
    }

    // ---- tell rustc where each static lib lives ----
    let cb = cmake_build.display();
    println!("cargo:rustc-link-search=native={cb}");
    println!("cargo:rustc-link-search=native={cb}/fse");
    println!("cargo:rustc-link-search=native={cb}/keygen-rs-target/release");

    // Order matters: xchplot2_cli depends on pos2_gpu_host depends on pos2_gpu.
    // Wrap in --start-group/--end-group so the static linker resolves any
    // remaining cross-archive references without us having to pin order.
    //
    // --allow-multiple-definition: pos2_keygen.a is a Rust staticlib, so it
    // bundles its own copy of libstd (rust_eh_personality, ARGV_INIT_ARRAY,
    // EMPTY_PANIC). The host xchplot2 binary also brings in libstd. Both
    // copies come from the same toolchain and are bit-identical, so letting
    // the linker pick the first is safe. The clean alternative is to make
    // keygen-rs a Rust workspace member with crate-type = ["rlib"], but
    // that breaks the standalone CMake-only build path which expects a
    // staticlib for the cmake-built executable.
    println!("cargo:rustc-link-arg=-Wl,--allow-multiple-definition");
    println!("cargo:rustc-link-arg=-Wl,--start-group");
    println!("cargo:rustc-link-lib=static=xchplot2_cli");
    println!("cargo:rustc-link-lib=static=pos2_gpu_host");
    println!("cargo:rustc-link-lib=static=pos2_gpu");
    println!("cargo:rustc-link-lib=static=pos2_keygen");
    println!("cargo:rustc-link-lib=static=fse");
    println!("cargo:rustc-link-arg=-Wl,--end-group");

    // ---- AdaptiveCpp runtime ----
    // The static archives produced by CMake reference hipsycl::rt::* symbols
    // that live in libacpp-rt + libacpp-common (shared). CMake writes the
    // exact lib directory to $cmake_build/acpp-prefix.txt during configure;
    // honour that, then $ACPP_PREFIX / standard locations as fallbacks.
    let acpp_lib_dir = std::fs::read_to_string(cmake_build.join("acpp-prefix.txt"))
        .ok()
        .map(|s| s.trim().to_string())
        .filter(|s| !s.is_empty())
        .or_else(|| env::var("ACPP_PREFIX").ok().map(|p| format!("{p}/lib")))
        .or_else(|| env::var("AdaptiveCpp_ROOT").ok().map(|p| format!("{p}/lib")))
        .unwrap_or_else(|| {
            for guess in ["/opt/adaptivecpp/lib", "/usr/local/lib",
                          "/usr/lib/x86_64-linux-gnu", "/usr/lib"] {
                if std::path::Path::new(&format!("{guess}/libacpp-rt.so")).exists() {
                    return guess.to_string();
                }
            }
            "/opt/adaptivecpp/lib".to_string()
        });
    println!("cargo:rustc-link-search=native={acpp_lib_dir}");
    println!("cargo:rustc-link-arg=-Wl,-rpath,{acpp_lib_dir}");
    println!("cargo:rustc-link-lib=acpp-rt");
    println!("cargo:rustc-link-lib=acpp-common");

    // ---- LLVM OpenMP runtime (SYCL→OMP backend) ----
    // AdaptiveCpp's OMP backend lowers SYCL nd_range kernels to OpenMP
    // parallel loops. The compiled .o files reference libomp's runtime
    // symbols (__kmpc_fork_call, __kmpc_global_thread_num, __kmpc_barrier,
    // __kmpc_for_static_init_8u / _fini). cc / rust-lld don't auto-link
    // libomp — pos2_gpu's SYCL TUs would then fail to link with
    //
    //   rust-lld: error: undefined symbol: __kmpc_fork_call
    //
    // Only fire on builds where ACPP_TARGETS includes "omp"; HIP and
    // SSCP-with-CUDA backends translate to their own runtimes and don't
    // need libomp at link time.
    //
    // Locations:
    //   Ubuntu/Debian (apt libomp-18-dev): /usr/lib/llvm-18/lib/libomp.so
    //   Arch (pacman openmp):              /usr/lib/libomp.so
    //   AdaptiveCpp install (bundled):     $ACPP_PREFIX/lib/libomp.so
    if acpp_targets.split(';').any(|t| t.trim() == "omp") {
        for guess in ["/usr/lib/llvm-18/lib", "/usr/lib/llvm-19/lib",
                      "/usr/lib/llvm-20/lib", "/usr/lib"] {
            if std::path::Path::new(&format!("{guess}/libomp.so")).exists()
                || std::path::Path::new(&format!("{guess}/libomp.so.5")).exists() {
                println!("cargo:rustc-link-search=native={guess}");
                println!("cargo:rustc-link-arg=-Wl,-rpath,{guess}");
                break;
            }
        }
        println!("cargo:rustc-link-lib=omp");
    }

    // ---- CUDA runtime ----
    // Only needed when XCHPLOT2_BUILD_CUDA=ON — then the nvcc-compiled
    // TUs (SortCuda, AesGpu, AesGpuBitsliced) pull in cudart / cudadevrt.
    // On the AMD/Intel OFF path there's no CUDA Toolkit on the image and
    // nothing in the static archives references cudart, so emitting
    // `-lcudart` would make rust-lld fail with "unable to find library".
    if build_cuda == "ON" {
        // Order matters: the *first* libcudart_static.a the linker
        // finds wins. If the user has multiple toolkits installed and
        // /usr/local/cuda symlinks to a stale CUDA 11.x leftover, we'd
        // statically link the wrong runtime and fail on the v2 ABI.
        // The reliable source of truth is the nvcc that CMake actually
        // invoked — its sibling lib dirs always hold a matching
        // libcudart_static.a. Canonicalize `which nvcc` to resolve
        // the `/usr/local/cuda` symlink chain and put that toolkit's
        // lib dirs first on the search list. See cuda-only branch
        // commit history for the user-bug-report context.
        let nvcc_toolkit_root = nvcc_canonical_toolkit_root();
        let cuda_root = env::var("CUDA_PATH")
            .or_else(|_| env::var("CUDA_HOME"))
            .ok()
            .or_else(|| nvcc_toolkit_root.clone())
            .unwrap_or_else(|| {
                for guess in ["/opt/cuda", "/usr/local/cuda"] {
                    if std::path::Path::new(guess).exists() { return guess.to_string(); }
                }
                "/opt/cuda".to_string()
            });
        // nvcc's own toolkit dirs FIRST (when distinct from cuda_root),
        // so the linker resolves libcudart_static.a from there ahead
        // of any stale lookalikes under /usr/local/cuda or /opt/cuda.
        if let Some(ref root) = nvcc_toolkit_root {
            if root != &cuda_root {
                println!("cargo:rustc-link-search=native={root}/targets/x86_64-linux/lib");
                println!("cargo:rustc-link-search=native={root}/lib64");
                println!("cargo:rustc-link-search=native={root}/lib");
            }
        }
        println!("cargo:rustc-link-search=native={cuda_root}/lib64");
        println!("cargo:rustc-link-search=native={cuda_root}/lib");
        // Per-host-triple library layout used by recent NVIDIA toolkits
        // (apt repo cuda-toolkit-12-5+ reorganised x86_64 too, not just
        // ARM). Also covers Jetson JetPack/L4T (aarch64-linux) and
        // GH200/SBSA servers. Harmless when the dir doesn't exist.
        println!("cargo:rustc-link-search=native={cuda_root}/targets/x86_64-linux/lib");
        println!("cargo:rustc-link-search=native={cuda_root}/targets/aarch64-linux/lib");
        println!("cargo:rustc-link-search=native={cuda_root}/targets/sbsa-linux/lib");
        // Distro-packaged CUDA fallbacks. Debian/Ubuntu's
        // `apt install nvidia-cuda-toolkit` ships libcudart_static.a /
        // libcudadevrt.a at the multi-arch path /usr/lib/x86_64-linux-gnu,
        // not the /usr/local/cuda layout the NVIDIA apt repo / runfile
        // installer uses. Fedora/RHEL parks them at /usr/lib64. Emit
        // both as additional search paths so cargo install works on
        // stock distro packages too. Gated on dir existence so we don't
        // pollute the search list on non-Linux hosts.
        for extra in ["/usr/lib/x86_64-linux-gnu", "/usr/lib64"] {
            if std::path::Path::new(extra).is_dir() {
                println!("cargo:rustc-link-search=native={extra}");
            }
        }
        // Static-link the CUDA runtime so we don't depend on whatever
        // libcudart.so happens to be earliest on the user's link path.
        // Reported failure was `undefined symbol: cudaGetDeviceProperties_v2`
        // — that symbol was added in CUDA 12.0; users with a stale
        // pre-12 libcudart.so somewhere on the linker path (mixed
        // installs, post-upgrade leftovers, certain WSL setups) saw
        // the linker resolve against the old lib even though nvcc
        // compiled against 12-era headers. libcudart_static.a is the
        // toolkit's own runtime, so it always matches our headers and
        // there's nothing to mismatch against. Costs ~600 KB of binary
        // size; eliminates a whole class of distro-install bugs.
        //
        // cudart_static drags in libculibos (CUDA's internal OS shim)
        // plus pthread/dl/rt (already linked below). cudadevrt is
        // .a-only (no .so) — separable-compilation device-code linker,
        // always static.
        println!("cargo:rustc-link-lib=static=cudart_static");
        println!("cargo:rustc-link-lib=static=culibos");
        println!("cargo:rustc-link-lib=static=cudadevrt");

        // WSL defensive rpath. libcudart_static's dlopen("libcuda.so.1")
        // needs /usr/lib/wsl/lib on the runtime loader path. WSL distros
        // usually set that up via /etc/ld.so.conf.d/ld.wsl.conf, but
        // non-wslg / custom images can be missing the entry — then the
        // binary installs fine but fails at first GPU call. Bake
        // /usr/lib/wsl/lib into the binary's runtime search path.
        //
        // --disable-new-dtags emits DT_RPATH (legacy) instead of
        // DT_RUNPATH. We need DT_RPATH because it propagates to dlopen
        // calls made from libraries linked into this binary (libcudart);
        // DT_RUNPATH only helps DT_NEEDED resolution we declare directly.
        //
        // No cost on non-WSL: loader hits the missing dir, skips it.
        println!("cargo:rustc-link-arg=-Wl,-rpath,/usr/lib/wsl/lib");
        println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags");
    }

    // ---- HIP runtime ----
    // When ACPP_TARGETS is "hip:gfxXXXX", AdaptiveCpp's HIP backend
    // compiles SYCL kernels into HIP fat binaries whose host-side
    // launcher stubs reference __hipPushCallConfiguration /
    // __hipRegisterFatBinary / hipLaunchKernel from libamdhip64. Without
    // -lamdhip64 rust-lld fails with "undefined symbol: __hip*".
    // Honour $ROCM_PATH if set, else fall back to /opt/rocm (standard
    // bare-metal + all official ROCm container images).
    // Link libamdhip64 whenever ROCm is reachable, not just when
    // ACPP_TARGETS is hip-prefixed. ACPP_TARGETS=generic (SSCP JIT) on
    // an AMD host still needs the HIP runtime at load time —
    // librt-backend-hip.so dlopens libamdhip64, but glibc doesn't walk
    // the binary's RUNPATH for transitive backend deps. By making
    // libamdhip64 a direct dependency of the binary, the loader pulls
    // it in at startup via RUNPATH, and AdaptiveCpp's runtime dlopen
    // finds the already-loaded handle. Without this, an AMD-host
    // build with the new RDNA1 default (generic instead of the
    // gfx1013 spoof) fails at first queue construction with
    // "No matching device" because HIP can't initialise.
    //
    // We pass the full .so path (rather than `cargo:rustc-link-lib=amdhip64`
    // which becomes `-lamdhip64`) because the SSCP path emits no host-
    // side HIP symbol references, and the linker's default --as-needed
    // would drop a name-only -l flag from NEEDED. A positional path
    // argument bypasses --as-needed and keeps the library in the link.
    // Same approach as CMakeLists.txt's `link_libraries(.../libamdhip64.so)`.
    let rocm_root = env::var("ROCM_PATH")
        .unwrap_or_else(|_| "/opt/rocm".to_string());
    let amdhip_lib = format!("{rocm_root}/lib/libamdhip64.so");
    if acpp_targets.starts_with("hip:") || std::path::Path::new(&amdhip_lib).exists() {
        println!("cargo:rustc-link-search=native={rocm_root}/lib");
        println!("cargo:rustc-link-search=native={rocm_root}/hip/lib");
        println!("cargo:rustc-link-arg=-Wl,-rpath,{rocm_root}/lib");
        if std::path::Path::new(&amdhip_lib).exists() {
            // Wrap with --no-as-needed/--as-needed: even a positional
            // .so path gets dropped from NEEDED by ld's --as-needed
            // when no symbol references it (true for the SSCP path
            // that has zero host-side HIP symbol refs). The library
            // itself must end up in DT_NEEDED so AdaptiveCpp's runtime
            // dlopen finds it already loaded; otherwise HIP backend
            // never initialises and we throw "No matching device".
            println!("cargo:rustc-link-arg=-Wl,--no-as-needed");
            println!("cargo:rustc-link-arg={amdhip_lib}");
            println!("cargo:rustc-link-arg=-Wl,--as-needed");
        } else {
            // Fallback: ROCm not at /opt/rocm/lib but the user set
            // ACPP_TARGETS=hip:* explicitly. AOT HIP fat binaries
            // reference HIP symbols directly, so --as-needed keeps
            // -lamdhip64 in NEEDED on that path.
            println!("cargo:rustc-link-lib=amdhip64");
        }
    }

    // C++ stdlib + POSIX bits the static libs (Rust std + pthread inside
    // pos2_keygen, std::async + std::thread in pos2_gpu_host) reach for.
    println!("cargo:rustc-link-lib=stdc++");
    println!("cargo:rustc-link-lib=pthread");
    println!("cargo:rustc-link-lib=dl");
    println!("cargo:rustc-link-lib=m");
    println!("cargo:rustc-link-lib=rt");

    // ---- rebuild triggers ----
    for p in &[
        "src", "tools", "keygen-rs/src", "keygen-rs/Cargo.toml",
        "keygen-rs/Cargo.lock", "CMakeLists.txt", "build.rs",
    ] {
        println!("cargo:rerun-if-changed={p}");
    }
    println!("cargo:rerun-if-env-changed=CUDA_ARCHITECTURES");
    println!("cargo:rerun-if-env-changed=CUDA_PATH");
    println!("cargo:rerun-if-env-changed=CUDA_HOME");
}

/// Locate nvcc on PATH (or under $CUDA_PATH/bin, $CUDA_HOME/bin) and
/// return the canonical (symlink-resolved) parent-of-bin directory.
/// That dir is the toolkit root whose lib subdirs hold the
/// libcudart_static.a that matches what nvcc compiled the .o against.
///
/// Returns None if no nvcc is reachable — caller falls back to the
/// legacy /opt/cuda / /usr/local/cuda probe.
fn nvcc_canonical_toolkit_root() -> Option<String> {
    let mut candidates: Vec<std::path::PathBuf> = Vec::new();
    for var in &["CUDA_PATH", "CUDA_HOME"] {
        if let Ok(p) = env::var(var) {
            candidates.push(std::path::PathBuf::from(p).join("bin").join("nvcc"));
        }
    }
    if let Ok(path) = env::var("PATH") {
        for dir in env::split_paths(&path) {
            candidates.push(dir.join("nvcc"));
        }
    }
    for cand in candidates {
        if !cand.is_file() { continue; }
        let real = match std::fs::canonicalize(&cand) {
            Ok(p) => p,
            Err(_) => continue,
        };
        if let Some(toolkit) = real.parent().and_then(|bin| bin.parent()) {
            return toolkit.to_str().map(String::from);
        }
    }
    None
}