diff --git a/Project.toml b/Project.toml
index d18d00a0..311ac256 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "GPUCompiler"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "1.12.0"
+version = "2.0.0"
 authors = ["Tim Besard <tim.besard@gmail.com>"]
 
 [workspace]
diff --git a/src/interface.jl b/src/interface.jl
index dd82630b..acaa8936 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -298,7 +298,7 @@ pass_by_ref(@nospecialize(job::CompilerJob)) = false
 valid_function_pointer(@nospecialize(job::CompilerJob), ptr::Ptr{Cvoid}) = false
 
 # Care is required for anything that impacts:
-#   - method_table
+#   - method_tables
 #   - inference_params
 #   - optimization_params
 # By default that is just always_inline
@@ -306,11 +306,11 @@ valid_function_pointer(@nospecialize(job::CompilerJob), ptr::Ptr{Cvoid}) = false
 struct GPUCompilerCacheToken
     target_type::Type
     always_inline::Bool
-    method_table::Core.MethodTable
+    method_tables::Tuple{Vararg{Core.MethodTable}}
 end
 
 ci_cache_token(@nospecialize(job::CompilerJob)) =
-    GPUCompilerCacheToken(typeof(job.config.target), job.config.always_inline, method_table(job))
+    GPUCompilerCacheToken(typeof(job.config.target), job.config.always_inline, method_tables(job))
 
 # the codeinstance cache to use -- should only be used for the constructor
 if VERSION >= v"1.11.0-DEV.1552"
@@ -327,10 +327,38 @@ function ci_cache(@nospecialize(job::CompilerJob))
 end
 end
 
-# the method table to use
-# deprecate method_table on next-breaking release
-method_table(@nospecialize(job::CompilerJob)) = GLOBAL_METHOD_TABLE
-method_table_view(@nospecialize(job::CompilerJob)) = get_method_table_view(job.world, method_table(job))
+"""
+    method_tables(job::CompilerJob) -> Tuple{Vararg{Core.MethodTable}}
+
+The back-end's method tables, in priority order. They are stacked on top of GPUCompiler's
+internal runtime-intrinsic overlay table for inference, and are used as the discriminator
+component of [`ci_cache_token`](@ref).
+
+Most back-ends only need to declare a single table:
+
+    Base.Experimental.@MethodTable(my_method_table)
+    GPUCompiler.method_tables(::MyCompilerJob) = (my_method_table,)
+
+If the back-end has overlays spread across multiple `Core.MethodTable`s (e.g. one local
+to the package plus one inherited from a shared intrinsics library), return them in
+priority order — the first match wins.
+
+For full control of the inference-side `Core.Compiler.MethodTableView`, override
+[`method_table_view`](@ref) instead; that is an internal hook and most back-ends should
+not need it.
+"""
+method_tables(@nospecialize(job::CompilerJob)) = ()
+
+# Build the inference-side view of the back-end's method tables stacked on top of
+# GPUCompiler's runtime-intrinsic overlay table. Back-ends generally shouldn't override
+# this; override `method_tables(job)` instead.
+function method_table_view(@nospecialize(job::CompilerJob))
+    parent = CC.OverlayMethodTable(job.world, GLOBAL_METHOD_TABLE)
+    for mt in reverse(method_tables(job))
+        parent = StackedMethodTable(job.world, mt, parent)
+    end
+    return parent
+end
 
 # the inference parameters to use when constructing the GPUInterpreter
 function inference_params(@nospecialize(job::CompilerJob))
diff --git a/src/jlgen.jl b/src/jlgen.jl
index 21216d79..af115c6b 100644
--- a/src/jlgen.jl
+++ b/src/jlgen.jl
@@ -295,8 +295,6 @@ end # !HAS_INTEGRATED_CACHE
 
 ## method overrides
 
-Base.Experimental.@MethodTable(GLOBAL_METHOD_TABLE)
-
 # Implements a priority lookup for method tables, where the first match in the stack get's returned.
 # An alternative to this would be to use a "Union" where we would query the parent method table and
 # do a most-specific match.
@@ -402,8 +400,6 @@ else
     maybe_cached(mtv::CC.MethodTableView) = mtv
 end
 
-get_method_table_view(world::UInt, mt::CC.MethodTable) = CC.OverlayMethodTable(world, mt)
-
 # VERSION >= v"1.14.0-DEV.1691"
 const INFERENCE_CACHE_TYPE = isdefined(CC, :InferenceCache) ? CC.InferenceCache : Vector{CC.InferenceResult}
 
@@ -493,7 +489,11 @@ CC.lock_mi_inference(interp::GPUInterpreter, mi::MethodInstance) = nothing
 CC.unlock_mi_inference(interp::GPUInterpreter, mi::MethodInstance) = nothing
 
 function CC.add_remark!(interp::GPUInterpreter, sv::CC.InferenceState, msg)
-    @safe_debug "Inference remark during GPU compilation of $(sv.linfo): $msg"
+    # NOTE: deliberately a no-op. emitting any logging here pulls all the components
+    # needed to evaluate the warning into the IR for the compile job, even when the
+    # remark never fires — those ccalls into the Julia CPU runtime then poison AOT
+    # compilation. See JuliaGPU/GPUCompiler.jl#749.
+    return nothing
 end
 
 CC.may_optimize(interp::GPUInterpreter) = true
diff --git a/src/runtime.jl b/src/runtime.jl
index 2b11d915..ea41a433 100644
--- a/src/runtime.jl
+++ b/src/runtime.jl
@@ -82,8 +82,9 @@ function compile(def, return_type, types, llvm_return_type=nothing, llvm_types=n
     #        using the new nonrecursive codegen to handle function lookup ourselves?
     if def isa Symbol
         args = [gensym() for typ in types]
-        @eval @inline $def($(args...)) =
-            ccall($("extern $llvm_name"), llvmcall, $return_type, ($(types...),), $(args...))
+        @eval GPUCompiler.@device_function($return_type,
+            @inline $def($(args...)) =
+                ccall($("extern $llvm_name"), llvmcall, $return_type, ($(types...),), $(args...)))
     end
 
     return
diff --git a/src/utils.jl b/src/utils.jl
index b98ead9d..8278bd2e 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -242,3 +242,35 @@ end
         return inits
     end
 end
+
+
+## device function definitions
+
+# GPUCompiler-owned method table holding overlays for GPU runtime intrinsics. Back-end
+# method tables (declared via `method_tables(job)`) are stacked on top of this by
+# `method_table_view`, so back-end overrides win first and these overlays remain
+# reachable underneath. This is an internal table; back-ends must not `@overlay` it.
+Base.Experimental.@MethodTable(GLOBAL_METHOD_TABLE)
+
+# define a CPU-visible stub plus an overlay in GLOBAL_METHOD_TABLE that holds the real
+# device body. used to keep `ccall("extern gpu_*", ...)` bodies out of the native cache
+# (so that `compile=all` sysimages / juliac don't try to resolve nonexistent symbols),
+# while still letting GPU compilation find the real body via the back-end's stacked
+# method table.
+macro device_function(rt, ex)
+    ex = macroexpand(__module__, ex)
+    def = splitdef(ex)
+
+    # replace the CPU body with a harmless constructor call returning the expected type.
+    # NOTE: Int64(1) (rather than 0) so that `Ptr(Int64(...))` doesn't get lowered to C_NULL.
+    def[:body] = quote
+        $rt(1)
+    end
+
+    return esc(quote
+        $(combinedef(def))
+
+        # NOTE: no `@consistent_overlay` because the CPU stub returns a fake value
+        Base.Experimental.@overlay($(GPUCompiler).GLOBAL_METHOD_TABLE, $ex)
+    end)
+end
diff --git a/test/helpers/native.jl b/test/helpers/native.jl
index 656028f4..b86e4e79 100644
--- a/test/helpers/native.jl
+++ b/test/helpers/native.jl
@@ -19,7 +19,7 @@ module Runtime end
 NativeCompilerJob = CompilerJob{NativeCompilerTarget,CompilerParams}
 GPUCompiler.runtime_module(::NativeCompilerJob) = Runtime
 
-GPUCompiler.method_table(@nospecialize(job::NativeCompilerJob)) = job.config.params.method_table
+GPUCompiler.method_tables(@nospecialize(job::NativeCompilerJob)) = (job.config.params.method_table,)
 GPUCompiler.can_safepoint(@nospecialize(job::NativeCompilerJob)) = job.config.params.entry_safepoint
 
 function create_job(@nospecialize(func), @nospecialize(types);
diff --git a/test/utils.jl b/test/utils.jl
index 4ce2258c..edbadc42 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -171,3 +171,37 @@ end
     # Check that we can call this function from the CPU, to support deferred codegen for Enzyme.
     @test ccall("extern deferred_codegen", llvmcall, UInt, (UInt,), 3) == 3
 end
+
+@testset "@device_function macro" begin
+    using InteractiveUtils
+
+    # The macro should:
+    #   1. define a CPU-visible function that returns the expected type without
+    #      referencing the `extern gpu_*` symbol (so AOT compilation can link),
+    #   2. register an overlay in GPUCompiler.GLOBAL_METHOD_TABLE that GPU compilation
+    #      finds via the stacked method-table view.
+
+    test_mod = @eval module $(gensym("DeviceFunctionTest"))
+        using GPUCompiler
+
+        GPUCompiler.@device_function(Ptr{Nothing},
+            @inline test_device_ptr() = ccall("extern gpu_test", llvmcall, Ptr{Nothing}, ()))
+
+        GPUCompiler.@device_function(Nothing,
+            @inline test_device_nothing() = ccall("extern gpu_test2", llvmcall, Nothing, ()))
+    end
+
+    @test isdefined(test_mod, :test_device_ptr)
+    @test isdefined(test_mod, :test_device_nothing)
+
+    # the overlays should be findable in GLOBAL_METHOD_TABLE
+    mt_view = Core.Compiler.OverlayMethodTable(Base.get_world_counter(),
+                                               GPUCompiler.GLOBAL_METHOD_TABLE)
+    @test findsup(Tuple{typeof(test_mod.test_device_ptr)}, mt_view) !== nothing
+    @test findsup(Tuple{typeof(test_mod.test_device_nothing)}, mt_view) !== nothing
+
+    # the CPU stubs must not reference the extern gpu_* symbol — that's the whole point
+    buf = IOBuffer()
+    code_llvm(buf, test_mod.test_device_ptr, Tuple{}; debuginfo=:none)
+    @test !occursin("gpu_test", String(take!(buf)))
+end