JuliaMath · dannys4 · Apr 27, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/src/FFTA.jl b/src/FFTA.jl
@@ -10,6 +10,7 @@ using Reexport: @reexport
 @reexport using AbstractFFTs
 
 include("callgraph.jl")
+include("singleton_twiddle.jl")
 include("algos.jl")
 include("plan.jl")
 end
diff --git a/src/algos.jl b/src/algos.jl
@@ -57,7 +57,7 @@ function fft_composite!(out::AbstractVector{T}, in::AbstractVector{U}, start_out
     right_idx = idx + root.right
     left = g[left_idx]
     right = g[right_idx]
-    # N  = root.sz
+    N  = root.sz
     N1 = left.sz
     N2 = right.sz
     s_in = root.s_in
@@ -67,14 +67,17 @@ function fft_composite!(out::AbstractVector{T}, in::AbstractVector{U}, start_out
     Lt = left.type
 
     w1 = _conj(root.w, d)
-    wj1 = one(T)
+    Rtype = real(T)
+    # The composite twiddle at position (j1, k2) is `cispi(dir · 2 j1 k2 / N)`.
+    # Singleton's recurrence advances `wk2 = cispi(dir · 2 j1 k2 / N)` in k2
+    # for fixed j1; (α, β) depend on j1 so we reset them at each outer step.
+    dir = twiddle_direction(w1)
     tmp = g.workspace[idx]
 
     if Rt === BLUESTEIN
         R_bluestein_scratchspace = prealloc_blue(N2, d, T)
     end
     for j1 in 0:N1-1
-        wk2 = wj1
         R_start_in  = start_in + j1 * s_in
         R_start_out = 1 + N2 * j1
 
@@ -87,13 +90,13 @@ function fft_composite!(out::AbstractVector{T}, in::AbstractVector{U}, start_out
         end
 
         if j1 > 0
+            αi, βi = singleton_params(dir * Rtype(2 * j1) / Rtype(N))
+            ci, si = one(Rtype), zero(Rtype)
             @inbounds for k2 in 1:N2-1
-                tmp[R_start_out + k2] *= wk2
-                wk2 *= wj1
+                ci, si = singleton_step(ci, si, αi, βi)
+                tmp[R_start_out + k2] *= Complex(ci, si)
             end
         end
-
-        wj1 *= w1
     end
 
     if Lt === BLUESTEIN
@@ -134,16 +137,17 @@ function fft_dft!(out::AbstractVector{T}, in::AbstractVector{T}, N::Int, start_o
     end
     out[start_out] = tmp
 
-    wk = wkn = w
+    Rtype = real(T)
+    dir = twiddle_direction(w)
     @inbounds for d in 1:N-1
-        tmp = in[start_in]
+        t = in[start_in]
+        αk, βk = singleton_params(dir * Rtype(2 * d) / Rtype(N))
+        ck, sk = one(Rtype), zero(Rtype)
         @inbounds for k in 1:N-1
-            tmp += wkn*in[start_in + k*stride_in]
-            wkn *= wk
+            ck, sk = singleton_step(ck, sk, αk, βk)
+            t += Complex(ck, sk) * in[start_in + k*stride_in]
         end
-        out[start_out + d*stride_out] = tmp
-        wk *= w
-        wkn = wk
+        out[start_out + d*stride_out] = t
     end
 end
 
@@ -156,16 +160,16 @@ function fft_dft!(out::AbstractVector{Complex{T}}, in::AbstractVector{T}, N::Int
     end
     out[start_out] = tmp
 
-    wk = wkn = w
+    dir = twiddle_direction(w)
     @inbounds for d in 1:halfN
-        tmp = Complex{T}(in[start_in])
+        t = Complex{T}(in[start_in])
+        αk, βk = singleton_params(dir * T(2 * d) / T(N))
+        ck, sk = one(T), zero(T)
         @inbounds for k in 1:N-1
-            tmp += wkn*in[start_in + k*stride_in]
-            wkn *= wk
+            ck, sk = singleton_step(ck, sk, αk, βk)
+            t += Complex{T}(ck, sk) * in[start_in + k*stride_in]
         end
-        out[start_out + d*stride_out] = tmp
-        wk *= w
-        wkn = wk
+        out[start_out + d*stride_out] = t
     end
 end
 
@@ -214,38 +218,45 @@ function fft_pow2_radix4!(out::AbstractVector{T}, in::AbstractVector{U}, N::Int,
     # ...othersize split the problem in four and recur
     m = N ÷ 4
 
-    w1 = w
-    w2 = w * w1
-    w3 = w * w2
-    w4 = w2 * w2
-
-    fft_pow2_radix4!(out, in, m, start_out                 , stride_out, start_in              , stride_in*4, w4)
-    fft_pow2_radix4!(out, in, m, start_out +   m*stride_out, stride_out, start_in +   stride_in, stride_in*4, w4)
-    fft_pow2_radix4!(out, in, m, start_out + 2*m*stride_out, stride_out, start_in + 2*stride_in, stride_in*4, w4)
-    fft_pow2_radix4!(out, in, m, start_out + 3*m*stride_out, stride_out, start_in + 3*stride_in, stride_in*4, w4)
-
-    wkoe = wkeo = wkoo = one(T)
+    Rtype = real(T)
+    dir = twiddle_direction(w)
+    # Recursive sub-problem step `cispi(dir · 2 / m) = w^4`; use `cispi`
+    # directly so the sub-tree gets a < 1 ULP starting phase.
+    w_sub = cispi(dir * Rtype(2) / Rtype(m))
+
+    fft_pow2_radix4!(out, in, m, start_out                 , stride_out, start_in              , stride_in*4, w_sub)
+    fft_pow2_radix4!(out, in, m, start_out +   m*stride_out, stride_out, start_in +   stride_in, stride_in*4, w_sub)
+    fft_pow2_radix4!(out, in, m, start_out + 2*m*stride_out, stride_out, start_in + 2*stride_in, stride_in*4, w_sub)
+    fft_pow2_radix4!(out, in, m, start_out + 3*m*stride_out, stride_out, start_in + 3*stride_in, stride_in*4, w_sub)
+
+    # Singleton recurrence for the three running twiddles `w^k`, `w^2k`, `w^3k`.
+    α1, β1 = singleton_params(dir * Rtype(2) / Rtype(N))
+    α2, β2 = singleton_params(dir * Rtype(4) / Rtype(N))
+    α3, β3 = singleton_params(dir * Rtype(6) / Rtype(N))
+    c1, s1 = one(Rtype), zero(Rtype)
+    c2, s2 = one(Rtype), zero(Rtype)
+    c3, s3 = one(Rtype), zero(Rtype)
 
     @inbounds for k in 0:m-1
         kee = start_out +  k          * stride_out
         koe = start_out + (k +     m) * stride_out
         keo = start_out + (k + 2 * m) * stride_out
         koo = start_out + (k + 3 * m) * stride_out
         y_kee, y_koe, y_keo, y_koo = out[kee], out[koe], out[keo], out[koo]
-        ỹ_keo = y_keo * wkeo
-        ỹ_koe = y_koe * wkoe
-        ỹ_koo = y_koo * wkoo
-        y_kee_p_y_keo = y_kee + ỹ_keo
-        y_kee_m_y_keo = y_kee - ỹ_keo
-        ỹ_koe_p_ỹ_koo = ỹ_koe + ỹ_koo
-        ỹ_koe_m_ỹ_koo = -(ỹ_koe - ỹ_koo) * minusi
-        out[kee] = y_kee_p_y_keo + ỹ_koe_p_ỹ_koo
-        out[koe] = y_kee_m_y_keo + ỹ_koe_m_ỹ_koo
-        out[keo] = y_kee_p_y_keo - ỹ_koe_p_ỹ_koo
-        out[koo] = y_kee_m_y_keo - ỹ_koe_m_ỹ_koo
-        wkoe *= w1
-        wkeo *= w2
-        wkoo *= w3
+        t_keo = y_keo * Complex(c2, s2)
+        t_koe = y_koe * Complex(c1, s1)
+        t_koo = y_koo * Complex(c3, s3)
+        y_kee_p_y_keo = y_kee + t_keo
+        y_kee_m_y_keo = y_kee - t_keo
+        t_koe_p_t_koo = t_koe + t_koo
+        t_koe_m_t_koo = -(t_koe - t_koo) * minusi
+        out[kee] = y_kee_p_y_keo + t_koe_p_t_koo
+        out[koe] = y_kee_m_y_keo + t_koe_m_t_koo
+        out[keo] = y_kee_p_y_keo - t_koe_p_t_koo
+        out[koo] = y_kee_m_y_keo - t_koe_m_t_koo
+        c1, s1 = singleton_step(c1, s1, α1, β1)
+        c2, s2 = singleton_step(c2, s2, α2, β2)
+        c3, s3 = singleton_step(c3, s3, α3, β3)
     end
 end
 
@@ -279,24 +290,32 @@ function fft_pow3!(out::AbstractVector{T}, in::AbstractVector{U}, N::Int, start_
     # Size of subproblem
     Nprime = N ÷ 3
 
-    # Dividing into subproblems
-    fft_pow3!(out, in, Nprime, start_out,                       stride_out, start_in,               stride_in*3, w^3, minus120)
-    fft_pow3!(out, in, Nprime, start_out +   Nprime*stride_out, stride_out, start_in +   stride_in, stride_in*3, w^3, minus120)
-    fft_pow3!(out, in, Nprime, start_out + 2*Nprime*stride_out, stride_out, start_in + 2*stride_in, stride_in*3, w^3, minus120)
+    Rtype = real(T)
+    dir = twiddle_direction(w)
+    # Recursive sub-problem step cispi(dir · 2 / Nprime) = w^3.
+    w_sub = cispi(dir * Rtype(2) / Rtype(Nprime))
 
-    w1 = w
-    w2 = w * w1
-    wk1 = wk2 = one(T)
+    # Dividing into subproblems
+    fft_pow3!(out, in, Nprime, start_out,                       stride_out, start_in,               stride_in*3, w_sub, minus120)
+    fft_pow3!(out, in, Nprime, start_out +   Nprime*stride_out, stride_out, start_in +   stride_in, stride_in*3, w_sub, minus120)
+    fft_pow3!(out, in, Nprime, start_out + 2*Nprime*stride_out, stride_out, start_in + 2*stride_in, stride_in*3, w_sub, minus120)
+
+    α1, β1 = singleton_params(dir * Rtype(2) / Rtype(N))
+    α2, β2 = singleton_params(dir * Rtype(4) / Rtype(N))
+    c1, s1 = one(Rtype), zero(Rtype)
+    c2, s2 = one(Rtype), zero(Rtype)
     for k in 0:Nprime-1
         k0 = start_out + stride_out * k
         k1 = start_out + stride_out * (k + Nprime)
         k2 = start_out + stride_out * (k + 2 * Nprime)
         y_k0, y_k1, y_k2 = out[k0], out[k1], out[k2]
-        @muladd out[k0] = y_k0 + y_k1 * wk1            + y_k2 * wk2
-        @muladd out[k1] = y_k0 + y_k1 * wk1 * plus120  + y_k2 * wk2 * minus120
-        @muladd out[k2] = y_k0 + y_k1 * wk1 * minus120 + y_k2 * wk2 * plus120
-        wk1 *= w1
-        wk2 *= w2
+        wk1 = Complex(c1, s1)
+        wk2 = Complex(c2, s2)
+        @muladd out[k0] = y_k0 + y_k1*wk1 + y_k2*wk2
+        @muladd out[k1] = y_k0 + y_k1*wk1*plus120 + y_k2*wk2*minus120
+        @muladd out[k2] = y_k0 + y_k1*wk1*minus120 + y_k2*wk2*plus120
+        c1, s1 = singleton_step(c1, s1, α1, β1)
+        c2, s2 = singleton_step(c2, s2, α2, β2)
     end
 end
 

diff --git a/src/singleton_twiddle.jl b/src/singleton_twiddle.jl
@@ -0,0 +1,48 @@
+# Singleton's stable trigonometric recurrence for twiddle factors.
+#
+# The naïve `w *= step` recurrence gives O(N) relative error growth,
+# which is the issue described in #118. Singleton (1967) showed that
+# if one precomputes
+#     α = 2 sin²(θ/2) = 1 - cos(θ)
+#     β = sin(θ)
+# then the recurrence
+#     c_{k+1} = c_k - (α c_k + β s_k)
+#     s_{k+1} = s_k - (α s_k - β c_k)
+# produces (cos(kθ), sin(kθ)) with RMS error growing only as √k · ε
+# (and worst-case ~ k · ε / something sub-linear in typical cases)
+# rather than k · ε. The point is that `α c_k + β s_k` is a *small*
+# correction (α ≈ θ²/2, β ≈ θ) subtracted from an ~O(1) base, so the
+# update stays in the high-significand bits.
+#
+# The cost is ~8 flops per step (2× the naïve complex multiply but an
+# order of magnitude faster than a fresh `cispi`), and the extra trig
+# (`sincospi(θ/2)`) happens once per kernel call.
+
+# Direction lives in `sign(imag(w))`; when `w` is real (N = 2 or any
+# degenerate case where `imag` rounds to zero) both directions collapse
+# to the same twiddle set so we pick +1.
+@inline function twiddle_direction(w::Complex{T}) where {T<:Real}
+    s = imag(w)
+    s > 0 ? one(T) : (s < 0 ? -one(T) : one(T))
+end
+
+# Recurrence coefficients for stepping by `cispi(freq) = e^(iπ·freq)`.
+# Uses `sincospi(freq/2)` so that `α` and `β` are exact-to-ULP even
+# for very small frequencies — writing `1 - cos(θ)` directly suffers
+# catastrophic cancellation there.
+@inline function singleton_params(freq::T) where {T<:Real}
+    s_h, c_h = sincospi(freq / 2)
+    α = 2 * s_h * s_h
+    β = 2 * s_h * c_h
+    (α, β)
+end
+
+# Advance `(c, s) = (cos(kθ), sin(kθ))` to `(cos((k+1)θ), sin((k+1)θ))`.
+# Computed as `c - (αc + βs)` rather than `(1-α)c - βs` on purpose:
+# the correction is small so subtracting it from `c` preserves the
+# high-order bits and the recurrence self-heals.
+@inline function singleton_step(c::T, s::T, α::T, β::T) where {T<:Real}
+    c_new = c - muladd(α, c, β * s)
+    s_new = s - muladd(α, s, -(β * c))
+    (c_new, s_new)
+end
diff --git a/test/onedim/accuracy.jl b/test/onedim/accuracy.jl
@@ -0,0 +1,72 @@
+using FFTA, Test, Random, LinearAlgebra
+
+# Regression test for issue #118. Singleton's trigonometric recurrence
+# (see src/singleton_twiddle.jl) brings the twiddle error from O(N)
+# down to roughly O(log N · √log N) + an O(√N) twiddle term. Bounds
+# below are set with ~2-3× headroom over what the current
+# implementation produces (max over 5 seeds on aarch64 NEON); they
+# still fail comfortably against the pre-fix naive `w *= step`
+# recurrence, which ballooned past ~4000 ULP at N = 16384.
+
+Random.seed!(42)
+
+# (N, max eps ratio) across the power-of-2 ladder. Covers both even
+# powers (= powers of 4, recursion bottoms at N = 4) and odd powers
+# (recursion bottoms at N = 2), which hit different base cases in
+# `fft_pow2_radix4!`.
+const POWERS_OF_2 = (
+    (1 << 4,   3.0),   # 16    = 4^2
+    (1 << 5,   3.0),   # 32
+    (1 << 6,   3.0),   # 64    = 4^3
+    (1 << 7,   3.0),   # 128
+    (1 << 8,   4.0),   # 256   = 4^4
+    (1 << 9,   4.0),   # 512
+    (1 << 10,  8.0),   # 1024  = 4^5
+    (1 << 11,  8.0),   # 2048
+    (1 << 12, 10.0),   # 4096  = 4^6
+    (1 << 13, 12.0),   # 8192
+    (1 << 14, 14.0),   # 16384 = 4^7
+    (1 << 15, 20.0),   # 32768
+    (1 << 16, 22.0),   # 65536 = 4^8
+    (1 << 17, 28.0),   # 131072
+    (1 << 18, 28.0),   # 262144 = 4^9
+)
+
+const POWERS_OF_3 = (
+    (3^1,    3.0),
+    (3^2,    3.0),
+    (3^3,    3.0),
+    (3^4,    4.0),
+    (3^5,    5.0),
+    (3^6,    7.0),
+    (3^7,   10.0),
+    (3^8,   13.0),
+    (3^9,   18.0),
+)
+
+function _worst_relerr(N::Int)
+    worst = 0.0
+    for seed in 1:5
+        Random.seed!(seed)
+        x64 = randn(ComplexF64, N)
+        x32 = ComplexF32.(x64)
+        y32 = fft(x32)
+        y_ref = ComplexF32.(fft(x64))
+        relerr = norm(y32 .- y_ref) / norm(y_ref)
+        worst = max(worst, relerr / eps(Float32))
+    end
+    return worst
+end
+
+@testset "twiddle accuracy (issue #118)" begin
+    @testset "powers of 2" begin
+        @testset "N = $N" for (N, bound) in POWERS_OF_2
+            @test _worst_relerr(N) ≤ bound
+        end
+    end
+    @testset "powers of 3" begin
+        @testset "N = $N" for (N, bound) in POWERS_OF_3
+            @test _worst_relerr(N) ≤ bound
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -58,6 +58,9 @@ Random.seed!(1)
             @testset verbose = false "Backward" begin
                 include("onedim/complex_backward.jl")
             end
+            @testset verbose = false "Accuracy" begin
+                include("onedim/accuracy.jl")
+            end
         end
         @testset verbose = true "Real" begin
             @testset verbose = false "Forward" begin