Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/FFTA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ using Reexport: @reexport
@reexport using AbstractFFTs

include("callgraph.jl")
include("singleton_twiddle.jl")
include("algos.jl")
include("plan.jl")
end
135 changes: 77 additions & 58 deletions src/algos.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ function fft_composite!(out::AbstractVector{T}, in::AbstractVector{U}, start_out
right_idx = idx + root.right
left = g[left_idx]
right = g[right_idx]
# N = root.sz
N = root.sz
N1 = left.sz
N2 = right.sz
s_in = root.s_in
Expand All @@ -67,14 +67,17 @@ function fft_composite!(out::AbstractVector{T}, in::AbstractVector{U}, start_out
Lt = left.type

w1 = _conj(root.w, d)
wj1 = one(T)
Rtype = real(T)
# The composite twiddle at position (j1, k2) is `cispi(dir · 2 j1 k2 / N)`.
# Singleton's recurrence advances `wk2 = cispi(dir · 2 j1 k2 / N)` in k2
# for fixed j1; (α, β) depend on j1 so we reset them at each outer step.
dir = twiddle_direction(w1)
tmp = g.workspace[idx]

if Rt === BLUESTEIN
R_bluestein_scratchspace = prealloc_blue(N2, d, T)
end
for j1 in 0:N1-1
wk2 = wj1
R_start_in = start_in + j1 * s_in
R_start_out = 1 + N2 * j1

Expand All @@ -87,13 +90,13 @@ function fft_composite!(out::AbstractVector{T}, in::AbstractVector{U}, start_out
end

if j1 > 0
αi, βi = singleton_params(dir * Rtype(2 * j1) / Rtype(N))
ci, si = one(Rtype), zero(Rtype)
@inbounds for k2 in 1:N2-1
tmp[R_start_out + k2] *= wk2
wk2 *= wj1
ci, si = singleton_step(ci, si, αi, βi)
tmp[R_start_out + k2] *= Complex(ci, si)
end
end

wj1 *= w1
end

if Lt === BLUESTEIN
Expand Down Expand Up @@ -134,16 +137,17 @@ function fft_dft!(out::AbstractVector{T}, in::AbstractVector{T}, N::Int, start_o
end
out[start_out] = tmp

wk = wkn = w
Rtype = real(T)
dir = twiddle_direction(w)
@inbounds for d in 1:N-1
tmp = in[start_in]
t = in[start_in]
αk, βk = singleton_params(dir * Rtype(2 * d) / Rtype(N))
ck, sk = one(Rtype), zero(Rtype)
@inbounds for k in 1:N-1
tmp += wkn*in[start_in + k*stride_in]
wkn *= wk
ck, sk = singleton_step(ck, sk, αk, βk)
t += Complex(ck, sk) * in[start_in + k*stride_in]
end
out[start_out + d*stride_out] = tmp
wk *= w
wkn = wk
out[start_out + d*stride_out] = t
end
end

Expand All @@ -156,16 +160,16 @@ function fft_dft!(out::AbstractVector{Complex{T}}, in::AbstractVector{T}, N::Int
end
out[start_out] = tmp

wk = wkn = w
dir = twiddle_direction(w)
@inbounds for d in 1:halfN
tmp = Complex{T}(in[start_in])
t = Complex{T}(in[start_in])
αk, βk = singleton_params(dir * T(2 * d) / T(N))
ck, sk = one(T), zero(T)
@inbounds for k in 1:N-1
tmp += wkn*in[start_in + k*stride_in]
wkn *= wk
ck, sk = singleton_step(ck, sk, αk, βk)
t += Complex{T}(ck, sk) * in[start_in + k*stride_in]
end
out[start_out + d*stride_out] = tmp
wk *= w
wkn = wk
out[start_out + d*stride_out] = t
end
end

Expand Down Expand Up @@ -214,38 +218,45 @@ function fft_pow2_radix4!(out::AbstractVector{T}, in::AbstractVector{U}, N::Int,
# ...othersize split the problem in four and recur
m = N ÷ 4

w1 = w
w2 = w * w1
w3 = w * w2
w4 = w2 * w2

fft_pow2_radix4!(out, in, m, start_out , stride_out, start_in , stride_in*4, w4)
fft_pow2_radix4!(out, in, m, start_out + m*stride_out, stride_out, start_in + stride_in, stride_in*4, w4)
fft_pow2_radix4!(out, in, m, start_out + 2*m*stride_out, stride_out, start_in + 2*stride_in, stride_in*4, w4)
fft_pow2_radix4!(out, in, m, start_out + 3*m*stride_out, stride_out, start_in + 3*stride_in, stride_in*4, w4)

wkoe = wkeo = wkoo = one(T)
Rtype = real(T)
dir = twiddle_direction(w)
# Recursive sub-problem step `cispi(dir · 2 / m) = w^4`; use `cispi`
# directly so the sub-tree gets a < 1 ULP starting phase.
w_sub = cispi(dir * Rtype(2) / Rtype(m))

fft_pow2_radix4!(out, in, m, start_out , stride_out, start_in , stride_in*4, w_sub)
fft_pow2_radix4!(out, in, m, start_out + m*stride_out, stride_out, start_in + stride_in, stride_in*4, w_sub)
fft_pow2_radix4!(out, in, m, start_out + 2*m*stride_out, stride_out, start_in + 2*stride_in, stride_in*4, w_sub)
fft_pow2_radix4!(out, in, m, start_out + 3*m*stride_out, stride_out, start_in + 3*stride_in, stride_in*4, w_sub)

# Singleton recurrence for the three running twiddles `w^k`, `w^2k`, `w^3k`.
α1, β1 = singleton_params(dir * Rtype(2) / Rtype(N))
α2, β2 = singleton_params(dir * Rtype(4) / Rtype(N))
α3, β3 = singleton_params(dir * Rtype(6) / Rtype(N))
c1, s1 = one(Rtype), zero(Rtype)
c2, s2 = one(Rtype), zero(Rtype)
c3, s3 = one(Rtype), zero(Rtype)

@inbounds for k in 0:m-1
kee = start_out + k * stride_out
koe = start_out + (k + m) * stride_out
keo = start_out + (k + 2 * m) * stride_out
koo = start_out + (k + 3 * m) * stride_out
y_kee, y_koe, y_keo, y_koo = out[kee], out[koe], out[keo], out[koo]
ỹ_keo = y_keo * wkeo
ỹ_koe = y_koe * wkoe
ỹ_koo = y_koo * wkoo
y_kee_p_y_keo = y_kee + ỹ_keo
y_kee_m_y_keo = y_kee - ỹ_keo
ỹ_koe_p_ỹ_koo = ỹ_koe + ỹ_koo
ỹ_koe_m_ỹ_koo = -(ỹ_koe - ỹ_koo) * minusi
out[kee] = y_kee_p_y_keo + ỹ_koe_p_ỹ_koo
out[koe] = y_kee_m_y_keo + ỹ_koe_m_ỹ_koo
out[keo] = y_kee_p_y_keo - ỹ_koe_p_ỹ_koo
out[koo] = y_kee_m_y_keo - ỹ_koe_m_ỹ_koo
wkoe *= w1
wkeo *= w2
wkoo *= w3
t_keo = y_keo * Complex(c2, s2)
t_koe = y_koe * Complex(c1, s1)
t_koo = y_koo * Complex(c3, s3)
y_kee_p_y_keo = y_kee + t_keo
y_kee_m_y_keo = y_kee - t_keo
t_koe_p_t_koo = t_koe + t_koo
t_koe_m_t_koo = -(t_koe - t_koo) * minusi
out[kee] = y_kee_p_y_keo + t_koe_p_t_koo
out[koe] = y_kee_m_y_keo + t_koe_m_t_koo
out[keo] = y_kee_p_y_keo - t_koe_p_t_koo
out[koo] = y_kee_m_y_keo - t_koe_m_t_koo
c1, s1 = singleton_step(c1, s1, α1, β1)
c2, s2 = singleton_step(c2, s2, α2, β2)
c3, s3 = singleton_step(c3, s3, α3, β3)
end
end

Expand Down Expand Up @@ -279,24 +290,32 @@ function fft_pow3!(out::AbstractVector{T}, in::AbstractVector{U}, N::Int, start_
# Size of subproblem
Nprime = N ÷ 3

# Dividing into subproblems
fft_pow3!(out, in, Nprime, start_out, stride_out, start_in, stride_in*3, w^3, minus120)
fft_pow3!(out, in, Nprime, start_out + Nprime*stride_out, stride_out, start_in + stride_in, stride_in*3, w^3, minus120)
fft_pow3!(out, in, Nprime, start_out + 2*Nprime*stride_out, stride_out, start_in + 2*stride_in, stride_in*3, w^3, minus120)
Rtype = real(T)
dir = twiddle_direction(w)
# Recursive sub-problem step cispi(dir · 2 / Nprime) = w^3.
w_sub = cispi(dir * Rtype(2) / Rtype(Nprime))

w1 = w
w2 = w * w1
wk1 = wk2 = one(T)
# Dividing into subproblems
fft_pow3!(out, in, Nprime, start_out, stride_out, start_in, stride_in*3, w_sub, minus120)
fft_pow3!(out, in, Nprime, start_out + Nprime*stride_out, stride_out, start_in + stride_in, stride_in*3, w_sub, minus120)
fft_pow3!(out, in, Nprime, start_out + 2*Nprime*stride_out, stride_out, start_in + 2*stride_in, stride_in*3, w_sub, minus120)

α1, β1 = singleton_params(dir * Rtype(2) / Rtype(N))
α2, β2 = singleton_params(dir * Rtype(4) / Rtype(N))
c1, s1 = one(Rtype), zero(Rtype)
c2, s2 = one(Rtype), zero(Rtype)
for k in 0:Nprime-1
k0 = start_out + stride_out * k
k1 = start_out + stride_out * (k + Nprime)
k2 = start_out + stride_out * (k + 2 * Nprime)
y_k0, y_k1, y_k2 = out[k0], out[k1], out[k2]
@muladd out[k0] = y_k0 + y_k1 * wk1 + y_k2 * wk2
@muladd out[k1] = y_k0 + y_k1 * wk1 * plus120 + y_k2 * wk2 * minus120
@muladd out[k2] = y_k0 + y_k1 * wk1 * minus120 + y_k2 * wk2 * plus120
wk1 *= w1
wk2 *= w2
wk1 = Complex(c1, s1)
wk2 = Complex(c2, s2)
@muladd out[k0] = y_k0 + y_k1*wk1 + y_k2*wk2
@muladd out[k1] = y_k0 + y_k1*wk1*plus120 + y_k2*wk2*minus120
@muladd out[k2] = y_k0 + y_k1*wk1*minus120 + y_k2*wk2*plus120
c1, s1 = singleton_step(c1, s1, α1, β1)
c2, s2 = singleton_step(c2, s2, α2, β2)
end
end

Expand Down
48 changes: 48 additions & 0 deletions src/singleton_twiddle.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Singleton's stable trigonometric recurrence for twiddle factors.
#
# The naïve `w *= step` recurrence gives O(N) relative error growth,
# which is the issue described in #118. Singleton (1967) showed that
# if one precomputes
# α = 2 sin²(θ/2) = 1 - cos(θ)
# β = sin(θ)
# then the recurrence
# c_{k+1} = c_k - (α c_k + β s_k)
# s_{k+1} = s_k - (α s_k - β c_k)
# produces (cos(kθ), sin(kθ)) with RMS error growing only as √k · ε
# (and worst-case ~ k · ε / something sub-linear in typical cases)
# rather than k · ε. The point is that `α c_k + β s_k` is a *small*
# correction (α ≈ θ²/2, β ≈ θ) subtracted from an ~O(1) base, so the
# update stays in the high-significand bits.
#
# The cost is ~8 flops per step (2× the naïve complex multiply but an
# order of magnitude faster than a fresh `cispi`), and the extra trig
# (`sincospi(θ/2)`) happens once per kernel call.

# Direction lives in `sign(imag(w))`; when `w` is real (N = 2 or any
# degenerate case where `imag` rounds to zero) both directions collapse
# to the same twiddle set so we pick +1.
@inline function twiddle_direction(w::Complex{T}) where {T<:Real}
s = imag(w)
s > 0 ? one(T) : (s < 0 ? -one(T) : one(T))
end

# Recurrence coefficients for stepping by `cispi(freq) = e^(iπ·freq)`.
# Uses `sincospi(freq/2)` so that `α` and `β` are exact-to-ULP even
# for very small frequencies — writing `1 - cos(θ)` directly suffers
# catastrophic cancellation there.
@inline function singleton_params(freq::T) where {T<:Real}
s_h, c_h = sincospi(freq / 2)
α = 2 * s_h * s_h
β = 2 * s_h * c_h
(α, β)
end

# Advance `(c, s) = (cos(kθ), sin(kθ))` to `(cos((k+1)θ), sin((k+1)θ))`.
# Computed as `c - (αc + βs)` rather than `(1-α)c - βs` on purpose:
# the correction is small so subtracting it from `c` preserves the
# high-order bits and the recurrence self-heals.
@inline function singleton_step(c::T, s::T, α::T, β::T) where {T<:Real}
c_new = c - muladd(α, c, β * s)
s_new = s - muladd(α, s, -(β * c))
(c_new, s_new)
end
72 changes: 72 additions & 0 deletions test/onedim/accuracy.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
using FFTA, Test, Random, LinearAlgebra

# Regression test for issue #118. Singleton's trigonometric recurrence
# (see src/singleton_twiddle.jl) brings the twiddle error from O(N)
# down to roughly O(log N · √log N) + an O(√N) twiddle term. Bounds
# below are set with ~2-3× headroom over what the current
# implementation produces (max over 5 seeds on aarch64 NEON); they
# still fail comfortably against the pre-fix naive `w *= step`
# recurrence, which ballooned past ~4000 ULP at N = 16384.

Random.seed!(42)

# (N, max eps ratio) across the power-of-2 ladder. Covers both even
# powers (= powers of 4, recursion bottoms at N = 4) and odd powers
# (recursion bottoms at N = 2), which hit different base cases in
# `fft_pow2_radix4!`.
const POWERS_OF_2 = (
(1 << 4, 3.0), # 16 = 4^2
(1 << 5, 3.0), # 32
(1 << 6, 3.0), # 64 = 4^3
(1 << 7, 3.0), # 128
(1 << 8, 4.0), # 256 = 4^4
(1 << 9, 4.0), # 512
(1 << 10, 8.0), # 1024 = 4^5
(1 << 11, 8.0), # 2048
(1 << 12, 10.0), # 4096 = 4^6
(1 << 13, 12.0), # 8192
(1 << 14, 14.0), # 16384 = 4^7
(1 << 15, 20.0), # 32768
(1 << 16, 22.0), # 65536 = 4^8
(1 << 17, 28.0), # 131072
(1 << 18, 28.0), # 262144 = 4^9
)

const POWERS_OF_3 = (
(3^1, 3.0),
(3^2, 3.0),
(3^3, 3.0),
(3^4, 4.0),
(3^5, 5.0),
(3^6, 7.0),
(3^7, 10.0),
(3^8, 13.0),
(3^9, 18.0),
)

function _worst_relerr(N::Int)
worst = 0.0
for seed in 1:5
Random.seed!(seed)
x64 = randn(ComplexF64, N)
x32 = ComplexF32.(x64)
y32 = fft(x32)
y_ref = ComplexF32.(fft(x64))
relerr = norm(y32 .- y_ref) / norm(y_ref)
worst = max(worst, relerr / eps(Float32))
end
return worst
end

@testset "twiddle accuracy (issue #118)" begin
@testset "powers of 2" begin
@testset "N = $N" for (N, bound) in POWERS_OF_2
@test _worst_relerr(N) ≤ bound
end
end
@testset "powers of 3" begin
@testset "N = $N" for (N, bound) in POWERS_OF_3
@test _worst_relerr(N) ≤ bound
end
end
end
3 changes: 3 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ Random.seed!(1)
@testset verbose = false "Backward" begin
include("onedim/complex_backward.jl")
end
@testset verbose = false "Accuracy" begin
include("onedim/accuracy.jl")
end
end
@testset verbose = true "Real" begin
@testset verbose = false "Forward" begin
Expand Down
Loading