|
1 | | -use crate::math::scalar; |
2 | | -use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64}; |
3 | | - |
4 | | -type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64; |
5 | | - |
6 | | -const SINH_COSH_SMALL_ABS: f64 = 0.125; |
7 | | -const SINH_COSH_FAST_ABS_MAX: f64 = 0.125; |
8 | | -const TANH_SMALL_ABS: f64 = 0.0; |
9 | | -const TANH_FAST_ABS_MAX: f64 = 0.0; |
10 | | - |
11 | | -#[inline(always)] |
12 | | -fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool |
13 | | -where |
14 | | - V: SimdFloat64, |
15 | | -{ |
16 | | - unsafe { |
17 | | - let lanes = mask.as_array(); |
18 | | - for lane in 0..V::WIDTH { |
19 | | - if lanes[lane] != 0 { |
20 | | - return true; |
21 | | - } |
22 | | - } |
23 | | - } |
24 | | - |
25 | | - false |
26 | | -} |
27 | | - |
28 | | -#[inline(always)] |
29 | | -fn patch_exceptional_lanes<V>( |
30 | | - input: V, |
31 | | - output: V, |
32 | | - exceptional_mask: SimdI64<V>, |
33 | | - scalar_fallback: fn(f64) -> f64, |
34 | | -) -> V |
35 | | -where |
36 | | - V: SimdFloat64, |
37 | | -{ |
38 | | - if !any_lane_nonzero::<V>(exceptional_mask) { |
39 | | - return output; |
40 | | - } |
41 | | - |
42 | | - unsafe { |
43 | | - let input_lanes = input.as_array(); |
44 | | - let mask_lanes = exceptional_mask.as_array(); |
45 | | - let mut output_lanes = output.as_array(); |
46 | | - |
47 | | - for lane in 0..V::WIDTH { |
48 | | - if mask_lanes[lane] != 0 { |
49 | | - output_lanes[lane] = scalar_fallback(input_lanes[lane]); |
50 | | - } |
51 | | - } |
52 | | - |
53 | | - V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64) |
54 | | - } |
55 | | -} |
56 | | - |
57 | | -#[inline(always)] |
58 | | -fn exp_u35<V>(input: V) -> V |
59 | | -where |
60 | | - V: SimdFloat64, |
61 | | -{ |
62 | | - // Temporary family-local bridge: use scalar exp lane mapping here while |
63 | | - // avoiding scalar lane mapping for the final hyperbolic functions. |
64 | | - unsafe { |
65 | | - let mut lanes = input.as_array(); |
66 | | - for lane in 0..V::WIDTH { |
67 | | - lanes[lane] = scalar::exp_u35_f64(lanes[lane]); |
68 | | - } |
69 | | - V::load_from_ptr_unaligned(&lanes as *const V::ArrayRepresentation as *const f64) |
70 | | - } |
71 | | -} |
72 | | - |
73 | | -#[inline(always)] |
74 | | -fn sinh_small<V>(input: V, input_sq: V) -> V |
75 | | -where |
76 | | - V: SimdFloat64, |
77 | | -{ |
78 | | - let poly = ((((V::set1(1.0 / 39916800.0) * input_sq) + V::set1(1.0 / 362880.0)) * input_sq |
79 | | - + V::set1(1.0 / 5040.0)) |
80 | | - * input_sq |
81 | | - + V::set1(1.0 / 120.0)) |
82 | | - * input_sq |
83 | | - + V::set1(1.0 / 6.0); |
84 | | - |
85 | | - input + (input * input_sq * poly) |
86 | | -} |
87 | | - |
88 | | -#[inline(always)] |
89 | | -fn cosh_small<V>(input_sq: V) -> V |
90 | | -where |
91 | | - V: SimdFloat64, |
92 | | -{ |
93 | | - let poly = (((V::set1(1.0 / 40320.0) * input_sq) + V::set1(1.0 / 720.0)) * input_sq |
94 | | - + V::set1(1.0 / 24.0)) |
95 | | - * input_sq |
96 | | - + V::set1(0.5); |
97 | | - |
98 | | - V::set1(1.0) + (input_sq * poly) |
99 | | -} |
100 | | - |
101 | | -#[inline(always)] |
102 | | -fn sinh_cosh_medium<V>(abs_input: V) -> (V, V) |
103 | | -where |
104 | | - V: SimdFloat64, |
105 | | -{ |
106 | | - let exp_abs = exp_u35(abs_input); |
107 | | - let exp_neg_abs = V::set1(1.0) / exp_abs; |
108 | | - let half = V::set1(0.5); |
109 | | - |
110 | | - ( |
111 | | - (exp_abs - exp_neg_abs) * half, |
112 | | - (exp_abs + exp_neg_abs) * half, |
113 | | - ) |
114 | | -} |
115 | | - |
116 | | -#[inline(always)] |
117 | | -fn sinh_cosh_masks<V>(input: V) -> (SimdI64<V>, V, V) |
118 | | -where |
119 | | - V: SimdFloat64, |
120 | | -{ |
121 | | - let abs_input = input.abs(); |
122 | | - let finite_mask = input.cmp_eq(input).bitcast_i64(); |
123 | | - let within_fast_range = abs_input |
124 | | - .cmp_lte(V::set1(SINH_COSH_FAST_ABS_MAX)) |
125 | | - .bitcast_i64(); |
126 | | - |
127 | | - (finite_mask & within_fast_range, abs_input, input * input) |
128 | | -} |
| 1 | +use crate::math::{map, scalar}; |
| 2 | +use crate::SimdFloat64; |
129 | 3 |
|
130 | 4 | #[inline(always)] |
131 | 5 | pub(crate) fn sinh_u35<V>(input: V) -> V |
132 | 6 | where |
133 | 7 | V: SimdFloat64, |
134 | 8 | { |
135 | | - let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input); |
136 | | - let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes()); |
137 | | - let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS)); |
138 | | - |
139 | | - let fast_small = sinh_small(input, input_sq); |
140 | | - let exp_input = exp_u35(input); |
141 | | - let exp_neg_input = V::set1(1.0) / exp_input; |
142 | | - let sinh_medium = (exp_input - exp_neg_input) * V::set1(0.5); |
143 | | - let fast = small_mask.blendv(sinh_medium, fast_small); |
144 | | - let zero_mask = input.cmp_eq(V::set1(0.0)); |
145 | | - let fast = zero_mask.blendv(fast, input); |
146 | | - |
147 | | - patch_exceptional_lanes(input, fast, exceptional_mask, scalar::sinh_u35_f64) |
| 9 | + map::unary_f64(input, scalar::sinh_u35_f64) |
148 | 10 | } |
149 | 11 |
|
150 | 12 | #[inline(always)] |
151 | 13 | pub(crate) fn cosh_u35<V>(input: V) -> V |
152 | 14 | where |
153 | 15 | V: SimdFloat64, |
154 | 16 | { |
155 | | - let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input); |
156 | | - let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes()); |
157 | | - let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS)); |
158 | | - |
159 | | - let fast_small = cosh_small(input_sq); |
160 | | - let (_, cosh_medium) = sinh_cosh_medium(abs_input); |
161 | | - let fast = small_mask.blendv(cosh_medium, fast_small); |
162 | | - |
163 | | - patch_exceptional_lanes(input, fast, exceptional_mask, scalar::cosh_u35_f64) |
| 17 | + map::unary_f64(input, scalar::cosh_u35_f64) |
164 | 18 | } |
165 | 19 |
|
166 | 20 | #[inline(always)] |
167 | 21 | pub(crate) fn tanh_u35<V>(input: V) -> V |
168 | 22 | where |
169 | 23 | V: SimdFloat64, |
170 | 24 | { |
171 | | - let abs_input = input.abs(); |
172 | | - let finite_mask = input.cmp_eq(input).bitcast_i64(); |
173 | | - let within_fast_range = abs_input.cmp_lte(V::set1(TANH_FAST_ABS_MAX)).bitcast_i64(); |
174 | | - let exceptional_mask = (finite_mask & within_fast_range).cmp_eq(SimdI64::<V>::zeroes()); |
175 | | - let small_mask = abs_input.cmp_lt(V::set1(TANH_SMALL_ABS)); |
176 | | - |
177 | | - let input_sq = input * input; |
178 | | - let fast_small = sinh_small(input, input_sq) / cosh_small(input_sq); |
179 | | - |
180 | | - let exp_input = exp_u35(input); |
181 | | - let exp_neg_input = V::set1(1.0) / exp_input; |
182 | | - let tanh_medium = (exp_input - exp_neg_input) / (exp_input + exp_neg_input); |
183 | | - let fast = small_mask.blendv(tanh_medium, fast_small); |
184 | | - let zero_mask = input.cmp_eq(V::set1(0.0)); |
185 | | - let fast = zero_mask.blendv(fast, input); |
186 | | - |
187 | | - patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tanh_u35_f64) |
| 25 | + map::unary_f64(input, scalar::tanh_u35_f64) |
188 | 26 | } |
0 commit comments