From 5e69a2fe85a2efa7d15233f9c4268dc30678b962 Mon Sep 17 00:00:00 2001 From: Gowind Date: Sat, 16 Nov 2024 00:10:39 +0100 Subject: [PATCH 1/6] Fix: Refs #207 Fix implementation of Jensen Shannon measure The Jensen Shannom measure implementation in SimSIMD is different from the implementation in scipy. It turns out that a scaling factory of 0.5 was missed and this fix seems to match the example provided in scipy documentation --- include/simsimd/probability.h | 17 ++++++++++++----- scripts/test.mjs | 9 +++++---- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/include/simsimd/probability.h b/include/simsimd/probability.h index 2865aa32..c2fe68c6 100644 --- a/include/simsimd/probability.h +++ b/include/simsimd/probability.h @@ -108,7 +108,7 @@ SIMSIMD_PUBLIC void simsimd_js_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_ d += ai * SIMSIMD_LOG((ai + epsilon) / (mi + epsilon)); \ d += bi * SIMSIMD_LOG((bi + epsilon) / (mi + epsilon)); \ } \ - *result = (simsimd_distance_t)d / 2; \ + *result = SIMSIMD_SQRT(((simsimd_distance_t)d / 2)); \ } SIMSIMD_MAKE_KL(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f64_serial @@ -198,6 +198,8 @@ SIMSIMD_PUBLIC void simsimd_js_f32_neon(simsimd_f32_t const *a, simsimd_f32_t co simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON; float32x4_t epsilon_vec = vdupq_n_f32(epsilon); float32x4_t sum_vec = vdupq_n_f32(0); + float32x4_t half_vec = vdupq_n_f32(0.5f); + float32x4_t a_vec, b_vec; simsimd_js_f32_neon_cycle: @@ -219,12 +221,14 @@ SIMSIMD_PUBLIC void simsimd_js_f32_neon(simsimd_f32_t const *a, simsimd_f32_t co float32x4_t log_ratio_b_vec = _simsimd_log2_f32_neon(ratio_b_vec); float32x4_t prod_a_vec = vmulq_f32(a_vec, log_ratio_a_vec); float32x4_t prod_b_vec = vmulq_f32(b_vec, log_ratio_b_vec); + sum_vec = vaddq_f32(sum_vec, vaddq_f32(prod_a_vec, prod_b_vec)); if (n != 0) goto simsimd_js_f32_neon_cycle; simsimd_f32_t log2_normalizer = 0.693147181f; + sum_vec = vmulq_f32(sum_vec, half_vec); simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer; - *result = sum / 2; + *result = SIMSIMD_SQRT(sum); } #pragma clang attribute pop @@ -271,6 +275,8 @@ SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const *a, simsimd_f16_t co float32x4_t sum_vec = vdupq_n_f32(0); simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON; float32x4_t epsilon_vec = vdupq_n_f32(epsilon); + float32x4_t half_vec = vdupq_n_f32(0.5f); + float32x4_t a_vec, b_vec; simsimd_js_f16_neon_cycle: @@ -290,14 +296,15 @@ SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const *a, simsimd_f16_t co float32x4_t ratio_b_vec = vdivq_f32(vaddq_f32(b_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec)); float32x4_t log_ratio_a_vec = _simsimd_log2_f32_neon(ratio_a_vec); float32x4_t log_ratio_b_vec = _simsimd_log2_f32_neon(ratio_b_vec); - float32x4_t prod_a_vec = vmulq_f32(a_vec, log_ratio_a_vec); - float32x4_t prod_b_vec = vmulq_f32(b_vec, log_ratio_b_vec); + float32x4_t prod_a_vec = vmulq_f32(vmulq_f32(a_vec, log_ratio_a_vec), half_vec); + float32x4_t prod_b_vec = vmulq_f32(vmulq_f32(b_vec, log_ratio_b_vec), half_vec); sum_vec = vaddq_f32(sum_vec, vaddq_f32(prod_a_vec, prod_b_vec)); if (n) goto simsimd_js_f16_neon_cycle; simsimd_f32_t log2_normalizer = 0.693147181f; + sum_vec = vmulq_f32(sum_vec, half_vec); simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer; - *result = sum / 2; + *result = SIMSIMD_SQRT(sum); } #pragma clang attribute pop diff --git a/scripts/test.mjs b/scripts/test.mjs index 24d9baea..30e685d1 100644 --- a/scripts/test.mjs +++ b/scripts/test.mjs @@ -172,8 +172,9 @@ test("Kullback-Leibler C vs JS", () => { }); test("Jensen-Shannon C vs JS", () => { - const f32sDistribution = new Float32Array([1.0 / 6, 2.0 / 6, 3.0 / 6]); - const result = simsimd.jensenshannon(f32sDistribution, f32sDistribution); - const resultjs = fallback.jensenshannon(f32sDistribution, f32sDistribution); - assertAlmostEqual(resultjs, result, 0.01); + const f32sDistribution = new Float32Array([1.0, 0.0]); + const f32sDistribution2 = new Float32Array([0.5, 0.5]); + const result = simsimd.jensenshannon(f32sDistribution, f32sDistribution2); + const resultjs = fallback.jensenshannon(f32sDistribution, f32sDistribution2); + assertAlmostEqual(result, resultjs, 0.01); }); From e0654bdfdf19ab9ac72ae7b1e4710bb7f31ecd99 Mon Sep 17 00:00:00 2001 From: Gowind Date: Sat, 16 Nov 2024 00:22:45 +0100 Subject: [PATCH 2/6] feat: Add more tests --- scripts/test.mjs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/test.mjs b/scripts/test.mjs index 30e685d1..8bd0842b 100644 --- a/scripts/test.mjs +++ b/scripts/test.mjs @@ -177,4 +177,10 @@ test("Jensen-Shannon C vs JS", () => { const result = simsimd.jensenshannon(f32sDistribution, f32sDistribution2); const resultjs = fallback.jensenshannon(f32sDistribution, f32sDistribution2); assertAlmostEqual(result, resultjs, 0.01); + + const orthogonalVec1 = new Float32Array([1.0, 0.0, 0.0]); + const orthogonalVec2 = new Float32Array([0.0, 1.0, 0.0]); + const orthoResult = simsimd.jensenshannon(orthogonalVec1, orthogonalVec2); + const orthoResultJs = fallback.jensenshannon(orthogonalVec1, orthogonalVec2); + assertAlmostEqual(orthoResult, orthoResultJs, 0.01); }); From f98e02d0b44b166870c72fa5addc287c07576aa6 Mon Sep 17 00:00:00 2001 From: Gowind Date: Sun, 17 Nov 2024 00:30:26 +0100 Subject: [PATCH 3/6] refactor: use vmulq_n_f32 to avoid creating an extra vector and update Python tests --- include/simsimd/probability.h | 14 +++++--------- scripts/test.py | 5 +++-- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/simsimd/probability.h b/include/simsimd/probability.h index c2fe68c6..c4545c86 100644 --- a/include/simsimd/probability.h +++ b/include/simsimd/probability.h @@ -108,7 +108,7 @@ SIMSIMD_PUBLIC void simsimd_js_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_ d += ai * SIMSIMD_LOG((ai + epsilon) / (mi + epsilon)); \ d += bi * SIMSIMD_LOG((bi + epsilon) / (mi + epsilon)); \ } \ - *result = SIMSIMD_SQRT(((simsimd_distance_t)d / 2)); \ + *result = SIMSIMD_SQRT(((simsimd_distance_t)d / 2)); \ } SIMSIMD_MAKE_KL(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f64_serial @@ -198,8 +198,6 @@ SIMSIMD_PUBLIC void simsimd_js_f32_neon(simsimd_f32_t const *a, simsimd_f32_t co simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON; float32x4_t epsilon_vec = vdupq_n_f32(epsilon); float32x4_t sum_vec = vdupq_n_f32(0); - float32x4_t half_vec = vdupq_n_f32(0.5f); - float32x4_t a_vec, b_vec; simsimd_js_f32_neon_cycle: @@ -226,7 +224,7 @@ SIMSIMD_PUBLIC void simsimd_js_f32_neon(simsimd_f32_t const *a, simsimd_f32_t co if (n != 0) goto simsimd_js_f32_neon_cycle; simsimd_f32_t log2_normalizer = 0.693147181f; - sum_vec = vmulq_f32(sum_vec, half_vec); + sum_vec = vmulq_n_f32(sum_vec, 0.5f); simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer; *result = SIMSIMD_SQRT(sum); } @@ -275,8 +273,6 @@ SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const *a, simsimd_f16_t co float32x4_t sum_vec = vdupq_n_f32(0); simsimd_f32_t epsilon = SIMSIMD_F32_DIVISION_EPSILON; float32x4_t epsilon_vec = vdupq_n_f32(epsilon); - float32x4_t half_vec = vdupq_n_f32(0.5f); - float32x4_t a_vec, b_vec; simsimd_js_f16_neon_cycle: @@ -296,13 +292,13 @@ SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const *a, simsimd_f16_t co float32x4_t ratio_b_vec = vdivq_f32(vaddq_f32(b_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec)); float32x4_t log_ratio_a_vec = _simsimd_log2_f32_neon(ratio_a_vec); float32x4_t log_ratio_b_vec = _simsimd_log2_f32_neon(ratio_b_vec); - float32x4_t prod_a_vec = vmulq_f32(vmulq_f32(a_vec, log_ratio_a_vec), half_vec); - float32x4_t prod_b_vec = vmulq_f32(vmulq_f32(b_vec, log_ratio_b_vec), half_vec); + float32x4_t prod_a_vec = vmulq_f32(a_vec, log_ratio_a_vec); + float32x4_t prod_b_vec = vmulq_f32(b_vec, log_ratio_b_vec); sum_vec = vaddq_f32(sum_vec, vaddq_f32(prod_a_vec, prod_b_vec)); if (n) goto simsimd_js_f16_neon_cycle; simsimd_f32_t log2_normalizer = 0.693147181f; - sum_vec = vmulq_f32(sum_vec, half_vec); + sum_vec = vmulq_n_f32(sum_vec, 0.5f); simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer; *result = SIMSIMD_SQRT(sum); } diff --git a/scripts/test.py b/scripts/test.py index 7408d71e..1dc8dab7 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -38,7 +38,6 @@ python test.py """ - import os import math import time @@ -124,7 +123,7 @@ def baseline_wsum(x, y, alpha, beta): baseline_euclidean = lambda x, y: np.array(spd.euclidean(x, y)) #! SciPy returns a scalar baseline_sqeuclidean = spd.sqeuclidean baseline_cosine = spd.cosine - baseline_jensenshannon = lambda x, y: spd.jensenshannon(x, y) ** 2 + baseline_jensenshannon = lambda x, y: spd.jensenshannon(x, y) baseline_hamming = lambda x, y: spd.hamming(x, y) * len(x) baseline_jaccard = spd.jaccard @@ -453,6 +452,8 @@ def name_to_kernels(name: str): return baseline_fma, simd.fma elif name == "wsum": return baseline_wsum, simd.wsum + elif name == "jensenshannon": + return baseline_jensenshannon, simd.jensenshannon else: raise ValueError(f"Unknown kernel name: {name}") From 63d665e40ad653de4b445cf1256e376e5166c835 Mon Sep 17 00:00:00 2001 From: Gowind Date: Sun, 17 Nov 2024 00:55:30 +0100 Subject: [PATCH 4/6] refactor: Refs #207 Slightly improve precision --- include/simsimd/probability.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/simsimd/probability.h b/include/simsimd/probability.h index c4545c86..0d2f3666 100644 --- a/include/simsimd/probability.h +++ b/include/simsimd/probability.h @@ -217,16 +217,17 @@ SIMSIMD_PUBLIC void simsimd_js_f32_neon(simsimd_f32_t const *a, simsimd_f32_t co float32x4_t ratio_b_vec = vdivq_f32(vaddq_f32(b_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec)); float32x4_t log_ratio_a_vec = _simsimd_log2_f32_neon(ratio_a_vec); float32x4_t log_ratio_b_vec = _simsimd_log2_f32_neon(ratio_b_vec); - float32x4_t prod_a_vec = vmulq_f32(a_vec, log_ratio_a_vec); - float32x4_t prod_b_vec = vmulq_f32(b_vec, log_ratio_b_vec); + float32x4_t prod_a_vec = vmulq_n_f32(vmulq_f32(a_vec, log_ratio_a_vec), 0.5f); + float32x4_t prod_b_vec = vmulq_n_f32(vmulq_f32(b_vec, log_ratio_b_vec), 0.5f); sum_vec = vaddq_f32(sum_vec, vaddq_f32(prod_a_vec, prod_b_vec)); if (n != 0) goto simsimd_js_f32_neon_cycle; simsimd_f32_t log2_normalizer = 0.693147181f; - sum_vec = vmulq_n_f32(sum_vec, 0.5f); simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer; *result = SIMSIMD_SQRT(sum); + + } #pragma clang attribute pop @@ -292,13 +293,12 @@ SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const *a, simsimd_f16_t co float32x4_t ratio_b_vec = vdivq_f32(vaddq_f32(b_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec)); float32x4_t log_ratio_a_vec = _simsimd_log2_f32_neon(ratio_a_vec); float32x4_t log_ratio_b_vec = _simsimd_log2_f32_neon(ratio_b_vec); - float32x4_t prod_a_vec = vmulq_f32(a_vec, log_ratio_a_vec); - float32x4_t prod_b_vec = vmulq_f32(b_vec, log_ratio_b_vec); + float32x4_t prod_a_vec = vmulq_n_f32(vmulq_f32(a_vec, log_ratio_a_vec), 0.5f); + float32x4_t prod_b_vec = vmulq_n_f32(vmulq_f32(b_vec, log_ratio_b_vec), 0.5f); sum_vec = vaddq_f32(sum_vec, vaddq_f32(prod_a_vec, prod_b_vec)); if (n) goto simsimd_js_f16_neon_cycle; simsimd_f32_t log2_normalizer = 0.693147181f; - sum_vec = vmulq_n_f32(sum_vec, 0.5f); simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer; *result = SIMSIMD_SQRT(sum); } From 5c8f81cc48c57c9d83086ae874a9c63fc96d05b8 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 17 Nov 2024 13:57:58 +0000 Subject: [PATCH 5/6] Fix: Don't skip JS tests --- scripts/test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/test.py b/scripts/test.py index 1dc8dab7..4e8781aa 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -840,12 +840,13 @@ def test_dense_bits(ndim, metric, capability, stats_fixture): collect_errors(metric, ndim, "bin8", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture) -@pytest.mark.skip(reason="Problems inferring the tolerance bounds for numerical errors") +@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed") +@pytest.mark.skipif(not scipy_available, reason="SciPy is not installed") @pytest.mark.repeat(50) @pytest.mark.parametrize("ndim", [11, 97, 1536]) @pytest.mark.parametrize("dtype", ["float32", "float16"]) @pytest.mark.parametrize("capability", possible_capabilities) -def test_jensen_shannon(ndim, dtype, capability): +def test_jensen_shannon(ndim, dtype, capability, stats_fixture): """Compares the simd.jensenshannon() function with scipy.spatial.distance.jensenshannon(), measuring the accuracy error for f16, and f32 types.""" np.random.seed() a = np.abs(np.random.randn(ndim)).astype(dtype) From 114490a37097b287585c13e078a27abfb63b716f Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 17 Nov 2024 14:00:58 +0000 Subject: [PATCH 6/6] Improve: Avoid division on hot path --- include/simsimd/probability.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/include/simsimd/probability.h b/include/simsimd/probability.h index 0d2f3666..ca10b782 100644 --- a/include/simsimd/probability.h +++ b/include/simsimd/probability.h @@ -217,17 +217,15 @@ SIMSIMD_PUBLIC void simsimd_js_f32_neon(simsimd_f32_t const *a, simsimd_f32_t co float32x4_t ratio_b_vec = vdivq_f32(vaddq_f32(b_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec)); float32x4_t log_ratio_a_vec = _simsimd_log2_f32_neon(ratio_a_vec); float32x4_t log_ratio_b_vec = _simsimd_log2_f32_neon(ratio_b_vec); - float32x4_t prod_a_vec = vmulq_n_f32(vmulq_f32(a_vec, log_ratio_a_vec), 0.5f); - float32x4_t prod_b_vec = vmulq_n_f32(vmulq_f32(b_vec, log_ratio_b_vec), 0.5f); - + float32x4_t prod_a_vec = vmulq_f32(a_vec, log_ratio_a_vec); + float32x4_t prod_b_vec = vmulq_f32(b_vec, log_ratio_b_vec); + sum_vec = vaddq_f32(sum_vec, vaddq_f32(prod_a_vec, prod_b_vec)); if (n != 0) goto simsimd_js_f32_neon_cycle; simsimd_f32_t log2_normalizer = 0.693147181f; - simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer; + simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer / 2; *result = SIMSIMD_SQRT(sum); - - } #pragma clang attribute pop @@ -293,13 +291,13 @@ SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const *a, simsimd_f16_t co float32x4_t ratio_b_vec = vdivq_f32(vaddq_f32(b_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec)); float32x4_t log_ratio_a_vec = _simsimd_log2_f32_neon(ratio_a_vec); float32x4_t log_ratio_b_vec = _simsimd_log2_f32_neon(ratio_b_vec); - float32x4_t prod_a_vec = vmulq_n_f32(vmulq_f32(a_vec, log_ratio_a_vec), 0.5f); - float32x4_t prod_b_vec = vmulq_n_f32(vmulq_f32(b_vec, log_ratio_b_vec), 0.5f); + float32x4_t prod_a_vec = vmulq_f32(a_vec, log_ratio_a_vec); + float32x4_t prod_b_vec = vmulq_f32(b_vec, log_ratio_b_vec); sum_vec = vaddq_f32(sum_vec, vaddq_f32(prod_a_vec, prod_b_vec)); if (n) goto simsimd_js_f16_neon_cycle; simsimd_f32_t log2_normalizer = 0.693147181f; - simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer; + simsimd_f32_t sum = vaddvq_f32(sum_vec) * log2_normalizer / 2; *result = SIMSIMD_SQRT(sum); }