Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
d322818
fix: add scalar
richyreachy Mar 16, 2026
625cd0c
Merge branch 'main' into fix/fix_performance
richyreachy Mar 16, 2026
4ea17e9
refactor: add scalar
richyreachy Mar 16, 2026
efecee9
fix: fix scalar
richyreachy Mar 16, 2026
5f5ef13
fix: remove inline
richyreachy Mar 17, 2026
58a9cc8
refactor: separate avx512 fp16 and use -m flag instead of -m march
richyreachy Mar 17, 2026
6ed3306
add fp16 avx512fp16
richyreachy Mar 17, 2026
cf4a2ee
merge with fp16 separation
richyreachy Mar 17, 2026
5ac9e0d
Merge branch 'refactor/separate_fp16' into fix/fix_performance
richyreachy Mar 17, 2026
ddd3dc5
fix: format cmake config
richyreachy Mar 17, 2026
0b21b7a
fix: avx512 fp16
richyreachy Mar 17, 2026
f8ea918
fix: fp16 typo
richyreachy Mar 17, 2026
2b78014
revert: use march back since performance degrades
richyreachy Mar 17, 2026
f91a91e
fix: fix typo according to greptile
richyreachy Mar 17, 2026
28c5a37
fix: fix neon
richyreachy Mar 18, 2026
61eff0c
fix: fix naming
richyreachy Mar 18, 2026
2f6472d
fix: fix naming
richyreachy Mar 18, 2026
97586a2
fix: int4
richyreachy Mar 18, 2026
9aebde3
fix: fix sparse
richyreachy Mar 18, 2026
50c3522
fix: fix sparse
richyreachy Mar 18, 2026
c63e206
fix: fix sparse
richyreachy Mar 18, 2026
6e1c474
fix: fix int8 scalar
richyreachy Mar 18, 2026
f2370a1
fix: fix sparse
richyreachy Mar 18, 2026
9012959
fix: fix ut
richyreachy Mar 18, 2026
6c1c8bb
refactor: change cmake march
richyreachy Mar 18, 2026
bb4e8cd
refactor: change cmake march
richyreachy Mar 18, 2026
d9a2b73
fix: fix ut
richyreachy Mar 18, 2026
4bf3541
Merge branch 'main' into refactor/add_scalar_dist_function_and_setup_…
richyreachy Mar 18, 2026
1d37aeb
fix: fix avx512fp16
richyreachy Mar 18, 2026
37166e0
refactor: change math batch
richyreachy Mar 18, 2026
d00aa56
fix: fix cmake config
richyreachy Mar 18, 2026
9061a95
fix: mips fp16
richyreachy Mar 18, 2026
0daf6fe
fix: mips fp16
richyreachy Mar 18, 2026
6cd1365
Merge branch 'main' into refactor/add_scalar_dist_function_and_setup_…
richyreachy Mar 18, 2026
5e7b9ac
fix: update turbo cmake
richyreachy Mar 18, 2026
c1a7132
fix: fip mips fp16 scalar
richyreachy Mar 19, 2026
c1ea0d0
fix: add fp32 mips
richyreachy Mar 19, 2026
ff58d68
fix: missout icelake
richyreachy Mar 19, 2026
106c513
fix: mips fp32 neon
richyreachy Mar 19, 2026
fa958d5
Merge branch 'main' into refactor/add_scalar_dist_function_and_setup_…
richyreachy Mar 19, 2026
1cd42f7
Merge branch 'main' into refactor/add_scalar_dist_function_and_setup_…
richyreachy Mar 19, 2026
3d8bdf7
fix: fix cmake config
richyreachy Mar 19, 2026
9804a8f
Merge branch 'main' into refactor/add_scalar_dist_function_and_setup_…
richyreachy Mar 19, 2026
91332ac
fix: add avx512fp16
richyreachy Mar 19, 2026
dc4d33c
fix: cmake config
richyreachy Mar 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake)
include(${PROJECT_ROOT_DIR}/cmake/option.cmake)

if(NOT ANDROID AND AUTO_DETECT_ARCH AND CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64")
setup_compiler_march_for_x86(MATH_MARCH_FLAG_SSE MATH_MARCH_FLAG_AVX2 MATH_MARCH_FLAG_AVX512)
message(STATUS "best compiler march, sse: " ${MATH_MARCH_FLAG_SSE} ", avx2: " ${MATH_MARCH_FLAG_AVX2} ", avx512: " ${MATH_MARCH_FLAG_AVX512})
setup_compiler_march_for_x86(MATH_MARCH_FLAG_SSE MATH_MARCH_FLAG_AVX2 MATH_MARCH_FLAG_AVX512 MATH_MARCH_FLAG_AVX512FP16)
message(STATUS "best compiler march, sse: " ${MATH_MARCH_FLAG_SSE} ", avx2: " ${MATH_MARCH_FLAG_AVX2} ", avx512: " ${MATH_MARCH_FLAG_AVX512} ", avx512fp16: " ${MATH_MARCH_FLAG_AVX512FP16})
endif()

include_directories(${PROJECT_ROOT_DIR}/src/include)
Expand Down
36 changes: 20 additions & 16 deletions cmake/option.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ option(ENABLE_OPENMP "Enable OpenMP support" OFF)

set(ARCH_OPTIONS
ENABLE_NEHALEM ENABLE_SANDYBRIDGE ENABLE_HASWELL ENABLE_BROADWELL ENABLE_SKYLAKE
ENABLE_SKYLAKE_AVX512 ENABLE_ICELAKE ENABLE_SAPPHIRERAPIDS ENABLE_EMERALDRAPIDS
ENABLE_GRANITERAPIDS ENABLE_ZEN1 ENABLE_ZEN2 ENABLE_ZEN3
ENABLE_SKYLAKE_AVX512 ENABLE_ICELAKE ENABLE_SAPPHIRERAPIDS ENABLE_EMERALDRAPIDS ENABLE_GRANITERAPIDS
ENABLE_ZEN1 ENABLE_ZEN2 ENABLE_ZEN3
ENABLE_ARMV8A ENABLE_ARMV8.1A ENABLE_ARMV8.2A ENABLE_ARMV8.3A ENABLE_ARMV8.4A
ENABLE_ARMV8.5A ENABLE_ARMV8.6A
ENABLE_NATIVE
Expand Down Expand Up @@ -103,30 +103,34 @@ function(_setup_x86_march)
endif()
endfunction()

function(setup_compiler_march_for_x86 VAR_NAME_SSE VAR_NAME_AVX2 VAR_NAME_AVX512)
function(setup_compiler_march_for_x86 VAR_NAME_SSE VAR_NAME_AVX2 VAR_NAME_AVX512 VAR_NAME_AVX512FP16)
#sse
set(${VAR_NAME_SSE} "-march=corei7" PARENT_SCOPE)

#avx 2
set(${VAR_NAME_AVX2} "-march=core-avx2" PARENT_SCOPE)

#avx512
set(_x86_flags
"graniterapids" "emeraldrapids" "sapphirerapids"
"icelake-server" "skylake-avx512"
)
foreach(_arch IN LISTS _x86_flags)
check_c_compiler_flag("-march=${_arch}" _COMP_SUPP_${_arch})
if(_COMP_SUPP_${_arch})
set(${VAR_NAME_AVX512} "-march=${_arch}" PARENT_SCOPE)
return()
set(_x86_flags_avx512 "icelake-server" "skylake-avx512" "core-avx2" "x86-64")
foreach(_arch_avx512 IN LISTS _x86_flags_avx512)
check_c_compiler_flag("-march=${_arch_avx512}" _COMP_SUPP_${_arch_avx512})
if(_COMP_SUPP_${_arch_avx512})
set(${VAR_NAME_AVX512} "-march=${_arch_avx512}" PARENT_SCOPE)
break()
endif()
endforeach()


set(${VAR_NAME_AVX512} "-march=core-avx2" PARENT_SCOPE)
message(WARNING "No known avx512 microarchitecture flag found. Set up as core-avx2")

#avx512fp16
set(_x86_flags_avx512fp16
"sapphirerapids" "icelake-server" "skylake-avx512" "core-avx2" "x86-64"
)
foreach(_arch_avx512fp16 IN LISTS _x86_flags_avx512fp16)
check_c_compiler_flag("-march=${_arch_avx512fp16}" _COMP_SUPP_${_arch_avx512fp16})
if(_COMP_SUPP_${_arch_avx512fp16})
set(${VAR_NAME_AVX512FP16} "-march=${_arch_avx512fp16}" PARENT_SCOPE)
break()
endif()
endforeach()
endfunction()

if(MSVC)
Expand Down
27 changes: 21 additions & 6 deletions src/ailego/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ endif()

if(NOT ANDROID AND AUTO_DETECT_ARCH)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64")
setup_compiler_march_for_x86(MATH_MARCH_FLAG_SSE MATH_MARCH_FLAG_AVX2 MATH_MARCH_FLAG_AVX512)
message(STATUS "best compiler march, sse: " ${MATH_MARCH_FLAG_SSE} ", avx2: " ${MATH_MARCH_FLAG_AVX2} ", avx512: " ${MATH_MARCH_FLAG_AVX512})
setup_compiler_march_for_x86(MATH_MARCH_FLAG_SSE MATH_MARCH_FLAG_AVX2 MATH_MARCH_FLAG_AVX512 MATH_MARCH_FLAG_AVX512FP16)
message(STATUS "best compiler march, sse: " ${MATH_MARCH_FLAG_SSE} ", avx2: " ${MATH_MARCH_FLAG_AVX2} ", avx512: " ${MATH_MARCH_FLAG_AVX512} ", avx512fp16: " ${MATH_MARCH_FLAG_AVX512FP16})

file(GLOB_RECURSE MATH_FILES_SSE
${CMAKE_CURRENT_SOURCE_DIR}/math/*_sse.cc
Expand All @@ -42,16 +42,23 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH)
)

file(GLOB_RECURSE MATH_FILES_AVX512
${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc
${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c
${CMAKE_CURRENT_SOURCE_DIR}/math/*_avx512.cc
${CMAKE_CURRENT_SOURCE_DIR}/math/*_avx512.c
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.cc
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.c
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_avx512.cc
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_avx512.c
)

file(GLOB_RECURSE MATH_FILES_AVX512FP16
${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc
${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c
${CMAKE_CURRENT_SOURCE_DIR}/math/*_avx512fp16.cc
${CMAKE_CURRENT_SOURCE_DIR}/math/*_avx512fp16.c
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.cc
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.c
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_avx512fp16.cc
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_avx512fp16.c
)

foreach(MATH_FILE ${MATH_FILES_SSE})
set_source_files_properties(
${MATH_FILE}
Expand All @@ -75,6 +82,14 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH)
COMPILE_FLAGS "${MATH_MARCH_FLAG_AVX512}"
)
endforeach()

foreach(MATH_FILE ${MATH_FILES_AVX512FP16})
set_source_files_properties(
${MATH_FILE}
PROPERTIES
COMPILE_FLAGS "${MATH_MARCH_FLAG_AVX512FP16}"
)
endforeach()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
# set(CMAKE_CXX_FLAGS "-march=armv8-a")
# set(CMAKE_C_FLAGS "-march=armv8-a")
Expand Down
170 changes: 50 additions & 120 deletions src/ailego/math/euclidean_distance_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
namespace zvec {
namespace ailego {

//--------------------------------------------------
// Dense
//--------------------------------------------------
/*! Squared Euclidean Distance Matrix
*/
template <typename T, size_t M, size_t N, typename = void>
Expand All @@ -48,6 +51,46 @@ struct SquaredEuclideanDistanceMatrix<
}
};

template <>
struct SquaredEuclideanDistanceMatrix<uint8_t, 1, 1> {
//! Type of value
using ValueType = uint8_t;

//! Compute the distance between matrix and query
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};

template <>
struct SquaredEuclideanDistanceMatrix<int8_t, 1, 1> {
//! Type of value
using ValueType = int8_t;

//! Compute the distance between matrix and query
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};

template <>
struct SquaredEuclideanDistanceMatrix<Float16, 1, 1> {
//! Type of value
using ValueType = Float16;

//! Compute the distance between matrix and query
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};

template <>
struct SquaredEuclideanDistanceMatrix<float, 1, 1> {
//! Type of value
using ValueType = float;

//! Compute the distance between matrix and query
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};

/*! Squared Euclidean Distance Matrix
*/
template <typename T, size_t M, size_t N>
Expand Down Expand Up @@ -353,32 +396,6 @@ struct SquaredEuclideanDistanceMatrix<uint8_t, M, 1,
}
};

#if !defined(__SSE4_1__)
/*! Squared Euclidean Distance Matrix (INT4, M=1, N=1)
*/
template <>
struct SquaredEuclideanDistanceMatrix<uint8_t, 1, 1> {
//! Type of value
using ValueType = uint8_t;

//! Compute the distance between matrix and query
static inline void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out) {
ailego_assert(m && q && dim && !(dim & 1) && out);

float sum = 0.0;
for (size_t i = 0; i < (dim >> 1); ++i) {
uint8_t m_val = m[i];
uint8_t q_val = q[i];
sum +=
Int4SquaredDiffTable[((m_val << 4) & 0xf0) | ((q_val >> 0) & 0xf)] +
Int4SquaredDiffTable[((m_val >> 0) & 0xf0) | ((q_val >> 4) & 0xf)];
}
*out = sum;
}
};
#endif // !__SSE4_1__

/*! Euclidean Distance Matrix
*/
template <typename T, size_t M, size_t N,
Expand Down Expand Up @@ -424,76 +441,26 @@ struct EuclideanDistanceMatrix<
}
};

#if !defined(__SSE4_1__)
/*! Euclidean Distance Matrix (INT4, M=1, N=1)
*/
template <>
struct EuclideanDistanceMatrix<uint8_t, 1, 1> {
//! Type of value
using ValueType = uint8_t;

//! Compute the distance between matrix and query
static inline void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out) {
ailego_assert(m && q && dim && !(dim & 1) && out);

float sum = 0.0;
for (size_t i = 0; i < (dim >> 1); ++i) {
uint8_t m_val = m[i];
uint8_t q_val = q[i];
sum +=
Int4SquaredDiffTable[((m_val << 4) & 0xf0) | ((q_val >> 0) & 0xf)] +
Int4SquaredDiffTable[((m_val >> 0) & 0xf0) | ((q_val >> 4) & 0xf)];
}
*out = std::sqrt(sum);
}
};
#endif // !__SSE4_1__

#if defined(__SSE__) || defined(__ARM_NEON)
/*! Squared Euclidean Distance Matrix (FP32, M=1, N=1)
*/
template <>
struct SquaredEuclideanDistanceMatrix<float, 1, 1> {
//! Type of value
using ValueType = float;

//! Compute the distance between matrix and query
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};
#endif // __SSE__ || __ARM_NEON

#if defined(__SSE__) || (defined(__ARM_NEON) && (defined(__aarch64__)))
/*! Euclidean Distance Matrix (FP32, M=1, N=1)
*/
template <>
struct EuclideanDistanceMatrix<float, 1, 1> {
//! Type of value
using ValueType = float;

//! Compute the distance between matrix and query
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};
#endif // __SSE__ || __ARM_NEON && __aarch64__

#if (defined(__F16C__) && defined(__AVX__)) || \
(defined(__ARM_NEON) && defined(__aarch64__))
/*! Squared Euclidean Distance Matrix (FP16, M=1, N=1)
*/
template <>
struct SquaredEuclideanDistanceMatrix<Float16, 1, 1> {
struct EuclideanDistanceMatrix<int8_t, 1, 1> {
//! Type of value
using ValueType = Float16;
using ValueType = int8_t;

//! Compute the distance between matrix and query
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};

/*! Euclidean Distance Matrix (FP16, M=1, N=1)
*/
template <>
struct EuclideanDistanceMatrix<Float16, 1, 1> {
//! Type of value
Expand All @@ -503,58 +470,21 @@ struct EuclideanDistanceMatrix<Float16, 1, 1> {
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};
#endif // (__F16C__ && __AVX__) || (__ARM_NEON && __aarch64__)

#if defined(__SSE4_1__)
/*! Squared Euclidean Distance Matrix (INT8, M=1, N=1)
*/
template <>
struct SquaredEuclideanDistanceMatrix<int8_t, 1, 1> {
//! Type of value
using ValueType = int8_t;

//! Compute the distance between matrix and query
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};

/*! Euclidean Distance Matrix (INT8, M=1, N=1)
*/
template <>
struct EuclideanDistanceMatrix<int8_t, 1, 1> {
//! Type of value
using ValueType = int8_t;

//! Compute the distance between matrix and query
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};

/*! Squared Euclidean Distance Matrix (INT4, M=1, N=1)
*/
template <>
struct SquaredEuclideanDistanceMatrix<uint8_t, 1, 1> {
struct EuclideanDistanceMatrix<float, 1, 1> {
//! Type of value
using ValueType = uint8_t;
using ValueType = float;

//! Compute the distance between matrix and query
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};

/*! Euclidean Distance Matrix (INT4, M=1, N=1)
*/
template <>
struct EuclideanDistanceMatrix<uint8_t, 1, 1> {
//! Type of value
using ValueType = uint8_t;

//! Compute the distance between matrix and query
static void Compute(const ValueType *m, const ValueType *q, size_t dim,
float *out);
};
#endif // __SSE4_1__

//--------------------------------------------------
// Sparse
//--------------------------------------------------
/*! Squared Euclidean Distance Sparse Matrix
*/
template <typename T>
Expand Down
14 changes: 6 additions & 8 deletions src/ailego/math/euclidean_distance_matrix_fp16_avx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,13 @@ namespace ailego {

#if defined(__AVX__)

void SquaredEuclideanDistanceAVX(const Float16 *lhs, const Float16 *rhs,
size_t size, float *out) {
ACCUM_FP16_1X1_AVX(lhs, rhs, size, out, 0ull, )
}
float SquaredEuclideanDistanceFp16AVX(const Float16 *lhs, const Float16 *rhs,
size_t size) {
float score{0.0f};

ACCUM_FP16_1X1_AVX(lhs, rhs, size, &score, 0ull, )

//! EuclideanDistance
void EuclideanDistanceAVX(const Float16 *lhs, const Float16 *rhs, size_t size,
float *out) {
ACCUM_FP16_1X1_AVX(lhs, rhs, size, out, 0ull, std::sqrt)
return score;
}

#endif // __AVX__
Expand Down
Loading
Loading