diff --git a/Makefile b/Makefile index 0d6c0dc..3619716 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,7 @@ install: cp libdto.so.1.0 /usr/lib64/ ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so.1 ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so + cp dto.h /usr/include/ install-local: ln -sf ./libdto.so.1.0 ./libdto.so.1 diff --git a/README.md b/README.md index c9f3fe0..b4ee91e 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,16 @@ can be enabled or disabled using an environment variable DTO_AUTO_ADJUST_KNOBS. DTO can also be used to learn certain application characterics by building histogram of various API types and sizes. The histogram can be built using an environment variable DTO_COLLECT_STATS. +Finally, DTO offers an API to allow applications to pass a function pointer to be called while waiting for DSA to complete the operation. This can be used to perform other work while waiting for DSA to complete the operation. The function signature is: + +```bash +dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args); +``` +where callback_t cb is a function pointer in the calling application. If the callback terminates before DSA completes the operation, the specified wait method is used to complete the waiting. + + + + ```bash dto.c: DSA Transparent Offload shared library dto-test.c: Sample multi-threaded test application @@ -179,4 +189,4 @@ When linking DTO using LD_PRELOAD environment variable special care is required in the script. - When the application is started by a script with #! which invokes another script with #!, for unknown reasons DTO causes a segmentation fault during a memset operation on an 8K sized buffer. This can be avoided by setting the minimum - DTO size above 8K, or by avoiding this invocation sequence. \ No newline at end of file + DTO size above 8K, or by avoiding this invocation sequence. diff --git a/dto.c b/dto.c index b7a3a1c..3bf2dd1 100644 --- a/dto.c +++ b/dto.c @@ -23,6 +23,7 @@ #include #include #include +#include "dto.h" #define likely(x) __builtin_expect((x), 1) #define unlikely(x) __builtin_expect((x), 0) @@ -48,6 +49,10 @@ #define DTO_INITIALIZED 0 #define DTO_INITIALIZING 1 +#define NSEC_PER_SEC (1000000000) +#define MSEC_PER_SEC (1000) +#define NSEC_PER_MSEC (NSEC_PER_SEC/MSEC_PER_SEC) + // thread specific variables static __thread struct dsa_hw_desc thr_desc; static __thread struct dsa_completion_record thr_comp __attribute__((aligned(32))); @@ -107,6 +112,7 @@ static enum numa_aware is_numa_aware; static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE; static int wait_method = WAIT_YIELD; static size_t cpu_size_fraction; // range of values is 0 to 99 +static uint64_t wait_time = 100000; //10K nanoseconds static uint8_t dto_dsa_memcpy = 1; static uint8_t dto_dsa_memmove = 1; @@ -122,6 +128,7 @@ static uint8_t fork_handler_registered; enum memop { MEMSET = 0x0, MEMCOPY, + MEMCOPY_ASYNC, MEMMOVE, MEMCMP, MAX_MEMOP, @@ -130,6 +137,7 @@ enum memop { static const char * const memop_names[] = { [MEMSET] = "set", [MEMCOPY] = "cpy", + [MEMCOPY_ASYNC] = "cpy_async", [MEMMOVE] = "mov", [MEMCMP] = "cmp" }; @@ -557,6 +565,7 @@ static void print_stats(void) clock_gettime(CLOCK_BOOTTIME, &dto_end_time); LOG_TRACE("DTO Run Time: %ld ms\n", TS_NS(dto_start_time, dto_end_time)/1000000); + LOG_TRACE("DTO CPU Fraction: %.2f \n", cpu_size_fraction/100.0); // display stats for (int t = 0; t < 2; ++t) { @@ -1340,6 +1349,23 @@ static int init_dto(void) LOG_ERROR("Didn't find any usable DSAs. Falling back to using CPUs.\n"); use_std_lib_calls = 1; } + unsigned int num, den, freq; + unsigned int unused; + unsigned long long tmp; + __get_cpuid( 0x15, &den, &num, &freq, &unused ); + freq /= 1000; + LOG_TRACE( "Core Freq = %u kHz\n", freq ); + LOG_TRACE( "TSC Mult = %u\n", num ); + LOG_TRACE( "TSC Den = %u\n", den ); + freq *= num; + freq /= den; + LOG_TRACE( "CPU freq = %u kHz\n", freq ); + LOG_TRACE( "Requested wait: %llu nsec\n", wait_time ); + tmp = wait_time; + tmp *= freq; + wait_time = tmp / NSEC_PER_MSEC; + LOG_TRACE( "Requested wait duration: %llu cycles\n", wait_time ); + // display configuration LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, " @@ -1484,6 +1510,56 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n) return true; } +__attribute__((visibility("default"))) void dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args) { + //submit dsa work if successful, call the callback + int result = 0; + struct dto_wq *wq = get_wq(dest); + size_t dsa_size = n; +#ifdef DTO_STATS_SUPPORT + struct timespec st, et; + size_t orig_n = n; + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + thr_desc.opcode = DSA_OPCODE_MEMMOVE; + thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR; + if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + thr_desc.flags |= IDXD_OP_FLAG_CC; + thr_desc.completion_addr = (uint64_t)&thr_comp; + + thr_bytes_completed = 0; + + thr_desc.src_addr = (uint64_t) src; + thr_desc.dst_addr = (uint64_t) dest; + thr_desc.xfer_size = (uint32_t) dsa_size; + thr_comp.status = 0; + result = dsa_submit(wq, &thr_desc); + if (result == SUCCESS) { + cb(args); + result = dsa_wait(wq, &thr_desc, &thr_comp.status); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY_ASYNC, n, thr_bytes_completed, result); +#endif + if (thr_bytes_completed != n) { + /* fallback to std call if job is only partially completed */ + n -= thr_bytes_completed; + if (thr_comp.result == 0) { + dest = (void *)((uint64_t)dest + thr_bytes_completed); + src = (const void *)((uint64_t)src + thr_bytes_completed); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + orig_memcpy(dest, src, n); + +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCOPY, n, orig_n); +#endif + } +} + static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result) { struct dto_wq *wq = get_wq(dest); diff --git a/dto.h b/dto.h new file mode 100644 index 0000000..30045dd --- /dev/null +++ b/dto.h @@ -0,0 +1,18 @@ + +#ifndef DTO_H +#define DTO_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void(*callback_t)(void*); + +void dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args); + +#ifdef __cplusplus +} +#endif + +#endif +